3 # reboot a ganeti cluster, making sure instances are moved around before and after
5 # Copyright 2018, 2019 Peter Palfrader
7 # Permission is hereby granted, free of charge, to any person obtaining
8 # a copy of this software and associated documentation files (the
9 # "Software"), to deal in the Software without restriction, including
10 # without limitation the rights to use, copy, modify, merge, publish,
11 # distribute, sublicense, and/or sell copies of the Software, and to
12 # permit persons to whom the Software is furnished to do so, subject to
13 # the following conditions:
15 # The above copyright notice and this permission notice shall be
16 # included in all copies or substantial portions of the Software.
18 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
21 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
22 # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
32 echo "Usage: $0 [-n <node-list>] [-f] [ -M <newmaster> ] [up|down]"
33 echo " -M is for internal use only (used in n>2 clusters if we want to reboot the master first)"
41 cnt=$((${#cleanup[*]}-1))
42 for i in $(seq ${cnt} -1 0); do
43 ${cleanup[$i]} || true
55 while getopts "fhn:M:" OPTION; do
69 if ! [ -e "$nodelist" ]; then
70 echo >&2 "nodelist $nodelist not found."
78 shift $(($OPTIND - 1))
81 [ "$#" -ge 1 ] && shift
84 down) print_list=cat;;
88 [ "$#" -gt 0 ] && error_usage
91 gnt-instance list --no-headers -o status --filter '(pnode == "'"$1"'")' | grep -c -v ADMIN_down
94 if [ "$(count_instances "$1")" != 0 ]; then
109 if has_instances "$tgt"; then
110 echo >&2 "$tgt not empty."
114 ssh -n -l root "$tgt" shutdown -r 1 "'reboot requested by $0 on $(hostname -f)'"
116 # wait for target to go down:
117 max_wait='300 seconds'
118 wait_until=$(date -d "now +$max_wait" +%s)
119 while ping -c 5 -q "$tgt" > /dev/null; do
120 echo "[$(date)] $tgt is still up (will wait until $(date -d "@$wait_until")."
122 if [ "$(date +%s)" -gt "$wait_until" ]; then
123 echo >&2 "Giving up on waiting for $tgt to go down."
129 echo "[$(date)] $tgt is down. Pausing for $sleep_time seconds"
132 max_wait='15 minutes'
133 wait_until=$(date -d "now +$max_wait" +%s)
134 while ! ping -c 5 -q "$tgt" > /dev/null; do
135 echo "[$(date)] $tgt is still down (will wait until $(date -d "@$wait_until")."
136 if [ "$(date +%s)" -gt "$wait_until" ]; then
137 echo >&2 "Giving up on waiting for $tgt to come back."
144 echo "[$(date)] $tgt is up. Pausing for $sleep_time seconds"
147 max_wait='180 minutes'
148 wait_until=$(date -d "now +$max_wait" +%s)
149 while ! ssh -n -l root "$tgt" systemctl is-system-running; do
150 echo "[$(date)] $tgt is still booting up (will wait until $(date -d "@$wait_until")."
151 if [ "$(date +%s)" -gt "$wait_until" ]; then
152 echo >&2 "Giving up on waiting for $tgt to come back."
159 echo "[$(date)] $tgt has finished booting. Pausing for $sleep_time seconds"
163 # move down, i.e. from 2 to 1, ..., 14 to 13.
165 first_tgt="$(${print_list} "$nodelist" | head -n1 | awk '{print $1}')"
166 last_node="$(${print_list} "$nodelist" | tail -n1 | awk '{print $1}')"
169 if has_instances "$first_tgt"; then
170 echo "$first_tgt not empty."
174 if [ "$me" != "$last_node" ]; then
175 echo "Making $last_node the new master"
176 ssh -n -l root "$last_node" gnt-cluster master-failover
177 echo "relaunching reboot-cluster on $last_node"
178 tmp="$(ssh -n -l root -t "$last_node" tempfile)"
179 scp "$nodelist" "$last_node:$tmp"
180 ssh -l root -t "$last_node" screen -S reboot-cluster -m sh -c "\"echo Relaunched on $last_node; ganeti-reboot-cluster -f -n '$tmp' -M '$me' '$direction'; echo ganeti-reboot-cluster exited with \$?.; sleep 12h\""
181 echo >&1 "fell through!"
185 ${print_list} "$nodelist" | (
187 while read src dummy; do
190 if has_instances "$src"; then
191 echo "Migrating from $src to $tgt."
192 if ! gnt-node migrate -f -n "$tgt" "$src"; then
193 echo >&2 "gnt-node migrate exited with an error. Bailing out."
197 echo "nothing to migrate from $src to $tgt"
202 if has_instances "$tgt"; then
203 echo "$tgt not empty."
207 if ! [ "$tgt" = "$me" ]; then
208 echo >&2 "I was expecting $tgt to be me ($me) here."
212 if [ "$newmaster" != "" ]; then
213 echo "Making $newmaster the new master"
214 ssh -n -l root "$newmaster" gnt-cluster master-failover
216 shutdown -r 1 "reboot requested by $0"
223 if ! grep -q --line-regexp --fixed-strings "$me" "$nodelist"; then
224 echo >&2 "my hostname ($me) not found in nodelist"
228 # move ourselves last
229 newlist="$(tempfile)"
230 cleanup+=("rm -f '$newlist'")
231 grep -v --line-regexp --fixed-strings "$me" "$nodelist" > "$newlist"
232 echo "$me" >> "$newlist"
235 if ! hbal -L -C -v -v --no-disk-moves --offline="$node" -X; then
236 echo >&2 "hbal failed at node $node. Bailing out."
239 if ! gnt-node migrate -f "$node"; then
240 echo >&2 "gnt-node migrate failed for node $node. Bailing out."
243 if [ "$node" = "$me" ] ; then
248 echo "Bringing back disks using the watcher"
250 # wait for a cron-launched ganeti-watcher to finish
251 while pgrep ganeti-watcher > /dev/null ; do
258 at 'now + 5 min' << 'EOF'
259 screen -S hbal -d -m sh -c '
260 echo "Activating disks using the watcher.."
262 while pgrep ganeti-watcher > /dev/null ; do
265 hbal -L -C -v -v --no-disk-moves -X
275 if ! grep -q --line-regexp --fixed-strings "$me" "$nodelist"; then
276 echo >&2 "my hostname ($me) not found in nodelist"
279 them="$(grep -v --line-regexp --fixed-strings "$me" "$nodelist")"
281 echo "Migrating from $them to $me."
282 if ! gnt-node migrate -f -n "$me" "$them"; then
283 echo >&2 "gnt-node migrate exited with an error. Bailing out."
288 echo "Activating disks.."
289 for instance in $( gnt-instance list -o name --no-headers --filter 'status == "running"' ); do
290 echo " - $instance ..."
291 if ! gnt-instance activate-disks "$instance"; then
292 echo >&2 "gnt-instance activate-disks $instance failed. Bailing out."
297 if [ -e /proc/drbd ]; then
298 echo "Waiting for drbd to be consistent."
300 while egrep -C2 --color -i 'iconsistent|finish' /proc/drbd || ! /usr/lib/nagios/plugins/dsa-check-drbd -d All ; do
301 echo "Still waiting.."
306 echo "Migrating from $me to $them."
307 if ! gnt-node migrate -f -n "$them" "$me"; then
308 echo >&2 "gnt-node migrate exited with an error. Bailing out."
312 at 'now + 30 min' << 'EOF'
313 screen -S hbal -d -m sh -c '
314 echo "Activating disks.."
315 for instance in $( gnt-instance list -o name --no-headers --filter "status == \"running\"" ); do
316 echo " - $instance ..."
317 if ! gnt-instance activate-disks "$instance"; then
318 echo >&2 "Warning: gnt-instance activate-disks $instance failed."
331 /sbin/shutdown -k 30 < /dev/null
333 gnt-cluster watcher pause 30m
335 for i in $(gnt-instance list --no-headers -o name); do
336 gnt-instance shutdown --no-remember --submit $i
339 while pgrep -c '^qemu-|^kvm$' -u root ; do
341 gnt-cluster watcher pause 30m
344 at 'now + 5 min' << EOF
346 gnt-cluster watcher continue
351 /sbin/shutdown -r 1 </dev/null
354 if [ "${TMUX:-}" = "" ] && [ "${STY:-}" = "" ] ; then
355 echo >&2 "Might want to launch me in a screen or tmux."
359 if ! [ "$force" = 1 ]; then
367 if ! [ -e "$nodelist" ]; then
369 cleanup+=("rm -f '$tmp'")
370 gnt-node list --no-headers -o name > "$tmp"
374 lines=$(wc -l < "$nodelist")
377 echo >&2 "nodelist $nodelist empty."
381 case "$(hostname -f)" in
386 echo >&2 "Only one node."
394 echo "WARNING: this is untested. ^C now if you want to stop"