3 # reboot a ganeti cluster, making sure instances are moved around before and after
5 # Copyright 2018, 2019 Peter Palfrader
7 # Permission is hereby granted, free of charge, to any person obtaining
8 # a copy of this software and associated documentation files (the
9 # "Software"), to deal in the Software without restriction, including
10 # without limitation the rights to use, copy, modify, merge, publish,
11 # distribute, sublicense, and/or sell copies of the Software, and to
12 # permit persons to whom the Software is furnished to do so, subject to
13 # the following conditions:
15 # The above copyright notice and this permission notice shall be
16 # included in all copies or substantial portions of the Software.
18 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
21 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
22 # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
32 echo "Usage: $0 [-n <node-list>] [-f] [ -M <newmaster> ] [up|down]"
33 echo " -M is for internal use only (used in n>2 clusters if we want to reboot the master first)"
44 while getopts "fhn:M:" OPTION; do
58 if ! [ -e "$nodelist" ]; then
59 echo >&2 "nodelist $nodelist not found."
67 shift $(($OPTIND - 1))
70 [ "$#" -ge 1 ] && shift
73 down) print_list=cat;;
77 [ "$#" -gt 0 ] && error_usage
80 gnt-instance list --no-headers -o status --filter '(pnode == "'"$1"'")' | grep -c -v ADMIN_down
83 if [ "$(count_instances "$1")" != 0 ]; then
98 if has_instances "$tgt"; then
99 echo >&2 "$tgt not empty."
103 ssh -n -l root "$tgt" shutdown -r 1 "'reboot requested by $0 on $(hostname -f)'"
105 # wait for target to go down:
106 max_wait='300 seconds'
107 wait_until=$(date -d "now +$max_wait" +%s)
108 while ping -c 5 -q "$tgt" > /dev/null; do
109 echo "[$(date)] $tgt is still up (will wait until $(date -d "@$wait_until")."
111 if [ "$(date +%s)" -gt "$wait_until" ]; then
112 echo >&2 "Giving up on waiting for $tgt to go down."
118 echo "[$(date)] $tgt is down. Pausing for $sleep_time seconds"
121 max_wait='15 minutes'
122 wait_until=$(date -d "now +$max_wait" +%s)
123 while ! ping -c 5 -q "$tgt" > /dev/null; do
124 echo "[$(date)] $tgt is still down (will wait until $(date -d "@$wait_until")."
125 if [ "$(date +%s)" -gt "$wait_until" ]; then
126 echo >&2 "Giving up on waiting for $tgt to come back."
133 echo "[$(date)] $tgt is up. Pausing for $sleep_time seconds"
136 max_wait='180 minutes'
137 wait_until=$(date -d "now +$max_wait" +%s)
138 while ! ssh -n -l root "$tgt" systemctl is-system-running; do
139 echo "[$(date)] $tgt is still booting up (will wait until $(date -d "@$wait_until")."
140 if [ "$(date +%s)" -gt "$wait_until" ]; then
141 echo >&2 "Giving up on waiting for $tgt to come back."
148 echo "[$(date)] $tgt has finished booting. Pausing for $sleep_time seconds"
152 # move down, i.e. from 2 to 1, ..., 14 to 13.
154 first_tgt="$(${print_list} "$nodelist" | head -n1 | awk '{print $1}')"
155 last_node="$(${print_list} "$nodelist" | tail -n1 | awk '{print $1}')"
158 if has_instances "$first_tgt"; then
159 echo "$first_tgt not empty."
163 if [ "$me" != "$last_node" ]; then
164 echo "Making $last_node the new master"
165 ssh -n -l root "$last_node" gnt-cluster master-failover
166 echo "relaunching reboot-cluster on $last_node"
167 tmp="$(ssh -n -l root -t "$last_node" tempfile)"
168 scp "$nodelist" "$last_node:$tmp"
169 ssh -l root -t "$last_node" screen -S reboot-cluster -m sh -c "\"echo Relaunched on $last_node; ganeti-reboot-cluster -f -n '$tmp' -M '$me' '$direction'; echo ganeti-reboot-cluster exited with \$?.; sleep 12h\""
170 echo >&1 "fell through!"
174 ${print_list} "$nodelist" | (
176 while read src dummy; do
177 if has_instances "$tgt"; then
178 echo "$tgt not empty."
183 if has_instances "$src"; then
184 echo "Migrating from $src to $tgt."
185 if ! gnt-node migrate -f -n "$tgt" "$src"; then
186 echo >&2 "gnt-node migrate exited with an error. Bailing out."
190 echo "nothing to migrate from $src to $tgt"
195 if has_instances "$tgt"; then
196 echo "$tgt not empty."
200 if ! [ "$tgt" = "$me" ]; then
201 echo >&2 "I was expecting $tgt to be me ($me) here."
205 if [ "$newmaster" != "" ]; then
206 echo "Making $newmaster the new master"
207 ssh -n -l root "$newmaster" gnt-cluster master-failover
209 shutdown -r 1 "reboot requested by $0"
216 if ! grep -q -F "$me" "$nodelist"; then
217 echo >&2 "my hostname ($me) not found in nodelist"
220 them="$(grep -v -F "$me" "$nodelist")"
222 echo "Migrating from $them to $me."
223 if ! gnt-node migrate -f -n "$me" "$them"; then
224 echo >&2 "gnt-node migrate exited with an error. Bailing out."
229 echo "Activating disks.."
230 for instance in $( gnt-instance list -o name --no-headers --filter 'status == "running"' ); do
231 echo " - $instance ..."
232 if ! gnt-instance activate-disks "$instance"; then
233 echo >&2 "gnt-instance activate-disks $instance failed. Bailing out."
238 if [ -e /proc/drbd ]; then
239 echo "Waiting for drbd to be consistent."
241 while egrep -C2 --color -i 'iconsistent|finish' /proc/drbd || ! /usr/lib/nagios/plugins/dsa-check-drbd -d All ; do
242 echo "Still waiting.."
247 echo "Migrating from $me to $them."
248 if ! gnt-node migrate -f -n "$them" "$me"; then
249 echo >&2 "gnt-node migrate exited with an error. Bailing out."
253 at 'now + 30 min' << 'EOF'
254 screen -S hbal -d -m sh -c '
255 echo "Activating disks.."
256 for instance in $( gnt-instance list -o name --no-headers --filter "status == \"running\"" ); do
257 echo " - $instance ..."
258 if ! gnt-instance activate-disks "$instance"; then
259 echo >&2 "Warning: gnt-instance activate-disks $instance failed."
272 /sbin/shutdown -k 30 < /dev/null
274 gnt-cluster watcher pause 30m
276 for i in $(gnt-instance list --no-headers -o name); do
277 gnt-instance shutdown --no-remember --submit $i
280 while pgrep -c '^qemu-|^kvm$' -u root ; do
282 gnt-cluster watcher pause 30m
285 at 'now + 5 min' << EOF
287 gnt-cluster watcher continue
292 /sbin/shutdown -r 1 </dev/null
295 if [ "${TMUX:-}" = "" ] && [ "${STY:-}" = "" ] ; then
296 echo >&2 "Might want to launch me in a screen or tmux."
300 if ! [ "$force" = 1 ]; then
308 if ! [ -e "$nodelist" ]; then
310 trap "rm -f '$tmp'" EXIT
311 gnt-node list --no-headers -o name > "$tmp"
315 lines=$(wc -l < "$nodelist")
318 echo >&2 "nodelist $nodelist empty."
322 case "$(hostname -f)" in
327 echo >&2 "Only one node."