--- /dev/null
+#!/bin/bash
+
+set -e
+set -o pipefail
+set -u
+
+usage() {
+ echo "Usage: $0 [-n <node-list>] [-f] [ -M <newmaster> ] [up|down]"
+ echo " -M is for internal use only (used in n>2 clusters if we want to reboot the master first)"
+}
+error_usage() {
+ usage >&2
+ exit 1
+}
+
+nodelist="node-list"
+newmaster=""
+force="0"
+
+while getopts "fhn:M:" OPTION; do
+ case "$OPTION" in
+ f)
+ force="1"
+ ;;
+ h)
+ usage
+ exit 0
+ ;;
+ M)
+ newmaster="$OPTARG"
+ ;;
+ n)
+ nodelist="$OPTARG"
+ if ! [ -e "$nodelist" ]; then
+ echo >&2 "nodelist $nodelist not found."
+ exit 1
+ fi
+ ;;
+ *)
+ error_usage
+ esac
+done
+shift $(($OPTIND - 1))
+
+direction="${1:-up}"
+[ "$#" -ge 1 ] && shift
+case "$direction" in
+ up) print_list=tac;;
+ down) print_list=cat;;
+ *) error_usage;;
+esac
+
+[ "$#" -gt 0 ] && error_usage
+
+count_instances() {
+ gnt-instance list --no-headers -o status --filter '(pnode == "'"$1"'")' | grep -c -v ADMIN_down
+}
+has_instances() {
+ if [ "$(count_instances "$1")" != 0 ]; then
+ return 0
+ else
+ return 1
+ fi
+}
+
+reboot_host() {
+ local tgt
+ local max_wait
+ local wait_until
+ local sleep_time
+
+ tgt="$1"
+
+ if has_instances "$tgt"; then
+ echo >&2 "$tgt not empty."
+ exit 1
+ fi
+
+ ssh -n -l root "$tgt" shutdown -r 1 "'reboot requested by $0 on $(hostname -f)'"
+
+ # wait for target to go down:
+ max_wait='300 seconds'
+ wait_until=$(date -d "now +$max_wait" +%s)
+ while ping -c 5 -q "$tgt" > /dev/null; do
+ echo "[$(date)] $tgt is still up (will wait until $(date -d "@$wait_until")."
+ sleep 10
+ if [ "$(date +%s)" -gt "$wait_until" ]; then
+ echo >&2 "Giving up on waiting for $tgt to go down."
+ exit 1
+ fi
+ done
+
+ sleep_time=30
+ echo "[$(date)] $tgt is down. Pausing for $sleep_time seconds"
+ sleep "$sleep_time"
+
+ max_wait='15 minutes'
+ wait_until=$(date -d "now +$max_wait" +%s)
+ while ! ping -c 5 -q "$tgt" > /dev/null; do
+ echo "[$(date)] $tgt is still down (will wait until $(date -d "@$wait_until")."
+ if [ "$(date +%s)" -gt "$wait_until" ]; then
+ echo >&2 "Giving up on waiting for $tgt to come back."
+ exit 1
+ fi
+ sleep 10
+ done
+
+ sleep_time=30
+ echo "[$(date)] $tgt is up. Pausing for $sleep_time seconds"
+ sleep "$sleep_time"
+
+ max_wait='15 minutes'
+ wait_until=$(date -d "now +$max_wait" +%s)
+ while ! ssh -n -l root "$tgt" systemctl is-system-running; do
+ echo "[$(date)] $tgt is still booting up (will wait until $(date -d "@$wait_until")."
+ if [ "$(date +%s)" -gt "$wait_until" ]; then
+ echo >&2 "Giving up on waiting for $tgt to come back."
+ exit 1
+ fi
+ sleep 10
+ done
+
+ sleep_time=30
+ echo "[$(date)] $tgt has finished booting. Pausing for $sleep_time seconds"
+ sleep "$sleep_time"
+}
+
+# move down, i.e. from 2 to 1, ..., 14 to 13.
+moveupdown() {
+ first_tgt="$(${print_list} "$nodelist" | head -n1 | awk '{print $1}')"
+ last_node="$(${print_list} "$nodelist" | tail -n1 | awk '{print $1}')"
+ me=$(hostname -f)
+
+ if has_instances "$first_tgt"; then
+ echo "$first_tgt not empty."
+ exit 1
+ fi
+
+ if [ "$me" != "$last_node" ]; then
+ echo "Making $last_node the new master"
+ ssh -n -l root "$last_node" gnt-cluster master-failover
+ echo "relaunching reboot-cluster on $last_node"
+ tmp="$(ssh -n -l root -t "$last_node" tempfile)"
+ scp "$nodelist" "$last_node:$tmp"
+ ssh -l root -t "$last_node" screen -S reboot-cluster -m sh -c "\"echo Relaunched on $last_node; ganeti-reboot-cluster -f -n '$tmp' -M '$me' '$direction'; echo ganeti-reboot-cluster exited with \$?.; sleep 12h\""
+ echo >&1 "fell through!"
+ exit 1
+ fi
+
+ ${print_list} "$nodelist" | (
+ read tgt dummy
+ while read src dummy; do
+ if has_instances "$tgt"; then
+ echo "$tgt not empty."
+ exit 1
+ fi
+ reboot_host "$tgt"
+
+ if has_instances "$src"; then
+ echo "Migrating from $src to $tgt."
+ if ! gnt-node migrate -f -n "$tgt" "$src"; then
+ echo >&2 "gnt-node migrate exited with an error. Bailing out."
+ exit 1
+ fi
+ else
+ echo "nothing to migrate from $src to $tgt"
+ fi
+ tgt="$src"
+ done
+
+ if has_instances "$tgt"; then
+ echo "$tgt not empty."
+ exit 1
+ fi
+
+ if ! [ "$tgt" = "$me" ]; then
+ echo >&2 "I was expecting $tgt to be me ($me) here."
+ exit 1
+ fi
+
+ if [ "$newmaster" != "" ]; then
+ echo "Making $newmaster the new master"
+ ssh -n -l root "$newmaster" gnt-cluster master-failover
+ fi
+ shutdown -r 1 "reboot requested by $0"
+ exit
+ )
+}
+
+crossmigrate() {
+ me=$(hostname -f)
+ if ! grep -q -F "$me" "$nodelist"; then
+ echo >&2 "my hostname ($me) not found in nodelist"
+ exit 1
+ fi
+ them="$(grep -v -F "$me" "$nodelist")"
+
+ echo "Migrating from $them to $me."
+ if ! gnt-node migrate -f -n "$me" "$them"; then
+ echo >&2 "gnt-node migrate exited with an error. Bailing out."
+ exit 1
+ fi
+ reboot_host "$them"
+
+ echo "Activating disks.."
+ for instance in $( gnt-instance list -o name --no-headers --filter 'status == "running"' ); do
+ echo " - $instance ..."
+ if ! gnt-instance activate-disks "$instance"; then
+ echo >&2 "gnt-instance activate-disks $instance failed. Bailing out."
+ exit 1
+ fi
+ done
+
+ echo "Migrating from $me to $them."
+ if ! gnt-node migrate -f -n "$them" "$me"; then
+ echo >&2 "gnt-node migrate exited with an error. Bailing out."
+ exit 1
+ fi
+
+ at 'now + 30 min' << 'EOF'
+screen -S hbal -d -m sh -c '
+ echo "Activating disks.."
+ for instance in $( gnt-instance list -o name --no-headers --filter "status == \"running\"" ); do
+ echo " - $instance ..."
+ if ! gnt-instance activate-disks "$instance"; then
+ echo >&2 "Warning: gnt-instance activate-disks $instance failed."
+ fi
+ done
+
+ hbal -L -C -v -X
+ echo "done."
+ sleep 1h
+'
+EOF
+ reboot_host "$me"
+}
+
+reboot_byrd() {
+ /sbin/shutdown -k 30 < /dev/null
+ sleep 15m
+ gnt-cluster watcher pause 30m
+
+ for i in $(gnt-instance list --no-headers -o name); do
+ gnt-instance shutdown --no-remember --submit $i
+ done
+
+ while pgrep -c '^qemu-|^kvm$' -u root ; do
+ sleep 15;
+ gnt-cluster watcher pause 30m
+ done
+
+ at 'now + 5 min' << EOF
+sleep 4m;
+gnt-cluster watcher continue
+EOF
+
+ /sbin/shutdown -c
+ sleep 5
+ /sbin/shutdown -r 1 </dev/null
+}
+
+if [ "${TMUX:-}" = "" ] && [ "${STY:-}" = "" ] ; then
+ echo >&2 "Might want to launch me in a screen or tmux."
+ exit 1
+fi
+
+if ! [ "$force" = 1 ]; then
+ echo -n 'really? '
+ read really
+ [ "$really" = "y" ]
+fi
+
+### ensure_nodelist
+###################
+if ! [ -e "$nodelist" ]; then
+ tmp="$(tempfile)"
+ trap "rm -f '$tmp'" EXIT
+ gnt-node list --no-headers -o name > "$tmp"
+ nodelist="$tmp"
+fi
+
+lines=$(wc -l < "$nodelist")
+case "$lines" in
+ 0)
+ echo >&2 "nodelist $nodelist empty."
+ exit 1
+ ;;
+ 1)
+ case "$(hostname -f)" in
+ byrd.debian.org)
+ reboot_byrd
+ ;;
+ *)
+ echo >&2 "Only one node."
+ exit 1
+ esac
+ ;;
+ 2)
+ crossmigrate
+ ;;
+ *)
+ moveupdown
+ ;;
+esac
+