From: Peter Palfrader Date: Tue, 19 Jun 2018 15:19:20 +0000 (+0200) Subject: Install ganeti-reboot-cluster X-Git-Url: https://git.adam-barratt.org.uk/?a=commitdiff_plain;h=1ed456470b331c694707ed9b4ec6f5dfab1aa432;p=mirror%2Fdsa-puppet.git Install ganeti-reboot-cluster --- diff --git a/modules/ganeti2/files/ganeti-reboot-cluster b/modules/ganeti2/files/ganeti-reboot-cluster new file mode 100755 index 000000000..2e2a2ab86 --- /dev/null +++ b/modules/ganeti2/files/ganeti-reboot-cluster @@ -0,0 +1,305 @@ +#!/bin/bash + +set -e +set -o pipefail +set -u + +usage() { + echo "Usage: $0 [-n ] [-f] [ -M ] [up|down]" + echo " -M is for internal use only (used in n>2 clusters if we want to reboot the master first)" +} +error_usage() { + usage >&2 + exit 1 +} + +nodelist="node-list" +newmaster="" +force="0" + +while getopts "fhn:M:" OPTION; do + case "$OPTION" in + f) + force="1" + ;; + h) + usage + exit 0 + ;; + M) + newmaster="$OPTARG" + ;; + n) + nodelist="$OPTARG" + if ! [ -e "$nodelist" ]; then + echo >&2 "nodelist $nodelist not found." + exit 1 + fi + ;; + *) + error_usage + esac +done +shift $(($OPTIND - 1)) + +direction="${1:-up}" +[ "$#" -ge 1 ] && shift +case "$direction" in + up) print_list=tac;; + down) print_list=cat;; + *) error_usage;; +esac + +[ "$#" -gt 0 ] && error_usage + +count_instances() { + gnt-instance list --no-headers -o status --filter '(pnode == "'"$1"'")' | grep -c -v ADMIN_down +} +has_instances() { + if [ "$(count_instances "$1")" != 0 ]; then + return 0 + else + return 1 + fi +} + +reboot_host() { + local tgt + local max_wait + local wait_until + local sleep_time + + tgt="$1" + + if has_instances "$tgt"; then + echo >&2 "$tgt not empty." + exit 1 + fi + + ssh -n -l root "$tgt" shutdown -r 1 "'reboot requested by $0 on $(hostname -f)'" + + # wait for target to go down: + max_wait='300 seconds' + wait_until=$(date -d "now +$max_wait" +%s) + while ping -c 5 -q "$tgt" > /dev/null; do + echo "[$(date)] $tgt is still up (will wait until $(date -d "@$wait_until")." + sleep 10 + if [ "$(date +%s)" -gt "$wait_until" ]; then + echo >&2 "Giving up on waiting for $tgt to go down." + exit 1 + fi + done + + sleep_time=30 + echo "[$(date)] $tgt is down. Pausing for $sleep_time seconds" + sleep "$sleep_time" + + max_wait='15 minutes' + wait_until=$(date -d "now +$max_wait" +%s) + while ! ping -c 5 -q "$tgt" > /dev/null; do + echo "[$(date)] $tgt is still down (will wait until $(date -d "@$wait_until")." + if [ "$(date +%s)" -gt "$wait_until" ]; then + echo >&2 "Giving up on waiting for $tgt to come back." + exit 1 + fi + sleep 10 + done + + sleep_time=30 + echo "[$(date)] $tgt is up. Pausing for $sleep_time seconds" + sleep "$sleep_time" + + max_wait='15 minutes' + wait_until=$(date -d "now +$max_wait" +%s) + while ! ssh -n -l root "$tgt" systemctl is-system-running; do + echo "[$(date)] $tgt is still booting up (will wait until $(date -d "@$wait_until")." + if [ "$(date +%s)" -gt "$wait_until" ]; then + echo >&2 "Giving up on waiting for $tgt to come back." + exit 1 + fi + sleep 10 + done + + sleep_time=30 + echo "[$(date)] $tgt has finished booting. Pausing for $sleep_time seconds" + sleep "$sleep_time" +} + +# move down, i.e. from 2 to 1, ..., 14 to 13. +moveupdown() { + first_tgt="$(${print_list} "$nodelist" | head -n1 | awk '{print $1}')" + last_node="$(${print_list} "$nodelist" | tail -n1 | awk '{print $1}')" + me=$(hostname -f) + + if has_instances "$first_tgt"; then + echo "$first_tgt not empty." + exit 1 + fi + + if [ "$me" != "$last_node" ]; then + echo "Making $last_node the new master" + ssh -n -l root "$last_node" gnt-cluster master-failover + echo "relaunching reboot-cluster on $last_node" + tmp="$(ssh -n -l root -t "$last_node" tempfile)" + scp "$nodelist" "$last_node:$tmp" + ssh -l root -t "$last_node" screen -S reboot-cluster -m sh -c "\"echo Relaunched on $last_node; ganeti-reboot-cluster -f -n '$tmp' -M '$me' '$direction'; echo ganeti-reboot-cluster exited with \$?.; sleep 12h\"" + echo >&1 "fell through!" + exit 1 + fi + + ${print_list} "$nodelist" | ( + read tgt dummy + while read src dummy; do + if has_instances "$tgt"; then + echo "$tgt not empty." + exit 1 + fi + reboot_host "$tgt" + + if has_instances "$src"; then + echo "Migrating from $src to $tgt." + if ! gnt-node migrate -f -n "$tgt" "$src"; then + echo >&2 "gnt-node migrate exited with an error. Bailing out." + exit 1 + fi + else + echo "nothing to migrate from $src to $tgt" + fi + tgt="$src" + done + + if has_instances "$tgt"; then + echo "$tgt not empty." + exit 1 + fi + + if ! [ "$tgt" = "$me" ]; then + echo >&2 "I was expecting $tgt to be me ($me) here." + exit 1 + fi + + if [ "$newmaster" != "" ]; then + echo "Making $newmaster the new master" + ssh -n -l root "$newmaster" gnt-cluster master-failover + fi + shutdown -r 1 "reboot requested by $0" + exit + ) +} + +crossmigrate() { + me=$(hostname -f) + if ! grep -q -F "$me" "$nodelist"; then + echo >&2 "my hostname ($me) not found in nodelist" + exit 1 + fi + them="$(grep -v -F "$me" "$nodelist")" + + echo "Migrating from $them to $me." + if ! gnt-node migrate -f -n "$me" "$them"; then + echo >&2 "gnt-node migrate exited with an error. Bailing out." + exit 1 + fi + reboot_host "$them" + + echo "Activating disks.." + for instance in $( gnt-instance list -o name --no-headers --filter 'status == "running"' ); do + echo " - $instance ..." + if ! gnt-instance activate-disks "$instance"; then + echo >&2 "gnt-instance activate-disks $instance failed. Bailing out." + exit 1 + fi + done + + echo "Migrating from $me to $them." + if ! gnt-node migrate -f -n "$them" "$me"; then + echo >&2 "gnt-node migrate exited with an error. Bailing out." + exit 1 + fi + + at 'now + 30 min' << 'EOF' +screen -S hbal -d -m sh -c ' + echo "Activating disks.." + for instance in $( gnt-instance list -o name --no-headers --filter "status == \"running\"" ); do + echo " - $instance ..." + if ! gnt-instance activate-disks "$instance"; then + echo >&2 "Warning: gnt-instance activate-disks $instance failed." + fi + done + + hbal -L -C -v -X + echo "done." + sleep 1h +' +EOF + reboot_host "$me" +} + +reboot_byrd() { + /sbin/shutdown -k 30 < /dev/null + sleep 15m + gnt-cluster watcher pause 30m + + for i in $(gnt-instance list --no-headers -o name); do + gnt-instance shutdown --no-remember --submit $i + done + + while pgrep -c '^qemu-|^kvm$' -u root ; do + sleep 15; + gnt-cluster watcher pause 30m + done + + at 'now + 5 min' << EOF +sleep 4m; +gnt-cluster watcher continue +EOF + + /sbin/shutdown -c + sleep 5 + /sbin/shutdown -r 1 &2 "Might want to launch me in a screen or tmux." + exit 1 +fi + +if ! [ "$force" = 1 ]; then + echo -n 'really? ' + read really + [ "$really" = "y" ] +fi + +### ensure_nodelist +################### +if ! [ -e "$nodelist" ]; then + tmp="$(tempfile)" + trap "rm -f '$tmp'" EXIT + gnt-node list --no-headers -o name > "$tmp" + nodelist="$tmp" +fi + +lines=$(wc -l < "$nodelist") +case "$lines" in + 0) + echo >&2 "nodelist $nodelist empty." + exit 1 + ;; + 1) + case "$(hostname -f)" in + byrd.debian.org) + reboot_byrd + ;; + *) + echo >&2 "Only one node." + exit 1 + esac + ;; + 2) + crossmigrate + ;; + *) + moveupdown + ;; +esac + diff --git a/modules/ganeti2/manifests/init.pp b/modules/ganeti2/manifests/init.pp index a944f3db0..1708ba857 100644 --- a/modules/ganeti2/manifests/init.pp +++ b/modules/ganeti2/manifests/init.pp @@ -43,6 +43,10 @@ class ganeti2 { | EOF } } + file { '/usr/local/sbin/ganeti-reboot-cluster': + source => 'puppet:///modules/ganeti2/ganeti-reboot-cluster', + mode => '0555', + } package { ['python-dbus', 'systemd-container']: ensure => installed } file { '/usr/local/sbin/ganeti-machined-register-instances':