From ba9d4e7eceff2f88efbac2b16a4f08997ccd9d80 Mon Sep 17 00:00:00 2001 From: Peter Palfrader Date: Tue, 6 Mar 2018 16:38:51 +0100 Subject: [PATCH] run base backups spread over time. This also should help us to recover from failures or reboots better --- modules/postgres/manifests/backup_server.pp | 19 +-- .../postgres-make-base-backups.erb | 114 +++++++++++++++--- 2 files changed, 104 insertions(+), 29 deletions(-) diff --git a/modules/postgres/manifests/backup_server.pp b/modules/postgres/manifests/backup_server.pp index c9c3d470f..913905eb1 100644 --- a/modules/postgres/manifests/backup_server.pp +++ b/modules/postgres/manifests/backup_server.pp @@ -37,20 +37,11 @@ class postgres::backup_server { order => '99', } file { '/etc/cron.d/puppet-postgres-make-base-backups': ensure => absent; } - if $::hostname in [backuphost] { - concat::fragment { 'dsa-puppet-stuff--porterbox-chroot-update': - target => '/etc/cron.d/dsa-puppet-stuff', - content => @("EOF") - 20 1 * * 0 debbackup chronic ${$postgres::backup_server::globals::make_base_backups} - | EOF - } - } else { - concat::fragment { 'dsa-puppet-stuff--porterbox-chroot-update': - target => '/etc/cron.d/dsa-puppet-stuff', - content => @("EOF") - 20 0 * * 6 debbackup chronic ${$postgres::backup_server::globals::make_base_backups} - | EOF - } + concat::fragment { 'dsa-puppet-stuff--postgres-make_base_backups': + target => '/etc/cron.d/dsa-puppet-stuff', + content => @("EOF") + */10 * * * 0 debbackup chronic ${$postgres::backup_server::globals::make_base_backups} + | EOF } #### diff --git a/modules/postgres/templates/backup_server/postgres-make-base-backups.erb b/modules/postgres/templates/backup_server/postgres-make-base-backups.erb index 82d7886b0..eaac39a17 100755 --- a/modules/postgres/templates/backup_server/postgres-make-base-backups.erb +++ b/modules/postgres/templates/backup_server/postgres-make-base-backups.erb @@ -1,6 +1,13 @@ #!/bin/bash +# vim:syn=sh: +# vim:ts=4: +# vim:et: + + # run a bunch of full postgresql backups +# if given a host:port, run this backup, +# else run all defined once if they have not run recently # vim:syn=sh: @@ -26,37 +33,114 @@ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -set -e +MIN_WAIT=$(( 60*60*4 )) +MIN_WAIT_SUCCESS=$(( 60*60*24*7 )) +MAX_WAIT_SUCCESS=$(( 60*60*24*10 )) + +STATEDIR=/var/lib/dsa/postgres-make-base-backups + set -u if [ -t 0 ]; then - verbose=1 + verbose=1 else - verbose=0 + verbose=0 fi +log() { + [ "$verbose" -gt 0 ] && echo "$*" +} + + if [ "${1:-}" = "-h" ] || [ "${1:-}" = "--help" ]; then - echo "Usage: $0 [:]" - exit 0 + echo "Usage: $0 [:]" + exit 0 fi if [ "$#" -gt 0 ]; then - forcehostport="$1" - shift + forcehostport="$1" + shift else - forcehostport="" + forcehostport="" fi +mkdir -p "$STATEDIR" + +# get a lock, but only if we did not force the run +if [ -z "$forcehostport" ]; then + exec 200< "$STATEDIR" + if ! flock -w 0 -e 200; then + log "Cannot acquire lock on $STATEDIR." + exit 0 + fi +fi + +DELTA_WAIT_SUCCESS=$(( MAX_WAIT_SUCCESS - MIN_WAIT_SUCCESS )) +MYHOSTNAME=$(hostname -f) + while read host port username cluster version; do - [ "${host#\#}" = "$host" ] || continue - [ -z "$host" ] && continue + [ "${host#\#}" = "$host" ] || continue + [ -z "$host" ] && continue - if [ -n "$forcehostport" ] && [ "$forcehostport" != "$host:$port" ]; then - [ "$verbose" -gt 0 ] && echo "Skipping $host:$port $version/$cluster." - continue - fi + flagfile="$STATEDIR/$host-$port.last-attempt" + flagfilesuccess="$STATEDIR/$host-$port.last-success" + if [ -n "$forcehostport" ]; then + if [ "$forcehostport" != "$host:$port" ]; then + log "Skipping $host:$port $version/$cluster." + runme=0 + else + log "Running forced $host:$port $version/$cluster." + runme=1 + fi + else + if ! [ -e "$flagfile" ]; then + runme=1 + log "Running $host:$port $version/$cluster because no flag file exists." + else + now=$(date +%s) + mtime="$(stat --printf "%Y" "$flagfile")" + delta=$(( now - mtime )) + if [ "$delta" -lt "$MIN_WAIT" ]; then + runme=0 + log "Not running $host:$port $version/$cluster because last attempt was only ${delta}s ago." + else + if ! [ -e "$flagfilesuccess" ]; then + runme=1 + log "Running $host:$port $version/$cluster because no success flag exists." + else + mtime="$(stat --printf "%Y" "$flagfilesuccess")" + delta=$(( now - mtime )) + if [ "$delta" -lt "$MIN_WAIT_SUCCESS" ]; then + runme=0 + log "Not running $host:$port $version/$cluster because last success was only ${delta}s ago." + elif [ "$delta" -gt "$MAX_WAIT_SUCCESS" ]; then + runme=1 + log "Running $host:$port $version/$cluster because last success was ${delta}s ago." + else + # get a "randomish" but stable value for this backup run + val=$(echo "$MYHOSTNAME-$host-$port-$mtime" | sha256sum | head -c 8) + val=$((16#$val)) + valmod=$(($val % $DELTA_WAIT_SUCCESS)) + after_min=$((delta - MIN_WAIT_SUCCESS)) + if [ "$after_min" -gt "$valmod" ]; then + runme=1 + log "Running $host:$port $version/$cluster because random computer says so ($after_min > $valmod)." + else + runme=0 + log "Not running $host:$port $version/$cluster because random computer says wait ($after_min <= $valmod)." + fi + fi + fi + fi + fi + fi - /usr/local/bin/postgres-make-one-base-backup "$host" "$port" "$username" "$cluster" "$version" + if [ "$runme" -gt 0 ]; then + touch "$flagfile" + /usr/local/bin/postgres-make-one-base-backup "$host" "$port" "$username" "$cluster" "$version" + rc=$? + [ "$rc" = 0 ] && touch "$flagfilesuccess" + fi done << EOF seger.debian.org 5432 debian-backup dak 9.6 bmdb1.debian.org 5435 debian-backup main 9.6 -- 2.20.1