run base backups spread over time. This also should help us to recover from failures...
authorPeter Palfrader <peter@palfrader.org>
Tue, 6 Mar 2018 15:38:51 +0000 (16:38 +0100)
committerPeter Palfrader <peter@palfrader.org>
Tue, 6 Mar 2018 15:38:51 +0000 (16:38 +0100)
modules/postgres/manifests/backup_server.pp
modules/postgres/templates/backup_server/postgres-make-base-backups.erb

index c9c3d47..913905e 100644 (file)
@@ -37,20 +37,11 @@ class postgres::backup_server {
                order  => '99',
        }
        file { '/etc/cron.d/puppet-postgres-make-base-backups': ensure => absent; }
-       if $::hostname in [backuphost] {
-               concat::fragment { 'dsa-puppet-stuff--porterbox-chroot-update':
-                       target => '/etc/cron.d/dsa-puppet-stuff',
-                       content  => @("EOF")
-                               20 1 * * 0 debbackup chronic ${$postgres::backup_server::globals::make_base_backups}
-                               | EOF
-               }
-       } else  {
-               concat::fragment { 'dsa-puppet-stuff--porterbox-chroot-update':
-                       target => '/etc/cron.d/dsa-puppet-stuff',
-                       content  => @("EOF")
-                               20 0 * * 6 debbackup chronic ${$postgres::backup_server::globals::make_base_backups}
-                               | EOF
-               }
+       concat::fragment { 'dsa-puppet-stuff--postgres-make_base_backups':
+               target => '/etc/cron.d/dsa-puppet-stuff',
+               content  => @("EOF")
+                       */10 * * * 0 debbackup chronic ${$postgres::backup_server::globals::make_base_backups}
+                       | EOF
        }
 
        ####
index 82d7886..eaac39a 100755 (executable)
@@ -1,6 +1,13 @@
 #!/bin/bash
 
+# vim:syn=sh:
+# vim:ts=4:
+# vim:et:
+
+
 # run a bunch of full postgresql backups
+#  if given a host:port, run this backup,
+#  else run all defined once if they have not run recently
 # vim:syn=sh:
 
 
 # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 
-set -e
+MIN_WAIT=$(( 60*60*4 ))
+MIN_WAIT_SUCCESS=$(( 60*60*24*7 ))
+MAX_WAIT_SUCCESS=$(( 60*60*24*10 ))
+
+STATEDIR=/var/lib/dsa/postgres-make-base-backups
+
 set -u
 
 if [ -t 0 ]; then
-       verbose=1
+    verbose=1
 else
-       verbose=0
+    verbose=0
 fi
 
+log() {
+    [ "$verbose" -gt 0 ] && echo "$*"
+}
+
+
 if [ "${1:-}" = "-h" ] || [ "${1:-}" = "--help" ]; then
-       echo "Usage: $0 [<host>:<port>]"
-       exit 0
+    echo "Usage: $0 [<host>:<port>]"
+    exit 0
 fi
 
 if [ "$#" -gt 0 ]; then
-       forcehostport="$1"
-       shift
+    forcehostport="$1"
+    shift
 else
-       forcehostport=""
+    forcehostport=""
 fi
 
+mkdir -p "$STATEDIR"
+
+# get a lock, but only if we did not force the run
+if [ -z "$forcehostport" ]; then
+    exec 200< "$STATEDIR"
+    if ! flock -w 0 -e 200; then
+        log "Cannot acquire lock on $STATEDIR."
+        exit 0
+    fi
+fi
+
+DELTA_WAIT_SUCCESS=$(( MAX_WAIT_SUCCESS - MIN_WAIT_SUCCESS ))
+MYHOSTNAME=$(hostname -f)
+
 while read host port username  cluster version; do
-       [ "${host#\#}" = "$host" ] || continue
-       [ -z "$host" ] && continue
+    [ "${host#\#}" = "$host" ] || continue
+    [ -z "$host" ] && continue
 
-       if [ -n "$forcehostport" ] && [ "$forcehostport" != "$host:$port" ]; then
-               [ "$verbose" -gt 0 ] && echo "Skipping $host:$port $version/$cluster."
-               continue
-       fi
+    flagfile="$STATEDIR/$host-$port.last-attempt"
+    flagfilesuccess="$STATEDIR/$host-$port.last-success"
+    if [ -n "$forcehostport" ]; then
+        if [ "$forcehostport" != "$host:$port" ]; then
+            log "Skipping $host:$port $version/$cluster."
+            runme=0
+        else
+            log "Running forced $host:$port $version/$cluster."
+            runme=1
+        fi
+    else
+        if ! [ -e "$flagfile" ]; then
+            runme=1
+            log "Running $host:$port $version/$cluster because no flag file exists."
+        else
+            now=$(date +%s)
+            mtime="$(stat --printf "%Y" "$flagfile")"
+            delta=$(( now - mtime ))
+            if [ "$delta" -lt "$MIN_WAIT" ]; then
+                runme=0
+                log "Not running $host:$port $version/$cluster because last attempt was only ${delta}s ago."
+            else
+                if ! [ -e "$flagfilesuccess" ]; then
+                    runme=1
+                    log "Running $host:$port $version/$cluster because no success flag exists."
+                else
+                    mtime="$(stat --printf "%Y" "$flagfilesuccess")"
+                    delta=$(( now - mtime ))
+                    if [ "$delta" -lt "$MIN_WAIT_SUCCESS" ]; then
+                        runme=0
+                        log "Not running $host:$port $version/$cluster because last success was only ${delta}s ago."
+                    elif [ "$delta" -gt "$MAX_WAIT_SUCCESS" ]; then
+                        runme=1
+                        log "Running $host:$port $version/$cluster because last success was ${delta}s ago."
+                    else
+                        # get a "randomish" but stable value for this backup run
+                        val=$(echo "$MYHOSTNAME-$host-$port-$mtime" | sha256sum | head -c 8)
+                        val=$((16#$val))
+                        valmod=$(($val % $DELTA_WAIT_SUCCESS))
+                        after_min=$((delta - MIN_WAIT_SUCCESS))
+                        if [ "$after_min" -gt "$valmod" ]; then
+                            runme=1
+                            log "Running $host:$port $version/$cluster because random computer says so ($after_min > $valmod)."
+                        else
+                            runme=0
+                            log "Not running $host:$port $version/$cluster because random computer says wait ($after_min <= $valmod)."
+                        fi
+                    fi
+                fi
+            fi
+        fi
+    fi
 
-       /usr/local/bin/postgres-make-one-base-backup "$host" "$port" "$username" "$cluster" "$version"
+    if [ "$runme" -gt 0 ]; then
+        touch "$flagfile"
+        /usr/local/bin/postgres-make-one-base-backup "$host" "$port" "$username" "$cluster" "$version"
+        rc=$?
+        [ "$rc" = 0 ] && touch "$flagfilesuccess"
+    fi
 done << EOF
 seger.debian.org       5432    debian-backup           dak             9.6
 bmdb1.debian.org       5435    debian-backup           main            9.6