Move list of clusters to make a base backup of from the script to a conffile
[mirror/dsa-puppet.git] / modules / postgres / templates / backup_server / postgres-make-base-backups.erb
index eaac39a..fc56411 100755 (executable)
 MIN_WAIT=$(( 60*60*4 ))
 MIN_WAIT_SUCCESS=$(( 60*60*24*7 ))
 MAX_WAIT_SUCCESS=$(( 60*60*24*10 ))
-
 STATEDIR=/var/lib/dsa/postgres-make-base-backups
 
+####
 set -u
 
+if [ "$(id -u)" = 0 ]; then
+    echo >&2 "Do not run me as root.  Probably you want sudo -u debbackup."
+    exit 1
+fi
+
+SELF="`basename "$0"`[$$]"
+DELTA_WAIT_SUCCESS=$(( MAX_WAIT_SUCCESS - MIN_WAIT_SUCCESS ))
+MYHOSTNAME=$(hostname -f)
+
 if [ -t 0 ]; then
     verbose=1
 else
@@ -49,6 +58,15 @@ fi
 
 log() {
     [ "$verbose" -gt 0 ] && echo "$*"
+    logger -p daemon.info -t "$SELF" "$*"
+}
+format_timedelta() {
+    local secs="$1"; shift
+    if [ "$secs" -ge 86400 ]; then
+        printf '%d+%02d:%02d:%02d\n' $(($secs/3600/24)) $(($secs/3600%24)) $(($secs/60%60)) $(($secs%60))
+    else
+        printf '%02d:%02d:%02d\n' $(($secs/3600)) $(($secs/60%60)) $(($secs%60))
+    fi
 }
 
 
@@ -75,9 +93,6 @@ if [ -z "$forcehostport" ]; then
     fi
 fi
 
-DELTA_WAIT_SUCCESS=$(( MAX_WAIT_SUCCESS - MIN_WAIT_SUCCESS ))
-MYHOSTNAME=$(hostname -f)
-
 while read host port username  cluster version; do
     [ "${host#\#}" = "$host" ] || continue
     [ -z "$host" ] && continue
@@ -86,48 +101,48 @@ while read host port username  cluster version; do
     flagfilesuccess="$STATEDIR/$host-$port.last-success"
     if [ -n "$forcehostport" ]; then
         if [ "$forcehostport" != "$host:$port" ]; then
-            log "Skipping $host:$port $version/$cluster."
+            log "Skipping $host:$port $version/$cluster because this run is limited to $forcehostport."
             runme=0
         else
-            log "Running forced $host:$port $version/$cluster."
+            log "Forcing $host:$port $version/$cluster run."
             runme=1
         fi
     else
         if ! [ -e "$flagfile" ]; then
             runme=1
-            log "Running $host:$port $version/$cluster because no flag file exists."
+            log "Planning to run $host:$port $version/$cluster because no flag file exists."
         else
             now=$(date +%s)
             mtime="$(stat --printf "%Y" "$flagfile")"
             delta=$(( now - mtime ))
             if [ "$delta" -lt "$MIN_WAIT" ]; then
                 runme=0
-                log "Not running $host:$port $version/$cluster because last attempt was only ${delta}s ago."
+                log "Skipping $host:$port $version/$cluster because last attempt was only $(format_timedelta "${delta}") (< $(format_timedelta "${MIN_WAIT}")) ago."
             else
                 if ! [ -e "$flagfilesuccess" ]; then
                     runme=1
-                    log "Running $host:$port $version/$cluster because no success flag exists."
+                    log "Planning to run $host:$port $version/$cluster because no success flag exists."
                 else
                     mtime="$(stat --printf "%Y" "$flagfilesuccess")"
                     delta=$(( now - mtime ))
                     if [ "$delta" -lt "$MIN_WAIT_SUCCESS" ]; then
                         runme=0
-                        log "Not running $host:$port $version/$cluster because last success was only ${delta}s ago."
+                        log "Skipping $host:$port $version/$cluster because last success was only $(format_timedelta "${delta}") (< $(format_timedelta "${MIN_WAIT_SUCCESS}")) ago."
                     elif [ "$delta" -gt "$MAX_WAIT_SUCCESS" ]; then
                         runme=1
-                        log "Running $host:$port $version/$cluster because last success was ${delta}s ago."
+                        log "Planning to run $host:$port $version/$cluster because last success was $(format_timedelta "${delta}") (>= $(format_timedelta "${MAX_WAIT_SUCCESS}")) ago."
                     else
                         # get a "randomish" but stable value for this backup run
                         val=$(echo "$MYHOSTNAME-$host-$port-$mtime" | sha256sum | head -c 8)
                         val=$((16#$val))
-                        valmod=$(($val % $DELTA_WAIT_SUCCESS))
-                        after_min=$((delta - MIN_WAIT_SUCCESS))
-                        if [ "$after_min" -gt "$valmod" ]; then
-                            runme=1
-                            log "Running $host:$port $version/$cluster because random computer says so ($after_min > $valmod)."
-                        else
+                        rnd_cuttoff=$(($val % $DELTA_WAIT_SUCCESS))
+                        age_after_min=$((delta - MIN_WAIT_SUCCESS))
+                        if [ "$age_after_min" -lt "$rnd_cuttoff" ]; then
                             runme=0
-                            log "Not running $host:$port $version/$cluster because random computer says wait ($after_min <= $valmod)."
+                            log "Skipping $host:$port $version/$cluster because random computer says wait ([$(format_timedelta "${age_after_min}") < $(format_timedelta "${rnd_cuttoff}") (< $(format_timedelta "${DELTA_WAIT_SUCCESS}"))] + $(format_timedelta "${MIN_WAIT_SUCCESS}"))."
+                        else
+                            runme=1
+                            log "Planning to run $host:$port $version/$cluster because random computer says so ($(format_timedelta "${age_after_min}") >= $(format_timedelta "${rnd_cuttoff}"))."
                         fi
                     fi
                 fi
@@ -137,20 +152,16 @@ while read host port username  cluster version; do
 
     if [ "$runme" -gt 0 ]; then
         touch "$flagfile"
-        /usr/local/bin/postgres-make-one-base-backup "$host" "$port" "$username" "$cluster" "$version"
-        rc=$?
-        [ "$rc" = 0 ] && touch "$flagfilesuccess"
+        exec 201< "$flagfile"
+        if flock -w 0 -e 201; then
+            log "Running $host:$port $version/$cluster."
+            /usr/local/bin/postgres-make-one-base-backup "$host" "$port" "$username" "$cluster" "$version"
+            rc=$?
+            log "Base backup for $host:$port $version/$cluster exited with rc $rc."
+            [ "$rc" = 0 ] && touch "$flagfilesuccess"
+            flock -u 201
+        else
+            log "Cannot acquire lock on $flagfile, skipping $host:$port $version/$cluster."
+        fi
     fi
-done << EOF
-seger.debian.org       5432    debian-backup           dak             9.6
-bmdb1.debian.org       5435    debian-backup           main            9.6
-bmdb1.debian.org       5436    debian-backup           wannabuild      9.6
-bmdb1.debian.org       5440    debian-backup           debsources      9.6
-fasolo.debian.org      5433    debian-backup           dak             9.6
-sibelius.debian.org    5433    debian-backup           snapshot        9.4
-<%- if @hostname != "backuphost" -%>
-moszumanska.debian.org 5432    debian-backup           main            9.1
-<%- end -%>
-#
-# puppet notice:  this is just a partial file.  The tail EOF comes
-# from a different concat fragment
+done < '<%= scope['postgres::backup_server::globals::base_backup_clusters'] %> '