postgres-make-base-backups: locks and logs
authorPeter Palfrader <peter@palfrader.org>
Tue, 6 Mar 2018 22:03:59 +0000 (23:03 +0100)
committerPeter Palfrader <peter@palfrader.org>
Tue, 6 Mar 2018 22:04:04 +0000 (23:04 +0100)
- get locks for each individual base backup so we do not run parallel ones in the precense of forced runs
- also log to syslog

modules/postgres/templates/backup_server/postgres-make-base-backups.erb

index eaac39a..18046bb 100755 (executable)
 MIN_WAIT=$(( 60*60*4 ))
 MIN_WAIT_SUCCESS=$(( 60*60*24*7 ))
 MAX_WAIT_SUCCESS=$(( 60*60*24*10 ))
-
 STATEDIR=/var/lib/dsa/postgres-make-base-backups
 
+####
 set -u
 
+SELF="`basename "$0"`[$$]"
+DELTA_WAIT_SUCCESS=$(( MAX_WAIT_SUCCESS - MIN_WAIT_SUCCESS ))
+MYHOSTNAME=$(hostname -f)
+
 if [ -t 0 ]; then
     verbose=1
 else
@@ -49,6 +53,7 @@ fi
 
 log() {
     [ "$verbose" -gt 0 ] && echo "$*"
+    logger -p daemon.info -t "$SELF" "$*"
 }
 
 
@@ -75,9 +80,6 @@ if [ -z "$forcehostport" ]; then
     fi
 fi
 
-DELTA_WAIT_SUCCESS=$(( MAX_WAIT_SUCCESS - MIN_WAIT_SUCCESS ))
-MYHOSTNAME=$(hostname -f)
-
 while read host port username  cluster version; do
     [ "${host#\#}" = "$host" ] || continue
     [ -z "$host" ] && continue
@@ -86,36 +88,36 @@ while read host port username  cluster version; do
     flagfilesuccess="$STATEDIR/$host-$port.last-success"
     if [ -n "$forcehostport" ]; then
         if [ "$forcehostport" != "$host:$port" ]; then
-            log "Skipping $host:$port $version/$cluster."
+            log "Skipping $host:$port $version/$cluster because this run is limited to $host:$port."
             runme=0
         else
-            log "Running forced $host:$port $version/$cluster."
+            log "Forcing $host:$port $version/$cluster run."
             runme=1
         fi
     else
         if ! [ -e "$flagfile" ]; then
             runme=1
-            log "Running $host:$port $version/$cluster because no flag file exists."
+            log "Planning to run $host:$port $version/$cluster because no flag file exists."
         else
             now=$(date +%s)
             mtime="$(stat --printf "%Y" "$flagfile")"
             delta=$(( now - mtime ))
             if [ "$delta" -lt "$MIN_WAIT" ]; then
                 runme=0
-                log "Not running $host:$port $version/$cluster because last attempt was only ${delta}s ago."
+                log "Skipping $host:$port $version/$cluster because last attempt was only ${delta}s ago."
             else
                 if ! [ -e "$flagfilesuccess" ]; then
                     runme=1
-                    log "Running $host:$port $version/$cluster because no success flag exists."
+                    log "Planning to run $host:$port $version/$cluster because no success flag exists."
                 else
                     mtime="$(stat --printf "%Y" "$flagfilesuccess")"
                     delta=$(( now - mtime ))
                     if [ "$delta" -lt "$MIN_WAIT_SUCCESS" ]; then
                         runme=0
-                        log "Not running $host:$port $version/$cluster because last success was only ${delta}s ago."
+                        log "Skipping $host:$port $version/$cluster because last success was only ${delta}s ago."
                     elif [ "$delta" -gt "$MAX_WAIT_SUCCESS" ]; then
                         runme=1
-                        log "Running $host:$port $version/$cluster because last success was ${delta}s ago."
+                        log "Planning to run $host:$port $version/$cluster because last success was ${delta}s ago."
                     else
                         # get a "randomish" but stable value for this backup run
                         val=$(echo "$MYHOSTNAME-$host-$port-$mtime" | sha256sum | head -c 8)
@@ -124,10 +126,10 @@ while read host port username  cluster version; do
                         after_min=$((delta - MIN_WAIT_SUCCESS))
                         if [ "$after_min" -gt "$valmod" ]; then
                             runme=1
-                            log "Running $host:$port $version/$cluster because random computer says so ($after_min > $valmod)."
+                            log "Planning to run $host:$port $version/$cluster because random computer says so ($after_min > $valmod)."
                         else
                             runme=0
-                            log "Not running $host:$port $version/$cluster because random computer says wait ($after_min <= $valmod)."
+                            log "Skipping $host:$port $version/$cluster because random computer says wait ($after_min <= $valmod)."
                         fi
                     fi
                 fi
@@ -137,9 +139,16 @@ while read host port username  cluster version; do
 
     if [ "$runme" -gt 0 ]; then
         touch "$flagfile"
-        /usr/local/bin/postgres-make-one-base-backup "$host" "$port" "$username" "$cluster" "$version"
-        rc=$?
-        [ "$rc" = 0 ] && touch "$flagfilesuccess"
+        exec 201< "$flagfile"
+        if flock -w 0 -e 201; then
+            log "Running $host:$port $version/$cluster."
+            /usr/local/bin/postgres-make-one-base-backup "$host" "$port" "$username" "$cluster" "$version"
+            rc=$?
+            [ "$rc" = 0 ] && touch "$flagfilesuccess"
+            flock -u 201
+        else
+            log "Cannot acquire lock on $flagfile, skipping $host:$port $version/$cluster."
+        fi
     fi
 done << EOF
 seger.debian.org       5432    debian-backup           dak             9.6