Merge remote-tracking branch 'origin/master' into staging
[mirror/dsa-puppet.git] / modules / postgres / templates / backup_server / postgres-make-base-backups.erb
index 2e6d7be..b0d7d32 100755 (executable)
@@ -1,6 +1,13 @@
 #!/bin/bash
 
+# vim:syn=sh:
+# vim:ts=4:
+# vim:et:
+
+
 # run a bunch of full postgresql backups
+#  if given a host:port, run this backup,
+#  else run all defined once if they have not run recently
 # vim:syn=sh:
 
 
 # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 
-set -e
+MIN_WAIT=$(( 60*60*4 ))
+MIN_WAIT_SUCCESS=$(( 60*60*24*7 ))
+MAX_WAIT_SUCCESS=$(( 60*60*24*10 ))
+STATEDIR=/var/lib/dsa/postgres-make-base-backups
+
+####
 set -u
 
-CONFFILE=/etc/nagios/dsa-check-backuppg.conf
-ROOTDIR=$(perl -MYAML -e "print YAML::LoadFile('$CONFFILE')->{'rootdir'}")
-if [ -z "$ROOTDIR" ]; then
-       echo >&2 "Could not learn rootdir from $CONFFILE"
-       exit 1
-fi
+SELF="`basename "$0"`[$$]"
+DELTA_WAIT_SUCCESS=$(( MAX_WAIT_SUCCESS - MIN_WAIT_SUCCESS ))
+MYHOSTNAME=$(hostname -f)
 
 if [ -t 0 ]; then
-       verbose=1
+    verbose=1
 else
-       verbose=0
+    verbose=0
 fi
 
-if [ "$verbose" -gt 0 ]; then
-       console="--progress --verbose"
-else
-       console=""
-fi
+log() {
+    [ "$verbose" -gt 0 ] && echo "$*"
+    logger -p daemon.info -t "$SELF" "$*"
+}
+format_timedelta() {
+    local secs="$1"; shift
+    if [ "$secs" -ge 86400 ]; then
+        printf '%d+%02d:%02d:%02d\n' $(($secs/3600/24)) $(($secs/3600%24)) $(($secs/60%60)) $(($secs%60))
+    else
+        printf '%02d:%02d:%02d\n' $(($secs/3600)) $(($secs/60%60)) $(($secs%60))
+    fi
+}
+
 
 if [ "${1:-}" = "-h" ] || [ "${1:-}" = "--help" ]; then
-       echo "Usage: $0 [<host>:<port>]"
-       exit 0
+    echo "Usage: $0 [<host>:<port>]"
+    exit 0
 fi
 
 if [ "$#" -gt 0 ]; then
-       forcehostport="$1"
-       shift
+    forcehostport="$1"
+    shift
 else
-       forcehostport=""
+    forcehostport=""
 fi
 
-export PGSSLMODE=verify-full
-export PGSSLROOTCERT=/etc/ssl/debian/certs/ca.crt
+mkdir -p "$STATEDIR"
 
-date=$(date "+%Y%m%d-%H%M%S")
-thishost=$(hostname -f)
+# get a lock, but only if we did not force the run
+if [ -z "$forcehostport" ]; then
+    exec 200< "$STATEDIR"
+    if ! flock -w 0 -e 200; then
+        log "Cannot acquire lock on $STATEDIR."
+        exit 0
+    fi
+fi
 
 while read host port username  cluster version; do
-       [ "${host#\#}" = "$host" ] || continue
-       [ -z "$host" ] && continue
-
-       if [ -n "$forcehostport" ] && [ "$forcehostport" != "$host:$port" ]; then
-               [ "$verbose" -gt 0 ] && echo "Skipping $host:$port $version/$cluster."
-               continue
-       fi
-
-       label="$thishost-$date-$host-$cluster-$version-backup"
-       [ "$verbose" -gt 0 ] && echo "Doing $host:$port $version/$cluster: $label"
-
-       target="$cluster.BASE.$label.tar.gz"
-       tmp=$(tempfile -d "$ROOTDIR" -p "BASE-$host:$port-" -s ".tar.gz")
-       trap "rm -f '$tmp'" EXIT
-
-       /usr/lib/postgresql/"$version"/bin/pg_basebackup --format=tar --pgdata=- --label="$label" --host="$host" --port="$port" --username="$username" --no-password $console | pigz > "$tmp"
-       if ! [ "${PIPESTATUS[0]}" -eq 0 ]; then
-               echo >&2 "pg_basebackup failed with exit code ${PIPESTATUS[0]}"
-               exit 1
-       fi
-       mv "$tmp" "$ROOTDIR/${host%%.*}/$target"
+    [ "${host#\#}" = "$host" ] || continue
+    [ -z "$host" ] && continue
+
+    flagfile="$STATEDIR/$host-$port.last-attempt"
+    flagfilesuccess="$STATEDIR/$host-$port.last-success"
+    if [ -n "$forcehostport" ]; then
+        if [ "$forcehostport" != "$host:$port" ]; then
+            log "Skipping $host:$port $version/$cluster because this run is limited to $host:$port."
+            runme=0
+        else
+            log "Forcing $host:$port $version/$cluster run."
+            runme=1
+        fi
+    else
+        if ! [ -e "$flagfile" ]; then
+            runme=1
+            log "Planning to run $host:$port $version/$cluster because no flag file exists."
+        else
+            now=$(date +%s)
+            mtime="$(stat --printf "%Y" "$flagfile")"
+            delta=$(( now - mtime ))
+            if [ "$delta" -lt "$MIN_WAIT" ]; then
+                runme=0
+                log "Skipping $host:$port $version/$cluster because last attempt was only $(format_timedelta "${delta}") (< $(format_timedelta "${MIN_WAIT}")) ago."
+            else
+                if ! [ -e "$flagfilesuccess" ]; then
+                    runme=1
+                    log "Planning to run $host:$port $version/$cluster because no success flag exists."
+                else
+                    mtime="$(stat --printf "%Y" "$flagfilesuccess")"
+                    delta=$(( now - mtime ))
+                    if [ "$delta" -lt "$MIN_WAIT_SUCCESS" ]; then
+                        runme=0
+                        log "Skipping $host:$port $version/$cluster because last success was only $(format_timedelta "${delta}") (< $(format_timedelta "${MIN_WAIT_SUCCESS}")) ago."
+                    elif [ "$delta" -gt "$MAX_WAIT_SUCCESS" ]; then
+                        runme=1
+                        log "Planning to run $host:$port $version/$cluster because last success was $(format_timedelta "${delta}") (>= $(format_timedelta "${MAX_WAIT_SUCCESS}")) ago."
+                    else
+                        # get a "randomish" but stable value for this backup run
+                        val=$(echo "$MYHOSTNAME-$host-$port-$mtime" | sha256sum | head -c 8)
+                        val=$((16#$val))
+                        rnd_cuttoff=$(($val % $DELTA_WAIT_SUCCESS))
+                        age_after_min=$((delta - MIN_WAIT_SUCCESS))
+                        if [ "$age_after_min" -lt "$rnd_cuttoff" ]; then
+                            runme=0
+                            log "Skipping $host:$port $version/$cluster because random computer says wait ([$(format_timedelta "${age_after_min}") < $(format_timedelta "${rnd_cuttoff}") (< $(format_timedelta "${DELTA_WAIT_SUCCESS}"))] + $(format_timedelta "${MIN_WAIT_SUCCESS}"))."
+                        else
+                            runme=1
+                            log "Planning to run $host:$port $version/$cluster because random computer says so ($(format_timedelta "${age_after_min}") >= $(format_timedelta "${rnd_cuttoff}"))."
+                        fi
+                    fi
+                fi
+            fi
+        fi
+    fi
+
+    if [ "$runme" -gt 0 ]; then
+        touch "$flagfile"
+        exec 201< "$flagfile"
+        if flock -w 0 -e 201; then
+            log "Running $host:$port $version/$cluster."
+            /usr/local/bin/postgres-make-one-base-backup "$host" "$port" "$username" "$cluster" "$version"
+            rc=$?
+            [ "$rc" = 0 ] && touch "$flagfilesuccess"
+            flock -u 201
+        else
+            log "Cannot acquire lock on $flagfile, skipping $host:$port $version/$cluster."
+        fi
+    fi
 done << EOF
 seger.debian.org       5432    debian-backup           dak             9.6
 bmdb1.debian.org       5435    debian-backup           main            9.6
@@ -95,6 +165,7 @@ bmdb1.debian.org     5436    debian-backup           wannabuild      9.6
 bmdb1.debian.org       5440    debian-backup           debsources      9.6
 fasolo.debian.org      5433    debian-backup           dak             9.6
 sibelius.debian.org    5433    debian-backup           snapshot        9.4
+sallinen.debian.org    5473    debian-backup           snapshot        9.6
 <%- if @hostname != "backuphost" -%>
 moszumanska.debian.org 5432    debian-backup           main            9.1
 <%- end -%>