port dsa-check-memory to python3
[mirror/dsa-nagios.git] / dsa-nagios-checks / checks / dsa-check-backuppg
index 92ce6cb..ce6a9a0 100755 (executable)
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/python3
 
 # Copyright 2010 Peter Palfrader
 #
@@ -47,9 +47,11 @@ import copy
 import time
 import re
 import os
+import errno
 import sys
 import yaml
 import optparse
+import socket
 
 def load_conf(cf):
     if cf is not None:
@@ -60,11 +62,12 @@ def load_conf(cf):
         configfile = '/etc/nagios/dsa-check-backuppg.conf'
 
     f = open(configfile)
-    config = yaml.load(f.read())
+    config = yaml.safe_load(f.read())
     f.close()
     return config
 
 
+notices_seq = []
 problems_seq = []
 problems_per_db = {}
 global_expires = []
@@ -75,6 +78,12 @@ global_expires = []
 #    global problems_per_db
 #    if not host in problems_per_db: problems_per_db[host] = {}
 #    problems_per_db[host][db] = True
+def note_info(key, value, pre=None):
+    global notices_seq
+    if pre is None:
+        notices_seq.append("%s: %s"%(key, value))
+    else:
+        notices_seq.append("[%s] %s: %s"%(pre, key, value))
 
 def note_warning(key, value, pre=None):
     global problems_seq
@@ -89,11 +98,11 @@ def note_warning_db(host, db, key, value):
     problems_per_db[host][db] = True
 
 
-def wal_pre(w):
+def wal_pre(w, host, db):
     (w1,w2) = w
     if w2 == 0:
         w1 -= 1
-        w2 = 0xFE
+        w2 = 0xFF
     else:
         w2 -= 1
 
@@ -108,6 +117,25 @@ def parse_pg_backup_info(fn):
     f.close()
     return i
 
+def get_retention(config, host, db):
+    assert('retention' in config)
+
+    assert('backups' in config)
+    assert(isinstance(config['backups'], dict))
+
+    assert(host in config['backups'])
+    assert(isinstance(config['backups'][host], dict))
+
+    assert(db in config['backups'][host])
+    if isinstance(config['backups'][host][db], dict) and 'retention' in config['backups'][host][db]:
+        r = config['backups'][host][db]['retention']
+    elif '_retention' in config['backups'][host]:
+        r = config['backups'][host]['_retention']
+    else:
+        r = config['retention']
+
+    assert(isinstance(r, int))
+    return r
 
 parser = optparse.OptionParser()
 parser.set_usage("%prog [-c=<CONFFILE>]               (nagios mode)\n" +
@@ -131,8 +159,23 @@ config = load_conf(options.conffile)
 
 os.chdir(config['rootdir'])
 for dir in os.listdir('.'):
+    if dir.startswith('.') or dir.endswith('.old') or dir == 'lost+found':
+        note_info('IGNORED', dir)
+        continue
+
     if not os.path.isdir(dir):
-        note_warning('NOT-A-DIR', dir)
+        try:
+            mtime = os.path.getmtime(dir)
+            ctime = os.path.getctime(dir)
+        except OSError as e:
+            if e.errno == errno.ENOENT:
+                continue
+            else:
+                raise e
+        if min(mtime, ctime) + 3600*4 > time.time():
+            note_info('IGNORED', dir)
+        else:
+            note_warning('NOT-A-DIR', dir)
         continue
 
     if not dir in config['backups']:
@@ -146,7 +189,7 @@ for dir in os.listdir('.'):
 
     files.sort()
 
-    unhandled_backups = copy.copy(config['backups'][dir])
+    notyetseen_dbs = copy.copy(config['backups'][dir])
     ignored_dbs = {}
     backup_state = {}
 
@@ -168,7 +211,7 @@ for dir in os.listdir('.'):
             continue
 
         (db, type) = r.groups(1)
-        if not isinstance(config['backups'][dir], list) or not db in config['backups'][dir]:
+        if not isinstance(config['backups'][dir], dict) or not db in config['backups'][dir]:
             if not db in ignored_dbs:
                 note_warning_db(dir, db, 'NOT-CONFIGURED', '%s/%s'%(dir, db))
             ignored_dbs[db] = True
@@ -184,6 +227,10 @@ for dir in os.listdir('.'):
             # can_expire_next: Can expire all files that we handle from now on
             backup_state[db]['can_expire_next'] = False
             backup_state[db]['expires'] = []
+            if isinstance(config['backups'][dir][db], dict) and 'timeline' in config['backups'][dir][db]:
+                backup_state[db]['timeline'] = config['backups'][dir][db]['timeline']
+            else:
+                backup_state[db]['timeline'] = 1
 
         # Apparently we already have seen a base backup and all its wal files
         # which we want to keep, so everything what we see now is older than
@@ -205,8 +252,17 @@ for dir in os.listdir('.'):
                 basefn = '%s.BASE.%s-%s.tar.gz'%(db, info['label'], info['start wal location'].split(' ',2)[0].replace('/', '_'))
                 baseffn = os.path.join(dir, basefn)
                 if not basefn in files:
-                    note_warning_db(dir, db, 'MISSING-BASE', basefn)
-                    continue
+                    basefn = '%s.BASE.%s.tar.gz'%(db, info['label'])
+                    baseffn = os.path.join(dir, basefn)
+                    if not basefn in files:
+                        m = re.match('([a-z0-9.]+)-\d{8}-\d{6}', info['label'])
+                        if m and (m.group(1) != socket.getfqdn()):
+                            note_info(dir, 'IGNORED-OTHER-BASE: '+basefn)
+                            continue
+                        else:
+                            note_warning_db(dir, db, 'MISSING-BASE', basefn)
+                            continue
+                if db in notyetseen_dbs: del notyetseen_dbs[db]
                 files.remove(basefn)
                 if backup_state[db]['can_expire_next']:
                     backup_state[db]['expires'].append(baseffn)
@@ -224,7 +280,7 @@ for dir in os.listdir('.'):
                 backup_state[db]['base_needs_wal_until'] = walbase
 
                 start = time.mktime(time.strptime(info['start time'], '%Y-%m-%d %H:%M:%S %Z'))
-                if start + config['retention'] < time.time():
+                if start + get_retention(config, dir, db) < time.time():
                     backup_state[db]['can_expire_for_base_hit'] = True
                 continue
 
@@ -238,8 +294,8 @@ for dir in os.listdir('.'):
                             backup_state[db]['can_expire_next'] = True
 
                 (timeline, wal1, wal2) = map(lambda x: int(x,16), r.groups())
-                if not timeline == 1:
-                    note_warning_db(dir, db, 'CANNOT-HANDLE-TIMELINES_NOT_1', ffn)
+                if not timeline == backup_state[db]['timeline']:
+                    note_warning_db(dir, db, 'UNEXPECTED-TIMELINE', ffn)
                     continue
 
                 thissegment = (wal1, wal2)
@@ -247,7 +303,7 @@ for dir in os.listdir('.'):
                     backup_state[db]['newest-wal'] = thissegment
                     backup_state[db]['newest-wal-file'] = ffn
                 else:
-                    if not wal_pre(backup_state[db]['oldest-wal']) == thissegment:
+                    if not wal_pre(backup_state[db]['oldest-wal'], dir, db) == thissegment:
                         note_warning_db(dir, db, 'WAL-MISSING-AFTER', ffn)
                         ignored_dbs[db] = True
                         continue
@@ -288,6 +344,10 @@ for dir in os.listdir('.'):
                 for f in backup_state[db]['expires']:
                     global_expires.append(f)
 
+    for db in notyetseen_dbs:
+        if db.startswith('_'): continue
+        note_warning_db(dir, db, 'NO-BACKUP', 'no backups! (no .backup files found)')
+
     #if not db in backup_state:
     #    note_warning('BASE-WITHOUT-WAL', ffn)
     #    ignored_dbs[db] = True
@@ -301,18 +361,21 @@ for dir in os.listdir('.'):
 
 
 for p in problems_seq:
-    print p
+    print(p)
+if options.verbose:
+    for p in notices_seq:
+        print(p)
 
 if options.expire:
     for f in global_expires:
-        if options.verbose: print "Expiring %s"%(f)
+        if options.verbose: print("Expiring %s" % f)
         if not options.dry_run: os.unlink(f)
 
 if len(problems_seq) > 0:
     sys.exit(1)
 
 if not options.expire or options.verbose:
-    print "OK: no problems detected"
+    print("OK: no problems detected")
 sys.exit(0)
 
 # vim:set et: