dsa-check-backuppg: clusternames with _ are not really clusternames
[mirror/dsa-nagios.git] / dsa-nagios-checks / checks / dsa-check-backuppg
index 10d0033..8704f3c 100755 (executable)
@@ -62,7 +62,7 @@ def load_conf(cf):
         configfile = '/etc/nagios/dsa-check-backuppg.conf'
 
     f = open(configfile)
-    config = yaml.load(f.read())
+    config = yaml.safe_load(f.read())
     f.close()
     return config
 
@@ -120,6 +120,25 @@ def parse_pg_backup_info(fn):
     f.close()
     return i
 
+def get_retention(config, host, db):
+    assert('retention' in config)
+
+    assert('backups' in config)
+    assert(isinstance(config['backups'], dict))
+
+    assert(host in config['backups'])
+    assert(isinstance(config['backups'][host], dict))
+
+    assert(db in config['backups'][host])
+    if isinstance(config['backups'][host][db], dict) and 'retention' in config['backups'][host][db]:
+        r = config['backups'][host][db]['retention']
+    elif '_retention' in config['backups'][host]:
+        r = config['backups'][host]['_retention']
+    else:
+        r = config['retention']
+
+    assert(isinstance(r, int))
+    return r
 
 parser = optparse.OptionParser()
 parser.set_usage("%prog [-c=<CONFFILE>]               (nagios mode)\n" +
@@ -173,7 +192,7 @@ for dir in os.listdir('.'):
 
     files.sort()
 
-    unhandled_backups = copy.copy(config['backups'][dir])
+    notyetseen_dbs = copy.copy(config['backups'][dir])
     ignored_dbs = {}
     backup_state = {}
 
@@ -195,7 +214,7 @@ for dir in os.listdir('.'):
             continue
 
         (db, type) = r.groups(1)
-        if not isinstance(config['backups'][dir], list) or not db in config['backups'][dir]:
+        if not isinstance(config['backups'][dir], dict) or not db in config['backups'][dir]:
             if not db in ignored_dbs:
                 note_warning_db(dir, db, 'NOT-CONFIGURED', '%s/%s'%(dir, db))
             ignored_dbs[db] = True
@@ -211,6 +230,10 @@ for dir in os.listdir('.'):
             # can_expire_next: Can expire all files that we handle from now on
             backup_state[db]['can_expire_next'] = False
             backup_state[db]['expires'] = []
+            if isinstance(config['backups'][dir][db], dict) and 'timeline' in config['backups'][dir][db]:
+                backup_state[db]['timeline'] = config['backups'][dir][db]['timeline']
+            else:
+                backup_state[db]['timeline'] = 1
 
         # Apparently we already have seen a base backup and all its wal files
         # which we want to keep, so everything what we see now is older than
@@ -242,6 +265,7 @@ for dir in os.listdir('.'):
                         else:
                             note_warning_db(dir, db, 'MISSING-BASE', basefn)
                             continue
+                if db in notyetseen_dbs: del notyetseen_dbs[db]
                 files.remove(basefn)
                 if backup_state[db]['can_expire_next']:
                     backup_state[db]['expires'].append(baseffn)
@@ -259,7 +283,7 @@ for dir in os.listdir('.'):
                 backup_state[db]['base_needs_wal_until'] = walbase
 
                 start = time.mktime(time.strptime(info['start time'], '%Y-%m-%d %H:%M:%S %Z'))
-                if start + config['retention'] < time.time():
+                if start + get_retention(config, dir, db) < time.time():
                     backup_state[db]['can_expire_for_base_hit'] = True
                 continue
 
@@ -273,8 +297,8 @@ for dir in os.listdir('.'):
                             backup_state[db]['can_expire_next'] = True
 
                 (timeline, wal1, wal2) = map(lambda x: int(x,16), r.groups())
-                if not timeline == 1:
-                    note_warning_db(dir, db, 'CANNOT-HANDLE-TIMELINES_NOT_1', ffn)
+                if not timeline == backup_state[db]['timeline']:
+                    note_warning_db(dir, db, 'UNEXPECTED-TIMELINE', ffn)
                     continue
 
                 thissegment = (wal1, wal2)
@@ -323,6 +347,10 @@ for dir in os.listdir('.'):
                 for f in backup_state[db]['expires']:
                     global_expires.append(f)
 
+    for db in notyetseen_dbs:
+        if db.startswith('_'): continue
+        note_warning_db(dir, db, 'NO-BACKUP', 'no backups! (no .backup files found)')
+
     #if not db in backup_state:
     #    note_warning('BASE-WITHOUT-WAL', ffn)
     #    ignored_dbs[db] = True
@@ -337,8 +365,9 @@ for dir in os.listdir('.'):
 
 for p in problems_seq:
     print p
-for p in notices_seq:
-    print p
+if options.verbose:
+    for p in notices_seq:
+        print p
 
 if options.expire:
     for f in global_expires: