X-Git-Url: https://git.adam-barratt.org.uk/?a=blobdiff_plain;f=dsa-nagios-checks%2Fchecks%2Fdsa-check-backuppg;h=ce6a9a0efacb470d4aa96d4bb108e66c7fb51934;hb=2fa1a6e542f4cc81d5eafcb89856a25fdee94d16;hp=92ce6cb6731bfdbb08ecc843ee3f8efbfad8b74c;hpb=3e579ecf2993b0d53b52d9cf5f2b65f65d56058e;p=mirror%2Fdsa-nagios.git diff --git a/dsa-nagios-checks/checks/dsa-check-backuppg b/dsa-nagios-checks/checks/dsa-check-backuppg index 92ce6cb..ce6a9a0 100755 --- a/dsa-nagios-checks/checks/dsa-check-backuppg +++ b/dsa-nagios-checks/checks/dsa-check-backuppg @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/python3 # Copyright 2010 Peter Palfrader # @@ -47,9 +47,11 @@ import copy import time import re import os +import errno import sys import yaml import optparse +import socket def load_conf(cf): if cf is not None: @@ -60,11 +62,12 @@ def load_conf(cf): configfile = '/etc/nagios/dsa-check-backuppg.conf' f = open(configfile) - config = yaml.load(f.read()) + config = yaml.safe_load(f.read()) f.close() return config +notices_seq = [] problems_seq = [] problems_per_db = {} global_expires = [] @@ -75,6 +78,12 @@ global_expires = [] # global problems_per_db # if not host in problems_per_db: problems_per_db[host] = {} # problems_per_db[host][db] = True +def note_info(key, value, pre=None): + global notices_seq + if pre is None: + notices_seq.append("%s: %s"%(key, value)) + else: + notices_seq.append("[%s] %s: %s"%(pre, key, value)) def note_warning(key, value, pre=None): global problems_seq @@ -89,11 +98,11 @@ def note_warning_db(host, db, key, value): problems_per_db[host][db] = True -def wal_pre(w): +def wal_pre(w, host, db): (w1,w2) = w if w2 == 0: w1 -= 1 - w2 = 0xFE + w2 = 0xFF else: w2 -= 1 @@ -108,6 +117,25 @@ def parse_pg_backup_info(fn): f.close() return i +def get_retention(config, host, db): + assert('retention' in config) + + assert('backups' in config) + assert(isinstance(config['backups'], dict)) + + assert(host in config['backups']) + assert(isinstance(config['backups'][host], dict)) + + assert(db in config['backups'][host]) + if isinstance(config['backups'][host][db], dict) and 'retention' in config['backups'][host][db]: + r = config['backups'][host][db]['retention'] + elif '_retention' in config['backups'][host]: + r = config['backups'][host]['_retention'] + else: + r = config['retention'] + + assert(isinstance(r, int)) + return r parser = optparse.OptionParser() parser.set_usage("%prog [-c=] (nagios mode)\n" + @@ -131,8 +159,23 @@ config = load_conf(options.conffile) os.chdir(config['rootdir']) for dir in os.listdir('.'): + if dir.startswith('.') or dir.endswith('.old') or dir == 'lost+found': + note_info('IGNORED', dir) + continue + if not os.path.isdir(dir): - note_warning('NOT-A-DIR', dir) + try: + mtime = os.path.getmtime(dir) + ctime = os.path.getctime(dir) + except OSError as e: + if e.errno == errno.ENOENT: + continue + else: + raise e + if min(mtime, ctime) + 3600*4 > time.time(): + note_info('IGNORED', dir) + else: + note_warning('NOT-A-DIR', dir) continue if not dir in config['backups']: @@ -146,7 +189,7 @@ for dir in os.listdir('.'): files.sort() - unhandled_backups = copy.copy(config['backups'][dir]) + notyetseen_dbs = copy.copy(config['backups'][dir]) ignored_dbs = {} backup_state = {} @@ -168,7 +211,7 @@ for dir in os.listdir('.'): continue (db, type) = r.groups(1) - if not isinstance(config['backups'][dir], list) or not db in config['backups'][dir]: + if not isinstance(config['backups'][dir], dict) or not db in config['backups'][dir]: if not db in ignored_dbs: note_warning_db(dir, db, 'NOT-CONFIGURED', '%s/%s'%(dir, db)) ignored_dbs[db] = True @@ -184,6 +227,10 @@ for dir in os.listdir('.'): # can_expire_next: Can expire all files that we handle from now on backup_state[db]['can_expire_next'] = False backup_state[db]['expires'] = [] + if isinstance(config['backups'][dir][db], dict) and 'timeline' in config['backups'][dir][db]: + backup_state[db]['timeline'] = config['backups'][dir][db]['timeline'] + else: + backup_state[db]['timeline'] = 1 # Apparently we already have seen a base backup and all its wal files # which we want to keep, so everything what we see now is older than @@ -205,8 +252,17 @@ for dir in os.listdir('.'): basefn = '%s.BASE.%s-%s.tar.gz'%(db, info['label'], info['start wal location'].split(' ',2)[0].replace('/', '_')) baseffn = os.path.join(dir, basefn) if not basefn in files: - note_warning_db(dir, db, 'MISSING-BASE', basefn) - continue + basefn = '%s.BASE.%s.tar.gz'%(db, info['label']) + baseffn = os.path.join(dir, basefn) + if not basefn in files: + m = re.match('([a-z0-9.]+)-\d{8}-\d{6}', info['label']) + if m and (m.group(1) != socket.getfqdn()): + note_info(dir, 'IGNORED-OTHER-BASE: '+basefn) + continue + else: + note_warning_db(dir, db, 'MISSING-BASE', basefn) + continue + if db in notyetseen_dbs: del notyetseen_dbs[db] files.remove(basefn) if backup_state[db]['can_expire_next']: backup_state[db]['expires'].append(baseffn) @@ -224,7 +280,7 @@ for dir in os.listdir('.'): backup_state[db]['base_needs_wal_until'] = walbase start = time.mktime(time.strptime(info['start time'], '%Y-%m-%d %H:%M:%S %Z')) - if start + config['retention'] < time.time(): + if start + get_retention(config, dir, db) < time.time(): backup_state[db]['can_expire_for_base_hit'] = True continue @@ -238,8 +294,8 @@ for dir in os.listdir('.'): backup_state[db]['can_expire_next'] = True (timeline, wal1, wal2) = map(lambda x: int(x,16), r.groups()) - if not timeline == 1: - note_warning_db(dir, db, 'CANNOT-HANDLE-TIMELINES_NOT_1', ffn) + if not timeline == backup_state[db]['timeline']: + note_warning_db(dir, db, 'UNEXPECTED-TIMELINE', ffn) continue thissegment = (wal1, wal2) @@ -247,7 +303,7 @@ for dir in os.listdir('.'): backup_state[db]['newest-wal'] = thissegment backup_state[db]['newest-wal-file'] = ffn else: - if not wal_pre(backup_state[db]['oldest-wal']) == thissegment: + if not wal_pre(backup_state[db]['oldest-wal'], dir, db) == thissegment: note_warning_db(dir, db, 'WAL-MISSING-AFTER', ffn) ignored_dbs[db] = True continue @@ -288,6 +344,10 @@ for dir in os.listdir('.'): for f in backup_state[db]['expires']: global_expires.append(f) + for db in notyetseen_dbs: + if db.startswith('_'): continue + note_warning_db(dir, db, 'NO-BACKUP', 'no backups! (no .backup files found)') + #if not db in backup_state: # note_warning('BASE-WITHOUT-WAL', ffn) # ignored_dbs[db] = True @@ -301,18 +361,21 @@ for dir in os.listdir('.'): for p in problems_seq: - print p + print(p) +if options.verbose: + for p in notices_seq: + print(p) if options.expire: for f in global_expires: - if options.verbose: print "Expiring %s"%(f) + if options.verbose: print("Expiring %s" % f) if not options.dry_run: os.unlink(f) if len(problems_seq) > 0: sys.exit(1) if not options.expire or options.verbose: - print "OK: no problems detected" + print("OK: no problems detected") sys.exit(0) # vim:set et: