#!/usr/bin/python3 # Copyright 2010 Peter Palfrader # # Permission is hereby granted, free of charge, to any person obtaining # a copy of this software and associated documentation files (the # "Software"), to deal in the Software without restriction, including # without limitation the rights to use, copy, modify, merge, publish, # distribute, sublicense, and/or sell copies of the Software, and to # permit persons to whom the Software is furnished to do so, subject to # the following conditions: # # The above copyright notice and this permission notice shall be # included in all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. # Checks if the WAL backups for several postgres clusters from # different hosts are current. Might not catch all error instances. # # If called with -e will expire WALs and BASE backups no longer required. # # Needs files layed out like so: # beethoven:/srv/pgbackup/pg# ls -l ries/ | head # total 6794956 # -rw------- 1 debbackup debbackup 378099591 May 1 02:26 dak.BASE.20100501-ries-dak-8.4-backup-F_37000020.tar.gz # -rw------- 1 debbackup debbackup 382267407 May 8 02:25 dak.BASE.20100508-ries-dak-8.4-backup-F_A2000020.tar.gz # -rw------- 1 debbackup debbackup 384054069 May 15 02:24 dak.BASE.20100515-ries-dak-8.4-backup-F_FB000020.tar.gz # -rw------- 1 debbackup debbackup 386407500 May 22 02:27 dak.BASE.20100522-ries-dak-8.4-backup-10_58000020.tar.gz # -rw------- 1 debbackup debbackup 16777216 May 1 02:26 dak.WAL.000000010000000F00000037 # -rw------- 1 debbackup debbackup 264 May 1 02:27 dak.WAL.000000010000000F00000037.00000020.backup # -rw------- 1 debbackup debbackup 16777216 May 1 03:25 dak.WAL.000000010000000F00000038 # -rw------- 1 debbackup debbackup 16777216 May 1 09:11 dak.WAL.000000010000000F00000039 # -rw------- 1 debbackup debbackup 16777216 May 1 09:45 dak.WAL.000000010000000F0000003A # ... # # needs write privileges to at least the .backup files import copy import time import re import os import errno import sys import yaml import optparse import socket def load_conf(cf): if cf is not None: configfile = cf elif 'DSA_CHECK_BACKUPPG_CONF' in os.environ: configfile = os.environ['DSA_CHECK_BACKUPPG_CONF'] else: configfile = '/etc/nagios/dsa-check-backuppg.conf' f = open(configfile) config = yaml.safe_load(f.read()) f.close() return config notices_seq = [] problems_seq = [] problems_per_db = {} global_expires = [] #def note_warning(key, host, db, value): # global problems_seq # problems_seq.append("[%s, %s]: %s: %s"%(host, db, key, value)) # # global problems_per_db # if not host in problems_per_db: problems_per_db[host] = {} # problems_per_db[host][db] = True def note_info(key, value, pre=None): global notices_seq if pre is None: notices_seq.append("%s: %s"%(key, value)) else: notices_seq.append("[%s] %s: %s"%(pre, key, value)) def note_warning(key, value, pre=None): global problems_seq if pre is None: problems_seq.append("%s: %s"%(key, value)) else: problems_seq.append("[%s] %s: %s"%(pre, key, value)) def note_warning_db(host, db, key, value): note_warning(key, value, "%s, %s"%(host, db)) global problems_per_db if not host in problems_per_db: problems_per_db[host] = {} problems_per_db[host][db] = True def wal_pre(w, host, db): (w1,w2) = w if w2 == 0: w1 -= 1 w2 = 0xFF else: w2 -= 1 return (w1,w2) def parse_pg_backup_info(fn): i = {} f = open(fn) for l in f: (k,v) = l.strip().split(': ', 2) i[k.lower()] = v f.close() return i def get_retention(config, host, db): assert('retention' in config) assert('backups' in config) assert(isinstance(config['backups'], dict)) assert(host in config['backups']) assert(isinstance(config['backups'][host], dict)) assert(db in config['backups'][host]) if isinstance(config['backups'][host][db], dict) and 'retention' in config['backups'][host][db]: r = config['backups'][host][db]['retention'] elif '_retention' in config['backups'][host]: r = config['backups'][host]['_retention'] else: r = config['retention'] assert(isinstance(r, int)) return r parser = optparse.OptionParser() parser.set_usage("%prog [-c=] (nagios mode)\n" + "Usage: %prog [-c=] -e [-d] [-v] (expire mode)") parser.add_option("-c", "--config", dest="conffile", metavar="CONFFILE", help="Config file location.") parser.add_option("-e", "--expire", dest="expire", action="store_true", help="Expire old files.") parser.add_option("-d", "--dry-run", dest="dry_run", action="store_true", help="Do not really remove files.") parser.add_option("-v", "--verbose", dest="verbose", action="store_true", help="List files we are expiring.") (options, args) = parser.parse_args() if len(args) > 0: parser.print_help() sys.exit(1) config = load_conf(options.conffile) os.chdir(config['rootdir']) for dir in os.listdir('.'): if dir.startswith('.') or dir.endswith('.old') or dir == 'lost+found': note_info('IGNORED', dir) continue if not os.path.isdir(dir): try: mtime = os.path.getmtime(dir) ctime = os.path.getctime(dir) except OSError as e: if e.errno == errno.ENOENT: continue else: raise e if min(mtime, ctime) + 3600*4 > time.time(): note_info('IGNORED', dir) else: note_warning('NOT-A-DIR', dir) continue if not dir in config['backups']: note_warning('NOT-CONFIGURED', dir) continue files = os.listdir(dir) if len(files) == 0: note_warning('EMPTY-DIR', dir) continue files.sort() notyetseen_dbs = copy.copy(config['backups'][dir]) ignored_dbs = {} backup_state = {} # Go over all the files in a directory and check for various things # - for a given cluster's backups we want the latest WAL file to be no # older than a certain age, # - we want all consecutive WAL files, i.e. no holes # - we want a full backup at one point, and it shouldn't be too old # - If our retention period is say 2 weeks, then we look for the # tar file that's older than that, and everything before that can # be expired while len(files) > 0: fn = files.pop() ffn = os.path.join(dir, fn) r = re.match('([a-z0-9-]+)\.(WAL|BASE)\..*', fn) if not r: note_warning('CANNOT-PARSE', ffn) continue (db, type) = r.groups(1) if not isinstance(config['backups'][dir], dict) or not db in config['backups'][dir]: if not db in ignored_dbs: note_warning_db(dir, db, 'NOT-CONFIGURED', '%s/%s'%(dir, db)) ignored_dbs[db] = True if db in ignored_dbs: continue if not db in backup_state: backup_state[db] = {} # can_expire_for_base_hit: We hit a BASE backup that is old enough # so that once we hit all the required WAL files for this base # backup to work we can start expiring everything older than that # oldest WAL file backup_state[db]['can_expire_for_base_hit'] = False # can_expire_next: Can expire all files that we handle from now on backup_state[db]['can_expire_next'] = False backup_state[db]['expires'] = [] if isinstance(config['backups'][dir][db], dict) and 'timeline' in config['backups'][dir][db]: backup_state[db]['timeline'] = config['backups'][dir][db]['timeline'] else: backup_state[db]['timeline'] = 1 # Apparently we already have seen a base backup and all its wal files # which we want to keep, so everything what we see now is older than # that and we can get rid of it if backup_state[db]['can_expire_next']: backup_state[db]['expires'].append(ffn) if type == 'BASE': # should have been taken care of before # while handling a WAL.backup file note_warning_db(dir, db, 'STRAY-BASE', ffn) continue elif type == 'WAL': # handle .backup files - they live near the WAL "file namespace" and reference # the corresponding full backup r = re.match('[a-z0-9-]+\.WAL\.([0-9A-F]{8})([0-9A-F]{8})([0-9A-F]{8})\.[0-9A-F]{8}\.backup', fn) if r: info = parse_pg_backup_info(ffn) basefn = '%s.BASE.%s-%s.tar.gz'%(db, info['label'], info['start wal location'].split(' ',2)[0].replace('/', '_')) baseffn = os.path.join(dir, basefn) if not basefn in files: basefn = '%s.BASE.%s.tar.gz'%(db, info['label']) baseffn = os.path.join(dir, basefn) if not basefn in files: m = re.match('([a-z0-9.]+)-\d{8}-\d{6}', info['label']) if m and (m.group(1) != socket.getfqdn()): note_info(dir, 'IGNORED-OTHER-BASE: '+basefn) continue else: note_warning_db(dir, db, 'MISSING-BASE', basefn) continue if db in notyetseen_dbs: del notyetseen_dbs[db] files.remove(basefn) if backup_state[db]['can_expire_next']: backup_state[db]['expires'].append(baseffn) if not 'newest-base' in backup_state[db]: backup_state[db]['newest-base'] = baseffn backup_state[db]['oldest-base'] = baseffn startre = re.search('\(file ([0-9A-F]{24})\)', info['start wal location']) if not startre: note_warning_db(dir, db, 'CANNOT-PARSE-START_WAL_LOCATION', ffn) continue start_file = startre.group(1) walbase = '%s.WAL.%s'%(db, start_file) backup_state[db]['base_needs_wal_until'] = walbase start = time.mktime(time.strptime(info['start time'], '%Y-%m-%d %H:%M:%S %Z')) if start + get_retention(config, dir, db) < time.time(): backup_state[db]['can_expire_for_base_hit'] = True continue # handle WAL files r = re.match('[a-z0-9-]+\.WAL\.([0-9A-F]{8})([0-9A-F]{8})([0-9A-F]{8})', fn) if r: if 'base_needs_wal_until' in backup_state[db]: if backup_state[db]['base_needs_wal_until'] == fn: del backup_state[db]['base_needs_wal_until'] if backup_state[db]['can_expire_for_base_hit']: backup_state[db]['can_expire_next'] = True (timeline, wal1, wal2) = map(lambda x: int(x,16), r.groups()) if not timeline == backup_state[db]['timeline']: note_warning_db(dir, db, 'UNEXPECTED-TIMELINE', ffn) continue thissegment = (wal1, wal2) if not 'newest-wal' in backup_state[db]: backup_state[db]['newest-wal'] = thissegment backup_state[db]['newest-wal-file'] = ffn else: if not wal_pre(backup_state[db]['oldest-wal'], dir, db) == thissegment: note_warning_db(dir, db, 'WAL-MISSING-AFTER', ffn) ignored_dbs[db] = True continue backup_state[db]['oldest-wal'] = thissegment continue note_warning_db(dir, db, 'CANNOT-PARSE-WAL', ffn) else: note_warning_db(dir, db, 'INVALID-TYPE', ffn) for db in backup_state: if 'base_needs_wal_until' in backup_state[db]: note_warning_db(dir, db, 'MISSING_WAL_FOR_BASE', backup_state[db]['base_needs_wal_until']) for db in backup_state: if not 'newest-base' in backup_state[db]: note_warning_db(dir, db, 'NO-BASE', 'no base backup found?') else: age = time.time() - os.stat(backup_state[db]['newest-base']).st_mtime if age > config['warn-age']['base']: note_warning_db(dir, db, 'BASE-IS-OLD', 'latest base backup is too old') if not 'newest-wal-file' in backup_state[db]: note_warning_db(dir, db, 'NO-BASE', 'no WAL files found?') else: age = time.time() - os.stat(backup_state[db]['newest-wal-file']).st_mtime if age > config['warn-age']['wal']: note_warning_db(dir, db, 'WAL-IS-OLD', 'latest wal file is too old') for db in backup_state: if len(backup_state[db]['expires']) > 0: if dir in problems_per_db and db in problems_per_db[dir] and problems_per_db[dir][db]: note_warning_db(dir, db, 'NOT-EXPIRING-DUE-TO-WARNINGS', 'have seen warnings, will not expire anything') else: backup_state[db]['expires'].reverse() for f in backup_state[db]['expires']: global_expires.append(f) for db in notyetseen_dbs: if db.startswith('_'): continue note_warning_db(dir, db, 'NO-BACKUP', 'no backups! (no .backup files found)') #if not db in backup_state: # note_warning('BASE-WITHOUT-WAL', ffn) # ignored_dbs[db] = True # continue #age = time.time() - os.stat(ffn).st_mtime #if age > config['warn-age']['wal']: # note_warning('OLD-WAL', backup_state[db]['newest-wal-file']) # ignored_dbs[db] = True # continue for p in problems_seq: print(p) if options.verbose: for p in notices_seq: print(p) if options.expire: for f in global_expires: if options.verbose: print("Expiring %s" % f) if not options.dry_run: os.unlink(f) if len(problems_seq) > 0: sys.exit(1) if not options.expire or options.verbose: print("OK: no problems detected") sys.exit(0) # vim:set et: # vim:set ts=4: # vim:set shiftwidth=4: