3 # Copyright 2010 Peter Palfrader
5 # Permission is hereby granted, free of charge, to any person obtaining
6 # a copy of this software and associated documentation files (the
7 # "Software"), to deal in the Software without restriction, including
8 # without limitation the rights to use, copy, modify, merge, publish,
9 # distribute, sublicense, and/or sell copies of the Software, and to
10 # permit persons to whom the Software is furnished to do so, subject to
11 # the following conditions:
13 # The above copyright notice and this permission notice shall be
14 # included in all copies or substantial portions of the Software.
16 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20 # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21 # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22 # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
24 # Checks if the WAL backups for several postgres clusters from
25 # different hosts are current. Might not catch all error instances.
27 # If called with -e will expire WALs and BASE backups no longer required.
29 # Needs files layed out like so:
30 # beethoven:/srv/pgbackup/pg# ls -l ries/ | head
32 # -rw------- 1 debbackup debbackup 378099591 May 1 02:26 dak.BASE.20100501-ries-dak-8.4-backup-F_37000020.tar.gz
33 # -rw------- 1 debbackup debbackup 382267407 May 8 02:25 dak.BASE.20100508-ries-dak-8.4-backup-F_A2000020.tar.gz
34 # -rw------- 1 debbackup debbackup 384054069 May 15 02:24 dak.BASE.20100515-ries-dak-8.4-backup-F_FB000020.tar.gz
35 # -rw------- 1 debbackup debbackup 386407500 May 22 02:27 dak.BASE.20100522-ries-dak-8.4-backup-10_58000020.tar.gz
36 # -rw------- 1 debbackup debbackup 16777216 May 1 02:26 dak.WAL.000000010000000F00000037
37 # -rw------- 1 debbackup debbackup 264 May 1 02:27 dak.WAL.000000010000000F00000037.00000020.backup
38 # -rw------- 1 debbackup debbackup 16777216 May 1 03:25 dak.WAL.000000010000000F00000038
39 # -rw------- 1 debbackup debbackup 16777216 May 1 09:11 dak.WAL.000000010000000F00000039
40 # -rw------- 1 debbackup debbackup 16777216 May 1 09:45 dak.WAL.000000010000000F0000003A
43 # needs write privileges to at least the .backup files
59 elif 'DSA_CHECK_BACKUPPG_CONF' in os.environ:
60 configfile = os.environ['DSA_CHECK_BACKUPPG_CONF']
62 configfile = '/etc/nagios/dsa-check-backuppg.conf'
65 config = yaml.safe_load(f.read())
74 #def note_warning(key, host, db, value):
76 # problems_seq.append("[%s, %s]: %s: %s"%(host, db, key, value))
78 # global problems_per_db
79 # if not host in problems_per_db: problems_per_db[host] = {}
80 # problems_per_db[host][db] = True
81 def note_info(key, value, pre=None):
84 notices_seq.append("%s: %s"%(key, value))
86 notices_seq.append("[%s] %s: %s"%(pre, key, value))
88 def note_warning(key, value, pre=None):
91 problems_seq.append("%s: %s"%(key, value))
93 problems_seq.append("[%s] %s: %s"%(pre, key, value))
94 def note_warning_db(host, db, key, value):
95 note_warning(key, value, "%s, %s"%(host, db))
96 global problems_per_db
97 if not host in problems_per_db: problems_per_db[host] = {}
98 problems_per_db[host][db] = True
101 def wal_pre(w, host, db):
105 if (host,db) in ( ('moszumanska', 'main'), ):
114 def parse_pg_backup_info(fn):
118 (k,v) = l.strip().split(': ', 2)
123 def get_retention(config, host, db):
124 assert('retention' in config)
126 assert('backups' in config)
127 assert(isinstance(config['backups'], dict))
129 assert(host in config['backups'])
130 assert(isinstance(config['backups'][host], dict))
132 assert(db in config['backups'][host])
133 if isinstance(config['backups'][host][db], dict) and 'retention' in config['backups'][host][db]:
134 r = config['backups'][host][db]['retention']
135 elif '_retention' in config['backups'][host]:
136 r = config['backups'][host]['_retention']
138 r = config['retention']
140 assert(isinstance(r, int))
143 parser = optparse.OptionParser()
144 parser.set_usage("%prog [-c=<CONFFILE>] (nagios mode)\n" +
145 "Usage: %prog [-c=<CONFFILE>] -e [-d] [-v] (expire mode)")
146 parser.add_option("-c", "--config", dest="conffile", metavar="CONFFILE",
147 help="Config file location.")
148 parser.add_option("-e", "--expire", dest="expire", action="store_true",
149 help="Expire old files.")
150 parser.add_option("-d", "--dry-run", dest="dry_run", action="store_true",
151 help="Do not really remove files.")
152 parser.add_option("-v", "--verbose", dest="verbose", action="store_true",
153 help="List files we are expiring.")
154 (options, args) = parser.parse_args()
161 config = load_conf(options.conffile)
163 os.chdir(config['rootdir'])
164 for dir in os.listdir('.'):
165 if dir.startswith('.') or dir.endswith('.old'):
166 note_info('IGNORED', dir)
169 if not os.path.isdir(dir):
171 mtime = os.path.getmtime(dir)
172 ctime = os.path.getctime(dir)
174 if e.errno == errno.ENOENT:
178 if min(mtime, ctime) + 3600*4 > time.time():
179 note_info('IGNORED', dir)
181 note_warning('NOT-A-DIR', dir)
184 if not dir in config['backups']:
185 note_warning('NOT-CONFIGURED', dir)
188 files = os.listdir(dir)
190 note_warning('EMPTY-DIR', dir)
195 notyetseen_dbs = copy.copy(config['backups'][dir])
199 # Go over all the files in a directory and check for various things
200 # - for a given cluster's backups we want the latest WAL file to be no
201 # older than a certain age,
202 # - we want all consecutive WAL files, i.e. no holes
203 # - we want a full backup at one point, and it shouldn't be too old
204 # - If our retention period is say 2 weeks, then we look for the
205 # tar file that's older than that, and everything before that can
207 while len(files) > 0:
209 ffn = os.path.join(dir, fn)
211 r = re.match('([a-z0-9-]+)\.(WAL|BASE)\..*', fn)
213 note_warning('CANNOT-PARSE', ffn)
216 (db, type) = r.groups(1)
217 if not isinstance(config['backups'][dir], dict) or not db in config['backups'][dir]:
218 if not db in ignored_dbs:
219 note_warning_db(dir, db, 'NOT-CONFIGURED', '%s/%s'%(dir, db))
220 ignored_dbs[db] = True
221 if db in ignored_dbs:
223 if not db in backup_state:
224 backup_state[db] = {}
225 # can_expire_for_base_hit: We hit a BASE backup that is old enough
226 # so that once we hit all the required WAL files for this base
227 # backup to work we can start expiring everything older than that
229 backup_state[db]['can_expire_for_base_hit'] = False
230 # can_expire_next: Can expire all files that we handle from now on
231 backup_state[db]['can_expire_next'] = False
232 backup_state[db]['expires'] = []
233 if isinstance(config['backups'][dir][db], dict) and 'timeline' in config['backups'][dir][db]:
234 backup_state[db]['timeline'] = config['backups'][dir][db]['timeline']
236 backup_state[db]['timeline'] = 1
238 # Apparently we already have seen a base backup and all its wal files
239 # which we want to keep, so everything what we see now is older than
240 # that and we can get rid of it
241 if backup_state[db]['can_expire_next']:
242 backup_state[db]['expires'].append(ffn)
245 # should have been taken care of before
246 # while handling a WAL.backup file
247 note_warning_db(dir, db, 'STRAY-BASE', ffn)
250 # handle .backup files - they live near the WAL "file namespace" and reference
251 # the corresponding full backup
252 r = re.match('[a-z0-9-]+\.WAL\.([0-9A-F]{8})([0-9A-F]{8})([0-9A-F]{8})\.[0-9A-F]{8}\.backup', fn)
254 info = parse_pg_backup_info(ffn)
255 basefn = '%s.BASE.%s-%s.tar.gz'%(db, info['label'], info['start wal location'].split(' ',2)[0].replace('/', '_'))
256 baseffn = os.path.join(dir, basefn)
257 if not basefn in files:
258 basefn = '%s.BASE.%s.tar.gz'%(db, info['label'])
259 baseffn = os.path.join(dir, basefn)
260 if not basefn in files:
261 m = re.match('([a-z0-9.]+)-\d{8}-\d{6}', info['label'])
262 if m and (m.group(1) != socket.getfqdn()):
263 note_info(dir, 'IGNORED-OTHER-BASE: '+basefn)
266 note_warning_db(dir, db, 'MISSING-BASE', basefn)
268 if db in notyetseen_dbs: del notyetseen_dbs[db]
270 if backup_state[db]['can_expire_next']:
271 backup_state[db]['expires'].append(baseffn)
273 if not 'newest-base' in backup_state[db]:
274 backup_state[db]['newest-base'] = baseffn
275 backup_state[db]['oldest-base'] = baseffn
277 startre = re.search('\(file ([0-9A-F]{24})\)', info['start wal location'])
279 note_warning_db(dir, db, 'CANNOT-PARSE-START_WAL_LOCATION', ffn)
281 start_file = startre.group(1)
282 walbase = '%s.WAL.%s'%(db, start_file)
283 backup_state[db]['base_needs_wal_until'] = walbase
285 start = time.mktime(time.strptime(info['start time'], '%Y-%m-%d %H:%M:%S %Z'))
286 if start + get_retention(config, dir, db) < time.time():
287 backup_state[db]['can_expire_for_base_hit'] = True
291 r = re.match('[a-z0-9-]+\.WAL\.([0-9A-F]{8})([0-9A-F]{8})([0-9A-F]{8})', fn)
293 if 'base_needs_wal_until' in backup_state[db]:
294 if backup_state[db]['base_needs_wal_until'] == fn:
295 del backup_state[db]['base_needs_wal_until']
296 if backup_state[db]['can_expire_for_base_hit']:
297 backup_state[db]['can_expire_next'] = True
299 (timeline, wal1, wal2) = map(lambda x: int(x,16), r.groups())
300 if not timeline == backup_state[db]['timeline']:
301 note_warning_db(dir, db, 'UNEXPECTED-TIMELINE', ffn)
304 thissegment = (wal1, wal2)
305 if not 'newest-wal' in backup_state[db]:
306 backup_state[db]['newest-wal'] = thissegment
307 backup_state[db]['newest-wal-file'] = ffn
309 if not wal_pre(backup_state[db]['oldest-wal'], dir, db) == thissegment:
310 note_warning_db(dir, db, 'WAL-MISSING-AFTER', ffn)
311 ignored_dbs[db] = True
313 backup_state[db]['oldest-wal'] = thissegment
317 note_warning_db(dir, db, 'CANNOT-PARSE-WAL', ffn)
319 note_warning_db(dir, db, 'INVALID-TYPE', ffn)
322 for db in backup_state:
323 if 'base_needs_wal_until' in backup_state[db]:
324 note_warning_db(dir, db, 'MISSING_WAL_FOR_BASE', backup_state[db]['base_needs_wal_until'])
326 for db in backup_state:
327 if not 'newest-base' in backup_state[db]:
328 note_warning_db(dir, db, 'NO-BASE', 'no base backup found?')
330 age = time.time() - os.stat(backup_state[db]['newest-base']).st_mtime
331 if age > config['warn-age']['base']:
332 note_warning_db(dir, db, 'BASE-IS-OLD', 'latest base backup is too old')
334 if not 'newest-wal-file' in backup_state[db]:
335 note_warning_db(dir, db, 'NO-BASE', 'no WAL files found?')
337 age = time.time() - os.stat(backup_state[db]['newest-wal-file']).st_mtime
338 if age > config['warn-age']['wal']:
339 note_warning_db(dir, db, 'WAL-IS-OLD', 'latest wal file is too old')
341 for db in backup_state:
342 if len(backup_state[db]['expires']) > 0:
343 if dir in problems_per_db and db in problems_per_db[dir] and problems_per_db[dir][db]:
344 note_warning_db(dir, db, 'NOT-EXPIRING-DUE-TO-WARNINGS', 'have seen warnings, will not expire anything')
346 backup_state[db]['expires'].reverse()
347 for f in backup_state[db]['expires']:
348 global_expires.append(f)
350 for db in notyetseen_dbs:
351 if db.startswith('_'): continue
352 note_warning_db(dir, db, 'NO-BACKUP', 'no backups! (no .backup files found)')
354 #if not db in backup_state:
355 # note_warning('BASE-WITHOUT-WAL', ffn)
356 # ignored_dbs[db] = True
359 #age = time.time() - os.stat(ffn).st_mtime
360 #if age > config['warn-age']['wal']:
361 # note_warning('OLD-WAL', backup_state[db]['newest-wal-file'])
362 # ignored_dbs[db] = True
366 for p in problems_seq:
369 for p in notices_seq:
373 for f in global_expires:
374 if options.verbose: print "Expiring %s"%(f)
375 if not options.dry_run: os.unlink(f)
377 if len(problems_seq) > 0:
380 if not options.expire or options.verbose:
381 print "OK: no problems detected"
386 # vim:set shiftwidth=4: