07843485ab6d4596b74dc0fd67948f7486fb1c1e
[mirror/dsa-nagios.git] / dsa-nagios-checks / checks / dsa-check-backuppg
1 #!/usr/bin/python
2
3 # Copyright 2010 Peter Palfrader
4 #
5 # Permission is hereby granted, free of charge, to any person obtaining
6 # a copy of this software and associated documentation files (the
7 # "Software"), to deal in the Software without restriction, including
8 # without limitation the rights to use, copy, modify, merge, publish,
9 # distribute, sublicense, and/or sell copies of the Software, and to
10 # permit persons to whom the Software is furnished to do so, subject to
11 # the following conditions:
12 #
13 # The above copyright notice and this permission notice shall be
14 # included in all copies or substantial portions of the Software.
15 #
16 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20 # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21 # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22 # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23
24 # Checks if the WAL backups for several postgres clusters from
25 # different hosts are current.  Might not catch all error instances.
26 #
27 # If called with -e will expire WALs and BASE backups no longer required.
28 #
29 # Needs files layed out like so:
30 # beethoven:/srv/pgbackup/pg# ls -l ries/ | head
31 # total 6794956
32 # -rw------- 1 debbackup debbackup 378099591 May  1 02:26 dak.BASE.20100501-ries-dak-8.4-backup-F_37000020.tar.gz
33 # -rw------- 1 debbackup debbackup 382267407 May  8 02:25 dak.BASE.20100508-ries-dak-8.4-backup-F_A2000020.tar.gz
34 # -rw------- 1 debbackup debbackup 384054069 May 15 02:24 dak.BASE.20100515-ries-dak-8.4-backup-F_FB000020.tar.gz
35 # -rw------- 1 debbackup debbackup 386407500 May 22 02:27 dak.BASE.20100522-ries-dak-8.4-backup-10_58000020.tar.gz
36 # -rw------- 1 debbackup debbackup  16777216 May  1 02:26 dak.WAL.000000010000000F00000037
37 # -rw------- 1 debbackup debbackup       264 May  1 02:27 dak.WAL.000000010000000F00000037.00000020.backup
38 # -rw------- 1 debbackup debbackup  16777216 May  1 03:25 dak.WAL.000000010000000F00000038
39 # -rw------- 1 debbackup debbackup  16777216 May  1 09:11 dak.WAL.000000010000000F00000039
40 # -rw------- 1 debbackup debbackup  16777216 May  1 09:45 dak.WAL.000000010000000F0000003A
41 # ...
42 #
43 # needs write privileges to at least the .backup files
44
45
46 import copy
47 import time
48 import re
49 import os
50 import errno
51 import sys
52 import yaml
53 import optparse
54 import socket
55
56 def load_conf(cf):
57     if cf is not None:
58         configfile = cf
59     elif 'DSA_CHECK_BACKUPPG_CONF' in os.environ:
60         configfile = os.environ['DSA_CHECK_BACKUPPG_CONF']
61     else:
62         configfile = '/etc/nagios/dsa-check-backuppg.conf'
63
64     f = open(configfile)
65     config = yaml.safe_load(f.read())
66     f.close()
67     return config
68
69
70 notices_seq = []
71 problems_seq = []
72 problems_per_db = {}
73 global_expires = []
74 #def note_warning(key, host, db, value):
75 #    global problems_seq
76 #    problems_seq.append("[%s, %s]: %s: %s"%(host, db, key, value))
77 #
78 #    global problems_per_db
79 #    if not host in problems_per_db: problems_per_db[host] = {}
80 #    problems_per_db[host][db] = True
81 def note_info(key, value, pre=None):
82     global notices_seq
83     if pre is None:
84         notices_seq.append("%s: %s"%(key, value))
85     else:
86         notices_seq.append("[%s] %s: %s"%(pre, key, value))
87
88 def note_warning(key, value, pre=None):
89     global problems_seq
90     if pre is None:
91         problems_seq.append("%s: %s"%(key, value))
92     else:
93         problems_seq.append("[%s] %s: %s"%(pre, key, value))
94 def note_warning_db(host, db, key, value):
95     note_warning(key, value, "%s, %s"%(host, db))
96     global problems_per_db
97     if not host in problems_per_db: problems_per_db[host] = {}
98     problems_per_db[host][db] = True
99
100
101 def wal_pre(w, host, db):
102     (w1,w2) = w
103     if w2 == 0:
104         w1 -= 1
105         w2 = 0xFF
106     else:
107         w2 -= 1
108
109     return (w1,w2)
110
111 def parse_pg_backup_info(fn):
112     i = {}
113     f = open(fn)
114     for l in f:
115         (k,v) = l.strip().split(': ', 2)
116         i[k.lower()] = v
117     f.close()
118     return i
119
120 def get_retention(config, host, db):
121     assert('retention' in config)
122
123     assert('backups' in config)
124     assert(isinstance(config['backups'], dict))
125
126     assert(host in config['backups'])
127     assert(isinstance(config['backups'][host], dict))
128
129     assert(db in config['backups'][host])
130     if isinstance(config['backups'][host][db], dict) and 'retention' in config['backups'][host][db]:
131         r = config['backups'][host][db]['retention']
132     elif '_retention' in config['backups'][host]:
133         r = config['backups'][host]['_retention']
134     else:
135         r = config['retention']
136
137     assert(isinstance(r, int))
138     return r
139
140 parser = optparse.OptionParser()
141 parser.set_usage("%prog [-c=<CONFFILE>]               (nagios mode)\n" +
142           "Usage: %prog [-c=<CONFFILE>] -e [-d] [-v]  (expire mode)")
143 parser.add_option("-c", "--config", dest="conffile", metavar="CONFFILE",
144   help="Config file location.")
145 parser.add_option("-e", "--expire", dest="expire", action="store_true",
146   help="Expire old files.")
147 parser.add_option("-d", "--dry-run", dest="dry_run", action="store_true",
148   help="Do not really remove files.")
149 parser.add_option("-v", "--verbose", dest="verbose", action="store_true",
150   help="List files we are expiring.")
151 (options, args) = parser.parse_args()
152
153 if len(args) > 0:
154     parser.print_help()
155     sys.exit(1)
156
157
158 config = load_conf(options.conffile)
159
160 os.chdir(config['rootdir'])
161 for dir in os.listdir('.'):
162     if dir.startswith('.') or dir.endswith('.old') or dir == 'lost+found':
163         note_info('IGNORED', dir)
164         continue
165
166     if not os.path.isdir(dir):
167         try:
168             mtime = os.path.getmtime(dir)
169             ctime = os.path.getctime(dir)
170         except OSError as e:
171             if e.errno == errno.ENOENT:
172                 continue
173             else:
174                 raise e
175         if min(mtime, ctime) + 3600*4 > time.time():
176             note_info('IGNORED', dir)
177         else:
178             note_warning('NOT-A-DIR', dir)
179         continue
180
181     if not dir in config['backups']:
182         note_warning('NOT-CONFIGURED', dir)
183         continue
184
185     files = os.listdir(dir)
186     if len(files) == 0:
187         note_warning('EMPTY-DIR', dir)
188         continue
189
190     files.sort()
191
192     notyetseen_dbs = copy.copy(config['backups'][dir])
193     ignored_dbs = {}
194     backup_state = {}
195
196     # Go over all the files in a directory and check for various things
197     # - for a given cluster's backups we want the latest WAL file to be no
198     #   older than a certain age,
199     # - we want all consecutive WAL files, i.e. no holes
200     # - we want a full backup at one point, and it shouldn't be too old
201     # - If our retention period is say 2 weeks, then we look for the
202     #   tar file that's older than that, and everything before that can
203     #   be expired
204     while len(files) > 0:
205         fn = files.pop()
206         ffn = os.path.join(dir, fn)
207
208         r = re.match('([a-z0-9-]+)\.(WAL|BASE)\..*', fn)
209         if not r:
210             note_warning('CANNOT-PARSE', ffn)
211             continue
212
213         (db, type) = r.groups(1)
214         if not isinstance(config['backups'][dir], dict) or not db in config['backups'][dir]:
215             if not db in ignored_dbs:
216                 note_warning_db(dir, db, 'NOT-CONFIGURED', '%s/%s'%(dir, db))
217             ignored_dbs[db] = True
218         if db in ignored_dbs:
219             continue
220         if not db in backup_state:
221             backup_state[db] = {}
222             # can_expire_for_base_hit: We hit a BASE backup that is old enough
223             #   so that once we hit all the required WAL files for this base
224             #   backup to work we can start expiring everything older than that
225             #   oldest WAL file
226             backup_state[db]['can_expire_for_base_hit'] = False
227             # can_expire_next: Can expire all files that we handle from now on
228             backup_state[db]['can_expire_next'] = False
229             backup_state[db]['expires'] = []
230             if isinstance(config['backups'][dir][db], dict) and 'timeline' in config['backups'][dir][db]:
231                 backup_state[db]['timeline'] = config['backups'][dir][db]['timeline']
232             else:
233                 backup_state[db]['timeline'] = 1
234
235         # Apparently we already have seen a base backup and all its wal files
236         # which we want to keep, so everything what we see now is older than
237         # that and we can get rid of it
238         if backup_state[db]['can_expire_next']:
239             backup_state[db]['expires'].append(ffn)
240
241         if type == 'BASE':
242             # should have been taken care of before
243             # while handling a WAL.backup file
244             note_warning_db(dir, db, 'STRAY-BASE', ffn)
245             continue
246         elif type == 'WAL':
247             # handle .backup files  -  they live near the WAL "file namespace" and reference
248             # the corresponding full backup
249             r = re.match('[a-z0-9-]+\.WAL\.([0-9A-F]{8})([0-9A-F]{8})([0-9A-F]{8})\.[0-9A-F]{8}\.backup', fn)
250             if r:
251                 info = parse_pg_backup_info(ffn)
252                 basefn = '%s.BASE.%s-%s.tar.gz'%(db, info['label'], info['start wal location'].split(' ',2)[0].replace('/', '_'))
253                 baseffn = os.path.join(dir, basefn)
254                 if not basefn in files:
255                     basefn = '%s.BASE.%s.tar.gz'%(db, info['label'])
256                     baseffn = os.path.join(dir, basefn)
257                     if not basefn in files:
258                         m = re.match('([a-z0-9.]+)-\d{8}-\d{6}', info['label'])
259                         if m and (m.group(1) != socket.getfqdn()):
260                             note_info(dir, 'IGNORED-OTHER-BASE: '+basefn)
261                             continue
262                         else:
263                             note_warning_db(dir, db, 'MISSING-BASE', basefn)
264                             continue
265                 if db in notyetseen_dbs: del notyetseen_dbs[db]
266                 files.remove(basefn)
267                 if backup_state[db]['can_expire_next']:
268                     backup_state[db]['expires'].append(baseffn)
269
270                 if not 'newest-base' in backup_state[db]:
271                     backup_state[db]['newest-base'] = baseffn
272                 backup_state[db]['oldest-base'] = baseffn
273
274                 startre = re.search('\(file ([0-9A-F]{24})\)', info['start wal location'])
275                 if not startre:
276                     note_warning_db(dir, db, 'CANNOT-PARSE-START_WAL_LOCATION', ffn)
277                     continue
278                 start_file = startre.group(1)
279                 walbase = '%s.WAL.%s'%(db, start_file)
280                 backup_state[db]['base_needs_wal_until'] = walbase
281
282                 start = time.mktime(time.strptime(info['start time'], '%Y-%m-%d %H:%M:%S %Z'))
283                 if start + get_retention(config, dir, db) < time.time():
284                     backup_state[db]['can_expire_for_base_hit'] = True
285                 continue
286
287             # handle WAL files
288             r = re.match('[a-z0-9-]+\.WAL\.([0-9A-F]{8})([0-9A-F]{8})([0-9A-F]{8})', fn)
289             if r:
290                 if 'base_needs_wal_until' in backup_state[db]:
291                     if backup_state[db]['base_needs_wal_until'] == fn:
292                         del backup_state[db]['base_needs_wal_until']
293                         if backup_state[db]['can_expire_for_base_hit']:
294                             backup_state[db]['can_expire_next'] = True
295
296                 (timeline, wal1, wal2) = map(lambda x: int(x,16), r.groups())
297                 if not timeline == backup_state[db]['timeline']:
298                     note_warning_db(dir, db, 'UNEXPECTED-TIMELINE', ffn)
299                     continue
300
301                 thissegment = (wal1, wal2)
302                 if not 'newest-wal' in backup_state[db]:
303                     backup_state[db]['newest-wal'] = thissegment
304                     backup_state[db]['newest-wal-file'] = ffn
305                 else:
306                     if not wal_pre(backup_state[db]['oldest-wal'], dir, db) == thissegment:
307                         note_warning_db(dir, db, 'WAL-MISSING-AFTER', ffn)
308                         ignored_dbs[db] = True
309                         continue
310                 backup_state[db]['oldest-wal'] = thissegment
311
312                 continue
313
314             note_warning_db(dir, db, 'CANNOT-PARSE-WAL', ffn)
315         else:
316             note_warning_db(dir, db, 'INVALID-TYPE', ffn)
317
318
319     for db in backup_state:
320         if 'base_needs_wal_until' in backup_state[db]:
321             note_warning_db(dir, db, 'MISSING_WAL_FOR_BASE', backup_state[db]['base_needs_wal_until'])
322
323     for db in backup_state:
324         if not 'newest-base' in backup_state[db]:
325             note_warning_db(dir, db, 'NO-BASE', 'no base backup found?')
326         else:
327             age = time.time() - os.stat(backup_state[db]['newest-base']).st_mtime
328             if age > config['warn-age']['base']:
329                 note_warning_db(dir, db, 'BASE-IS-OLD', 'latest base backup is too old')
330
331         if not 'newest-wal-file' in backup_state[db]:
332             note_warning_db(dir, db, 'NO-BASE', 'no WAL files found?')
333         else:
334             age = time.time() - os.stat(backup_state[db]['newest-wal-file']).st_mtime
335             if age > config['warn-age']['wal']:
336                 note_warning_db(dir, db, 'WAL-IS-OLD', 'latest wal file is too old')
337
338     for db in backup_state:
339         if len(backup_state[db]['expires']) > 0:
340             if dir in problems_per_db and db in problems_per_db[dir] and problems_per_db[dir][db]:
341                 note_warning_db(dir, db, 'NOT-EXPIRING-DUE-TO-WARNINGS', 'have seen warnings, will not expire anything')
342             else:
343                 backup_state[db]['expires'].reverse()
344                 for f in backup_state[db]['expires']:
345                     global_expires.append(f)
346
347     for db in notyetseen_dbs:
348         if db.startswith('_'): continue
349         note_warning_db(dir, db, 'NO-BACKUP', 'no backups! (no .backup files found)')
350
351     #if not db in backup_state:
352     #    note_warning('BASE-WITHOUT-WAL', ffn)
353     #    ignored_dbs[db] = True
354     #    continue
355
356     #age = time.time() - os.stat(ffn).st_mtime
357     #if age > config['warn-age']['wal']:
358     #    note_warning('OLD-WAL', backup_state[db]['newest-wal-file'])
359     #    ignored_dbs[db] = True
360     #    continue
361
362
363 for p in problems_seq:
364     print p
365 if options.verbose:
366     for p in notices_seq:
367         print p
368
369 if options.expire:
370     for f in global_expires:
371         if options.verbose: print "Expiring %s"%(f)
372         if not options.dry_run: os.unlink(f)
373
374 if len(problems_seq) > 0:
375     sys.exit(1)
376
377 if not options.expire or options.verbose:
378     print "OK: no problems detected"
379 sys.exit(0)
380
381 # vim:set et:
382 # vim:set ts=4:
383 # vim:set shiftwidth=4: