dsa-check-backuppg: warn if backups for a database configured in dsa-check-backuppg...
[mirror/dsa-nagios.git] / dsa-nagios-checks / checks / dsa-check-backuppg
1 #!/usr/bin/python
2
3 # Copyright 2010 Peter Palfrader
4 #
5 # Permission is hereby granted, free of charge, to any person obtaining
6 # a copy of this software and associated documentation files (the
7 # "Software"), to deal in the Software without restriction, including
8 # without limitation the rights to use, copy, modify, merge, publish,
9 # distribute, sublicense, and/or sell copies of the Software, and to
10 # permit persons to whom the Software is furnished to do so, subject to
11 # the following conditions:
12 #
13 # The above copyright notice and this permission notice shall be
14 # included in all copies or substantial portions of the Software.
15 #
16 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20 # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21 # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22 # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23
24 # Checks if the WAL backups for several postgres clusters from
25 # different hosts are current.  Might not catch all error instances.
26 #
27 # If called with -e will expire WALs and BASE backups no longer required.
28 #
29 # Needs files layed out like so:
30 # beethoven:/srv/pgbackup/pg# ls -l ries/ | head
31 # total 6794956
32 # -rw------- 1 debbackup debbackup 378099591 May  1 02:26 dak.BASE.20100501-ries-dak-8.4-backup-F_37000020.tar.gz
33 # -rw------- 1 debbackup debbackup 382267407 May  8 02:25 dak.BASE.20100508-ries-dak-8.4-backup-F_A2000020.tar.gz
34 # -rw------- 1 debbackup debbackup 384054069 May 15 02:24 dak.BASE.20100515-ries-dak-8.4-backup-F_FB000020.tar.gz
35 # -rw------- 1 debbackup debbackup 386407500 May 22 02:27 dak.BASE.20100522-ries-dak-8.4-backup-10_58000020.tar.gz
36 # -rw------- 1 debbackup debbackup  16777216 May  1 02:26 dak.WAL.000000010000000F00000037
37 # -rw------- 1 debbackup debbackup       264 May  1 02:27 dak.WAL.000000010000000F00000037.00000020.backup
38 # -rw------- 1 debbackup debbackup  16777216 May  1 03:25 dak.WAL.000000010000000F00000038
39 # -rw------- 1 debbackup debbackup  16777216 May  1 09:11 dak.WAL.000000010000000F00000039
40 # -rw------- 1 debbackup debbackup  16777216 May  1 09:45 dak.WAL.000000010000000F0000003A
41 # ...
42 #
43 # needs write privileges to at least the .backup files
44
45
46 import copy
47 import time
48 import re
49 import os
50 import errno
51 import sys
52 import yaml
53 import optparse
54 import socket
55
56 def load_conf(cf):
57     if cf is not None:
58         configfile = cf
59     elif 'DSA_CHECK_BACKUPPG_CONF' in os.environ:
60         configfile = os.environ['DSA_CHECK_BACKUPPG_CONF']
61     else:
62         configfile = '/etc/nagios/dsa-check-backuppg.conf'
63
64     f = open(configfile)
65     config = yaml.load(f.read())
66     f.close()
67     return config
68
69
70 notices_seq = []
71 problems_seq = []
72 problems_per_db = {}
73 global_expires = []
74 #def note_warning(key, host, db, value):
75 #    global problems_seq
76 #    problems_seq.append("[%s, %s]: %s: %s"%(host, db, key, value))
77 #
78 #    global problems_per_db
79 #    if not host in problems_per_db: problems_per_db[host] = {}
80 #    problems_per_db[host][db] = True
81 def note_info(key, value, pre=None):
82     global notices_seq
83     if pre is None:
84         notices_seq.append("%s: %s"%(key, value))
85     else:
86         notices_seq.append("[%s] %s: %s"%(pre, key, value))
87
88 def note_warning(key, value, pre=None):
89     global problems_seq
90     if pre is None:
91         problems_seq.append("%s: %s"%(key, value))
92     else:
93         problems_seq.append("[%s] %s: %s"%(pre, key, value))
94 def note_warning_db(host, db, key, value):
95     note_warning(key, value, "%s, %s"%(host, db))
96     global problems_per_db
97     if not host in problems_per_db: problems_per_db[host] = {}
98     problems_per_db[host][db] = True
99
100
101 def wal_pre(w, host, db):
102     (w1,w2) = w
103     if w2 == 0:
104         w1 -= 1
105         if (host,db) in ( ('moszumanska', 'main'), ):
106             w2 = 0xFE
107         else:
108             w2 = 0xFF
109     else:
110         w2 -= 1
111
112     return (w1,w2)
113
114 def parse_pg_backup_info(fn):
115     i = {}
116     f = open(fn)
117     for l in f:
118         (k,v) = l.strip().split(': ', 2)
119         i[k.lower()] = v
120     f.close()
121     return i
122
123
124 parser = optparse.OptionParser()
125 parser.set_usage("%prog [-c=<CONFFILE>]               (nagios mode)\n" +
126           "Usage: %prog [-c=<CONFFILE>] -e [-d] [-v]  (expire mode)")
127 parser.add_option("-c", "--config", dest="conffile", metavar="CONFFILE",
128   help="Config file location.")
129 parser.add_option("-e", "--expire", dest="expire", action="store_true",
130   help="Expire old files.")
131 parser.add_option("-d", "--dry-run", dest="dry_run", action="store_true",
132   help="Do not really remove files.")
133 parser.add_option("-v", "--verbose", dest="verbose", action="store_true",
134   help="List files we are expiring.")
135 (options, args) = parser.parse_args()
136
137 if len(args) > 0:
138     parser.print_help()
139     sys.exit(1)
140
141
142 config = load_conf(options.conffile)
143
144 os.chdir(config['rootdir'])
145 for dir in os.listdir('.'):
146     if dir.startswith('.') or dir.endswith('.old'):
147         note_info('IGNORED', dir)
148         continue
149
150     if not os.path.isdir(dir):
151         try:
152             mtime = os.path.getmtime(dir)
153             ctime = os.path.getctime(dir)
154         except OSError as e:
155             if e.errno == errno.ENOENT:
156                 continue
157             else:
158                 raise e
159         if min(mtime, ctime) + 3600*4 > time.time():
160             note_info('IGNORED', dir)
161         else:
162             note_warning('NOT-A-DIR', dir)
163         continue
164
165     if not dir in config['backups']:
166         note_warning('NOT-CONFIGURED', dir)
167         continue
168
169     files = os.listdir(dir)
170     if len(files) == 0:
171         note_warning('EMPTY-DIR', dir)
172         continue
173
174     files.sort()
175
176     notyetseen_dbs = copy.copy(config['backups'][dir])
177     ignored_dbs = {}
178     backup_state = {}
179
180     # Go over all the files in a directory and check for various things
181     # - for a given cluster's backups we want the latest WAL file to be no
182     #   older than a certain age,
183     # - we want all consecutive WAL files, i.e. no holes
184     # - we want a full backup at one point, and it shouldn't be too old
185     # - If our retention period is say 2 weeks, then we look for the
186     #   tar file that's older than that, and everything before that can
187     #   be expired
188     while len(files) > 0:
189         fn = files.pop()
190         ffn = os.path.join(dir, fn)
191
192         r = re.match('([a-z0-9-]+)\.(WAL|BASE)\..*', fn)
193         if not r:
194             note_warning('CANNOT-PARSE', ffn)
195             continue
196
197         (db, type) = r.groups(1)
198         if not isinstance(config['backups'][dir], dict) or not db in config['backups'][dir]:
199             if not db in ignored_dbs:
200                 note_warning_db(dir, db, 'NOT-CONFIGURED', '%s/%s'%(dir, db))
201             ignored_dbs[db] = True
202         if db in ignored_dbs:
203             continue
204         if not db in backup_state:
205             backup_state[db] = {}
206             # can_expire_for_base_hit: We hit a BASE backup that is old enough
207             #   so that once we hit all the required WAL files for this base
208             #   backup to work we can start expiring everything older than that
209             #   oldest WAL file
210             backup_state[db]['can_expire_for_base_hit'] = False
211             # can_expire_next: Can expire all files that we handle from now on
212             backup_state[db]['can_expire_next'] = False
213             backup_state[db]['expires'] = []
214             if isinstance(config['backups'][dir][db], dict) and 'timeline' in config['backups'][dir][db]:
215                 backup_state[db]['timeline'] = config['backups'][dir][db]['timeline']
216             else:
217                 backup_state[db]['timeline'] = 1
218
219         # Apparently we already have seen a base backup and all its wal files
220         # which we want to keep, so everything what we see now is older than
221         # that and we can get rid of it
222         if backup_state[db]['can_expire_next']:
223             backup_state[db]['expires'].append(ffn)
224
225         if type == 'BASE':
226             # should have been taken care of before
227             # while handling a WAL.backup file
228             note_warning_db(dir, db, 'STRAY-BASE', ffn)
229             continue
230         elif type == 'WAL':
231             # handle .backup files  -  they live near the WAL "file namespace" and reference
232             # the corresponding full backup
233             r = re.match('[a-z0-9-]+\.WAL\.([0-9A-F]{8})([0-9A-F]{8})([0-9A-F]{8})\.[0-9A-F]{8}\.backup', fn)
234             if r:
235                 info = parse_pg_backup_info(ffn)
236                 basefn = '%s.BASE.%s-%s.tar.gz'%(db, info['label'], info['start wal location'].split(' ',2)[0].replace('/', '_'))
237                 baseffn = os.path.join(dir, basefn)
238                 if not basefn in files:
239                     basefn = '%s.BASE.%s.tar.gz'%(db, info['label'])
240                     baseffn = os.path.join(dir, basefn)
241                     if not basefn in files:
242                         m = re.match('([a-z0-9.]+)-\d{8}-\d{6}', info['label'])
243                         if m and (m.group(1) != socket.getfqdn()):
244                             note_info(dir, 'IGNORED-OTHER-BASE: '+basefn)
245                             continue
246                         else:
247                             note_warning_db(dir, db, 'MISSING-BASE', basefn)
248                             continue
249                 if db in notyetseen_dbs: del notyetseen_dbs[db]
250                 files.remove(basefn)
251                 if backup_state[db]['can_expire_next']:
252                     backup_state[db]['expires'].append(baseffn)
253
254                 if not 'newest-base' in backup_state[db]:
255                     backup_state[db]['newest-base'] = baseffn
256                 backup_state[db]['oldest-base'] = baseffn
257
258                 startre = re.search('\(file ([0-9A-F]{24})\)', info['start wal location'])
259                 if not startre:
260                     note_warning_db(dir, db, 'CANNOT-PARSE-START_WAL_LOCATION', ffn)
261                     continue
262                 start_file = startre.group(1)
263                 walbase = '%s.WAL.%s'%(db, start_file)
264                 backup_state[db]['base_needs_wal_until'] = walbase
265
266                 start = time.mktime(time.strptime(info['start time'], '%Y-%m-%d %H:%M:%S %Z'))
267                 if start + config['retention'] < time.time():
268                     backup_state[db]['can_expire_for_base_hit'] = True
269                 continue
270
271             # handle WAL files
272             r = re.match('[a-z0-9-]+\.WAL\.([0-9A-F]{8})([0-9A-F]{8})([0-9A-F]{8})', fn)
273             if r:
274                 if 'base_needs_wal_until' in backup_state[db]:
275                     if backup_state[db]['base_needs_wal_until'] == fn:
276                         del backup_state[db]['base_needs_wal_until']
277                         if backup_state[db]['can_expire_for_base_hit']:
278                             backup_state[db]['can_expire_next'] = True
279
280                 (timeline, wal1, wal2) = map(lambda x: int(x,16), r.groups())
281                 if not timeline == backup_state[db]['timeline']:
282                     note_warning_db(dir, db, 'UNEXPECTED-TIMELINE', ffn)
283                     continue
284
285                 thissegment = (wal1, wal2)
286                 if not 'newest-wal' in backup_state[db]:
287                     backup_state[db]['newest-wal'] = thissegment
288                     backup_state[db]['newest-wal-file'] = ffn
289                 else:
290                     if not wal_pre(backup_state[db]['oldest-wal'], dir, db) == thissegment:
291                         note_warning_db(dir, db, 'WAL-MISSING-AFTER', ffn)
292                         ignored_dbs[db] = True
293                         continue
294                 backup_state[db]['oldest-wal'] = thissegment
295
296                 continue
297
298             note_warning_db(dir, db, 'CANNOT-PARSE-WAL', ffn)
299         else:
300             note_warning_db(dir, db, 'INVALID-TYPE', ffn)
301
302
303     for db in backup_state:
304         if 'base_needs_wal_until' in backup_state[db]:
305             note_warning_db(dir, db, 'MISSING_WAL_FOR_BASE', backup_state[db]['base_needs_wal_until'])
306
307     for db in backup_state:
308         if not 'newest-base' in backup_state[db]:
309             note_warning_db(dir, db, 'NO-BASE', 'no base backup found?')
310         else:
311             age = time.time() - os.stat(backup_state[db]['newest-base']).st_mtime
312             if age > config['warn-age']['base']:
313                 note_warning_db(dir, db, 'BASE-IS-OLD', 'latest base backup is too old')
314
315         if not 'newest-wal-file' in backup_state[db]:
316             note_warning_db(dir, db, 'NO-BASE', 'no WAL files found?')
317         else:
318             age = time.time() - os.stat(backup_state[db]['newest-wal-file']).st_mtime
319             if age > config['warn-age']['wal']:
320                 note_warning_db(dir, db, 'WAL-IS-OLD', 'latest wal file is too old')
321
322     for db in backup_state:
323         if len(backup_state[db]['expires']) > 0:
324             if dir in problems_per_db and db in problems_per_db[dir] and problems_per_db[dir][db]:
325                 note_warning_db(dir, db, 'NOT-EXPIRING-DUE-TO-WARNINGS', 'have seen warnings, will not expire anything')
326             else:
327                 backup_state[db]['expires'].reverse()
328                 for f in backup_state[db]['expires']:
329                     global_expires.append(f)
330
331     for db in notyetseen_dbs:
332         note_warning_db(dir, db, 'NO-BACKUP', 'no backups! (no .backup files found)')
333
334     #if not db in backup_state:
335     #    note_warning('BASE-WITHOUT-WAL', ffn)
336     #    ignored_dbs[db] = True
337     #    continue
338
339     #age = time.time() - os.stat(ffn).st_mtime
340     #if age > config['warn-age']['wal']:
341     #    note_warning('OLD-WAL', backup_state[db]['newest-wal-file'])
342     #    ignored_dbs[db] = True
343     #    continue
344
345
346 for p in problems_seq:
347     print p
348 for p in notices_seq:
349     print p
350
351 if options.expire:
352     for f in global_expires:
353         if options.verbose: print "Expiring %s"%(f)
354         if not options.dry_run: os.unlink(f)
355
356 if len(problems_seq) > 0:
357     sys.exit(1)
358
359 if not options.expire or options.verbose:
360     print "OK: no problems detected"
361 sys.exit(0)
362
363 # vim:set et:
364 # vim:set ts=4:
365 # vim:set shiftwidth=4: