dsa-check-soas: fix error when 0 (or more than 1) records returned
[mirror/dsa-nagios.git] / dsa-nagios-checks / checks / dsa-check-backuppg
1 #!/usr/bin/python
2
3 # Copyright 2010 Peter Palfrader
4 #
5 # Permission is hereby granted, free of charge, to any person obtaining
6 # a copy of this software and associated documentation files (the
7 # "Software"), to deal in the Software without restriction, including
8 # without limitation the rights to use, copy, modify, merge, publish,
9 # distribute, sublicense, and/or sell copies of the Software, and to
10 # permit persons to whom the Software is furnished to do so, subject to
11 # the following conditions:
12 #
13 # The above copyright notice and this permission notice shall be
14 # included in all copies or substantial portions of the Software.
15 #
16 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20 # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21 # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22 # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23
24 # Checks if the WAL backups for several postgres clusters from
25 # different hosts are current.  Might not catch all error instances.
26 #
27 # If called with -e will expire WALs and BASE backups no longer required.
28 #
29 # Needs files layed out like so:
30 # beethoven:/srv/pgbackup/pg# ls -l ries/ | head
31 # total 6794956
32 # -rw------- 1 debbackup debbackup 378099591 May  1 02:26 dak.BASE.20100501-ries-dak-8.4-backup-F_37000020.tar.gz
33 # -rw------- 1 debbackup debbackup 382267407 May  8 02:25 dak.BASE.20100508-ries-dak-8.4-backup-F_A2000020.tar.gz
34 # -rw------- 1 debbackup debbackup 384054069 May 15 02:24 dak.BASE.20100515-ries-dak-8.4-backup-F_FB000020.tar.gz
35 # -rw------- 1 debbackup debbackup 386407500 May 22 02:27 dak.BASE.20100522-ries-dak-8.4-backup-10_58000020.tar.gz
36 # -rw------- 1 debbackup debbackup  16777216 May  1 02:26 dak.WAL.000000010000000F00000037
37 # -rw------- 1 debbackup debbackup       264 May  1 02:27 dak.WAL.000000010000000F00000037.00000020.backup
38 # -rw------- 1 debbackup debbackup  16777216 May  1 03:25 dak.WAL.000000010000000F00000038
39 # -rw------- 1 debbackup debbackup  16777216 May  1 09:11 dak.WAL.000000010000000F00000039
40 # -rw------- 1 debbackup debbackup  16777216 May  1 09:45 dak.WAL.000000010000000F0000003A
41 # ...
42 #
43 # needs write privileges to at least the .backup files
44
45
46 import copy
47 import time
48 import re
49 import os
50 import errno
51 import sys
52 import yaml
53 import optparse
54 import socket
55
56 def load_conf(cf):
57     if cf is not None:
58         configfile = cf
59     elif 'DSA_CHECK_BACKUPPG_CONF' in os.environ:
60         configfile = os.environ['DSA_CHECK_BACKUPPG_CONF']
61     else:
62         configfile = '/etc/nagios/dsa-check-backuppg.conf'
63
64     f = open(configfile)
65     config = yaml.safe_load(f.read())
66     f.close()
67     return config
68
69
70 notices_seq = []
71 problems_seq = []
72 problems_per_db = {}
73 global_expires = []
74 #def note_warning(key, host, db, value):
75 #    global problems_seq
76 #    problems_seq.append("[%s, %s]: %s: %s"%(host, db, key, value))
77 #
78 #    global problems_per_db
79 #    if not host in problems_per_db: problems_per_db[host] = {}
80 #    problems_per_db[host][db] = True
81 def note_info(key, value, pre=None):
82     global notices_seq
83     if pre is None:
84         notices_seq.append("%s: %s"%(key, value))
85     else:
86         notices_seq.append("[%s] %s: %s"%(pre, key, value))
87
88 def note_warning(key, value, pre=None):
89     global problems_seq
90     if pre is None:
91         problems_seq.append("%s: %s"%(key, value))
92     else:
93         problems_seq.append("[%s] %s: %s"%(pre, key, value))
94 def note_warning_db(host, db, key, value):
95     note_warning(key, value, "%s, %s"%(host, db))
96     global problems_per_db
97     if not host in problems_per_db: problems_per_db[host] = {}
98     problems_per_db[host][db] = True
99
100
101 def wal_pre(w, host, db):
102     (w1,w2) = w
103     if w2 == 0:
104         w1 -= 1
105         if (host,db) in ( ('main'), ):
106             w2 = 0xFE
107         else:
108             w2 = 0xFF
109     else:
110         w2 -= 1
111
112     return (w1,w2)
113
114 def parse_pg_backup_info(fn):
115     i = {}
116     f = open(fn)
117     for l in f:
118         (k,v) = l.strip().split(': ', 2)
119         i[k.lower()] = v
120     f.close()
121     return i
122
123 def get_retention(config, host, db):
124     assert('retention' in config)
125
126     assert('backups' in config)
127     assert(isinstance(config['backups'], dict))
128
129     assert(host in config['backups'])
130     assert(isinstance(config['backups'][host], dict))
131
132     assert(db in config['backups'][host])
133     if isinstance(config['backups'][host][db], dict) and 'retention' in config['backups'][host][db]:
134         r = config['backups'][host][db]['retention']
135     elif '_retention' in config['backups'][host]:
136         r = config['backups'][host]['_retention']
137     else:
138         r = config['retention']
139
140     assert(isinstance(r, int))
141     return r
142
143 parser = optparse.OptionParser()
144 parser.set_usage("%prog [-c=<CONFFILE>]               (nagios mode)\n" +
145           "Usage: %prog [-c=<CONFFILE>] -e [-d] [-v]  (expire mode)")
146 parser.add_option("-c", "--config", dest="conffile", metavar="CONFFILE",
147   help="Config file location.")
148 parser.add_option("-e", "--expire", dest="expire", action="store_true",
149   help="Expire old files.")
150 parser.add_option("-d", "--dry-run", dest="dry_run", action="store_true",
151   help="Do not really remove files.")
152 parser.add_option("-v", "--verbose", dest="verbose", action="store_true",
153   help="List files we are expiring.")
154 (options, args) = parser.parse_args()
155
156 if len(args) > 0:
157     parser.print_help()
158     sys.exit(1)
159
160
161 config = load_conf(options.conffile)
162
163 os.chdir(config['rootdir'])
164 for dir in os.listdir('.'):
165     if dir.startswith('.') or dir.endswith('.old'):
166         note_info('IGNORED', dir)
167         continue
168
169     if not os.path.isdir(dir):
170         try:
171             mtime = os.path.getmtime(dir)
172             ctime = os.path.getctime(dir)
173         except OSError as e:
174             if e.errno == errno.ENOENT:
175                 continue
176             else:
177                 raise e
178         if min(mtime, ctime) + 3600*4 > time.time():
179             note_info('IGNORED', dir)
180         else:
181             note_warning('NOT-A-DIR', dir)
182         continue
183
184     if not dir in config['backups']:
185         note_warning('NOT-CONFIGURED', dir)
186         continue
187
188     files = os.listdir(dir)
189     if len(files) == 0:
190         note_warning('EMPTY-DIR', dir)
191         continue
192
193     files.sort()
194
195     notyetseen_dbs = copy.copy(config['backups'][dir])
196     ignored_dbs = {}
197     backup_state = {}
198
199     # Go over all the files in a directory and check for various things
200     # - for a given cluster's backups we want the latest WAL file to be no
201     #   older than a certain age,
202     # - we want all consecutive WAL files, i.e. no holes
203     # - we want a full backup at one point, and it shouldn't be too old
204     # - If our retention period is say 2 weeks, then we look for the
205     #   tar file that's older than that, and everything before that can
206     #   be expired
207     while len(files) > 0:
208         fn = files.pop()
209         ffn = os.path.join(dir, fn)
210
211         r = re.match('([a-z0-9-]+)\.(WAL|BASE)\..*', fn)
212         if not r:
213             note_warning('CANNOT-PARSE', ffn)
214             continue
215
216         (db, type) = r.groups(1)
217         if not isinstance(config['backups'][dir], dict) or not db in config['backups'][dir]:
218             if not db in ignored_dbs:
219                 note_warning_db(dir, db, 'NOT-CONFIGURED', '%s/%s'%(dir, db))
220             ignored_dbs[db] = True
221         if db in ignored_dbs:
222             continue
223         if not db in backup_state:
224             backup_state[db] = {}
225             # can_expire_for_base_hit: We hit a BASE backup that is old enough
226             #   so that once we hit all the required WAL files for this base
227             #   backup to work we can start expiring everything older than that
228             #   oldest WAL file
229             backup_state[db]['can_expire_for_base_hit'] = False
230             # can_expire_next: Can expire all files that we handle from now on
231             backup_state[db]['can_expire_next'] = False
232             backup_state[db]['expires'] = []
233             if isinstance(config['backups'][dir][db], dict) and 'timeline' in config['backups'][dir][db]:
234                 backup_state[db]['timeline'] = config['backups'][dir][db]['timeline']
235             else:
236                 backup_state[db]['timeline'] = 1
237
238         # Apparently we already have seen a base backup and all its wal files
239         # which we want to keep, so everything what we see now is older than
240         # that and we can get rid of it
241         if backup_state[db]['can_expire_next']:
242             backup_state[db]['expires'].append(ffn)
243
244         if type == 'BASE':
245             # should have been taken care of before
246             # while handling a WAL.backup file
247             note_warning_db(dir, db, 'STRAY-BASE', ffn)
248             continue
249         elif type == 'WAL':
250             # handle .backup files  -  they live near the WAL "file namespace" and reference
251             # the corresponding full backup
252             r = re.match('[a-z0-9-]+\.WAL\.([0-9A-F]{8})([0-9A-F]{8})([0-9A-F]{8})\.[0-9A-F]{8}\.backup', fn)
253             if r:
254                 info = parse_pg_backup_info(ffn)
255                 basefn = '%s.BASE.%s-%s.tar.gz'%(db, info['label'], info['start wal location'].split(' ',2)[0].replace('/', '_'))
256                 baseffn = os.path.join(dir, basefn)
257                 if not basefn in files:
258                     basefn = '%s.BASE.%s.tar.gz'%(db, info['label'])
259                     baseffn = os.path.join(dir, basefn)
260                     if not basefn in files:
261                         m = re.match('([a-z0-9.]+)-\d{8}-\d{6}', info['label'])
262                         if m and (m.group(1) != socket.getfqdn()):
263                             note_info(dir, 'IGNORED-OTHER-BASE: '+basefn)
264                             continue
265                         else:
266                             note_warning_db(dir, db, 'MISSING-BASE', basefn)
267                             continue
268                 if db in notyetseen_dbs: del notyetseen_dbs[db]
269                 files.remove(basefn)
270                 if backup_state[db]['can_expire_next']:
271                     backup_state[db]['expires'].append(baseffn)
272
273                 if not 'newest-base' in backup_state[db]:
274                     backup_state[db]['newest-base'] = baseffn
275                 backup_state[db]['oldest-base'] = baseffn
276
277                 startre = re.search('\(file ([0-9A-F]{24})\)', info['start wal location'])
278                 if not startre:
279                     note_warning_db(dir, db, 'CANNOT-PARSE-START_WAL_LOCATION', ffn)
280                     continue
281                 start_file = startre.group(1)
282                 walbase = '%s.WAL.%s'%(db, start_file)
283                 backup_state[db]['base_needs_wal_until'] = walbase
284
285                 start = time.mktime(time.strptime(info['start time'], '%Y-%m-%d %H:%M:%S %Z'))
286                 if start + get_retention(config, dir, db) < time.time():
287                     backup_state[db]['can_expire_for_base_hit'] = True
288                 continue
289
290             # handle WAL files
291             r = re.match('[a-z0-9-]+\.WAL\.([0-9A-F]{8})([0-9A-F]{8})([0-9A-F]{8})', fn)
292             if r:
293                 if 'base_needs_wal_until' in backup_state[db]:
294                     if backup_state[db]['base_needs_wal_until'] == fn:
295                         del backup_state[db]['base_needs_wal_until']
296                         if backup_state[db]['can_expire_for_base_hit']:
297                             backup_state[db]['can_expire_next'] = True
298
299                 (timeline, wal1, wal2) = map(lambda x: int(x,16), r.groups())
300                 if not timeline == backup_state[db]['timeline']:
301                     note_warning_db(dir, db, 'UNEXPECTED-TIMELINE', ffn)
302                     continue
303
304                 thissegment = (wal1, wal2)
305                 if not 'newest-wal' in backup_state[db]:
306                     backup_state[db]['newest-wal'] = thissegment
307                     backup_state[db]['newest-wal-file'] = ffn
308                 else:
309                     if not wal_pre(backup_state[db]['oldest-wal'], dir, db) == thissegment:
310                         note_warning_db(dir, db, 'WAL-MISSING-AFTER', ffn)
311                         ignored_dbs[db] = True
312                         continue
313                 backup_state[db]['oldest-wal'] = thissegment
314
315                 continue
316
317             note_warning_db(dir, db, 'CANNOT-PARSE-WAL', ffn)
318         else:
319             note_warning_db(dir, db, 'INVALID-TYPE', ffn)
320
321
322     for db in backup_state:
323         if 'base_needs_wal_until' in backup_state[db]:
324             note_warning_db(dir, db, 'MISSING_WAL_FOR_BASE', backup_state[db]['base_needs_wal_until'])
325
326     for db in backup_state:
327         if not 'newest-base' in backup_state[db]:
328             note_warning_db(dir, db, 'NO-BASE', 'no base backup found?')
329         else:
330             age = time.time() - os.stat(backup_state[db]['newest-base']).st_mtime
331             if age > config['warn-age']['base']:
332                 note_warning_db(dir, db, 'BASE-IS-OLD', 'latest base backup is too old')
333
334         if not 'newest-wal-file' in backup_state[db]:
335             note_warning_db(dir, db, 'NO-BASE', 'no WAL files found?')
336         else:
337             age = time.time() - os.stat(backup_state[db]['newest-wal-file']).st_mtime
338             if age > config['warn-age']['wal']:
339                 note_warning_db(dir, db, 'WAL-IS-OLD', 'latest wal file is too old')
340
341     for db in backup_state:
342         if len(backup_state[db]['expires']) > 0:
343             if dir in problems_per_db and db in problems_per_db[dir] and problems_per_db[dir][db]:
344                 note_warning_db(dir, db, 'NOT-EXPIRING-DUE-TO-WARNINGS', 'have seen warnings, will not expire anything')
345             else:
346                 backup_state[db]['expires'].reverse()
347                 for f in backup_state[db]['expires']:
348                     global_expires.append(f)
349
350     for db in notyetseen_dbs:
351         if db.startswith('_'): continue
352         note_warning_db(dir, db, 'NO-BACKUP', 'no backups! (no .backup files found)')
353
354     #if not db in backup_state:
355     #    note_warning('BASE-WITHOUT-WAL', ffn)
356     #    ignored_dbs[db] = True
357     #    continue
358
359     #age = time.time() - os.stat(ffn).st_mtime
360     #if age > config['warn-age']['wal']:
361     #    note_warning('OLD-WAL', backup_state[db]['newest-wal-file'])
362     #    ignored_dbs[db] = True
363     #    continue
364
365
366 for p in problems_seq:
367     print p
368 if options.verbose:
369     for p in notices_seq:
370         print p
371
372 if options.expire:
373     for f in global_expires:
374         if options.verbose: print "Expiring %s"%(f)
375         if not options.dry_run: os.unlink(f)
376
377 if len(problems_seq) > 0:
378     sys.exit(1)
379
380 if not options.expire or options.verbose:
381     print "OK: no problems detected"
382 sys.exit(0)
383
384 # vim:set et:
385 # vim:set ts=4:
386 # vim:set shiftwidth=4: