10d003384c424a432f672208458b69ecaa5104fc
[mirror/dsa-nagios.git] / dsa-nagios-checks / checks / dsa-check-backuppg
1 #!/usr/bin/python
2
3 # Copyright 2010 Peter Palfrader
4 #
5 # Permission is hereby granted, free of charge, to any person obtaining
6 # a copy of this software and associated documentation files (the
7 # "Software"), to deal in the Software without restriction, including
8 # without limitation the rights to use, copy, modify, merge, publish,
9 # distribute, sublicense, and/or sell copies of the Software, and to
10 # permit persons to whom the Software is furnished to do so, subject to
11 # the following conditions:
12 #
13 # The above copyright notice and this permission notice shall be
14 # included in all copies or substantial portions of the Software.
15 #
16 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20 # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21 # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22 # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23
24 # Checks if the WAL backups for several postgres clusters from
25 # different hosts are current.  Might not catch all error instances.
26 #
27 # If called with -e will expire WALs and BASE backups no longer required.
28 #
29 # Needs files layed out like so:
30 # beethoven:/srv/pgbackup/pg# ls -l ries/ | head
31 # total 6794956
32 # -rw------- 1 debbackup debbackup 378099591 May  1 02:26 dak.BASE.20100501-ries-dak-8.4-backup-F_37000020.tar.gz
33 # -rw------- 1 debbackup debbackup 382267407 May  8 02:25 dak.BASE.20100508-ries-dak-8.4-backup-F_A2000020.tar.gz
34 # -rw------- 1 debbackup debbackup 384054069 May 15 02:24 dak.BASE.20100515-ries-dak-8.4-backup-F_FB000020.tar.gz
35 # -rw------- 1 debbackup debbackup 386407500 May 22 02:27 dak.BASE.20100522-ries-dak-8.4-backup-10_58000020.tar.gz
36 # -rw------- 1 debbackup debbackup  16777216 May  1 02:26 dak.WAL.000000010000000F00000037
37 # -rw------- 1 debbackup debbackup       264 May  1 02:27 dak.WAL.000000010000000F00000037.00000020.backup
38 # -rw------- 1 debbackup debbackup  16777216 May  1 03:25 dak.WAL.000000010000000F00000038
39 # -rw------- 1 debbackup debbackup  16777216 May  1 09:11 dak.WAL.000000010000000F00000039
40 # -rw------- 1 debbackup debbackup  16777216 May  1 09:45 dak.WAL.000000010000000F0000003A
41 # ...
42 #
43 # needs write privileges to at least the .backup files
44
45
46 import copy
47 import time
48 import re
49 import os
50 import errno
51 import sys
52 import yaml
53 import optparse
54 import socket
55
56 def load_conf(cf):
57     if cf is not None:
58         configfile = cf
59     elif 'DSA_CHECK_BACKUPPG_CONF' in os.environ:
60         configfile = os.environ['DSA_CHECK_BACKUPPG_CONF']
61     else:
62         configfile = '/etc/nagios/dsa-check-backuppg.conf'
63
64     f = open(configfile)
65     config = yaml.load(f.read())
66     f.close()
67     return config
68
69
70 notices_seq = []
71 problems_seq = []
72 problems_per_db = {}
73 global_expires = []
74 #def note_warning(key, host, db, value):
75 #    global problems_seq
76 #    problems_seq.append("[%s, %s]: %s: %s"%(host, db, key, value))
77 #
78 #    global problems_per_db
79 #    if not host in problems_per_db: problems_per_db[host] = {}
80 #    problems_per_db[host][db] = True
81 def note_info(key, value, pre=None):
82     global notices_seq
83     if pre is None:
84         notices_seq.append("%s: %s"%(key, value))
85     else:
86         notices_seq.append("[%s] %s: %s"%(pre, key, value))
87
88 def note_warning(key, value, pre=None):
89     global problems_seq
90     if pre is None:
91         problems_seq.append("%s: %s"%(key, value))
92     else:
93         problems_seq.append("[%s] %s: %s"%(pre, key, value))
94 def note_warning_db(host, db, key, value):
95     note_warning(key, value, "%s, %s"%(host, db))
96     global problems_per_db
97     if not host in problems_per_db: problems_per_db[host] = {}
98     problems_per_db[host][db] = True
99
100
101 def wal_pre(w, host, db):
102     (w1,w2) = w
103     if w2 == 0:
104         w1 -= 1
105         if (host,db) in ( ('moszumanska', 'main'), ):
106             w2 = 0xFE
107         else:
108             w2 = 0xFF
109     else:
110         w2 -= 1
111
112     return (w1,w2)
113
114 def parse_pg_backup_info(fn):
115     i = {}
116     f = open(fn)
117     for l in f:
118         (k,v) = l.strip().split(': ', 2)
119         i[k.lower()] = v
120     f.close()
121     return i
122
123
124 parser = optparse.OptionParser()
125 parser.set_usage("%prog [-c=<CONFFILE>]               (nagios mode)\n" +
126           "Usage: %prog [-c=<CONFFILE>] -e [-d] [-v]  (expire mode)")
127 parser.add_option("-c", "--config", dest="conffile", metavar="CONFFILE",
128   help="Config file location.")
129 parser.add_option("-e", "--expire", dest="expire", action="store_true",
130   help="Expire old files.")
131 parser.add_option("-d", "--dry-run", dest="dry_run", action="store_true",
132   help="Do not really remove files.")
133 parser.add_option("-v", "--verbose", dest="verbose", action="store_true",
134   help="List files we are expiring.")
135 (options, args) = parser.parse_args()
136
137 if len(args) > 0:
138     parser.print_help()
139     sys.exit(1)
140
141
142 config = load_conf(options.conffile)
143
144 os.chdir(config['rootdir'])
145 for dir in os.listdir('.'):
146     if dir.startswith('.') or dir.endswith('.old'):
147         note_info('IGNORED', dir)
148         continue
149
150     if not os.path.isdir(dir):
151         try:
152             mtime = os.path.getmtime(dir)
153             ctime = os.path.getctime(dir)
154         except OSError as e:
155             if e.errno == errno.ENOENT:
156                 continue
157             else:
158                 raise e
159         if min(mtime, ctime) + 3600*4 > time.time():
160             note_info('IGNORED', dir)
161         else:
162             note_warning('NOT-A-DIR', dir)
163         continue
164
165     if not dir in config['backups']:
166         note_warning('NOT-CONFIGURED', dir)
167         continue
168
169     files = os.listdir(dir)
170     if len(files) == 0:
171         note_warning('EMPTY-DIR', dir)
172         continue
173
174     files.sort()
175
176     unhandled_backups = copy.copy(config['backups'][dir])
177     ignored_dbs = {}
178     backup_state = {}
179
180     # Go over all the files in a directory and check for various things
181     # - for a given cluster's backups we want the latest WAL file to be no
182     #   older than a certain age,
183     # - we want all consecutive WAL files, i.e. no holes
184     # - we want a full backup at one point, and it shouldn't be too old
185     # - If our retention period is say 2 weeks, then we look for the
186     #   tar file that's older than that, and everything before that can
187     #   be expired
188     while len(files) > 0:
189         fn = files.pop()
190         ffn = os.path.join(dir, fn)
191
192         r = re.match('([a-z0-9-]+)\.(WAL|BASE)\..*', fn)
193         if not r:
194             note_warning('CANNOT-PARSE', ffn)
195             continue
196
197         (db, type) = r.groups(1)
198         if not isinstance(config['backups'][dir], list) or not db in config['backups'][dir]:
199             if not db in ignored_dbs:
200                 note_warning_db(dir, db, 'NOT-CONFIGURED', '%s/%s'%(dir, db))
201             ignored_dbs[db] = True
202         if db in ignored_dbs:
203             continue
204         if not db in backup_state:
205             backup_state[db] = {}
206             # can_expire_for_base_hit: We hit a BASE backup that is old enough
207             #   so that once we hit all the required WAL files for this base
208             #   backup to work we can start expiring everything older than that
209             #   oldest WAL file
210             backup_state[db]['can_expire_for_base_hit'] = False
211             # can_expire_next: Can expire all files that we handle from now on
212             backup_state[db]['can_expire_next'] = False
213             backup_state[db]['expires'] = []
214
215         # Apparently we already have seen a base backup and all its wal files
216         # which we want to keep, so everything what we see now is older than
217         # that and we can get rid of it
218         if backup_state[db]['can_expire_next']:
219             backup_state[db]['expires'].append(ffn)
220
221         if type == 'BASE':
222             # should have been taken care of before
223             # while handling a WAL.backup file
224             note_warning_db(dir, db, 'STRAY-BASE', ffn)
225             continue
226         elif type == 'WAL':
227             # handle .backup files  -  they live near the WAL "file namespace" and reference
228             # the corresponding full backup
229             r = re.match('[a-z0-9-]+\.WAL\.([0-9A-F]{8})([0-9A-F]{8})([0-9A-F]{8})\.[0-9A-F]{8}\.backup', fn)
230             if r:
231                 info = parse_pg_backup_info(ffn)
232                 basefn = '%s.BASE.%s-%s.tar.gz'%(db, info['label'], info['start wal location'].split(' ',2)[0].replace('/', '_'))
233                 baseffn = os.path.join(dir, basefn)
234                 if not basefn in files:
235                     basefn = '%s.BASE.%s.tar.gz'%(db, info['label'])
236                     baseffn = os.path.join(dir, basefn)
237                     if not basefn in files:
238                         m = re.match('([a-z0-9.]+)-\d{8}-\d{6}', info['label'])
239                         if m and (m.group(1) != socket.getfqdn()):
240                             note_info(dir, 'IGNORED-OTHER-BASE: '+basefn)
241                             continue
242                         else:
243                             note_warning_db(dir, db, 'MISSING-BASE', basefn)
244                             continue
245                 files.remove(basefn)
246                 if backup_state[db]['can_expire_next']:
247                     backup_state[db]['expires'].append(baseffn)
248
249                 if not 'newest-base' in backup_state[db]:
250                     backup_state[db]['newest-base'] = baseffn
251                 backup_state[db]['oldest-base'] = baseffn
252
253                 startre = re.search('\(file ([0-9A-F]{24})\)', info['start wal location'])
254                 if not startre:
255                     note_warning_db(dir, db, 'CANNOT-PARSE-START_WAL_LOCATION', ffn)
256                     continue
257                 start_file = startre.group(1)
258                 walbase = '%s.WAL.%s'%(db, start_file)
259                 backup_state[db]['base_needs_wal_until'] = walbase
260
261                 start = time.mktime(time.strptime(info['start time'], '%Y-%m-%d %H:%M:%S %Z'))
262                 if start + config['retention'] < time.time():
263                     backup_state[db]['can_expire_for_base_hit'] = True
264                 continue
265
266             # handle WAL files
267             r = re.match('[a-z0-9-]+\.WAL\.([0-9A-F]{8})([0-9A-F]{8})([0-9A-F]{8})', fn)
268             if r:
269                 if 'base_needs_wal_until' in backup_state[db]:
270                     if backup_state[db]['base_needs_wal_until'] == fn:
271                         del backup_state[db]['base_needs_wal_until']
272                         if backup_state[db]['can_expire_for_base_hit']:
273                             backup_state[db]['can_expire_next'] = True
274
275                 (timeline, wal1, wal2) = map(lambda x: int(x,16), r.groups())
276                 if not timeline == 1:
277                     note_warning_db(dir, db, 'CANNOT-HANDLE-TIMELINES_NOT_1', ffn)
278                     continue
279
280                 thissegment = (wal1, wal2)
281                 if not 'newest-wal' in backup_state[db]:
282                     backup_state[db]['newest-wal'] = thissegment
283                     backup_state[db]['newest-wal-file'] = ffn
284                 else:
285                     if not wal_pre(backup_state[db]['oldest-wal'], dir, db) == thissegment:
286                         note_warning_db(dir, db, 'WAL-MISSING-AFTER', ffn)
287                         ignored_dbs[db] = True
288                         continue
289                 backup_state[db]['oldest-wal'] = thissegment
290
291                 continue
292
293             note_warning_db(dir, db, 'CANNOT-PARSE-WAL', ffn)
294         else:
295             note_warning_db(dir, db, 'INVALID-TYPE', ffn)
296
297
298     for db in backup_state:
299         if 'base_needs_wal_until' in backup_state[db]:
300             note_warning_db(dir, db, 'MISSING_WAL_FOR_BASE', backup_state[db]['base_needs_wal_until'])
301
302     for db in backup_state:
303         if not 'newest-base' in backup_state[db]:
304             note_warning_db(dir, db, 'NO-BASE', 'no base backup found?')
305         else:
306             age = time.time() - os.stat(backup_state[db]['newest-base']).st_mtime
307             if age > config['warn-age']['base']:
308                 note_warning_db(dir, db, 'BASE-IS-OLD', 'latest base backup is too old')
309
310         if not 'newest-wal-file' in backup_state[db]:
311             note_warning_db(dir, db, 'NO-BASE', 'no WAL files found?')
312         else:
313             age = time.time() - os.stat(backup_state[db]['newest-wal-file']).st_mtime
314             if age > config['warn-age']['wal']:
315                 note_warning_db(dir, db, 'WAL-IS-OLD', 'latest wal file is too old')
316
317     for db in backup_state:
318         if len(backup_state[db]['expires']) > 0:
319             if dir in problems_per_db and db in problems_per_db[dir] and problems_per_db[dir][db]:
320                 note_warning_db(dir, db, 'NOT-EXPIRING-DUE-TO-WARNINGS', 'have seen warnings, will not expire anything')
321             else:
322                 backup_state[db]['expires'].reverse()
323                 for f in backup_state[db]['expires']:
324                     global_expires.append(f)
325
326     #if not db in backup_state:
327     #    note_warning('BASE-WITHOUT-WAL', ffn)
328     #    ignored_dbs[db] = True
329     #    continue
330
331     #age = time.time() - os.stat(ffn).st_mtime
332     #if age > config['warn-age']['wal']:
333     #    note_warning('OLD-WAL', backup_state[db]['newest-wal-file'])
334     #    ignored_dbs[db] = True
335     #    continue
336
337
338 for p in problems_seq:
339     print p
340 for p in notices_seq:
341     print p
342
343 if options.expire:
344     for f in global_expires:
345         if options.verbose: print "Expiring %s"%(f)
346         if not options.dry_run: os.unlink(f)
347
348 if len(problems_seq) > 0:
349     sys.exit(1)
350
351 if not options.expire or options.verbose:
352     print "OK: no problems detected"
353 sys.exit(0)
354
355 # vim:set et:
356 # vim:set ts=4:
357 # vim:set shiftwidth=4: