e8a859237f019a61126920b8b8371ca30b0df4d4
[mirror/dsa-nagios.git] / dsa-nagios-checks / checks / dsa-check-backuppg
1 #!/usr/bin/python
2
3 # Copyright 2010 Peter Palfrader
4 #
5 # Permission is hereby granted, free of charge, to any person obtaining
6 # a copy of this software and associated documentation files (the
7 # "Software"), to deal in the Software without restriction, including
8 # without limitation the rights to use, copy, modify, merge, publish,
9 # distribute, sublicense, and/or sell copies of the Software, and to
10 # permit persons to whom the Software is furnished to do so, subject to
11 # the following conditions:
12 #
13 # The above copyright notice and this permission notice shall be
14 # included in all copies or substantial portions of the Software.
15 #
16 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20 # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21 # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22 # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23
24 # Checks if the WAL backups for several postgres clusters from
25 # different hosts are current.  Might not catch all error instances.
26 #
27 # If called with -e will expire WALs and BASE backups no longer required.
28 #
29 # Needs files layed out like so:
30 # beethoven:/srv/pgbackup/pg# ls -l ries/ | head
31 # total 6794956
32 # -rw------- 1 debbackup debbackup 378099591 May  1 02:26 dak.BASE.20100501-ries-dak-8.4-backup-F_37000020.tar.gz
33 # -rw------- 1 debbackup debbackup 382267407 May  8 02:25 dak.BASE.20100508-ries-dak-8.4-backup-F_A2000020.tar.gz
34 # -rw------- 1 debbackup debbackup 384054069 May 15 02:24 dak.BASE.20100515-ries-dak-8.4-backup-F_FB000020.tar.gz
35 # -rw------- 1 debbackup debbackup 386407500 May 22 02:27 dak.BASE.20100522-ries-dak-8.4-backup-10_58000020.tar.gz
36 # -rw------- 1 debbackup debbackup  16777216 May  1 02:26 dak.WAL.000000010000000F00000037
37 # -rw------- 1 debbackup debbackup       264 May  1 02:27 dak.WAL.000000010000000F00000037.00000020.backup
38 # -rw------- 1 debbackup debbackup  16777216 May  1 03:25 dak.WAL.000000010000000F00000038
39 # -rw------- 1 debbackup debbackup  16777216 May  1 09:11 dak.WAL.000000010000000F00000039
40 # -rw------- 1 debbackup debbackup  16777216 May  1 09:45 dak.WAL.000000010000000F0000003A
41 # ...
42 #
43 # needs write privileges to at least the .backup files
44
45
46 import copy
47 import time
48 import re
49 import os
50 import errno
51 import sys
52 import yaml
53 import optparse
54 import socket
55
56 def load_conf(cf):
57     if cf is not None:
58         configfile = cf
59     elif 'DSA_CHECK_BACKUPPG_CONF' in os.environ:
60         configfile = os.environ['DSA_CHECK_BACKUPPG_CONF']
61     else:
62         configfile = '/etc/nagios/dsa-check-backuppg.conf'
63
64     f = open(configfile)
65     config = yaml.load(f.read())
66     f.close()
67     return config
68
69
70 notices_seq = []
71 problems_seq = []
72 problems_per_db = {}
73 global_expires = []
74 #def note_warning(key, host, db, value):
75 #    global problems_seq
76 #    problems_seq.append("[%s, %s]: %s: %s"%(host, db, key, value))
77 #
78 #    global problems_per_db
79 #    if not host in problems_per_db: problems_per_db[host] = {}
80 #    problems_per_db[host][db] = True
81 def note_info(key, value, pre=None):
82     global notices_seq
83     if pre is None:
84         notices_seq.append("%s: %s"%(key, value))
85     else:
86         notices_seq.append("[%s] %s: %s"%(pre, key, value))
87
88 def note_warning(key, value, pre=None):
89     global problems_seq
90     if pre is None:
91         problems_seq.append("%s: %s"%(key, value))
92     else:
93         problems_seq.append("[%s] %s: %s"%(pre, key, value))
94 def note_warning_db(host, db, key, value):
95     note_warning(key, value, "%s, %s"%(host, db))
96     global problems_per_db
97     if not host in problems_per_db: problems_per_db[host] = {}
98     problems_per_db[host][db] = True
99
100
101 def wal_pre(w, host, db):
102     (w1,w2) = w
103     if w2 == 0:
104         w1 -= 1
105         if (host,db) in ( ('moszumanska', 'main'), ):
106             w2 = 0xFE
107         else:
108             w2 = 0xFF
109     else:
110         w2 -= 1
111
112     return (w1,w2)
113
114 def parse_pg_backup_info(fn):
115     i = {}
116     f = open(fn)
117     for l in f:
118         (k,v) = l.strip().split(': ', 2)
119         i[k.lower()] = v
120     f.close()
121     return i
122
123
124 parser = optparse.OptionParser()
125 parser.set_usage("%prog [-c=<CONFFILE>]               (nagios mode)\n" +
126           "Usage: %prog [-c=<CONFFILE>] -e [-d] [-v]  (expire mode)")
127 parser.add_option("-c", "--config", dest="conffile", metavar="CONFFILE",
128   help="Config file location.")
129 parser.add_option("-e", "--expire", dest="expire", action="store_true",
130   help="Expire old files.")
131 parser.add_option("-d", "--dry-run", dest="dry_run", action="store_true",
132   help="Do not really remove files.")
133 parser.add_option("-v", "--verbose", dest="verbose", action="store_true",
134   help="List files we are expiring.")
135 (options, args) = parser.parse_args()
136
137 if len(args) > 0:
138     parser.print_help()
139     sys.exit(1)
140
141
142 config = load_conf(options.conffile)
143
144 os.chdir(config['rootdir'])
145 for dir in os.listdir('.'):
146     if dir.startswith('.') or dir.endswith('.old'):
147         note_info('IGNORED', dir)
148         continue
149
150     if not os.path.isdir(dir):
151         try:
152             mtime = os.path.getmtime(dir)
153             ctime = os.path.getctime(dir)
154         except OSError as e:
155             if e.errno == errno.ENOENT:
156                 continue
157             else:
158                 raise e
159         if min(mtime, ctime) + 3600*4 > time.time():
160             note_info('IGNORED', dir)
161         else:
162             note_warning('NOT-A-DIR', dir)
163         continue
164
165     if not dir in config['backups']:
166         note_warning('NOT-CONFIGURED', dir)
167         continue
168
169     files = os.listdir(dir)
170     if len(files) == 0:
171         note_warning('EMPTY-DIR', dir)
172         continue
173
174     files.sort()
175
176     unhandled_backups = copy.copy(config['backups'][dir])
177     ignored_dbs = {}
178     backup_state = {}
179
180     # Go over all the files in a directory and check for various things
181     # - for a given cluster's backups we want the latest WAL file to be no
182     #   older than a certain age,
183     # - we want all consecutive WAL files, i.e. no holes
184     # - we want a full backup at one point, and it shouldn't be too old
185     # - If our retention period is say 2 weeks, then we look for the
186     #   tar file that's older than that, and everything before that can
187     #   be expired
188     while len(files) > 0:
189         fn = files.pop()
190         ffn = os.path.join(dir, fn)
191
192         r = re.match('([a-z0-9-]+)\.(WAL|BASE)\..*', fn)
193         if not r:
194             note_warning('CANNOT-PARSE', ffn)
195             continue
196
197         (db, type) = r.groups(1)
198         if not isinstance(config['backups'][dir], dict) or not db in config['backups'][dir]:
199             if not db in ignored_dbs:
200                 note_warning_db(dir, db, 'NOT-CONFIGURED', '%s/%s'%(dir, db))
201             ignored_dbs[db] = True
202         if db in ignored_dbs:
203             continue
204         if not db in backup_state:
205             backup_state[db] = {}
206             # can_expire_for_base_hit: We hit a BASE backup that is old enough
207             #   so that once we hit all the required WAL files for this base
208             #   backup to work we can start expiring everything older than that
209             #   oldest WAL file
210             backup_state[db]['can_expire_for_base_hit'] = False
211             # can_expire_next: Can expire all files that we handle from now on
212             backup_state[db]['can_expire_next'] = False
213             backup_state[db]['expires'] = []
214             if isinstance(config['backups'][dir][db], dict) and 'timeline' in config['backups'][dir][db]:
215                 backup_state[db]['timeline'] = config['backups'][dir][db]['timeline']
216             else:
217                 backup_state[db]['timeline'] = 1
218
219         # Apparently we already have seen a base backup and all its wal files
220         # which we want to keep, so everything what we see now is older than
221         # that and we can get rid of it
222         if backup_state[db]['can_expire_next']:
223             backup_state[db]['expires'].append(ffn)
224
225         if type == 'BASE':
226             # should have been taken care of before
227             # while handling a WAL.backup file
228             note_warning_db(dir, db, 'STRAY-BASE', ffn)
229             continue
230         elif type == 'WAL':
231             # handle .backup files  -  they live near the WAL "file namespace" and reference
232             # the corresponding full backup
233             r = re.match('[a-z0-9-]+\.WAL\.([0-9A-F]{8})([0-9A-F]{8})([0-9A-F]{8})\.[0-9A-F]{8}\.backup', fn)
234             if r:
235                 info = parse_pg_backup_info(ffn)
236                 basefn = '%s.BASE.%s-%s.tar.gz'%(db, info['label'], info['start wal location'].split(' ',2)[0].replace('/', '_'))
237                 baseffn = os.path.join(dir, basefn)
238                 if not basefn in files:
239                     basefn = '%s.BASE.%s.tar.gz'%(db, info['label'])
240                     baseffn = os.path.join(dir, basefn)
241                     if not basefn in files:
242                         m = re.match('([a-z0-9.]+)-\d{8}-\d{6}', info['label'])
243                         if m and (m.group(1) != socket.getfqdn()):
244                             note_info(dir, 'IGNORED-OTHER-BASE: '+basefn)
245                             continue
246                         else:
247                             note_warning_db(dir, db, 'MISSING-BASE', basefn)
248                             continue
249                 files.remove(basefn)
250                 if backup_state[db]['can_expire_next']:
251                     backup_state[db]['expires'].append(baseffn)
252
253                 if not 'newest-base' in backup_state[db]:
254                     backup_state[db]['newest-base'] = baseffn
255                 backup_state[db]['oldest-base'] = baseffn
256
257                 startre = re.search('\(file ([0-9A-F]{24})\)', info['start wal location'])
258                 if not startre:
259                     note_warning_db(dir, db, 'CANNOT-PARSE-START_WAL_LOCATION', ffn)
260                     continue
261                 start_file = startre.group(1)
262                 walbase = '%s.WAL.%s'%(db, start_file)
263                 backup_state[db]['base_needs_wal_until'] = walbase
264
265                 start = time.mktime(time.strptime(info['start time'], '%Y-%m-%d %H:%M:%S %Z'))
266                 if start + config['retention'] < time.time():
267                     backup_state[db]['can_expire_for_base_hit'] = True
268                 continue
269
270             # handle WAL files
271             r = re.match('[a-z0-9-]+\.WAL\.([0-9A-F]{8})([0-9A-F]{8})([0-9A-F]{8})', fn)
272             if r:
273                 if 'base_needs_wal_until' in backup_state[db]:
274                     if backup_state[db]['base_needs_wal_until'] == fn:
275                         del backup_state[db]['base_needs_wal_until']
276                         if backup_state[db]['can_expire_for_base_hit']:
277                             backup_state[db]['can_expire_next'] = True
278
279                 (timeline, wal1, wal2) = map(lambda x: int(x,16), r.groups())
280                 if not timeline == backup_state[db]['timeline']:
281                     note_warning_db(dir, db, 'UNEXPECTED-TIMELINE', ffn)
282                     continue
283
284                 thissegment = (wal1, wal2)
285                 if not 'newest-wal' in backup_state[db]:
286                     backup_state[db]['newest-wal'] = thissegment
287                     backup_state[db]['newest-wal-file'] = ffn
288                 else:
289                     if not wal_pre(backup_state[db]['oldest-wal'], dir, db) == thissegment:
290                         note_warning_db(dir, db, 'WAL-MISSING-AFTER', ffn)
291                         ignored_dbs[db] = True
292                         continue
293                 backup_state[db]['oldest-wal'] = thissegment
294
295                 continue
296
297             note_warning_db(dir, db, 'CANNOT-PARSE-WAL', ffn)
298         else:
299             note_warning_db(dir, db, 'INVALID-TYPE', ffn)
300
301
302     for db in backup_state:
303         if 'base_needs_wal_until' in backup_state[db]:
304             note_warning_db(dir, db, 'MISSING_WAL_FOR_BASE', backup_state[db]['base_needs_wal_until'])
305
306     for db in backup_state:
307         if not 'newest-base' in backup_state[db]:
308             note_warning_db(dir, db, 'NO-BASE', 'no base backup found?')
309         else:
310             age = time.time() - os.stat(backup_state[db]['newest-base']).st_mtime
311             if age > config['warn-age']['base']:
312                 note_warning_db(dir, db, 'BASE-IS-OLD', 'latest base backup is too old')
313
314         if not 'newest-wal-file' in backup_state[db]:
315             note_warning_db(dir, db, 'NO-BASE', 'no WAL files found?')
316         else:
317             age = time.time() - os.stat(backup_state[db]['newest-wal-file']).st_mtime
318             if age > config['warn-age']['wal']:
319                 note_warning_db(dir, db, 'WAL-IS-OLD', 'latest wal file is too old')
320
321     for db in backup_state:
322         if len(backup_state[db]['expires']) > 0:
323             if dir in problems_per_db and db in problems_per_db[dir] and problems_per_db[dir][db]:
324                 note_warning_db(dir, db, 'NOT-EXPIRING-DUE-TO-WARNINGS', 'have seen warnings, will not expire anything')
325             else:
326                 backup_state[db]['expires'].reverse()
327                 for f in backup_state[db]['expires']:
328                     global_expires.append(f)
329
330     #if not db in backup_state:
331     #    note_warning('BASE-WITHOUT-WAL', ffn)
332     #    ignored_dbs[db] = True
333     #    continue
334
335     #age = time.time() - os.stat(ffn).st_mtime
336     #if age > config['warn-age']['wal']:
337     #    note_warning('OLD-WAL', backup_state[db]['newest-wal-file'])
338     #    ignored_dbs[db] = True
339     #    continue
340
341
342 for p in problems_seq:
343     print p
344 for p in notices_seq:
345     print p
346
347 if options.expire:
348     for f in global_expires:
349         if options.verbose: print "Expiring %s"%(f)
350         if not options.dry_run: os.unlink(f)
351
352 if len(problems_seq) > 0:
353     sys.exit(1)
354
355 if not options.expire or options.verbose:
356     print "OK: no problems detected"
357 sys.exit(0)
358
359 # vim:set et:
360 # vim:set ts=4:
361 # vim:set shiftwidth=4: