Merge branch 'master' of git+ssh://db.debian.org/git/dsa-nagios
[mirror/dsa-nagios.git] / dsa-nagios-checks / checks / dsa-check-backuppg
1 #!/usr/bin/python
2
3 # Copyright 2010 Peter Palfrader
4 #
5 # Permission is hereby granted, free of charge, to any person obtaining
6 # a copy of this software and associated documentation files (the
7 # "Software"), to deal in the Software without restriction, including
8 # without limitation the rights to use, copy, modify, merge, publish,
9 # distribute, sublicense, and/or sell copies of the Software, and to
10 # permit persons to whom the Software is furnished to do so, subject to
11 # the following conditions:
12 #
13 # The above copyright notice and this permission notice shall be
14 # included in all copies or substantial portions of the Software.
15 #
16 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20 # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21 # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22 # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23
24 # Checks if the WAL backups for several postgres clusters from
25 # different hosts are current.  Might not catch all error instances.
26 #
27 # If called with -e will expire WALs and BASE backups no longer required.
28 #
29 # Needs files layed out like so:
30 # beethoven:/srv/pgbackup/pg# ls -l ries/ | head
31 # total 6794956
32 # -rw------- 1 debbackup debbackup 378099591 May  1 02:26 dak.BASE.20100501-ries-dak-8.4-backup-F_37000020.tar.gz
33 # -rw------- 1 debbackup debbackup 382267407 May  8 02:25 dak.BASE.20100508-ries-dak-8.4-backup-F_A2000020.tar.gz
34 # -rw------- 1 debbackup debbackup 384054069 May 15 02:24 dak.BASE.20100515-ries-dak-8.4-backup-F_FB000020.tar.gz
35 # -rw------- 1 debbackup debbackup 386407500 May 22 02:27 dak.BASE.20100522-ries-dak-8.4-backup-10_58000020.tar.gz
36 # -rw------- 1 debbackup debbackup  16777216 May  1 02:26 dak.WAL.000000010000000F00000037
37 # -rw------- 1 debbackup debbackup       264 May  1 02:27 dak.WAL.000000010000000F00000037.00000020.backup
38 # -rw------- 1 debbackup debbackup  16777216 May  1 03:25 dak.WAL.000000010000000F00000038
39 # -rw------- 1 debbackup debbackup  16777216 May  1 09:11 dak.WAL.000000010000000F00000039
40 # -rw------- 1 debbackup debbackup  16777216 May  1 09:45 dak.WAL.000000010000000F0000003A
41 # ...
42 #
43 # needs write privileges to at least the .backup files
44
45
46 import copy
47 import time
48 import re
49 import os
50 import errno
51 import sys
52 import yaml
53 import optparse
54
55 def load_conf(cf):
56     if cf is not None:
57         configfile = cf
58     elif 'DSA_CHECK_BACKUPPG_CONF' in os.environ:
59         configfile = os.environ['DSA_CHECK_BACKUPPG_CONF']
60     else:
61         configfile = '/etc/nagios/dsa-check-backuppg.conf'
62
63     f = open(configfile)
64     config = yaml.load(f.read())
65     f.close()
66     return config
67
68
69 notices_seq = []
70 problems_seq = []
71 problems_per_db = {}
72 global_expires = []
73 #def note_warning(key, host, db, value):
74 #    global problems_seq
75 #    problems_seq.append("[%s, %s]: %s: %s"%(host, db, key, value))
76 #
77 #    global problems_per_db
78 #    if not host in problems_per_db: problems_per_db[host] = {}
79 #    problems_per_db[host][db] = True
80 def note_info(key, value, pre=None):
81     global notices_seq
82     if pre is None:
83         notices_seq.append("%s: %s"%(key, value))
84     else:
85         notices_seq.append("[%s] %s: %s"%(pre, key, value))
86
87 def note_warning(key, value, pre=None):
88     global problems_seq
89     if pre is None:
90         problems_seq.append("%s: %s"%(key, value))
91     else:
92         problems_seq.append("[%s] %s: %s"%(pre, key, value))
93 def note_warning_db(host, db, key, value):
94     note_warning(key, value, "%s, %s"%(host, db))
95     global problems_per_db
96     if not host in problems_per_db: problems_per_db[host] = {}
97     problems_per_db[host][db] = True
98
99
100 def wal_pre(w):
101     (w1,w2) = w
102     if w2 == 0:
103         w1 -= 1
104         w2 = 0xFE
105     else:
106         w2 -= 1
107
108     return (w1,w2)
109
110 def parse_pg_backup_info(fn):
111     i = {}
112     f = open(fn)
113     for l in f:
114         (k,v) = l.strip().split(': ', 2)
115         i[k.lower()] = v
116     f.close()
117     return i
118
119
120 parser = optparse.OptionParser()
121 parser.set_usage("%prog [-c=<CONFFILE>]               (nagios mode)\n" +
122           "Usage: %prog [-c=<CONFFILE>] -e [-d] [-v]  (expire mode)")
123 parser.add_option("-c", "--config", dest="conffile", metavar="CONFFILE",
124   help="Config file location.")
125 parser.add_option("-e", "--expire", dest="expire", action="store_true",
126   help="Expire old files.")
127 parser.add_option("-d", "--dry-run", dest="dry_run", action="store_true",
128   help="Do not really remove files.")
129 parser.add_option("-v", "--verbose", dest="verbose", action="store_true",
130   help="List files we are expiring.")
131 (options, args) = parser.parse_args()
132
133 if len(args) > 0:
134     parser.print_help()
135     sys.exit(1)
136
137
138 config = load_conf(options.conffile)
139
140 os.chdir(config['rootdir'])
141 for dir in os.listdir('.'):
142     if dir.startswith('.') or dir.endswith('.old'):
143         note_info('IGNORED', dir)
144         continue
145
146     if not os.path.isdir(dir):
147         try:
148             mtime = os.path.getmtime(dir)
149             ctime = os.path.getctime(dir)
150         except OSError as e:
151             if e.errno == errno.ENOENT:
152                 continue
153             else:
154                 raise e
155         if min(mtime, ctime) + 3600*4 > time.time():
156             note_info('IGNORED', dir)
157         else:
158             note_warning('NOT-A-DIR', dir)
159         continue
160
161     if not dir in config['backups']:
162         note_warning('NOT-CONFIGURED', dir)
163         continue
164
165     files = os.listdir(dir)
166     if len(files) == 0:
167         note_warning('EMPTY-DIR', dir)
168         continue
169
170     files.sort()
171
172     unhandled_backups = copy.copy(config['backups'][dir])
173     ignored_dbs = {}
174     backup_state = {}
175
176     # Go over all the files in a directory and check for various things
177     # - for a given cluster's backups we want the latest WAL file to be no
178     #   older than a certain age,
179     # - we want all consecutive WAL files, i.e. no holes
180     # - we want a full backup at one point, and it shouldn't be too old
181     # - If our retention period is say 2 weeks, then we look for the
182     #   tar file that's older than that, and everything before that can
183     #   be expired
184     while len(files) > 0:
185         fn = files.pop()
186         ffn = os.path.join(dir, fn)
187
188         r = re.match('([a-z0-9-]+)\.(WAL|BASE)\..*', fn)
189         if not r:
190             note_warning('CANNOT-PARSE', ffn)
191             continue
192
193         (db, type) = r.groups(1)
194         if not isinstance(config['backups'][dir], list) or not db in config['backups'][dir]:
195             if not db in ignored_dbs:
196                 note_warning_db(dir, db, 'NOT-CONFIGURED', '%s/%s'%(dir, db))
197             ignored_dbs[db] = True
198         if db in ignored_dbs:
199             continue
200         if not db in backup_state:
201             backup_state[db] = {}
202             # can_expire_for_base_hit: We hit a BASE backup that is old enough
203             #   so that once we hit all the required WAL files for this base
204             #   backup to work we can start expiring everything older than that
205             #   oldest WAL file
206             backup_state[db]['can_expire_for_base_hit'] = False
207             # can_expire_next: Can expire all files that we handle from now on
208             backup_state[db]['can_expire_next'] = False
209             backup_state[db]['expires'] = []
210
211         # Apparently we already have seen a base backup and all its wal files
212         # which we want to keep, so everything what we see now is older than
213         # that and we can get rid of it
214         if backup_state[db]['can_expire_next']:
215             backup_state[db]['expires'].append(ffn)
216
217         if type == 'BASE':
218             # should have been taken care of before
219             # while handling a WAL.backup file
220             note_warning_db(dir, db, 'STRAY-BASE', ffn)
221             continue
222         elif type == 'WAL':
223             # handle .backup files  -  they live near the WAL "file namespace" and reference
224             # the corresponding full backup
225             r = re.match('[a-z0-9-]+\.WAL\.([0-9A-F]{8})([0-9A-F]{8})([0-9A-F]{8})\.[0-9A-F]{8}\.backup', fn)
226             if r:
227                 info = parse_pg_backup_info(ffn)
228                 basefn = '%s.BASE.%s-%s.tar.gz'%(db, info['label'], info['start wal location'].split(' ',2)[0].replace('/', '_'))
229                 baseffn = os.path.join(dir, basefn)
230                 if not basefn in files:
231                     basefn = '%s.BASE.%s.tar.gz'%(db, info['label'])
232                     baseffn = os.path.join(dir, basefn)
233                     if not basefn in files:
234                         note_warning_db(dir, db, 'MISSING-BASE', basefn)
235                         continue
236                 files.remove(basefn)
237                 if backup_state[db]['can_expire_next']:
238                     backup_state[db]['expires'].append(baseffn)
239
240                 if not 'newest-base' in backup_state[db]:
241                     backup_state[db]['newest-base'] = baseffn
242                 backup_state[db]['oldest-base'] = baseffn
243
244                 startre = re.search('\(file ([0-9A-F]{24})\)', info['start wal location'])
245                 if not startre:
246                     note_warning_db(dir, db, 'CANNOT-PARSE-START_WAL_LOCATION', ffn)
247                     continue
248                 start_file = startre.group(1)
249                 walbase = '%s.WAL.%s'%(db, start_file)
250                 backup_state[db]['base_needs_wal_until'] = walbase
251
252                 start = time.mktime(time.strptime(info['start time'], '%Y-%m-%d %H:%M:%S %Z'))
253                 if start + config['retention'] < time.time():
254                     backup_state[db]['can_expire_for_base_hit'] = True
255                 continue
256
257             # handle WAL files
258             r = re.match('[a-z0-9-]+\.WAL\.([0-9A-F]{8})([0-9A-F]{8})([0-9A-F]{8})', fn)
259             if r:
260                 if 'base_needs_wal_until' in backup_state[db]:
261                     if backup_state[db]['base_needs_wal_until'] == fn:
262                         del backup_state[db]['base_needs_wal_until']
263                         if backup_state[db]['can_expire_for_base_hit']:
264                             backup_state[db]['can_expire_next'] = True
265
266                 (timeline, wal1, wal2) = map(lambda x: int(x,16), r.groups())
267                 if not timeline == 1:
268                     note_warning_db(dir, db, 'CANNOT-HANDLE-TIMELINES_NOT_1', ffn)
269                     continue
270
271                 thissegment = (wal1, wal2)
272                 if not 'newest-wal' in backup_state[db]:
273                     backup_state[db]['newest-wal'] = thissegment
274                     backup_state[db]['newest-wal-file'] = ffn
275                 else:
276                     if not wal_pre(backup_state[db]['oldest-wal']) == thissegment:
277                         note_warning_db(dir, db, 'WAL-MISSING-AFTER', ffn)
278                         ignored_dbs[db] = True
279                         continue
280                 backup_state[db]['oldest-wal'] = thissegment
281
282                 continue
283
284             note_warning_db(dir, db, 'CANNOT-PARSE-WAL', ffn)
285         else:
286             note_warning_db(dir, db, 'INVALID-TYPE', ffn)
287
288
289     for db in backup_state:
290         if 'base_needs_wal_until' in backup_state[db]:
291             note_warning_db(dir, db, 'MISSING_WAL_FOR_BASE', backup_state[db]['base_needs_wal_until'])
292
293     for db in backup_state:
294         if not 'newest-base' in backup_state[db]:
295             note_warning_db(dir, db, 'NO-BASE', 'no base backup found?')
296         else:
297             age = time.time() - os.stat(backup_state[db]['newest-base']).st_mtime
298             if age > config['warn-age']['base']:
299                 note_warning_db(dir, db, 'BASE-IS-OLD', 'latest base backup is too old')
300
301         if not 'newest-wal-file' in backup_state[db]:
302             note_warning_db(dir, db, 'NO-BASE', 'no WAL files found?')
303         else:
304             age = time.time() - os.stat(backup_state[db]['newest-wal-file']).st_mtime
305             if age > config['warn-age']['wal']:
306                 note_warning_db(dir, db, 'WAL-IS-OLD', 'latest wal file is too old')
307
308     for db in backup_state:
309         if len(backup_state[db]['expires']) > 0:
310             if dir in problems_per_db and db in problems_per_db[dir] and problems_per_db[dir][db]:
311                 note_warning_db(dir, db, 'NOT-EXPIRING-DUE-TO-WARNINGS', 'have seen warnings, will not expire anything')
312             else:
313                 backup_state[db]['expires'].reverse()
314                 for f in backup_state[db]['expires']:
315                     global_expires.append(f)
316
317     #if not db in backup_state:
318     #    note_warning('BASE-WITHOUT-WAL', ffn)
319     #    ignored_dbs[db] = True
320     #    continue
321
322     #age = time.time() - os.stat(ffn).st_mtime
323     #if age > config['warn-age']['wal']:
324     #    note_warning('OLD-WAL', backup_state[db]['newest-wal-file'])
325     #    ignored_dbs[db] = True
326     #    continue
327
328
329 for p in problems_seq:
330     print p
331 for p in notices_seq:
332     print p
333
334 if options.expire:
335     for f in global_expires:
336         if options.verbose: print "Expiring %s"%(f)
337         if not options.dry_run: os.unlink(f)
338
339 if len(problems_seq) > 0:
340     sys.exit(1)
341
342 if not options.expire or options.verbose:
343     print "OK: no problems detected"
344 sys.exit(0)
345
346 # vim:set et:
347 # vim:set ts=4:
348 # vim:set shiftwidth=4: