dsa-check-backuppg: Ignore removed tmp files instead of crashing
[mirror/dsa-nagios.git] / dsa-nagios-checks / checks / dsa-check-backuppg
1 #!/usr/bin/python
2
3 # Copyright 2010 Peter Palfrader
4 #
5 # Permission is hereby granted, free of charge, to any person obtaining
6 # a copy of this software and associated documentation files (the
7 # "Software"), to deal in the Software without restriction, including
8 # without limitation the rights to use, copy, modify, merge, publish,
9 # distribute, sublicense, and/or sell copies of the Software, and to
10 # permit persons to whom the Software is furnished to do so, subject to
11 # the following conditions:
12 #
13 # The above copyright notice and this permission notice shall be
14 # included in all copies or substantial portions of the Software.
15 #
16 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20 # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21 # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22 # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23
24 # Checks if the WAL backups for several postgres clusters from
25 # different hosts are current.  Might not catch all error instances.
26 #
27 # If called with -e will expire WALs and BASE backups no longer required.
28 #
29 # Needs files layed out like so:
30 # beethoven:/srv/pgbackup/pg# ls -l ries/ | head
31 # total 6794956
32 # -rw------- 1 debbackup debbackup 378099591 May  1 02:26 dak.BASE.20100501-ries-dak-8.4-backup-F_37000020.tar.gz
33 # -rw------- 1 debbackup debbackup 382267407 May  8 02:25 dak.BASE.20100508-ries-dak-8.4-backup-F_A2000020.tar.gz
34 # -rw------- 1 debbackup debbackup 384054069 May 15 02:24 dak.BASE.20100515-ries-dak-8.4-backup-F_FB000020.tar.gz
35 # -rw------- 1 debbackup debbackup 386407500 May 22 02:27 dak.BASE.20100522-ries-dak-8.4-backup-10_58000020.tar.gz
36 # -rw------- 1 debbackup debbackup  16777216 May  1 02:26 dak.WAL.000000010000000F00000037
37 # -rw------- 1 debbackup debbackup       264 May  1 02:27 dak.WAL.000000010000000F00000037.00000020.backup
38 # -rw------- 1 debbackup debbackup  16777216 May  1 03:25 dak.WAL.000000010000000F00000038
39 # -rw------- 1 debbackup debbackup  16777216 May  1 09:11 dak.WAL.000000010000000F00000039
40 # -rw------- 1 debbackup debbackup  16777216 May  1 09:45 dak.WAL.000000010000000F0000003A
41 # ...
42 #
43 # needs write privileges to at least the .backup files
44
45
46 import copy
47 import time
48 import re
49 import os
50 import sys
51 import yaml
52 import optparse
53
54 def load_conf(cf):
55     if cf is not None:
56         configfile = cf
57     elif 'DSA_CHECK_BACKUPPG_CONF' in os.environ:
58         configfile = os.environ['DSA_CHECK_BACKUPPG_CONF']
59     else:
60         configfile = '/etc/nagios/dsa-check-backuppg.conf'
61
62     f = open(configfile)
63     config = yaml.load(f.read())
64     f.close()
65     return config
66
67
68 notices_seq = []
69 problems_seq = []
70 problems_per_db = {}
71 global_expires = []
72 #def note_warning(key, host, db, value):
73 #    global problems_seq
74 #    problems_seq.append("[%s, %s]: %s: %s"%(host, db, key, value))
75 #
76 #    global problems_per_db
77 #    if not host in problems_per_db: problems_per_db[host] = {}
78 #    problems_per_db[host][db] = True
79 def note_info(key, value, pre=None):
80     global notices_seq
81     if pre is None:
82         notices_seq.append("%s: %s"%(key, value))
83     else:
84         notices_seq.append("[%s] %s: %s"%(pre, key, value))
85
86 def note_warning(key, value, pre=None):
87     global problems_seq
88     if pre is None:
89         problems_seq.append("%s: %s"%(key, value))
90     else:
91         problems_seq.append("[%s] %s: %s"%(pre, key, value))
92 def note_warning_db(host, db, key, value):
93     note_warning(key, value, "%s, %s"%(host, db))
94     global problems_per_db
95     if not host in problems_per_db: problems_per_db[host] = {}
96     problems_per_db[host][db] = True
97
98
99 def wal_pre(w):
100     (w1,w2) = w
101     if w2 == 0:
102         w1 -= 1
103         w2 = 0xFE
104     else:
105         w2 -= 1
106
107     return (w1,w2)
108
109 def parse_pg_backup_info(fn):
110     i = {}
111     f = open(fn)
112     for l in f:
113         (k,v) = l.strip().split(': ', 2)
114         i[k.lower()] = v
115     f.close()
116     return i
117
118
119 parser = optparse.OptionParser()
120 parser.set_usage("%prog [-c=<CONFFILE>]               (nagios mode)\n" +
121           "Usage: %prog [-c=<CONFFILE>] -e [-d] [-v]  (expire mode)")
122 parser.add_option("-c", "--config", dest="conffile", metavar="CONFFILE",
123   help="Config file location.")
124 parser.add_option("-e", "--expire", dest="expire", action="store_true",
125   help="Expire old files.")
126 parser.add_option("-d", "--dry-run", dest="dry_run", action="store_true",
127   help="Do not really remove files.")
128 parser.add_option("-v", "--verbose", dest="verbose", action="store_true",
129   help="List files we are expiring.")
130 (options, args) = parser.parse_args()
131
132 if len(args) > 0:
133     parser.print_help()
134     sys.exit(1)
135
136
137 config = load_conf(options.conffile)
138
139 os.chdir(config['rootdir'])
140 for dir in os.listdir('.'):
141     if dir.startswith('.') or dir.endswith('.old'):
142         note_info('IGNORED', dir)
143         continue
144
145     if not os.path.isdir(dir):
146         try:
147             mtime = os.path.getmtime(dir)
148             ctime = os.path.getctime(dir)
149         except OSError as e:
150             if e.errno == errno.ENOENT:
151                 continue
152             else:
153                 raise e
154         if min(mtime, ctime) + 3600*4 > time.time():
155             note_info('IGNORED', dir)
156         else:
157             note_warning('NOT-A-DIR', dir)
158         continue
159
160     if not dir in config['backups']:
161         note_warning('NOT-CONFIGURED', dir)
162         continue
163
164     files = os.listdir(dir)
165     if len(files) == 0:
166         note_warning('EMPTY-DIR', dir)
167         continue
168
169     files.sort()
170
171     unhandled_backups = copy.copy(config['backups'][dir])
172     ignored_dbs = {}
173     backup_state = {}
174
175     # Go over all the files in a directory and check for various things
176     # - for a given cluster's backups we want the latest WAL file to be no
177     #   older than a certain age,
178     # - we want all consecutive WAL files, i.e. no holes
179     # - we want a full backup at one point, and it shouldn't be too old
180     # - If our retention period is say 2 weeks, then we look for the
181     #   tar file that's older than that, and everything before that can
182     #   be expired
183     while len(files) > 0:
184         fn = files.pop()
185         ffn = os.path.join(dir, fn)
186
187         r = re.match('([a-z0-9-]+)\.(WAL|BASE)\..*', fn)
188         if not r:
189             note_warning('CANNOT-PARSE', ffn)
190             continue
191
192         (db, type) = r.groups(1)
193         if not isinstance(config['backups'][dir], list) or not db in config['backups'][dir]:
194             if not db in ignored_dbs:
195                 note_warning_db(dir, db, 'NOT-CONFIGURED', '%s/%s'%(dir, db))
196             ignored_dbs[db] = True
197         if db in ignored_dbs:
198             continue
199         if not db in backup_state:
200             backup_state[db] = {}
201             # can_expire_for_base_hit: We hit a BASE backup that is old enough
202             #   so that once we hit all the required WAL files for this base
203             #   backup to work we can start expiring everything older than that
204             #   oldest WAL file
205             backup_state[db]['can_expire_for_base_hit'] = False
206             # can_expire_next: Can expire all files that we handle from now on
207             backup_state[db]['can_expire_next'] = False
208             backup_state[db]['expires'] = []
209
210         # Apparently we already have seen a base backup and all its wal files
211         # which we want to keep, so everything what we see now is older than
212         # that and we can get rid of it
213         if backup_state[db]['can_expire_next']:
214             backup_state[db]['expires'].append(ffn)
215
216         if type == 'BASE':
217             # should have been taken care of before
218             # while handling a WAL.backup file
219             note_warning_db(dir, db, 'STRAY-BASE', ffn)
220             continue
221         elif type == 'WAL':
222             # handle .backup files  -  they live near the WAL "file namespace" and reference
223             # the corresponding full backup
224             r = re.match('[a-z0-9-]+\.WAL\.([0-9A-F]{8})([0-9A-F]{8})([0-9A-F]{8})\.[0-9A-F]{8}\.backup', fn)
225             if r:
226                 info = parse_pg_backup_info(ffn)
227                 basefn = '%s.BASE.%s-%s.tar.gz'%(db, info['label'], info['start wal location'].split(' ',2)[0].replace('/', '_'))
228                 baseffn = os.path.join(dir, basefn)
229                 if not basefn in files:
230                     basefn = '%s.BASE.%s.tar.gz'%(db, info['label'])
231                     baseffn = os.path.join(dir, basefn)
232                     if not basefn in files:
233                         note_warning_db(dir, db, 'MISSING-BASE', basefn)
234                         continue
235                 files.remove(basefn)
236                 if backup_state[db]['can_expire_next']:
237                     backup_state[db]['expires'].append(baseffn)
238
239                 if not 'newest-base' in backup_state[db]:
240                     backup_state[db]['newest-base'] = baseffn
241                 backup_state[db]['oldest-base'] = baseffn
242
243                 startre = re.search('\(file ([0-9A-F]{24})\)', info['start wal location'])
244                 if not startre:
245                     note_warning_db(dir, db, 'CANNOT-PARSE-START_WAL_LOCATION', ffn)
246                     continue
247                 start_file = startre.group(1)
248                 walbase = '%s.WAL.%s'%(db, start_file)
249                 backup_state[db]['base_needs_wal_until'] = walbase
250
251                 start = time.mktime(time.strptime(info['start time'], '%Y-%m-%d %H:%M:%S %Z'))
252                 if start + config['retention'] < time.time():
253                     backup_state[db]['can_expire_for_base_hit'] = True
254                 continue
255
256             # handle WAL files
257             r = re.match('[a-z0-9-]+\.WAL\.([0-9A-F]{8})([0-9A-F]{8})([0-9A-F]{8})', fn)
258             if r:
259                 if 'base_needs_wal_until' in backup_state[db]:
260                     if backup_state[db]['base_needs_wal_until'] == fn:
261                         del backup_state[db]['base_needs_wal_until']
262                         if backup_state[db]['can_expire_for_base_hit']:
263                             backup_state[db]['can_expire_next'] = True
264
265                 (timeline, wal1, wal2) = map(lambda x: int(x,16), r.groups())
266                 if not timeline == 1:
267                     note_warning_db(dir, db, 'CANNOT-HANDLE-TIMELINES_NOT_1', ffn)
268                     continue
269
270                 thissegment = (wal1, wal2)
271                 if not 'newest-wal' in backup_state[db]:
272                     backup_state[db]['newest-wal'] = thissegment
273                     backup_state[db]['newest-wal-file'] = ffn
274                 else:
275                     if not wal_pre(backup_state[db]['oldest-wal']) == thissegment:
276                         note_warning_db(dir, db, 'WAL-MISSING-AFTER', ffn)
277                         ignored_dbs[db] = True
278                         continue
279                 backup_state[db]['oldest-wal'] = thissegment
280
281                 continue
282
283             note_warning_db(dir, db, 'CANNOT-PARSE-WAL', ffn)
284         else:
285             note_warning_db(dir, db, 'INVALID-TYPE', ffn)
286
287
288     for db in backup_state:
289         if 'base_needs_wal_until' in backup_state[db]:
290             note_warning_db(dir, db, 'MISSING_WAL_FOR_BASE', backup_state[db]['base_needs_wal_until'])
291
292     for db in backup_state:
293         if not 'newest-base' in backup_state[db]:
294             note_warning_db(dir, db, 'NO-BASE', 'no base backup found?')
295         else:
296             age = time.time() - os.stat(backup_state[db]['newest-base']).st_mtime
297             if age > config['warn-age']['base']:
298                 note_warning_db(dir, db, 'BASE-IS-OLD', 'latest base backup is too old')
299
300         if not 'newest-wal-file' in backup_state[db]:
301             note_warning_db(dir, db, 'NO-BASE', 'no WAL files found?')
302         else:
303             age = time.time() - os.stat(backup_state[db]['newest-wal-file']).st_mtime
304             if age > config['warn-age']['wal']:
305                 note_warning_db(dir, db, 'WAL-IS-OLD', 'latest wal file is too old')
306
307     for db in backup_state:
308         if len(backup_state[db]['expires']) > 0:
309             if dir in problems_per_db and db in problems_per_db[dir] and problems_per_db[dir][db]:
310                 note_warning_db(dir, db, 'NOT-EXPIRING-DUE-TO-WARNINGS', 'have seen warnings, will not expire anything')
311             else:
312                 backup_state[db]['expires'].reverse()
313                 for f in backup_state[db]['expires']:
314                     global_expires.append(f)
315
316     #if not db in backup_state:
317     #    note_warning('BASE-WITHOUT-WAL', ffn)
318     #    ignored_dbs[db] = True
319     #    continue
320
321     #age = time.time() - os.stat(ffn).st_mtime
322     #if age > config['warn-age']['wal']:
323     #    note_warning('OLD-WAL', backup_state[db]['newest-wal-file'])
324     #    ignored_dbs[db] = True
325     #    continue
326
327
328 for p in problems_seq:
329     print p
330 for p in notices_seq:
331     print p
332
333 if options.expire:
334     for f in global_expires:
335         if options.verbose: print "Expiring %s"%(f)
336         if not options.dry_run: os.unlink(f)
337
338 if len(problems_seq) > 0:
339     sys.exit(1)
340
341 if not options.expire or options.verbose:
342     print "OK: no problems detected"
343 sys.exit(0)
344
345 # vim:set et:
346 # vim:set ts=4:
347 # vim:set shiftwidth=4: