dsa-check-backuppg: ignore regular files in pg backup's root directory if they are...
[mirror/dsa-nagios.git] / dsa-nagios-checks / checks / dsa-check-backuppg
1 #!/usr/bin/python
2
3 # Copyright 2010 Peter Palfrader
4 #
5 # Permission is hereby granted, free of charge, to any person obtaining
6 # a copy of this software and associated documentation files (the
7 # "Software"), to deal in the Software without restriction, including
8 # without limitation the rights to use, copy, modify, merge, publish,
9 # distribute, sublicense, and/or sell copies of the Software, and to
10 # permit persons to whom the Software is furnished to do so, subject to
11 # the following conditions:
12 #
13 # The above copyright notice and this permission notice shall be
14 # included in all copies or substantial portions of the Software.
15 #
16 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20 # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21 # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22 # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23
24 # Checks if the WAL backups for several postgres clusters from
25 # different hosts are current.  Might not catch all error instances.
26 #
27 # If called with -e will expire WALs and BASE backups no longer required.
28 #
29 # Needs files layed out like so:
30 # beethoven:/srv/pgbackup/pg# ls -l ries/ | head
31 # total 6794956
32 # -rw------- 1 debbackup debbackup 378099591 May  1 02:26 dak.BASE.20100501-ries-dak-8.4-backup-F_37000020.tar.gz
33 # -rw------- 1 debbackup debbackup 382267407 May  8 02:25 dak.BASE.20100508-ries-dak-8.4-backup-F_A2000020.tar.gz
34 # -rw------- 1 debbackup debbackup 384054069 May 15 02:24 dak.BASE.20100515-ries-dak-8.4-backup-F_FB000020.tar.gz
35 # -rw------- 1 debbackup debbackup 386407500 May 22 02:27 dak.BASE.20100522-ries-dak-8.4-backup-10_58000020.tar.gz
36 # -rw------- 1 debbackup debbackup  16777216 May  1 02:26 dak.WAL.000000010000000F00000037
37 # -rw------- 1 debbackup debbackup       264 May  1 02:27 dak.WAL.000000010000000F00000037.00000020.backup
38 # -rw------- 1 debbackup debbackup  16777216 May  1 03:25 dak.WAL.000000010000000F00000038
39 # -rw------- 1 debbackup debbackup  16777216 May  1 09:11 dak.WAL.000000010000000F00000039
40 # -rw------- 1 debbackup debbackup  16777216 May  1 09:45 dak.WAL.000000010000000F0000003A
41 # ...
42 #
43 # needs write privileges to at least the .backup files
44
45
46 import copy
47 import time
48 import re
49 import os
50 import sys
51 import yaml
52 import optparse
53
54 def load_conf(cf):
55     if cf is not None:
56         configfile = cf
57     elif 'DSA_CHECK_BACKUPPG_CONF' in os.environ:
58         configfile = os.environ['DSA_CHECK_BACKUPPG_CONF']
59     else:
60         configfile = '/etc/nagios/dsa-check-backuppg.conf'
61
62     f = open(configfile)
63     config = yaml.load(f.read())
64     f.close()
65     return config
66
67
68 notices_seq = []
69 problems_seq = []
70 problems_per_db = {}
71 global_expires = []
72 #def note_warning(key, host, db, value):
73 #    global problems_seq
74 #    problems_seq.append("[%s, %s]: %s: %s"%(host, db, key, value))
75 #
76 #    global problems_per_db
77 #    if not host in problems_per_db: problems_per_db[host] = {}
78 #    problems_per_db[host][db] = True
79 def note_info(key, value, pre=None):
80     global notices_seq
81     if pre is None:
82         notices_seq.append("%s: %s"%(key, value))
83     else:
84         notices_seq.append("[%s] %s: %s"%(pre, key, value))
85
86 def note_warning(key, value, pre=None):
87     global problems_seq
88     if pre is None:
89         problems_seq.append("%s: %s"%(key, value))
90     else:
91         problems_seq.append("[%s] %s: %s"%(pre, key, value))
92 def note_warning_db(host, db, key, value):
93     note_warning(key, value, "%s, %s"%(host, db))
94     global problems_per_db
95     if not host in problems_per_db: problems_per_db[host] = {}
96     problems_per_db[host][db] = True
97
98
99 def wal_pre(w):
100     (w1,w2) = w
101     if w2 == 0:
102         w1 -= 1
103         w2 = 0xFE
104     else:
105         w2 -= 1
106
107     return (w1,w2)
108
109 def parse_pg_backup_info(fn):
110     i = {}
111     f = open(fn)
112     for l in f:
113         (k,v) = l.strip().split(': ', 2)
114         i[k.lower()] = v
115     f.close()
116     return i
117
118
119 parser = optparse.OptionParser()
120 parser.set_usage("%prog [-c=<CONFFILE>]               (nagios mode)\n" +
121           "Usage: %prog [-c=<CONFFILE>] -e [-d] [-v]  (expire mode)")
122 parser.add_option("-c", "--config", dest="conffile", metavar="CONFFILE",
123   help="Config file location.")
124 parser.add_option("-e", "--expire", dest="expire", action="store_true",
125   help="Expire old files.")
126 parser.add_option("-d", "--dry-run", dest="dry_run", action="store_true",
127   help="Do not really remove files.")
128 parser.add_option("-v", "--verbose", dest="verbose", action="store_true",
129   help="List files we are expiring.")
130 (options, args) = parser.parse_args()
131
132 if len(args) > 0:
133     parser.print_help()
134     sys.exit(1)
135
136
137 config = load_conf(options.conffile)
138
139 os.chdir(config['rootdir'])
140 for dir in os.listdir('.'):
141     if not os.path.isdir(dir):
142         if min(os.path.getmtime(dir), os.path.getctime(dir)) + 3600*4 > time.time():
143             note_info('IGNORED', dir)
144         else:
145             note_warning('NOT-A-DIR', dir)
146         continue
147
148     if not dir in config['backups']:
149         note_warning('NOT-CONFIGURED', dir)
150         continue
151
152     files = os.listdir(dir)
153     if len(files) == 0:
154         note_warning('EMPTY-DIR', dir)
155         continue
156
157     files.sort()
158
159     unhandled_backups = copy.copy(config['backups'][dir])
160     ignored_dbs = {}
161     backup_state = {}
162
163     # Go over all the files in a directory and check for various things
164     # - for a given cluster's backups we want the latest WAL file to be no
165     #   older than a certain age,
166     # - we want all consecutive WAL files, i.e. no holes
167     # - we want a full backup at one point, and it shouldn't be too old
168     # - If our retention period is say 2 weeks, then we look for the
169     #   tar file that's older than that, and everything before that can
170     #   be expired
171     while len(files) > 0:
172         fn = files.pop()
173         ffn = os.path.join(dir, fn)
174
175         r = re.match('([a-z0-9-]+)\.(WAL|BASE)\..*', fn)
176         if not r:
177             note_warning('CANNOT-PARSE', ffn)
178             continue
179
180         (db, type) = r.groups(1)
181         if not isinstance(config['backups'][dir], list) or not db in config['backups'][dir]:
182             if not db in ignored_dbs:
183                 note_warning_db(dir, db, 'NOT-CONFIGURED', '%s/%s'%(dir, db))
184             ignored_dbs[db] = True
185         if db in ignored_dbs:
186             continue
187         if not db in backup_state:
188             backup_state[db] = {}
189             # can_expire_for_base_hit: We hit a BASE backup that is old enough
190             #   so that once we hit all the required WAL files for this base
191             #   backup to work we can start expiring everything older than that
192             #   oldest WAL file
193             backup_state[db]['can_expire_for_base_hit'] = False
194             # can_expire_next: Can expire all files that we handle from now on
195             backup_state[db]['can_expire_next'] = False
196             backup_state[db]['expires'] = []
197
198         # Apparently we already have seen a base backup and all its wal files
199         # which we want to keep, so everything what we see now is older than
200         # that and we can get rid of it
201         if backup_state[db]['can_expire_next']:
202             backup_state[db]['expires'].append(ffn)
203
204         if type == 'BASE':
205             # should have been taken care of before
206             # while handling a WAL.backup file
207             note_warning_db(dir, db, 'STRAY-BASE', ffn)
208             continue
209         elif type == 'WAL':
210             # handle .backup files  -  they live near the WAL "file namespace" and reference
211             # the corresponding full backup
212             r = re.match('[a-z0-9-]+\.WAL\.([0-9A-F]{8})([0-9A-F]{8})([0-9A-F]{8})\.[0-9A-F]{8}\.backup', fn)
213             if r:
214                 info = parse_pg_backup_info(ffn)
215                 basefn = '%s.BASE.%s-%s.tar.gz'%(db, info['label'], info['start wal location'].split(' ',2)[0].replace('/', '_'))
216                 baseffn = os.path.join(dir, basefn)
217                 if not basefn in files:
218                     note_warning_db(dir, db, 'MISSING-BASE', basefn)
219                     continue
220                 files.remove(basefn)
221                 if backup_state[db]['can_expire_next']:
222                     backup_state[db]['expires'].append(baseffn)
223
224                 if not 'newest-base' in backup_state[db]:
225                     backup_state[db]['newest-base'] = baseffn
226                 backup_state[db]['oldest-base'] = baseffn
227
228                 startre = re.search('\(file ([0-9A-F]{24})\)', info['start wal location'])
229                 if not startre:
230                     note_warning_db(dir, db, 'CANNOT-PARSE-START_WAL_LOCATION', ffn)
231                     continue
232                 start_file = startre.group(1)
233                 walbase = '%s.WAL.%s'%(db, start_file)
234                 backup_state[db]['base_needs_wal_until'] = walbase
235
236                 start = time.mktime(time.strptime(info['start time'], '%Y-%m-%d %H:%M:%S %Z'))
237                 if start + config['retention'] < time.time():
238                     backup_state[db]['can_expire_for_base_hit'] = True
239                 continue
240
241             # handle WAL files
242             r = re.match('[a-z0-9-]+\.WAL\.([0-9A-F]{8})([0-9A-F]{8})([0-9A-F]{8})', fn)
243             if r:
244                 if 'base_needs_wal_until' in backup_state[db]:
245                     if backup_state[db]['base_needs_wal_until'] == fn:
246                         del backup_state[db]['base_needs_wal_until']
247                         if backup_state[db]['can_expire_for_base_hit']:
248                             backup_state[db]['can_expire_next'] = True
249
250                 (timeline, wal1, wal2) = map(lambda x: int(x,16), r.groups())
251                 if not timeline == 1:
252                     note_warning_db(dir, db, 'CANNOT-HANDLE-TIMELINES_NOT_1', ffn)
253                     continue
254
255                 thissegment = (wal1, wal2)
256                 if not 'newest-wal' in backup_state[db]:
257                     backup_state[db]['newest-wal'] = thissegment
258                     backup_state[db]['newest-wal-file'] = ffn
259                 else:
260                     if not wal_pre(backup_state[db]['oldest-wal']) == thissegment:
261                         note_warning_db(dir, db, 'WAL-MISSING-AFTER', ffn)
262                         ignored_dbs[db] = True
263                         continue
264                 backup_state[db]['oldest-wal'] = thissegment
265
266                 continue
267
268             note_warning_db(dir, db, 'CANNOT-PARSE-WAL', ffn)
269         else:
270             note_warning_db(dir, db, 'INVALID-TYPE', ffn)
271
272
273     for db in backup_state:
274         if 'base_needs_wal_until' in backup_state[db]:
275             note_warning_db(dir, db, 'MISSING_WAL_FOR_BASE', backup_state[db]['base_needs_wal_until'])
276
277     for db in backup_state:
278         if not 'newest-base' in backup_state[db]:
279             note_warning_db(dir, db, 'NO-BASE', 'no base backup found?')
280         else:
281             age = time.time() - os.stat(backup_state[db]['newest-base']).st_mtime
282             if age > config['warn-age']['base']:
283                 note_warning_db(dir, db, 'BASE-IS-OLD', 'latest base backup is too old')
284
285         if not 'newest-wal-file' in backup_state[db]:
286             note_warning_db(dir, db, 'NO-BASE', 'no WAL files found?')
287         else:
288             age = time.time() - os.stat(backup_state[db]['newest-wal-file']).st_mtime
289             if age > config['warn-age']['wal']:
290                 note_warning_db(dir, db, 'WAL-IS-OLD', 'latest wal file is too old')
291
292     for db in backup_state:
293         if len(backup_state[db]['expires']) > 0:
294             if dir in problems_per_db and db in problems_per_db[dir] and problems_per_db[dir][db]:
295                 note_warning_db(dir, db, 'NOT-EXPIRING-DUE-TO-WARNINGS', 'have seen warnings, will not expire anything')
296             else:
297                 backup_state[db]['expires'].reverse()
298                 for f in backup_state[db]['expires']:
299                     global_expires.append(f)
300
301     #if not db in backup_state:
302     #    note_warning('BASE-WITHOUT-WAL', ffn)
303     #    ignored_dbs[db] = True
304     #    continue
305
306     #age = time.time() - os.stat(ffn).st_mtime
307     #if age > config['warn-age']['wal']:
308     #    note_warning('OLD-WAL', backup_state[db]['newest-wal-file'])
309     #    ignored_dbs[db] = True
310     #    continue
311
312
313 for p in problems_seq:
314     print p
315 for p in notices_seq:
316     print p
317
318 if options.expire:
319     for f in global_expires:
320         if options.verbose: print "Expiring %s"%(f)
321         if not options.dry_run: os.unlink(f)
322
323 if len(problems_seq) > 0:
324     sys.exit(1)
325
326 if not options.expire or options.verbose:
327     print "OK: no problems detected"
328 sys.exit(0)
329
330 # vim:set et:
331 # vim:set ts=4:
332 # vim:set shiftwidth=4: