dsa-check-backuppg: allow BASE backup filenames without the WAL ID
[mirror/dsa-nagios.git] / dsa-nagios-checks / checks / dsa-check-backuppg
1 #!/usr/bin/python
2
3 # Copyright 2010 Peter Palfrader
4 #
5 # Permission is hereby granted, free of charge, to any person obtaining
6 # a copy of this software and associated documentation files (the
7 # "Software"), to deal in the Software without restriction, including
8 # without limitation the rights to use, copy, modify, merge, publish,
9 # distribute, sublicense, and/or sell copies of the Software, and to
10 # permit persons to whom the Software is furnished to do so, subject to
11 # the following conditions:
12 #
13 # The above copyright notice and this permission notice shall be
14 # included in all copies or substantial portions of the Software.
15 #
16 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20 # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21 # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22 # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23
24 # Checks if the WAL backups for several postgres clusters from
25 # different hosts are current.  Might not catch all error instances.
26 #
27 # If called with -e will expire WALs and BASE backups no longer required.
28 #
29 # Needs files layed out like so:
30 # beethoven:/srv/pgbackup/pg# ls -l ries/ | head
31 # total 6794956
32 # -rw------- 1 debbackup debbackup 378099591 May  1 02:26 dak.BASE.20100501-ries-dak-8.4-backup-F_37000020.tar.gz
33 # -rw------- 1 debbackup debbackup 382267407 May  8 02:25 dak.BASE.20100508-ries-dak-8.4-backup-F_A2000020.tar.gz
34 # -rw------- 1 debbackup debbackup 384054069 May 15 02:24 dak.BASE.20100515-ries-dak-8.4-backup-F_FB000020.tar.gz
35 # -rw------- 1 debbackup debbackup 386407500 May 22 02:27 dak.BASE.20100522-ries-dak-8.4-backup-10_58000020.tar.gz
36 # -rw------- 1 debbackup debbackup  16777216 May  1 02:26 dak.WAL.000000010000000F00000037
37 # -rw------- 1 debbackup debbackup       264 May  1 02:27 dak.WAL.000000010000000F00000037.00000020.backup
38 # -rw------- 1 debbackup debbackup  16777216 May  1 03:25 dak.WAL.000000010000000F00000038
39 # -rw------- 1 debbackup debbackup  16777216 May  1 09:11 dak.WAL.000000010000000F00000039
40 # -rw------- 1 debbackup debbackup  16777216 May  1 09:45 dak.WAL.000000010000000F0000003A
41 # ...
42 #
43 # needs write privileges to at least the .backup files
44
45
46 import copy
47 import time
48 import re
49 import os
50 import sys
51 import yaml
52 import optparse
53
54 def load_conf(cf):
55     if cf is not None:
56         configfile = cf
57     elif 'DSA_CHECK_BACKUPPG_CONF' in os.environ:
58         configfile = os.environ['DSA_CHECK_BACKUPPG_CONF']
59     else:
60         configfile = '/etc/nagios/dsa-check-backuppg.conf'
61
62     f = open(configfile)
63     config = yaml.load(f.read())
64     f.close()
65     return config
66
67
68 notices_seq = []
69 problems_seq = []
70 problems_per_db = {}
71 global_expires = []
72 #def note_warning(key, host, db, value):
73 #    global problems_seq
74 #    problems_seq.append("[%s, %s]: %s: %s"%(host, db, key, value))
75 #
76 #    global problems_per_db
77 #    if not host in problems_per_db: problems_per_db[host] = {}
78 #    problems_per_db[host][db] = True
79 def note_info(key, value, pre=None):
80     global notices_seq
81     if pre is None:
82         notices_seq.append("%s: %s"%(key, value))
83     else:
84         notices_seq.append("[%s] %s: %s"%(pre, key, value))
85
86 def note_warning(key, value, pre=None):
87     global problems_seq
88     if pre is None:
89         problems_seq.append("%s: %s"%(key, value))
90     else:
91         problems_seq.append("[%s] %s: %s"%(pre, key, value))
92 def note_warning_db(host, db, key, value):
93     note_warning(key, value, "%s, %s"%(host, db))
94     global problems_per_db
95     if not host in problems_per_db: problems_per_db[host] = {}
96     problems_per_db[host][db] = True
97
98
99 def wal_pre(w):
100     (w1,w2) = w
101     if w2 == 0:
102         w1 -= 1
103         w2 = 0xFE
104     else:
105         w2 -= 1
106
107     return (w1,w2)
108
109 def parse_pg_backup_info(fn):
110     i = {}
111     f = open(fn)
112     for l in f:
113         (k,v) = l.strip().split(': ', 2)
114         i[k.lower()] = v
115     f.close()
116     return i
117
118
119 parser = optparse.OptionParser()
120 parser.set_usage("%prog [-c=<CONFFILE>]               (nagios mode)\n" +
121           "Usage: %prog [-c=<CONFFILE>] -e [-d] [-v]  (expire mode)")
122 parser.add_option("-c", "--config", dest="conffile", metavar="CONFFILE",
123   help="Config file location.")
124 parser.add_option("-e", "--expire", dest="expire", action="store_true",
125   help="Expire old files.")
126 parser.add_option("-d", "--dry-run", dest="dry_run", action="store_true",
127   help="Do not really remove files.")
128 parser.add_option("-v", "--verbose", dest="verbose", action="store_true",
129   help="List files we are expiring.")
130 (options, args) = parser.parse_args()
131
132 if len(args) > 0:
133     parser.print_help()
134     sys.exit(1)
135
136
137 config = load_conf(options.conffile)
138
139 os.chdir(config['rootdir'])
140 for dir in os.listdir('.'):
141     if dir.startswith('.') or dir.endswith('.old'):
142         note_info('IGNORED', dir)
143         continue
144
145     if not os.path.isdir(dir):
146         if min(os.path.getmtime(dir), os.path.getctime(dir)) + 3600*4 > time.time():
147             note_info('IGNORED', dir)
148         else:
149             note_warning('NOT-A-DIR', dir)
150         continue
151
152     if not dir in config['backups']:
153         note_warning('NOT-CONFIGURED', dir)
154         continue
155
156     files = os.listdir(dir)
157     if len(files) == 0:
158         note_warning('EMPTY-DIR', dir)
159         continue
160
161     files.sort()
162
163     unhandled_backups = copy.copy(config['backups'][dir])
164     ignored_dbs = {}
165     backup_state = {}
166
167     # Go over all the files in a directory and check for various things
168     # - for a given cluster's backups we want the latest WAL file to be no
169     #   older than a certain age,
170     # - we want all consecutive WAL files, i.e. no holes
171     # - we want a full backup at one point, and it shouldn't be too old
172     # - If our retention period is say 2 weeks, then we look for the
173     #   tar file that's older than that, and everything before that can
174     #   be expired
175     while len(files) > 0:
176         fn = files.pop()
177         ffn = os.path.join(dir, fn)
178
179         r = re.match('([a-z0-9-]+)\.(WAL|BASE)\..*', fn)
180         if not r:
181             note_warning('CANNOT-PARSE', ffn)
182             continue
183
184         (db, type) = r.groups(1)
185         if not isinstance(config['backups'][dir], list) or not db in config['backups'][dir]:
186             if not db in ignored_dbs:
187                 note_warning_db(dir, db, 'NOT-CONFIGURED', '%s/%s'%(dir, db))
188             ignored_dbs[db] = True
189         if db in ignored_dbs:
190             continue
191         if not db in backup_state:
192             backup_state[db] = {}
193             # can_expire_for_base_hit: We hit a BASE backup that is old enough
194             #   so that once we hit all the required WAL files for this base
195             #   backup to work we can start expiring everything older than that
196             #   oldest WAL file
197             backup_state[db]['can_expire_for_base_hit'] = False
198             # can_expire_next: Can expire all files that we handle from now on
199             backup_state[db]['can_expire_next'] = False
200             backup_state[db]['expires'] = []
201
202         # Apparently we already have seen a base backup and all its wal files
203         # which we want to keep, so everything what we see now is older than
204         # that and we can get rid of it
205         if backup_state[db]['can_expire_next']:
206             backup_state[db]['expires'].append(ffn)
207
208         if type == 'BASE':
209             # should have been taken care of before
210             # while handling a WAL.backup file
211             note_warning_db(dir, db, 'STRAY-BASE', ffn)
212             continue
213         elif type == 'WAL':
214             # handle .backup files  -  they live near the WAL "file namespace" and reference
215             # the corresponding full backup
216             r = re.match('[a-z0-9-]+\.WAL\.([0-9A-F]{8})([0-9A-F]{8})([0-9A-F]{8})\.[0-9A-F]{8}\.backup', fn)
217             if r:
218                 info = parse_pg_backup_info(ffn)
219                 basefn = '%s.BASE.%s-%s.tar.gz'%(db, info['label'], info['start wal location'].split(' ',2)[0].replace('/', '_'))
220                 baseffn = os.path.join(dir, basefn)
221                 if not basefn in files:
222                     basefn = '%s.BASE.%s.tar.gz'%(db, info['label'])
223                     baseffn = os.path.join(dir, basefn)
224                     if not basefn in files:
225                         note_warning_db(dir, db, 'MISSING-BASE', basefn)
226                         continue
227                 files.remove(basefn)
228                 if backup_state[db]['can_expire_next']:
229                     backup_state[db]['expires'].append(baseffn)
230
231                 if not 'newest-base' in backup_state[db]:
232                     backup_state[db]['newest-base'] = baseffn
233                 backup_state[db]['oldest-base'] = baseffn
234
235                 startre = re.search('\(file ([0-9A-F]{24})\)', info['start wal location'])
236                 if not startre:
237                     note_warning_db(dir, db, 'CANNOT-PARSE-START_WAL_LOCATION', ffn)
238                     continue
239                 start_file = startre.group(1)
240                 walbase = '%s.WAL.%s'%(db, start_file)
241                 backup_state[db]['base_needs_wal_until'] = walbase
242
243                 start = time.mktime(time.strptime(info['start time'], '%Y-%m-%d %H:%M:%S %Z'))
244                 if start + config['retention'] < time.time():
245                     backup_state[db]['can_expire_for_base_hit'] = True
246                 continue
247
248             # handle WAL files
249             r = re.match('[a-z0-9-]+\.WAL\.([0-9A-F]{8})([0-9A-F]{8})([0-9A-F]{8})', fn)
250             if r:
251                 if 'base_needs_wal_until' in backup_state[db]:
252                     if backup_state[db]['base_needs_wal_until'] == fn:
253                         del backup_state[db]['base_needs_wal_until']
254                         if backup_state[db]['can_expire_for_base_hit']:
255                             backup_state[db]['can_expire_next'] = True
256
257                 (timeline, wal1, wal2) = map(lambda x: int(x,16), r.groups())
258                 if not timeline == 1:
259                     note_warning_db(dir, db, 'CANNOT-HANDLE-TIMELINES_NOT_1', ffn)
260                     continue
261
262                 thissegment = (wal1, wal2)
263                 if not 'newest-wal' in backup_state[db]:
264                     backup_state[db]['newest-wal'] = thissegment
265                     backup_state[db]['newest-wal-file'] = ffn
266                 else:
267                     if not wal_pre(backup_state[db]['oldest-wal']) == thissegment:
268                         note_warning_db(dir, db, 'WAL-MISSING-AFTER', ffn)
269                         ignored_dbs[db] = True
270                         continue
271                 backup_state[db]['oldest-wal'] = thissegment
272
273                 continue
274
275             note_warning_db(dir, db, 'CANNOT-PARSE-WAL', ffn)
276         else:
277             note_warning_db(dir, db, 'INVALID-TYPE', ffn)
278
279
280     for db in backup_state:
281         if 'base_needs_wal_until' in backup_state[db]:
282             note_warning_db(dir, db, 'MISSING_WAL_FOR_BASE', backup_state[db]['base_needs_wal_until'])
283
284     for db in backup_state:
285         if not 'newest-base' in backup_state[db]:
286             note_warning_db(dir, db, 'NO-BASE', 'no base backup found?')
287         else:
288             age = time.time() - os.stat(backup_state[db]['newest-base']).st_mtime
289             if age > config['warn-age']['base']:
290                 note_warning_db(dir, db, 'BASE-IS-OLD', 'latest base backup is too old')
291
292         if not 'newest-wal-file' in backup_state[db]:
293             note_warning_db(dir, db, 'NO-BASE', 'no WAL files found?')
294         else:
295             age = time.time() - os.stat(backup_state[db]['newest-wal-file']).st_mtime
296             if age > config['warn-age']['wal']:
297                 note_warning_db(dir, db, 'WAL-IS-OLD', 'latest wal file is too old')
298
299     for db in backup_state:
300         if len(backup_state[db]['expires']) > 0:
301             if dir in problems_per_db and db in problems_per_db[dir] and problems_per_db[dir][db]:
302                 note_warning_db(dir, db, 'NOT-EXPIRING-DUE-TO-WARNINGS', 'have seen warnings, will not expire anything')
303             else:
304                 backup_state[db]['expires'].reverse()
305                 for f in backup_state[db]['expires']:
306                     global_expires.append(f)
307
308     #if not db in backup_state:
309     #    note_warning('BASE-WITHOUT-WAL', ffn)
310     #    ignored_dbs[db] = True
311     #    continue
312
313     #age = time.time() - os.stat(ffn).st_mtime
314     #if age > config['warn-age']['wal']:
315     #    note_warning('OLD-WAL', backup_state[db]['newest-wal-file'])
316     #    ignored_dbs[db] = True
317     #    continue
318
319
320 for p in problems_seq:
321     print p
322 for p in notices_seq:
323     print p
324
325 if options.expire:
326     for f in global_expires:
327         if options.verbose: print "Expiring %s"%(f)
328         if not options.dry_run: os.unlink(f)
329
330 if len(problems_seq) > 0:
331     sys.exit(1)
332
333 if not options.expire or options.verbose:
334     print "OK: no problems detected"
335 sys.exit(0)
336
337 # vim:set et:
338 # vim:set ts=4:
339 # vim:set shiftwidth=4: