Merge branch 'master' of ssh://db.debian.org/git/dsa-nagios
[mirror/dsa-nagios.git] / dsa-nagios-checks / checks / dsa-check-backuppg
1 #!/usr/bin/python
2
3 # Copyright 2010 Peter Palfrader
4 #
5 # Permission is hereby granted, free of charge, to any person obtaining
6 # a copy of this software and associated documentation files (the
7 # "Software"), to deal in the Software without restriction, including
8 # without limitation the rights to use, copy, modify, merge, publish,
9 # distribute, sublicense, and/or sell copies of the Software, and to
10 # permit persons to whom the Software is furnished to do so, subject to
11 # the following conditions:
12 #
13 # The above copyright notice and this permission notice shall be
14 # included in all copies or substantial portions of the Software.
15 #
16 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20 # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21 # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22 # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23
24 # Checks if the WAL backups for several postgres clusters from
25 # different hosts are current.  Might not catch all error instances.
26 #
27 # If called with -e will expire WALs and BASE backups no longer required.
28 #
29 # Needs files layed out like so:
30 # beethoven:/srv/pgbackup/pg# ls -l ries/ | head
31 # total 6794956
32 # -rw------- 1 debbackup debbackup 378099591 May  1 02:26 dak.BASE.20100501-ries-dak-8.4-backup-F_37000020.tar.gz
33 # -rw------- 1 debbackup debbackup 382267407 May  8 02:25 dak.BASE.20100508-ries-dak-8.4-backup-F_A2000020.tar.gz
34 # -rw------- 1 debbackup debbackup 384054069 May 15 02:24 dak.BASE.20100515-ries-dak-8.4-backup-F_FB000020.tar.gz
35 # -rw------- 1 debbackup debbackup 386407500 May 22 02:27 dak.BASE.20100522-ries-dak-8.4-backup-10_58000020.tar.gz
36 # -rw------- 1 debbackup debbackup  16777216 May  1 02:26 dak.WAL.000000010000000F00000037
37 # -rw------- 1 debbackup debbackup       264 May  1 02:27 dak.WAL.000000010000000F00000037.00000020.backup
38 # -rw------- 1 debbackup debbackup  16777216 May  1 03:25 dak.WAL.000000010000000F00000038
39 # -rw------- 1 debbackup debbackup  16777216 May  1 09:11 dak.WAL.000000010000000F00000039
40 # -rw------- 1 debbackup debbackup  16777216 May  1 09:45 dak.WAL.000000010000000F0000003A
41 # ...
42 #
43 # needs write privileges to at least the .backup files
44
45
46 import copy
47 import time
48 import re
49 import os
50 import sys
51 import yaml
52 import optparse
53
54 def load_conf(cf):
55     if cf is not None:
56         configfile = cf
57     elif 'DSA_CHECK_BACKUPPG_CONF' in os.environ:
58         configfile = os.environ['DSA_CHECK_BACKUPPG_CONF']
59     else:
60         configfile = '/etc/nagios/dsa-check-backuppg.conf'
61
62     f = open(configfile)
63     config = yaml.load(f.read())
64     f.close()
65     return config
66
67
68 notices_seq = []
69 problems_seq = []
70 problems_per_db = {}
71 global_expires = []
72 #def note_warning(key, host, db, value):
73 #    global problems_seq
74 #    problems_seq.append("[%s, %s]: %s: %s"%(host, db, key, value))
75 #
76 #    global problems_per_db
77 #    if not host in problems_per_db: problems_per_db[host] = {}
78 #    problems_per_db[host][db] = True
79 def note_info(key, value, pre=None):
80     global notices_seq
81     if pre is None:
82         notices_seq.append("%s: %s"%(key, value))
83     else:
84         notices_seq.append("[%s] %s: %s"%(pre, key, value))
85
86 def note_warning(key, value, pre=None):
87     global problems_seq
88     if pre is None:
89         problems_seq.append("%s: %s"%(key, value))
90     else:
91         problems_seq.append("[%s] %s: %s"%(pre, key, value))
92 def note_warning_db(host, db, key, value):
93     note_warning(key, value, "%s, %s"%(host, db))
94     global problems_per_db
95     if not host in problems_per_db: problems_per_db[host] = {}
96     problems_per_db[host][db] = True
97
98
99 def wal_pre(w):
100     (w1,w2) = w
101     if w2 == 0:
102         w1 -= 1
103         w2 = 0xFE
104     else:
105         w2 -= 1
106
107     return (w1,w2)
108
109 def parse_pg_backup_info(fn):
110     i = {}
111     f = open(fn)
112     for l in f:
113         (k,v) = l.strip().split(': ', 2)
114         i[k.lower()] = v
115     f.close()
116     return i
117
118
119 parser = optparse.OptionParser()
120 parser.set_usage("%prog [-c=<CONFFILE>]               (nagios mode)\n" +
121           "Usage: %prog [-c=<CONFFILE>] -e [-d] [-v]  (expire mode)")
122 parser.add_option("-c", "--config", dest="conffile", metavar="CONFFILE",
123   help="Config file location.")
124 parser.add_option("-e", "--expire", dest="expire", action="store_true",
125   help="Expire old files.")
126 parser.add_option("-d", "--dry-run", dest="dry_run", action="store_true",
127   help="Do not really remove files.")
128 parser.add_option("-v", "--verbose", dest="verbose", action="store_true",
129   help="List files we are expiring.")
130 (options, args) = parser.parse_args()
131
132 if len(args) > 0:
133     parser.print_help()
134     sys.exit(1)
135
136
137 config = load_conf(options.conffile)
138
139 os.chdir(config['rootdir'])
140 for dir in os.listdir('.'):
141     if dir.startswith('.'):
142         note_info('IGNORED', dir)
143         continue
144
145     if not os.path.isdir(dir):
146         if min(os.path.getmtime(dir), os.path.getctime(dir)) + 3600*4 > time.time():
147             note_info('IGNORED', dir)
148         else:
149             note_warning('NOT-A-DIR', dir)
150         continue
151
152     if not dir in config['backups']:
153         note_warning('NOT-CONFIGURED', dir)
154         continue
155
156     files = os.listdir(dir)
157     if len(files) == 0:
158         note_warning('EMPTY-DIR', dir)
159         continue
160
161     files.sort()
162
163     unhandled_backups = copy.copy(config['backups'][dir])
164     ignored_dbs = {}
165     backup_state = {}
166
167     # Go over all the files in a directory and check for various things
168     # - for a given cluster's backups we want the latest WAL file to be no
169     #   older than a certain age,
170     # - we want all consecutive WAL files, i.e. no holes
171     # - we want a full backup at one point, and it shouldn't be too old
172     # - If our retention period is say 2 weeks, then we look for the
173     #   tar file that's older than that, and everything before that can
174     #   be expired
175     while len(files) > 0:
176         fn = files.pop()
177         ffn = os.path.join(dir, fn)
178
179         r = re.match('([a-z0-9-]+)\.(WAL|BASE)\..*', fn)
180         if not r:
181             note_warning('CANNOT-PARSE', ffn)
182             continue
183
184         (db, type) = r.groups(1)
185         if not isinstance(config['backups'][dir], list) or not db in config['backups'][dir]:
186             if not db in ignored_dbs:
187                 note_warning_db(dir, db, 'NOT-CONFIGURED', '%s/%s'%(dir, db))
188             ignored_dbs[db] = True
189         if db in ignored_dbs:
190             continue
191         if not db in backup_state:
192             backup_state[db] = {}
193             # can_expire_for_base_hit: We hit a BASE backup that is old enough
194             #   so that once we hit all the required WAL files for this base
195             #   backup to work we can start expiring everything older than that
196             #   oldest WAL file
197             backup_state[db]['can_expire_for_base_hit'] = False
198             # can_expire_next: Can expire all files that we handle from now on
199             backup_state[db]['can_expire_next'] = False
200             backup_state[db]['expires'] = []
201
202         # Apparently we already have seen a base backup and all its wal files
203         # which we want to keep, so everything what we see now is older than
204         # that and we can get rid of it
205         if backup_state[db]['can_expire_next']:
206             backup_state[db]['expires'].append(ffn)
207
208         if type == 'BASE':
209             # should have been taken care of before
210             # while handling a WAL.backup file
211             note_warning_db(dir, db, 'STRAY-BASE', ffn)
212             continue
213         elif type == 'WAL':
214             # handle .backup files  -  they live near the WAL "file namespace" and reference
215             # the corresponding full backup
216             r = re.match('[a-z0-9-]+\.WAL\.([0-9A-F]{8})([0-9A-F]{8})([0-9A-F]{8})\.[0-9A-F]{8}\.backup', fn)
217             if r:
218                 info = parse_pg_backup_info(ffn)
219                 basefn = '%s.BASE.%s-%s.tar.gz'%(db, info['label'], info['start wal location'].split(' ',2)[0].replace('/', '_'))
220                 baseffn = os.path.join(dir, basefn)
221                 if not basefn in files:
222                     note_warning_db(dir, db, 'MISSING-BASE', basefn)
223                     continue
224                 files.remove(basefn)
225                 if backup_state[db]['can_expire_next']:
226                     backup_state[db]['expires'].append(baseffn)
227
228                 if not 'newest-base' in backup_state[db]:
229                     backup_state[db]['newest-base'] = baseffn
230                 backup_state[db]['oldest-base'] = baseffn
231
232                 startre = re.search('\(file ([0-9A-F]{24})\)', info['start wal location'])
233                 if not startre:
234                     note_warning_db(dir, db, 'CANNOT-PARSE-START_WAL_LOCATION', ffn)
235                     continue
236                 start_file = startre.group(1)
237                 walbase = '%s.WAL.%s'%(db, start_file)
238                 backup_state[db]['base_needs_wal_until'] = walbase
239
240                 start = time.mktime(time.strptime(info['start time'], '%Y-%m-%d %H:%M:%S %Z'))
241                 if start + config['retention'] < time.time():
242                     backup_state[db]['can_expire_for_base_hit'] = True
243                 continue
244
245             # handle WAL files
246             r = re.match('[a-z0-9-]+\.WAL\.([0-9A-F]{8})([0-9A-F]{8})([0-9A-F]{8})', fn)
247             if r:
248                 if 'base_needs_wal_until' in backup_state[db]:
249                     if backup_state[db]['base_needs_wal_until'] == fn:
250                         del backup_state[db]['base_needs_wal_until']
251                         if backup_state[db]['can_expire_for_base_hit']:
252                             backup_state[db]['can_expire_next'] = True
253
254                 (timeline, wal1, wal2) = map(lambda x: int(x,16), r.groups())
255                 if not timeline == 1:
256                     note_warning_db(dir, db, 'CANNOT-HANDLE-TIMELINES_NOT_1', ffn)
257                     continue
258
259                 thissegment = (wal1, wal2)
260                 if not 'newest-wal' in backup_state[db]:
261                     backup_state[db]['newest-wal'] = thissegment
262                     backup_state[db]['newest-wal-file'] = ffn
263                 else:
264                     if not wal_pre(backup_state[db]['oldest-wal']) == thissegment:
265                         note_warning_db(dir, db, 'WAL-MISSING-AFTER', ffn)
266                         ignored_dbs[db] = True
267                         continue
268                 backup_state[db]['oldest-wal'] = thissegment
269
270                 continue
271
272             note_warning_db(dir, db, 'CANNOT-PARSE-WAL', ffn)
273         else:
274             note_warning_db(dir, db, 'INVALID-TYPE', ffn)
275
276
277     for db in backup_state:
278         if 'base_needs_wal_until' in backup_state[db]:
279             note_warning_db(dir, db, 'MISSING_WAL_FOR_BASE', backup_state[db]['base_needs_wal_until'])
280
281     for db in backup_state:
282         if not 'newest-base' in backup_state[db]:
283             note_warning_db(dir, db, 'NO-BASE', 'no base backup found?')
284         else:
285             age = time.time() - os.stat(backup_state[db]['newest-base']).st_mtime
286             if age > config['warn-age']['base']:
287                 note_warning_db(dir, db, 'BASE-IS-OLD', 'latest base backup is too old')
288
289         if not 'newest-wal-file' in backup_state[db]:
290             note_warning_db(dir, db, 'NO-BASE', 'no WAL files found?')
291         else:
292             age = time.time() - os.stat(backup_state[db]['newest-wal-file']).st_mtime
293             if age > config['warn-age']['wal']:
294                 note_warning_db(dir, db, 'WAL-IS-OLD', 'latest wal file is too old')
295
296     for db in backup_state:
297         if len(backup_state[db]['expires']) > 0:
298             if dir in problems_per_db and db in problems_per_db[dir] and problems_per_db[dir][db]:
299                 note_warning_db(dir, db, 'NOT-EXPIRING-DUE-TO-WARNINGS', 'have seen warnings, will not expire anything')
300             else:
301                 backup_state[db]['expires'].reverse()
302                 for f in backup_state[db]['expires']:
303                     global_expires.append(f)
304
305     #if not db in backup_state:
306     #    note_warning('BASE-WITHOUT-WAL', ffn)
307     #    ignored_dbs[db] = True
308     #    continue
309
310     #age = time.time() - os.stat(ffn).st_mtime
311     #if age > config['warn-age']['wal']:
312     #    note_warning('OLD-WAL', backup_state[db]['newest-wal-file'])
313     #    ignored_dbs[db] = True
314     #    continue
315
316
317 for p in problems_seq:
318     print p
319 for p in notices_seq:
320     print p
321
322 if options.expire:
323     for f in global_expires:
324         if options.verbose: print "Expiring %s"%(f)
325         if not options.dry_run: os.unlink(f)
326
327 if len(problems_seq) > 0:
328     sys.exit(1)
329
330 if not options.expire or options.verbose:
331     print "OK: no problems detected"
332 sys.exit(0)
333
334 # vim:set et:
335 # vim:set ts=4:
336 # vim:set shiftwidth=4: