A postgres WAL backup check
[mirror/dsa-nagios.git] / dsa-nagios-checks / checks / dsa-check-backuppg
1 #!/usr/bin/python
2
3 # Copyright 2010 Peter Palfrader
4 #
5 # Permission is hereby granted, free of charge, to any person obtaining
6 # a copy of this software and associated documentation files (the
7 # "Software"), to deal in the Software without restriction, including
8 # without limitation the rights to use, copy, modify, merge, publish,
9 # distribute, sublicense, and/or sell copies of the Software, and to
10 # permit persons to whom the Software is furnished to do so, subject to
11 # the following conditions:
12 #
13 # The above copyright notice and this permission notice shall be
14 # included in all copies or substantial portions of the Software.
15 #
16 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20 # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21 # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22 # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23
24 # Checks if the WAL backups for several postgres clusters from
25 # different hosts are current.  Might not catch all error instances.
26 #
27 # If called with -e will expire WALs and BASE backups no longer required.
28 #
29 # Needs files layed out like so:
30 # beethoven:/srv/pgbackup/pg# ls -l ries/ | head
31 # total 6794956
32 # -rw------- 1 debbackup debbackup 378099591 May  1 02:26 dak.BASE.20100501-ries-dak-8.4-backup-F_37000020.tar.gz
33 # -rw------- 1 debbackup debbackup 382267407 May  8 02:25 dak.BASE.20100508-ries-dak-8.4-backup-F_A2000020.tar.gz
34 # -rw------- 1 debbackup debbackup 384054069 May 15 02:24 dak.BASE.20100515-ries-dak-8.4-backup-F_FB000020.tar.gz
35 # -rw------- 1 debbackup debbackup 386407500 May 22 02:27 dak.BASE.20100522-ries-dak-8.4-backup-10_58000020.tar.gz
36 # -rw------- 1 debbackup debbackup  16777216 May  1 02:26 dak.WAL.000000010000000F00000037
37 # -rw------- 1 debbackup debbackup       264 May  1 02:27 dak.WAL.000000010000000F00000037.00000020.backup
38 # -rw------- 1 debbackup debbackup  16777216 May  1 03:25 dak.WAL.000000010000000F00000038
39 # -rw------- 1 debbackup debbackup  16777216 May  1 09:11 dak.WAL.000000010000000F00000039
40 # -rw------- 1 debbackup debbackup  16777216 May  1 09:45 dak.WAL.000000010000000F0000003A
41 # ...
42 #
43 # needs write privileges to at least the .backup files
44
45
46 import copy
47 import time
48 import re
49 import os
50 import sys
51 import yaml
52 import optparse
53
54 def load_conf(cf):
55     if cf is not None:
56         configfile = cf
57     elif 'DSA_CHECK_BACKUPPG_CONF' in os.environ:
58         configfile = os.environ['DSA_CHECK_BACKUPPG_CONF']
59     else:
60         configfile = '/etc/nagios/dsa-check-backuppg.conf'
61
62     f = open(configfile)
63     config = yaml.load(f.read())
64     f.close()
65     return config
66
67
68 problems_seq = []
69 problems_per_db = {}
70 global_expires = []
71 #def note_warning(key, host, db, value):
72 #    global problems_seq
73 #    problems_seq.append("[%s, %s]: %s: %s"%(host, db, key, value))
74 #
75 #    global problems_per_db
76 #    if not host in problems_per_db: problems_per_db[host] = {}
77 #    problems_per_db[host][db] = True
78
79 def note_warning(key, value, pre=None):
80     global problems_seq
81     if pre is None:
82         problems_seq.append("%s: %s"%(key, value))
83     else:
84         problems_seq.append("[%s] %s: %s"%(pre, key, value))
85 def note_warning_db(host, db, key, value):
86     note_warning(key, value, "%s, %s"%(host, db))
87     global problems_per_db
88     if not host in problems_per_db: problems_per_db[host] = {}
89     problems_per_db[host][db] = True
90
91
92 def wal_pre(w):
93     (w1,w2) = w
94     if w2 == 0:
95         w1 -= 1
96         w2 = 0xFE
97     else:
98         w2 -= 1
99
100     return (w1,w2)
101
102 def parse_pg_backup_info(fn):
103     i = {}
104     f = open(fn)
105     for l in f:
106         (k,v) = l.strip().split(': ', 2)
107         i[k.lower()] = v
108     f.close()
109     return i
110
111
112 parser = optparse.OptionParser()
113 parser.set_usage("%prog [-c=<CONFFILE>]               (nagios mode)\n" +
114           "Usage: %prog [-c=<CONFFILE>] -e [-d] [-v]  (expire mode)")
115 parser.add_option("-c", "--config", dest="conffile", metavar="CONFFILE",
116   help="Config file location.")
117 parser.add_option("-e", "--expire", dest="expire", action="store_true",
118   help="Expire old files.")
119 parser.add_option("-d", "--dry-run", dest="dry_run", action="store_true",
120   help="Do not really remove files.")
121 parser.add_option("-v", "--verbose", dest="verbose", action="store_true",
122   help="List files we are expiring.")
123 (options, args) = parser.parse_args()
124
125 if len(args) > 0:
126     parser.print_help()
127     sys.exit(1)
128
129
130 config = load_conf(options.conffile)
131
132 os.chdir(config['rootdir'])
133 for dir in os.listdir('.'):
134     if not os.path.isdir(dir):
135         note_warning('NOT-A-DIR', dir)
136         continue
137
138     if not dir in config['backups']:
139         note_warning('NOT-CONFIGURED', dir)
140         continue
141
142     files = os.listdir(dir)
143     if len(files) == 0:
144         note_warning('EMPTY-DIR', dir)
145         continue
146
147     files.sort()
148
149     unhandled_backups = copy.copy(config['backups'][dir])
150     ignored_dbs = {}
151     backup_state = {}
152
153     # Go over all the files in a directory and check for various things
154     # - for a given cluster's backups we want the latest WAL file to be no
155     #   older than a certain age,
156     # - we want all consecutive WAL files, i.e. no holes
157     # - we want a full backup at one point, and it shouldn't be too old
158     # - If our retention period is say 2 weeks, then we look for the
159     #   tar file that's older than that, and everything before that can
160     #   be expired
161     while len(files) > 0:
162         fn = files.pop()
163         ffn = os.path.join(dir, fn)
164
165         r = re.match('([a-z0-9-]+)\.(WAL|BASE)\..*', fn)
166         if not r:
167             note_warning('CANNOT-PARSE', ffn)
168             continue
169
170         (db, type) = r.groups(1)
171         if not isinstance(config['backups'][dir], list) or not db in config['backups'][dir]:
172             if not db in ignored_dbs:
173                 note_warning_db(dir, db, 'NOT-CONFIGURED', '%s/%s'%(dir, db))
174             ignored_dbs[db] = True
175         if db in ignored_dbs:
176             continue
177         if not db in backup_state:
178             backup_state[db] = {}
179             # can_expire_for_base_hit: We hit a BASE backup that is old enough
180             #   so that once we hit all the required WAL files for this base
181             #   backup to work we can start expiring everything older than that
182             #   oldest WAL file
183             backup_state[db]['can_expire_for_base_hit'] = False
184             # can_expire_next: Can expire all files that we handle from now on
185             backup_state[db]['can_expire_next'] = False
186             backup_state[db]['expires'] = []
187
188         # Apparently we already have seen a base backup and all its wal files
189         # which we want to keep, so everything what we see now is older than
190         # that and we can get rid of it
191         if backup_state[db]['can_expire_next']:
192             backup_state[db]['expires'].append(ffn)
193
194         if type == 'BASE':
195             # should have been taken care of before
196             # while handling a WAL.backup file
197             note_warning_db(dir, db, 'STRAY-BASE', ffn)
198             continue
199         elif type == 'WAL':
200             # handle .backup files  -  they live near the WAL "file namespace" and reference
201             # the corresponding full backup
202             r = re.match('[a-z0-9-]+\.WAL\.([0-9A-F]{8})([0-9A-F]{8})([0-9A-F]{8})\.[0-9A-F]{8}\.backup', fn)
203             if r:
204                 info = parse_pg_backup_info(ffn)
205                 basefn = '%s.BASE.%s-%s.tar.gz'%(db, info['label'], info['start wal location'].split(' ',2)[0].replace('/', '_'))
206                 baseffn = os.path.join(dir, basefn)
207                 if not basefn in files:
208                     note_warning_db(dir, db, 'MISSING-BASE', basefn)
209                     continue
210                 files.remove(basefn)
211                 if backup_state[db]['can_expire_next']:
212                     backup_state[db]['expires'].append(baseffn)
213
214                 if not 'newest-base' in backup_state[db]:
215                     backup_state[db]['newest-base'] = baseffn
216                 backup_state[db]['oldest-base'] = baseffn
217
218                 startre = re.search('\(file ([0-9A-F]{24})\)', info['start wal location'])
219                 if not startre:
220                     note_warning_db(dir, db, 'CANNOT-PARSE-START_WAL_LOCATION', ffn)
221                     continue
222                 start_file = startre.group(1)
223                 walbase = '%s.WAL.%s'%(db, start_file)
224                 backup_state[db]['base_needs_wal_until'] = walbase
225
226                 start = time.mktime(time.strptime(info['start time'], '%Y-%m-%d %H:%M:%S %Z'))
227                 if start + config['retention'] < time.time():
228                     backup_state[db]['can_expire_for_base_hit'] = True
229                 continue
230
231             # handle WAL files
232             r = re.match('[a-z0-9-]+\.WAL\.([0-9A-F]{8})([0-9A-F]{8})([0-9A-F]{8})', fn)
233             if r:
234                 if 'base_needs_wal_until' in backup_state[db]:
235                     if backup_state[db]['base_needs_wal_until'] == fn:
236                         del backup_state[db]['base_needs_wal_until']
237                         if backup_state[db]['can_expire_for_base_hit']:
238                             backup_state[db]['can_expire_next'] = True
239
240                 (timeline, wal1, wal2) = map(lambda x: int(x,16), r.groups())
241                 if not timeline == 1:
242                     note_warning_db(dir, db, 'CANNOT-HANDLE-TIMELINES_NOT_1', ffn)
243                     continue
244
245                 thissegment = (wal1, wal2)
246                 if not 'newest-wal' in backup_state[db]:
247                     backup_state[db]['newest-wal'] = thissegment
248                     backup_state[db]['newest-wal-file'] = ffn
249                 else:
250                     if not wal_pre(backup_state[db]['oldest-wal']) == thissegment:
251                         note_warning_db(dir, db, 'WAL-MISSING-AFTER', ffn)
252                         ignored_dbs[db] = True
253                         continue
254                 backup_state[db]['oldest-wal'] = thissegment
255
256                 continue
257
258             note_warning_db(dir, db, 'CANNOT-PARSE-WAL', ffn)
259         else:
260             note_warning_db(dir, db, 'INVALID-TYPE', ffn)
261
262
263     for db in backup_state:
264         if 'base_needs_wal_until' in backup_state[db]:
265             note_warning_db(dir, db, 'MISSING_WAL_FOR_BASE', backup_state[db]['base_needs_wal_until'])
266
267     for db in backup_state:
268         if not 'newest-base' in backup_state[db]:
269             note_warning(dir, db, 'NO-BASE', 'no base backup found?')
270         else:
271             age = time.time() - os.stat(backup_state[db]['newest-base']).st_mtime
272             if age > config['warn-age']['base']:
273                 note_warning(dir, db, 'BASE-IS-OLD', 'latest base backup is too old')
274
275         if not 'newest-wal-file' in backup_state[db]:
276             note_warning(dir, db, 'NO-BASE', 'no WAL files found?')
277         else:
278             age = time.time() - os.stat(backup_state[db]['newest-wal-file']).st_mtime
279             if age > config['warn-age']['wal']:
280                 note_warning(dir, db, 'WAL-IS-OLD', 'latest wal file is too old')
281
282     for db in backup_state:
283         if len(backup_state[db]['expires']) > 0:
284             if dir in problems_per_db and db in problems_per_db[dir] and problems_per_db[dir][db]:
285                 note_warning_db(dir, db, 'NOT-EXPIRING-DUE-TO-WARNINGS', 'have seen warnings, will not expire anything')
286             else:
287                 backup_state[db]['expires'].reverse()
288                 for f in backup_state[db]['expires']:
289                     global_expires.append(f)
290
291     #if not db in backup_state:
292     #    note_warning('BASE-WITHOUT-WAL', ffn)
293     #    ignored_dbs[db] = True
294     #    continue
295
296     #age = time.time() - os.stat(ffn).st_mtime
297     #if age > config['warn-age']['wal']:
298     #    note_warning('OLD-WAL', backup_state[db]['newest-wal-file'])
299     #    ignored_dbs[db] = True
300     #    continue
301
302
303 for p in problems_seq:
304     print p
305
306 if options.expire:
307     for f in global_expires:
308         if options.verbose: print "Expiring %s"%(f)
309         if not options.dry_run: os.unlink(f)
310
311 if len(problems_seq) > 0:
312     sys.exit(1)
313
314 print "OK: no problems detected"
315 sys.exit(0)
316
317 # vim:set et:
318 # vim:set ts=4:
319 # vim:set shiftwidth=4: