#!/usr/bin/perl -w $ID = q$Id: check_bos,v 1.7 2006/03/17 23:06:54 quanah Exp $; # # check_bos -- Monitor AFS bos output for problems in Nagios. # # Written by Russ Allbery # Based on an earlier script by Neil Crellin # Copyright 2003, 2004 Board of Trustees, Leland Stanford Jr. University # # This program is free software; you may redistribute it and/or modify it # under the same terms as Perl itself. # # Given an AFS server (file or VLDB), runs bos status on each one. Checks to # see if there is a communication failure, and also checks to see if anything # in the output looks unusual or wrong. If either of these conditions are # true, print that information to STDOUT. Suitable for being run inside # Nagios. ############################################################################## # Site configuration ############################################################################## # The full path to bos. Make sure that this is on local disk so that # monitoring doesn't have an AFS dependency. ($BOS) = grep { -x $_ } qw(/usr/bin/bos /usr/local/bin/bos); $BOS ||= '/usr/bin/bos'; # The default timeout in seconds (implemented by alarm) for rxdebug. $TIMEOUT = 10; # The list of regular expressions matching expected output. You may need to # customize this for what you're running at your site. Any output from bos # that doesn't match one of these regular expressions will throw a critical # error. @OKAY = ( qr/^\s*$/, qr/^Instance\ \S+,\ \(type\ is\ \S+\)(\ has\ core\ file,)? \ currently\ running\ normally\.$/x, qr/^\s*Auxiliary status is: file server running\.$/, qr/^\s*Process last started at /, qr/^\s*Last exit at /, qr/^\s*Last error exit at /, qr/^\s*Command \d+ is / ); ############################################################################## # Modules and declarations ############################################################################## require 5.005; use strict; use vars qw($BOS $ID @OKAY $TIMEOUT); use Getopt::Long qw(GetOptions); ############################################################################## # Implementation ############################################################################## # Parse command line options. my ($help, $host, $version); Getopt::Long::config ('bundling', 'no_ignore_case'); GetOptions ('hostname|H=s' => \$host, 'help|h' => \$help, 'timeout|t=i' => \$TIMEOUT, 'version|V' => \$version) or exit 3; if ($help) { print "Feeding myself to perldoc, please wait....\n"; exec ('perldoc', '-t', $0) or die "Cannot fork: $!\n"; } elsif ($version) { my $version = join (' ', (split (' ', $ID))[1..3]); $version =~ s/,v\b//; $version =~ s/(\S+)$/($1)/; $version =~ tr%/%-%; print $version, "\n"; exit 0; } if (@ARGV) { print "Usage: $0 [-hv] [-t ] -H \n"; warn "Usage: $0 [-hv] [-t ] -H \n"; exit 3; } # Set up the alarm. $SIG{ALRM} = sub { print "BOS CRITICAL - network timeout after $TIMEOUT seconds\n"; exit 2; }; alarm ($TIMEOUT); # Collect the bos output into a variable. unless (open (BOS, "$BOS status $host -noauth -long 2>&1 |")) { print "BOS UNKNOWN - cannot run bos\n"; exit 3; } my @bos = ; close BOS; # Make sure that bos was successful. Note that it generally does return # success even if it can't contact the bos server. if ($? != 0) { print "BOS CRITICAL - bos status failed\n"; exit 2; } # Scan the output. If we see anything that we don't expect, immediately # report it as a fatal error. for my $line (@bos) { my $okay = 0; for my $regex (@OKAY) { if ($line =~ /$regex/) { $okay = 1; last; } } unless ($okay) { $line =~ s/^\s+//; $line =~ s/\s+$//; print "BOS CRITICAL - $line\n"; exit 2; } } print "BOS OK\n"; exit 0; ############################################################################## # Documentation ############################################################################## =head1 NAME check_bos - Monitor AFS bos output for problems in Nagios =head1 SYNOPSIS check_bos [B<-hV>] [B<-t> I] B<-H> I =head1 DESCRIPTION B is a Nagios plugin for querying the AFS bosserver for process status and reporting an alert if there are any unexpected lines in the bos output. The acceptable lines of output from B are configured at the top of this script; they should be generally suitable for most sites, but may require some customization. B will always print out a single line of output. If there is a line that isn't matched by any regexes identifying acceptable lines, it will output the first non-matching line prefixed by C. Otherwise, it will output B. Note that this monitoring may not catch such things as a service being constantly restarted if it happens to be up and running normally each time the probe runs; it doesn't pay any attention to the last start time, the last error exit status, the presence of core files, and the like. It mostly just looks for the "running normally" part of the B output and makes sure the auxilliary status is also "running normally" for a file server process. =head1 OPTIONS =over 4 =item B<-H> I, B<--hostname>=I The AFS server whose B status B should check. This option is required. =item B<-h>, B<--help> Print out this documentation (which is done simply by feeding the script to C). =item B<-t> I, B<--timeout>=I Change the timeout for the B command. The default timeout is 10 seconds. =item B<-V>, B<--version> Print out the version of B and quit. =back =head1 EXIT STATUS B follows the standard Nagios exit status requirements. This means that it will exit with status 0 if there are no problems or with status 2 if there is a problem detected. For other errors, such as invalid syntax, B will exit with status 3. =head1 BUGS The standard B<-v> verbose Nagios plugin option is not supported. It should display the complete bos status output. The usage message for invalid options and for the B<-h> option doesn't conform to Nagios standards. =head1 CAVEATS This script does not use the Nagios util library or any of the defaults that it provides, which makes it somewhat deficient as a Nagios plugin. This is intentional, though, since this script can be used with other monitoring systems as well. It's not clear what a good solution to this would be. =head1 SEE ALSO The current version of this and other AFS monitoring plugins for Nagios are available from the AFS monitoring tools page at L. =head1 AUTHORS The original idea behind this script was from Neil Crellin. Russ Allbery updated it to work with Nagios and stripped out some rather neat but now unnecessary code to look for any changes in the bos output, instead just scanning it for acceptable lines. =head1 COPYRIGHT AND LICENSE Copyright 2003, 2004 Board of Trustees, Leland Stanford Jr. University. This program is free software; you may redistribute it and/or modify it under the same terms as Perl itself. =cut