--- /dev/null
+#!/usr/bin/perl -w
+$ID = q$Id: check_bos,v 1.7 2006/03/17 23:06:54 quanah Exp $;
+#
+# check_bos -- Monitor AFS bos output for problems in Nagios.
+#
+# Written by Russ Allbery <rra@stanford.edu>
+# Based on an earlier script by Neil Crellin <neilc@stanford.edu>
+# Copyright 2003, 2004 Board of Trustees, Leland Stanford Jr. University
+#
+# This program is free software; you may redistribute it and/or modify it
+# under the same terms as Perl itself.
+#
+# Given an AFS server (file or VLDB), runs bos status on each one. Checks to
+# see if there is a communication failure, and also checks to see if anything
+# in the output looks unusual or wrong. If either of these conditions are
+# true, print that information to STDOUT. Suitable for being run inside
+# Nagios.
+
+##############################################################################
+# Site configuration
+##############################################################################
+
+# The full path to bos. Make sure that this is on local disk so that
+# monitoring doesn't have an AFS dependency.
+($BOS) = grep { -x $_ } qw(/usr/bin/bos /usr/local/bin/bos);
+$BOS ||= '/usr/bin/bos';
+
+# The default timeout in seconds (implemented by alarm) for rxdebug.
+$TIMEOUT = 10;
+
+# The list of regular expressions matching expected output. You may need to
+# customize this for what you're running at your site. Any output from bos
+# that doesn't match one of these regular expressions will throw a critical
+# error.
+@OKAY = (
+ qr/^\s*$/,
+ qr/^Instance\ \S+,\ \(type\ is\ \S+\)(\ has\ core\ file,)?
+ \ currently\ running\ normally\.$/x,
+ qr/^\s*Auxiliary status is: file server running\.$/,
+ qr/^\s*Process last started at /,
+ qr/^\s*Last exit at /,
+ qr/^\s*Last error exit at /,
+ qr/^\s*Command \d+ is /
+);
+
+##############################################################################
+# Modules and declarations
+##############################################################################
+
+require 5.005;
+
+use strict;
+use vars qw($BOS $ID @OKAY $TIMEOUT);
+
+use Getopt::Long qw(GetOptions);
+
+##############################################################################
+# Implementation
+##############################################################################
+
+# Parse command line options.
+my ($help, $host, $version);
+Getopt::Long::config ('bundling', 'no_ignore_case');
+GetOptions ('hostname|H=s' => \$host,
+ 'help|h' => \$help,
+ 'timeout|t=i' => \$TIMEOUT,
+ 'version|V' => \$version) or exit 3;
+if ($help) {
+ print "Feeding myself to perldoc, please wait....\n";
+ exec ('perldoc', '-t', $0) or die "Cannot fork: $!\n";
+} elsif ($version) {
+ my $version = join (' ', (split (' ', $ID))[1..3]);
+ $version =~ s/,v\b//;
+ $version =~ s/(\S+)$/($1)/;
+ $version =~ tr%/%-%;
+ print $version, "\n";
+ exit 0;
+}
+if (@ARGV) {
+ print "Usage: $0 [-hv] [-t <timeout>] -H <host>\n";
+ warn "Usage: $0 [-hv] [-t <timeout>] -H <host>\n";
+ exit 3;
+}
+
+# Set up the alarm.
+$SIG{ALRM} = sub {
+ print "BOS CRITICAL - network timeout after $TIMEOUT seconds\n";
+ exit 2;
+};
+alarm ($TIMEOUT);
+
+# Collect the bos output into a variable.
+unless (open (BOS, "$BOS status $host -noauth -long 2>&1 |")) {
+ print "BOS UNKNOWN - cannot run bos\n";
+ exit 3;
+}
+my @bos = <BOS>;
+close BOS;
+
+# Make sure that bos was successful. Note that it generally does return
+# success even if it can't contact the bos server.
+if ($? != 0) {
+ print "BOS CRITICAL - bos status failed\n";
+ exit 2;
+}
+
+# Scan the output. If we see anything that we don't expect, immediately
+# report it as a fatal error.
+for my $line (@bos) {
+ my $okay = 0;
+ for my $regex (@OKAY) {
+ if ($line =~ /$regex/) {
+ $okay = 1;
+ last;
+ }
+ }
+ unless ($okay) {
+ $line =~ s/^\s+//;
+ $line =~ s/\s+$//;
+ print "BOS CRITICAL - $line\n";
+ exit 2;
+ }
+}
+print "BOS OK\n";
+exit 0;
+
+##############################################################################
+# Documentation
+##############################################################################
+
+=head1 NAME
+
+check_bos - Monitor AFS bos output for problems in Nagios
+
+=head1 SYNOPSIS
+
+check_bos [B<-hV>] [B<-t> I<timeout>] B<-H> I<host>
+
+=head1 DESCRIPTION
+
+B<check_bos> is a Nagios plugin for querying the AFS bosserver for process
+status and reporting an alert if there are any unexpected lines in the bos
+output. The acceptable lines of output from B<bos> are configured at the
+top of this script; they should be generally suitable for most sites, but
+may require some customization.
+
+B<check_bos> will always print out a single line of output. If there is a
+line that isn't matched by any regexes identifying acceptable lines, it will
+output the first non-matching line prefixed by C<BOS CRITICAL>. Otherwise,
+it will output B<BOS OK>. Note that this monitoring may not catch such
+things as a service being constantly restarted if it happens to be up and
+running normally each time the probe runs; it doesn't pay any attention to
+the last start time, the last error exit status, the presence of core files,
+and the like. It mostly just looks for the "running normally" part of the
+B<bos> output and makes sure the auxilliary status is also "running
+normally" for a file server process.
+
+=head1 OPTIONS
+
+=over 4
+
+=item B<-H> I<host>, B<--hostname>=I<host>
+
+The AFS server whose B<bos> status B<check_bos> should check. This option
+is required.
+
+=item B<-h>, B<--help>
+
+Print out this documentation (which is done simply by feeding the script
+to C<perldoc -t>).
+
+=item B<-t> I<timeout>, B<--timeout>=I<timeout>
+
+Change the timeout for the B<bos> command. The default timeout is 10
+seconds.
+
+=item B<-V>, B<--version>
+
+Print out the version of B<check_bos> and quit.
+
+=back
+
+=head1 EXIT STATUS
+
+B<check_bos> follows the standard Nagios exit status requirements. This
+means that it will exit with status 0 if there are no problems or with
+status 2 if there is a problem detected. For other errors, such as invalid
+syntax, B<check_bos> will exit with status 3.
+
+=head1 BUGS
+
+The standard B<-v> verbose Nagios plugin option is not supported. It should
+display the complete bos status output.
+
+The usage message for invalid options and for the B<-h> option doesn't
+conform to Nagios standards.
+
+=head1 CAVEATS
+
+This script does not use the Nagios util library or any of the defaults that
+it provides, which makes it somewhat deficient as a Nagios plugin. This is
+intentional, though, since this script can be used with other monitoring
+systems as well. It's not clear what a good solution to this would be.
+
+=head1 SEE ALSO
+
+The current version of this and other AFS monitoring plugins for Nagios are
+available from the AFS monitoring tools page at
+L<http://www.eyrie.org/~eagle/software/afs-monitor/>.
+
+=head1 AUTHORS
+
+The original idea behind this script was from Neil Crellin. Russ Allbery
+<rra@stanford.edu> updated it to work with Nagios and stripped out some
+rather neat but now unnecessary code to look for any changes in the bos
+output, instead just scanning it for acceptable lines.
+
+=head1 COPYRIGHT AND LICENSE
+
+Copyright 2003, 2004 Board of Trustees, Leland Stanford Jr. University.
+
+This program is free software; you may redistribute it and/or modify it
+under the same terms as Perl itself.
+
+=cut
--- /dev/null
+#!/usr/bin/perl -w
+$ID = q$Id: check_rxdebug,v 1.11 2006/03/17 23:06:54 quanah Exp $;
+#
+# check_rxdebug -- Nagios AFS server check for waiting connections.
+#
+# Written by Quanah Gibson-Mount based on work by Neil Crellin
+# Updated by Russ Allbery <rra@stanford.edu>
+# Copyright 2003, 2004, 2005 Board of Trustees, Leland Stanford Jr. University
+#
+# This program is free software; you may redistribute it and/or modify it
+# under the same terms as Perl itself.
+#
+# Expects a file server with the -H option and runs rxdebug against that file
+# server, looking for any connections that are waiting for a thread. Exits
+# with status 1 if there are more than two connections in that state (a
+# warning) and with status 2 if there are more than eight connections in that
+# state. The thresholds can be overridden from the command line.
+
+##############################################################################
+# Site configuration
+##############################################################################
+
+# The default count of blocked connections at which to warn or send a critical
+# alert. These can be overridden with the -w and -c command-line options.
+$WARNINGS = 2;
+$CRITICAL = 8;
+
+# The default timeout in seconds (implemented by alarm) for rxdebug.
+$TIMEOUT = 60;
+
+# The full path to rxdebug. Make sure that this is on local disk so that
+# monitoring doesn't have an AFS dependency.
+($RXDEBUG) = grep { -x $_ } qw(/usr/bin/rxdebug /usr/local/bin/rxdebug);
+$RXDEBUG ||= '/usr/bin/rxdebug';
+
+##############################################################################
+# Modules and declarations
+##############################################################################
+
+require 5.003;
+
+use strict;
+use vars qw($CRITICAL $ID $RXDEBUG $TIMEOUT $WARNINGS);
+
+use Getopt::Long qw(GetOptions);
+
+##############################################################################
+# Implementation
+##############################################################################
+
+# Parse command line options.
+my ($help, $host, $version);
+Getopt::Long::config ('bundling', 'no_ignore_case');
+GetOptions ('critical|c=i' => \$CRITICAL,
+ 'hostname|H=s' => \$host,
+ 'help|h' => \$help,
+ 'timeout|t=i' => \$TIMEOUT,
+ 'version|V' => \$version,
+ 'warning|w=i' => \$WARNINGS) or exit 3;
+if ($help) {
+ print "Feeding myself to perldoc, please wait....\n";
+ exec ('perldoc', '-t', $0) or die "Cannot fork: $!\n";
+} elsif ($version) {
+ my $version = join (' ', (split (' ', $ID))[1..3]);
+ $version =~ s/,v\b//;
+ $version =~ s/(\S+)$/($1)/;
+ $version =~ tr%/%-%;
+ print $version, "\n";
+ exit 0;
+}
+if (@ARGV) {
+ warn "Usage: $0 [-hv] [-c <level>] [-w <level>] -H <host>\n";
+ exit 3;
+}
+if ($WARNINGS > $CRITICAL) {
+ warn "$0: warning level $WARNINGS greater than critical level $CRITICAL\n";
+ exit 3;
+}
+
+# Set up the alarm.
+$SIG{ALRM} = sub {
+ print "AFS CRITICAL - network timeout after $TIMEOUT seconds\n";
+ exit 2;
+};
+alarm ($TIMEOUT);
+
+# Run rxdebug and parse the output, counting the number of waiting for process
+# connections that we have.
+unless (open (RXDEBUG, "$RXDEBUG $host -noconn |")) {
+ warn "$0: cannot run rxdebug\n";
+ exit 3;
+}
+my $blocked;
+while (<RXDEBUG>) {
+ if (/^(\d+) calls waiting for a thread/) {
+ $blocked = $1;
+ last;
+ }
+}
+close RXDEBUG;
+if ($? != 0) {
+ print "AFS CRITICAL - cannot contact server\n";
+ exit 2;
+}
+unless (defined $blocked) {
+ print "AFS CRITICAL - cannot parse rxdebug output\n";
+ exit 2;
+}
+
+# Check the connection count against our limits and make sure that it's okay.
+if ($blocked >= $CRITICAL) {
+ print "AFS CRITICAL - $blocked blocked connections\n";
+ exit 2;
+} elsif ($blocked >= $WARNINGS) {
+ print "AFS WARNING - $blocked blocked connections\n";
+ exit 1;
+} else {
+ print "AFS OK - $blocked blocked connections\n";
+ exit 0;
+}
+
+##############################################################################
+# Documentation
+##############################################################################
+
+=head1 NAME
+
+check_rxdebug - Check AFS servers for blocked connections in Nagios
+
+=head1 SYNOPSIS
+
+check_rxdebug [B<-hV>] [B<-c> I<threshold>] [B<-w> I<threshold>]
+[B<-t> I<timeout>] B<-H> I<host>
+
+=head1 DESCRIPTION
+
+B<check_rxdebug> is a Nagios plugin for checking AFS file servers to see if
+there are client connections waiting for a free thread. If there are more
+than a few of these, AFS performance tends to be very slow; this is a fairly
+reliable way to catch overloaded file servers. By default, B<check_rxdebug>
+returns a critical error if there are more than eight connections waiting
+for a free thread and a warning if there are more than two. These
+thresholds can be changed with the B<-c> and B<-w> options.
+
+B<check_rxdebug> will always print out a single line of output including the
+number of blocked connections, displaying whether this is critical, a
+warning, or okay.
+
+=head1 OPTIONS
+
+=over 4
+
+=item B<-c> I<threshold>, B<--critical>=I<threshold>
+
+Change the critical blocked connection count threshold to I<threshold>,
+which should be an integer. The default is 8.
+
+=item B<-H> I<host>, B<--hostname>=I<host>
+
+The AFS file server whose connections B<check_rxdebug> should check. This
+option is required.
+
+=item B<-h>, B<--help>
+
+Print out this documentation (which is done simply by feeding the script
+to C<perldoc -t>).
+
+=item B<-t> I<timeout>, B<--timeout>=I<timeout>
+
+Change the timeout for the B<rxdebug> command. The default timeout is 60
+seconds.
+
+=item B<-V>, B<--version>
+
+Print out the version of B<check_rxdebug> and quit.
+
+=item B<-w> I<threshold>, B<--warning>=I<threshold>
+
+Change the warning blocked connection threshold to I<threshold>, which
+should be an integer. The default is 2.
+
+=back
+
+=head1 EXIT STATUS
+
+B<check_rxdebug> follows the standard Nagios exit status requirements. This
+means that it will exit with status 0 if there are no problems, with status
+1 if there is a warning, and with status 2 if there is a critical problem.
+For other errors, such as invalid syntax, B<check_rxdebug> will exit with
+status 3.
+
+=head1 BUGS
+
+The standard B<-v> verbose Nagios plugin option is not supported, although
+it's not entirely clear what it would add.
+
+The usage message for invalid options and for the B<-h> option doesn't
+conform to Nagios standards.
+
+=head1 CAVEATS
+
+This script does not use the Nagios util library or any of the defaults that
+it provides, which makes it somewhat deficient as a Nagios plugin. This is
+intentional, though, since this script can be used with other monitoring
+systems as well. It's not clear what a good solution to this would be.
+
+=head1 SEE ALSO
+
+The current version of this and other AFS monitoring plugins for Nagios are
+available from the AFS monitoring tools page at
+L<http://www.eyrie.org/~eagle/software/afs-monitor/>.
+
+=head1 AUTHORS
+
+The original idea behind this script was from Neil Crellin. It was updated
+by Quanah Gibson-Mount to work with Nagios, and then further updated by Russ
+Allbery <rra@stanford.edu> to support more standard options and to use a
+more uniform coding style.
+
+=head1 COPYRIGHT AND LICENSE
+
+Copyright 2003, 2004, 2005 Board of Trustees, Leland Stanford Jr. University.
+
+This program is free software; you may redistribute it and/or modify it
+under the same terms as Perl itself.
+
+=cut
--- /dev/null
+#!/usr/bin/perl -w
+$ID = q$Id: check_afsspace,v 1.16 2006/03/17 23:06:54 quanah Exp $;
+#
+# check_afsspace -- Monitor AFS disk space usage under Nagios.
+#
+# Written by Susan Feng <sfeng@stanford.edu>
+# Updated by Russ Allbery <rra@stanford.edu>
+# Copyright 2003, 2004 Board of Trustees, Leland Stanford Jr. University
+#
+# This program is free software; you may redistribute it and/or modify it
+# under the same terms as Perl itself.
+#
+# Expects a host with the -H option and checks the partition usage with
+# vos partinfo. Exits with status 1 if the free space is below a warning
+# percentage and with status 2 if the free space is above a critical
+# percentage (this works with the Nagios check architecture).
+
+##############################################################################
+# Site configuration
+##############################################################################
+
+# The default percentage full at which to warn and at which to send a critical
+# alert. These can be overridden with the -w and -c command-line options.
+$WARNINGS = 85;
+$CRITICAL = 90;
+
+# The default timeout in seconds (implemented by alarm) for vos partinfo.
+$TIMEOUT = 300;
+
+# The full path to vos. Make sure that this is on local disk so that
+# monitoring doesn't have an AFS dependency.
+($VOS) = grep { -x $_ } qw(/usr/bin/vos /usr/local/bin/vos);
+$VOS ||= '/usr/bin/vos';
+
+##############################################################################
+# Modules and declarations
+##############################################################################
+
+require 5.003;
+
+use strict;
+use vars qw($CRITICAL $ID $TIMEOUT $VOS $WARNINGS);
+
+use Getopt::Long qw(GetOptions);
+
+##############################################################################
+# Implementation
+##############################################################################
+
+# Parse command line options.
+my ($help, $host, $version);
+Getopt::Long::config ('bundling', 'no_ignore_case');
+GetOptions ('critical|c=i' => \$CRITICAL,
+ 'hostname|H=s' => \$host,
+ 'help|h' => \$help,
+ 'timeout|t=i' => \$TIMEOUT,
+ 'version|V' => \$version,
+ 'warning|w=i' => \$WARNINGS) or exit 3;
+if ($help) {
+ print "Feeding myself to perldoc, please wait....\n";
+ exec ('perldoc', '-t', $0) or die "Cannot fork: $!\n";
+} elsif ($version) {
+ my $version = join (' ', (split (' ', $ID))[1..3]);
+ $version =~ s/,v\b//;
+ $version =~ s/(\S+)$/($1)/;
+ $version =~ tr%/%-%;
+ print $version, "\n";
+ exit 0;
+}
+if (@ARGV) {
+ warn "Usage: $0 [-hv] [-c <level>] [-w <level>] -H <host>\n";
+ exit 3;
+}
+if ($WARNINGS > $CRITICAL) {
+ warn "$0: warning level $WARNINGS greater than critical level $CRITICAL\n";
+ exit 3;
+}
+
+# Set up the alarm.
+$SIG{ALRM} = sub {
+ print "AFS CRITICAL - network timeout after $TIMEOUT seconds\n";
+ exit 2;
+};
+alarm ($TIMEOUT);
+
+# Get the partinfo information and calculate the percentage free for each
+# partition. Accumulate critical messages in @critical and warnings in
+# @warnings. Accumulate all percentages in @all.
+my (@critical, @warnings, @all);
+my @data = `$VOS partinfo '$host' 2> /dev/null`;
+if ($? != 0) {
+ print "AFS CRITICAL - cannot contact server\n";
+ exit 2;
+}
+for (@data) {
+ my ($partition, $free, $total) = (split)[4,5,11];
+ my $percent = int ((($total - $free) / $total) * 100);
+ if ($percent >= $CRITICAL) {
+ push (@critical, "$partition$percent% (free $free)");
+ } elsif ($percent >= $WARNINGS) {
+ push (@warnings, "$partition$percent% (free $free)");
+ }
+ push (@all, "$partition$percent%");
+}
+
+# Exit with the appropriate error messages.
+if (@critical) {
+ print "AFS CRITICAL - @critical\n";
+ exit 2;
+} elsif (@warnings) {
+ print "AFS WARNING - @warnings\n";
+ exit 1;
+} else {
+ print "AFS OK - @all\n";
+ exit 0;
+}
+
+##############################################################################
+# Documentation
+##############################################################################
+
+=head1 NAME
+
+check_afsspace - Monitor AFS disk space usage under Nagios
+
+=head1 SYNOPSIS
+
+check_afsspace [B<-hV>] [B<-c> I<threshold>] [B<-w> I<threshold>]
+[B<-t> I<timeout>] B<-H> I<host>
+
+=head1 DESCRIPTION
+
+B<check_afsspace> is a Nagios plugin for checking free space on AFS server
+partitions. It uses C<vos partinfo> to obtain the free space on the
+partitions on an AFS server and will return an alert if the percentage of
+used space exceeds a threshold. By default, it returns a critical error if
+the used space is over 90% and a warning if it is over 85% (changable with
+the B<-c> and B<-w> options).
+
+B<check_afsspace> will always print out a single line of output, giving the
+critical errors if any, otherwise giving the warnings if any, otherwise
+listing in an abbreviated form the percentage free space for all partitions.
+
+=head1 OPTIONS
+
+=over 4
+
+=item B<-c> I<threshold>, B<--critical>=I<threshold>
+
+Change the critical percentage threshold to I<threshold>, which should be an
+integer percentage. The default is 90.
+
+=item B<-H> I<host>, B<--hostname>=I<host>
+
+The AFS file server whose free space B<check_afsspace> should check. This
+option is required.
+
+=item B<-h>, B<--help>
+
+Print out this documentation (which is done simply by feeding the script
+to C<perldoc -t>).
+
+=item B<-t> I<timeout>, B<--timeout>=I<timeout>
+
+Change the timeout for the C<vos partinfo> command. The default timeout is
+10 seconds.
+
+=item B<-V>, B<--version>
+
+Print out the version of B<check_afsspace> and quit.
+
+=item B<-w> I<threshold>, B<--warning>=I<threshold>
+
+Change the warning percentage threshold to I<threshold>, which should be an
+integer percentage. The default is 85.
+
+=back
+
+=head1 EXIT STATUS
+
+B<check_afsspace> follows the standard Nagios exit status requirements.
+This means that it will exit with status 0 if there are no problems, with
+status 2 if there is at least one critical partition for that server, and
+with status 1 if there are no critical partitions but at least one warning
+partition. For other errors, such as invalid syntax, B<check_afsspace> will
+exit with status 3.
+
+=head1 BUGS
+
+The standard B<-v> verbose Nagios plugin option is not supported and should
+be. (For example, under B<-vv> we would want to show the actual total,
+free, and used byte counts, not just the percentages.)
+
+The usage message for invalid options and for the B<-h> option doesn't
+conform to Nagios standards.
+
+=head1 CAVEATS
+
+This script does not use the Nagios util library or any of the defaults that
+it provides, which makes it somewhat deficient as a Nagios plugin. This is
+intentional, though, since this script can be used with other monitoring
+systems as well. It's not clear what a good solution to this would be.
+
+=head1 SEE ALSO
+
+vos(1)
+
+The current version of this and other AFS monitoring plugins for Nagios are
+available from the AFS monitoring tools page at
+L<http://www.eyrie.org/~eagle/software/afs-monitor/>.
+
+=head1 AUTHORS
+
+Originally written by Susan Feng for use with mon. Updated by Quanah
+Gibson-Mount to work with Nagios, and then further updated by Russ Allbery
+<rra@stanford.edu> to support more standard options and to use a more
+uniform coding style.
+
+=head1 COPYRIGHT AND LICENSE
+
+Copyright 2003, 2004 Board of Trustees, Leland Stanford Jr. University.
+
+This program is free software; you may redistribute it and/or modify it
+under the same terms as Perl itself.
+
+=cut
--- /dev/null
+#!/usr/bin/perl -w
+$ID = q$Id: check_udebug,v 1.3 2006/03/17 23:06:54 quanah Exp $;
+#
+# check_udebug -- Check AFS database servers using udebug for Nagios.
+#
+# Written by Russ Allbery <rra@stanford.edu>
+# Copyright 2004 Board of Trustees, Leland Stanford Jr. University
+#
+# This program is free software; you may redistribute it and/or modify it
+# under the same terms as Perl itself.
+#
+# Takes a hostname and a port number and checks the udebug output for that
+# host and port. Reports an error if the recovery state is not 1f on the sync
+# site (ensuring that it considers all of the other servers up-to-date) or if
+# any of the servers don't believe there is a sync site.
+
+##############################################################################
+# Site configuration
+##############################################################################
+
+# The default timeout in seconds (implemented by alarm) for udebug.
+$TIMEOUT = 10;
+
+# The full path to udebug. Make sure that this is on local disk so that
+# monitoring doesn't have an AFS dependency.
+($UDEBUG) = grep { -x $_ } qw(/usr/bin/udebug /usr/local/bin/udebug);
+$UDEBUG ||= '/usr/bin/udebug';
+
+##############################################################################
+# Modules and declarations
+##############################################################################
+
+require 5.003;
+
+use strict;
+use vars qw($ID $TIMEOUT $UDEBUG);
+
+use Getopt::Long qw(GetOptions);
+
+##############################################################################
+# Implementation
+##############################################################################
+
+# Parse command line options.
+my ($help, $host, $port, $version);
+Getopt::Long::config ('bundling', 'no_ignore_case');
+GetOptions ('hostname|H=s' => \$host,
+ 'help|h' => \$help,
+ 'port|p=i' => \$port,
+ 'timeout|t=i' => \$TIMEOUT,
+ 'version|V' => \$version) or exit 3;
+if ($help) {
+ print "Feeding myself to perldoc, please wait....\n";
+ exec ('perldoc', '-t', $0) or die "Cannot fork: $!\n";
+} elsif ($version) {
+ my $version = join (' ', (split (' ', $ID))[1..3]);
+ $version =~ s/,v\b//;
+ $version =~ s/(\S+)$/($1)/;
+ $version =~ tr%/%-%;
+ print $version, "\n";
+ exit 0;
+}
+if (@ARGV || !(defined ($host) && defined ($port))) {
+ warn "Usage: $0 [-hv] [-t <timeout>] -H <host> -p <port>\n";
+ exit 3;
+}
+
+# Set up the alarm.
+$SIG{ALRM} = sub {
+ print "UBIK CRITICAL - network timeout after $TIMEOUT seconds\n";
+ exit 2;
+};
+alarm ($TIMEOUT);
+
+# Run udebug and parse the output. We're looking for three things: first,
+# we're looking to see if this host claims to be the sync site. If so, check
+# that recovery state is 1f. Otherwise, make sure that there's a defined sync
+# host.
+unless (open (UDEBUG, "$UDEBUG $host $port |")) {
+ warn "$0: cannot run udebug\n";
+ exit 3;
+}
+my ($issync, $recovery, $synchost);
+while (<UDEBUG>) {
+ $issync = 1 if /^I am sync site /;
+ $recovery = 1 if /^Recovery state 1f/;
+ $synchost = 1 if /^Sync host \d+(\.\d+){3} was set /;
+}
+close UDEBUG;
+if ($? != 0) {
+ print "UBIK CRITICAL - udebug failed\n";
+ exit 2;
+}
+
+# Check the results.
+if ($issync && !$recovery) {
+ print "UBIK CRITICAL - recovery state not 1f\n";
+ exit 2;
+} elsif (!$issync && !$synchost) {
+ print "UBIK CRITICAL - no sync site\n";
+ exit 2;
+} else {
+ print "UBIK OK\n";
+ exit 0;
+}
+
+##############################################################################
+# Documentation
+##############################################################################
+
+=head1 NAME
+
+check_udebug - Check AFS servers for blocked connections in Nagios
+
+=head1 SYNOPSIS
+
+check_udebug [B<-hV>] [B<-t> I<timeout>] B<-H> I<host> B<-p> I<port>
+
+=head1 DESCRIPTION
+
+B<check_udebug> is a Nagios plugin for checking AFS database servers to make
+sure the Ubik replication between the database servers is running correctly.
+B<udebug> is used to connect to the specified port, which should generally
+be one of 7002 (ptserver), 7003 (vlserver), or 7004 (kaserver), on the
+specified server. The resulting output is checked to make sure that the
+recovery state is 1f if that server is the sync site, or that a sync site is
+known if that server doesn't claim to be the sync site.
+
+B<check_udebug> will always print out a single line of output. That line
+will be C<UBIK OK> if everything is fine, or C<UBIK CRITICAL - > followed by
+an error message otherwise.
+
+=head1 OPTIONS
+
+=over 4
+
+=item B<-H> I<host>, B<--hostname>=I<host>
+
+The AFS database server whose Ubik status B<check_udebug> should check.
+This option is required.
+
+=item B<-h>, B<--help>
+
+Print out this documentation (which is done simply by feeding the script
+to C<perldoc -t>).
+
+=item B<-p> I<port>, B<--port>=I<port>
+
+The port to connect to on the AFS database server. This should generally be
+one of 7002 (ptserver), 7003 (vlserver), or 7004 (kaserver). This option is
+required.
+
+=item B<-t> I<timeout>, B<--timeout>=I<timeout>
+
+Change the timeout for the B<udebug> command. The default timeout is 60
+seconds.
+
+=item B<-V>, B<--version>
+
+Print out the version of B<check_udebug> and quit.
+
+=back
+
+=head1 EXIT STATUS
+
+B<check_udebug> follows the standard Nagios exit status requirements. This
+means that it will exit with status 0 if there are no problems or with
+status 2 if there are critical problems. For other errors, such as invalid
+syntax, B<check_udebug> will exit with status 3.
+
+=head1 BUGS
+
+The standard B<-v> verbose Nagios plugin option is not supported. It should
+print out the full B<udebug> output.
+
+The usage message for invalid options and for the B<-h> option doesn't
+conform to Nagios standards.
+
+=head1 CAVEATS
+
+This script does not use the Nagios util library or any of the defaults that
+it provides, which makes it somewhat deficient as a Nagios plugin. This is
+intentional, though, since this script can be used with other monitoring
+systems as well. It's not clear what a good solution to this would be.
+
+=head1 SEE ALSO
+
+The current version of this and other AFS monitoring plugins for Nagios are
+available from the AFS monitoring tools page at
+L<http://www.eyrie.org/~eagle/software/afs-monitor/>.
+
+=head1 AUTHORS
+
+Russ Allbery <rra@stanford.edu>
+
+=head1 COPYRIGHT AND LICENSE
+
+Copyright 2004 Board of Trustees, Leland Stanford Jr. University.
+
+This program is free software; you may redistribute it and/or modify it
+under the same terms as Perl itself.
+
+=cut