From 4dee8174f83d6970e8fa675c6924b9b94e36d031 Mon Sep 17 00:00:00 2001 From: Martin Zobel-Helas Date: Sat, 15 Sep 2012 14:00:30 +0200 Subject: [PATCH] add drbd nagios check Signed-off-by: Martin Zobel-Helas --- config/nagios-master.cfg | 17 +- dsa-nagios-checks/checks/dsa-check-drdb | 456 ++++++++++++++++++++++++ dsa-nagios-checks/debian/changelog | 5 +- 3 files changed, 473 insertions(+), 5 deletions(-) create mode 100755 dsa-nagios-checks/checks/dsa-check-drdb diff --git a/config/nagios-master.cfg b/config/nagios-master.cfg index 75aa2c7..1d7a814 100644 --- a/config/nagios-master.cfg +++ b/config/nagios-master.cfg @@ -665,7 +665,7 @@ servers: pasquini: address: 206.12.19.217 parents: sw-ubcece-kais - hostgroups: computers, bl460, acpid-hosts, service, squeeze + hostgroups: computers, bl460, acpid-hosts, service, squeeze, drbd-hosts luchesi: address: 206.12.19.214 parents: sw-ubcece-kais @@ -673,15 +673,15 @@ servers: tristano: address: 206.12.19.213 parents: sw-ubcece-kais - hostgroups: computers, bl460, acpid-hosts, service, squeeze + hostgroups: computers, bl460, acpid-hosts, service, squeeze, drbd-hosts boito: address: 206.12.19.216 parents: sw-ubcece-kais - hostgroups: computers, bl460, acpid-hosts, service, squeeze + hostgroups: computers, bl460, acpid-hosts, service, squeeze, drbd-hosts bertali: address: 206.12.19.212 parents: sw-ubcece-kais - hostgroups: computers, bl460, acpid-hosts, service, squeeze + hostgroups: computers, bl460, acpid-hosts, service, squeeze, drbd-hosts ganeti2: address: 206.12.19.23 parents: sw-ubcece-kais @@ -1028,6 +1028,9 @@ hostgroups: alias: Hosts that are KVM domains private: 1 + drbd-hosts: + alias: hosts running drbd + private: 1 postfix-hosts: alias: hosts running postfix instead of exim private: 1 @@ -2065,6 +2068,12 @@ services: servicegroups: raid nrpe: "/usr/lib/nagios/plugins/dsa-check-raid-megactl" hostgroups: megactl + ### + - + name: RAID - DRDB + normal_check_interval: 120 + nrpe: "/usr/lib/nagios/plugins/dsa-check-drbd -d All" + hostgroups: drbd ### - name: process - slapd diff --git a/dsa-nagios-checks/checks/dsa-check-drdb b/dsa-nagios-checks/checks/dsa-check-drdb new file mode 100755 index 0000000..5b06215 --- /dev/null +++ b/dsa-nagios-checks/checks/dsa-check-drdb @@ -0,0 +1,456 @@ +#!/usr/bin/perl -w + +#################################################### +# check_drbd v0.5.3 # +# by Brandon Lee Poyner bpoyner / CCAC.edu # +#################################################### + +use strict; +use File::Basename; +use Getopt::Long; + +my $drbd_proc='/proc/drbd'; +my $drbd_devices=0; +my ($drbd_expect, $drbd_role, $drbd_version, $debug_mode); +my (%options, %cs, %st, %ld, %ds, %check, %warning, %critical); + +my $prog_name=basename($0); +my $prog_revision='0.5.3'; + +my %errorcodes = ( + 'OK' => { 'retvalue' => 0 }, + 'WARNING' => { 'retvalue' => 1 }, + 'CRITICAL' => { 'retvalue' => 2 }, + 'UNKNOWN' => { 'retvalue' => 3 } +); + +# +# Define various states and default alarm values +# +my %state = ( + 'Primary' => { 'value' => 'OK', 'type' => 'st' }, + 'Secondary' => { 'value' => 'OK', 'type' => 'st' }, + 'Unknown' => { 'value' => 'CRITICAL', 'type' => 'st' }, + 'StandAlone' => { 'value' => 'WARNING', 'type' => 'cs' }, + 'Unconnected' => { 'value' => 'CRITICAL', 'type' => 'cs' }, + 'Timeout' => { 'value' => 'CRITICAL', 'type' => 'cs' }, + 'BrokenPipe' => { 'value' => 'CRITICAL', 'type' => 'cs' }, + 'WFConnection' => { 'value' => 'CRITICAL', 'type' => 'cs' }, + 'WFReportParams' => { 'value' => 'CRITICAL', 'type' => 'cs' }, + 'Connected' => { 'value' => 'OK', 'type' => 'cs' }, + 'Unconfigured' => { 'value' => 'OK', 'type' => 'cs' }, + # DRBD 0.6 + 'SyncingAll' => { 'value' => 'WARNING', 'type' => 'cs' }, + 'SyncingQuick' => { 'value' => 'WARNING', 'type' => 'cs' }, + 'SyncPaused' => { 'value' => 'CRITICAL', 'type' => 'cs' }, + # DRBD 0.7 + 'WFBitMapS' => { 'value' => 'CRITICAL', 'type' => 'cs' }, + 'WFBitMapT' => { 'value' => 'CRITICAL', 'type' => 'cs' }, + 'SyncSource' => { 'value' => 'WARNING', 'type' => 'cs' }, + 'SyncTarget' => { 'value' => 'WARNING', 'type' => 'cs' }, + 'PausedSyncS' => { 'value' => 'CRITICAL', 'type' => 'cs' }, + 'PausedSyncT' => { 'value' => 'CRITICAL', 'type' => 'cs' }, + 'NetworkFailure' => { 'value' => 'CRITICAL', 'type' => 'cs' }, + 'SkippedSyncS' => { 'value' => 'CRITICAL', 'type' => 'cs' }, + 'SkippedSyncT' => { 'value' => 'CRITICAL', 'type' => 'cs' }, + 'Consistent' => { 'value' => 'OK', 'type' => 'ld' }, + 'Inconsistent' => { 'value' => 'CRITICAL', 'type' => 'ld' }, + # DRBD 8.0 + 'UpToDate' => { 'value' => 'OK', 'type' => 'ds' }, + 'Consistent' => { 'value' => 'OK', 'type' => 'ds' }, + 'Negotiating' => { 'value' => 'WARNING', 'type' => 'ds' }, + 'Attaching' => { 'value' => 'WARNING', 'type' => 'ds' }, + 'Diskless' => { 'value' => 'CRITICAL', 'type' => 'ds' }, + 'Failed' => { 'value' => 'CRITICAL', 'type' => 'ds' }, + 'Outdated' => { 'value' => 'CRITICAL', 'type' => 'ds' }, + 'Inconsistent' => { 'value' => 'CRITICAL', 'type' => 'ds' }, + 'DUnknown' => { 'value' => 'CRITICAL', 'type' => 'ds' }, + # DRBD 8.2 + 'VerifyS' => { 'value' => 'WARNING', 'type' => 'cs' }, + 'VerifyT' => { 'value' => 'WARNING', 'type' => 'cs' }, + # DRBD 8.3 + 'Disconnecting' => { 'value' => 'WARNING', 'type' => 'cs' }, + 'ProtocolError' => { 'value' => 'CRITICAL', 'type' => 'cs' }, + 'TearDown' => { 'value' => 'WARNING', 'type' => 'cs' }, + 'StartingSyncS' => { 'value' => 'WARNING', 'type' => 'cs' }, + 'StartingSyncT' => { 'value' => 'WARNING', 'type' => 'cs' }, + 'WFSyncUUID' => { 'value' => 'WARNING', 'type' => 'cs' } +); + +&parse_options; +&parse_proc; +&parse_drbd_devices; +&check_drbd_state; +&report_status; +&myexit('UNKNOWN',"$prog_name should never reach here"); + +sub print_usage { + print <] [-e expect] [-p proc] [-r role] [-o states] [-w states] [-c states] [--debug] + Options: + -d STRING [default: $drbd_devices. Example: 0,1,2 ] + -p STRING [default: $drbd_proc. Use '-' for stdin] + -e STRING [Must be this connected state. Example: Connected] + -r STRING [Must be this node state. Example: Primary] + -o STRING [Change value to OK. Example: StandAlone] + -w STRING [Change value to WARNING. Example: SyncingAll] + -c STRING [Change value to CRITICAL. Example: Inconsistent,WFConnection] +EOF +} + +sub print_revision { + print <{'retvalue'}; +} + +sub parse_options { + my ($help, $version, $debug, $ok_string, $warning_string, + $critical_string); + # + # Get command line options + # + GetOptions("h|help" => \$help, + "V|version" => \$version, + "d|device|devices=s" => \$drbd_devices, + "e|expect=s" => \$drbd_expect, + "p|proc=s" => \$drbd_proc, + "r|role=s" => \$drbd_role, + "o|ok=s" => \$ok_string, + "w|warning=s" => \$warning_string, + "c|critical=s" => \$critical_string, + "debug" => \$debug); + if (defined($help) && ($help ne "")) { + &print_help; + exit $errorcodes{'UNKNOWN'}->{'retvalue'}; + } + if (defined($version) && ($version ne "")) { + &print_revision; + exit $errorcodes{'UNKNOWN'}->{'retvalue'}; + } + if (defined($drbd_expect) && ($drbd_expect ne "")) { + # User requested the connected state to be very specific + &change_values($drbd_expect,'cs','expect','connected state'); + } + if (defined($drbd_role) && ($drbd_role ne "")) { + # User requested the node state to be very specific + &change_values($drbd_role,'st','role','node state'); + } + if (defined($ok_string) && ($ok_string ne "")) { + # User requested certain values to be OK + &set_values($ok_string,'OK'); + } + if (defined($warning_string) && ($warning_string ne "")) { + # User requested certain values to be WARNING + &set_values($warning_string,'WARNING'); + } + if (defined($critical_string) && ($critical_string ne "")) { + # User requested certain values to be CRITICAL + &set_values($critical_string,'CRITICAL'); + } + if (defined($debug) && ($debug ne "")) { + # + # Debugging information + # + $debug_mode=1; + print STDERR "<$prog_name settings>\n"; + print STDERR "DRBD Devices: $drbd_devices\n"; + printf STDERR "DRBD Proc: %s\n", defined($drbd_proc)?$drbd_proc:""; + printf STDERR "DRBD Expect: %s\n", defined($drbd_expect)?$drbd_expect:""; + printf STDERR "DRBD Role: %s\n", defined($drbd_role)?$drbd_role:""; + my (@ok, @critical, @warning); + for my $key ( keys %state ) { + if ($state{$key}->{'value'} eq 'OK') { + push(@ok,$key); + } + if ($state{$key}->{'value'} eq 'WARNING') { + push(@warning,$key); + } + if ($state{$key}->{'value'} eq 'CRITICAL') { + push(@critical,$key); + } + } + printf STDERR "DRBD OK: %s\n", join(" ",sort(@ok)); + printf STDERR "DRBD WARNING: %s\n", join(" ",sort(@warning)); + printf STDERR "DRBD CRITICAL: %s\n", join(" ",sort(@critical)); + print STDERR "\n"; + } +} + +sub parse_proc { + # + # Read in contents of proc file, feed results into hashes + # + my $input; + if ( $drbd_proc ne "-" ) { + $input = "DRBD"; + if ( ! -e $drbd_proc ) { + &myexit('UNKNOWN',"No such file $drbd_proc"); + } + open(DRBD, "$drbd_proc") || + &myexit('UNKNOWN',"Could not open $drbd_proc"); + } else { + $input = "STDIN"; + } + while(<$input>) { + if (/^version: (\d+).(\d+)/) { + $drbd_version = "$1.$2"; + } + if (/^\s?(\d+):.* cs:(\w+)/) { + $cs{$1} = $2; + } + if (/^\s?(\d+):.* st:(\w+)\//) { + $st{$1} = $2; + } + if (/^\s?(\d+):.* ld:(\w+)/) { + $ld{$1} = $2; + } + if (/^\s?(\d+):.* ds:(\w+)/) { + $ds{$1} = $2; + } + } + if ( $drbd_proc ne "-" ) { + close(DRBD); + } + if (defined($debug_mode) && ($debug_mode == 1)) { + # + # Debugging information + # + print STDERR "<$prog_name devices found>\n"; + for my $key ( sort keys %cs ) { + printf STDERR "Found Device $key $cs{$key}%s%s%s\n", defined($st{$key})?" $st{$key}":"", defined($ld{$key})?" $ld{$key}":"", defined($ds{$key})?" $ds{$key}":""; + } + print STDERR "\n"; + } +} + +sub parse_drbd_devices { + # + # Determine which DRBD devices to monitor + # + my @devices; + if ($drbd_devices =~ /^all$/i) { + for my $device ( keys %cs ) { + push(@devices,$device); + } + } elsif ($drbd_devices =~ /^configured$/i) { + for my $device ( keys %cs ) { + next if ($cs{$device} eq "Unconfigured"); + push(@devices,$device); + } + } else { + @devices = split(/,/,$drbd_devices); + } + foreach my $device (@devices) { + if (!(defined($cs{$device}))) { + &myexit('OK',"Could not find device $device"); + } + $check{$device} = 1; + } + if (int(keys %check) == 0) { + &myexit('UNKNOWN',"No configured devices found"); + } + if (defined($debug_mode) && ($debug_mode == 1)) { + # + # Debugging information + # + print STDERR "<$prog_name devices to check>\n"; + for my $key ( sort keys %check ) { + printf STDERR "Checking enabled for device $key\n"; + } + print STDERR "\n"; + } +} + +sub check_drbd_state { + for my $drbd_device ( sort keys %check ) { + if ((defined($drbd_version)) && ($drbd_version >= '8.0')) { + # + # We're dealing with version 8.0 or greater + # Set data state + # + if ((defined($ds{$drbd_device})) && + (defined($state{$ds{$drbd_device}}))) { + $state{$ds{$drbd_device}}->{$drbd_device}->{'level'} = 1; + } elsif (defined($ds{$drbd_device})) { + &myexit('CRITICAL',"Data state unknown value '$ds{$drbd_device}' for device $drbd_device"); + } + } + if ((defined($drbd_version)) && ($drbd_version == '0.7')) { + # + # We're dealing with version 0.7 + # Set local data consistency + # + if ((defined($ld{$drbd_device})) && + (defined($state{$ld{$drbd_device}}))) { + $state{$ld{$drbd_device}}->{$drbd_device}->{'level'} = 1; + } elsif (defined($ld{$drbd_device})) { + &myexit('CRITICAL',"Local data consistency unknown value '$ld{$drbd_device}' for device $drbd_device"); + } + } + # + # Check for a state value (Primary, Secondary, etc) + # + if ((defined($st{$drbd_device})) && + (defined($state{$st{$drbd_device}}))) { + $state{$st{$drbd_device}}->{$drbd_device}->{'level'} = 1; + } elsif (defined($st{$drbd_device})) { + &myexit('CRITICAL',"Node state unknown value '$st{$drbd_device}' for device $drbd_device"); + } + # + # Check for a connected state value (Connected, StandAlone, etc) + # + if (defined($state{$cs{$drbd_device}})) { + $state{$cs{$drbd_device}}->{$drbd_device}->{'level'} = 1; + } else { + &myexit('CRITICAL',"Connection state unknown value '$cs{$drbd_device}' for device $drbd_device"); + } + # + # Debugging information + # + if (defined($debug_mode) && ($debug_mode == 1)) { + print STDERR "<$prog_name device $drbd_device status>\n"; + for my $key ( keys %state ) { + if (defined($state{$key}->{$drbd_device}->{'level'})) { + print STDERR "$key $state{$key}->{'value'}\n"; + } + } + print STDERR "\n"; + } + # + # Determine if any values are CRITICAL or WARNING + # + for my $key ( keys %state ) { + if (defined($state{$key}->{$drbd_device}->{'level'})) { + if ($state{$key}->{'value'} eq "CRITICAL") { + $critical{$drbd_device} = 1; + } + if ($state{$key}->{'value'} eq "WARNING") { + $warning{$drbd_device} = 1; + } + } + } + } +} + +sub report_status { + my $message; + my $critical_count=int(keys %critical); + my $warning_count=int(keys %warning); + if ($critical_count > 0) { + # + # We found a CRITICAL situation + # + my $i = 0; + for my $device (sort keys %critical) { + $message.=sprintf("Device %d%s $cs{$device}%s%s", $device,defined($st{$device})?" $st{$device}":"",defined($ld{$device})?" $ld{$device}":"",defined($ds{$device})?" $ds{$device}":""); + $i++; + if ($i != $critical_count) { + $message.=", "; + } + } + &myexit('CRITICAL',$message); + } elsif ($warning_count > 0) { + # + # We found a WARNING situation + # + my $i = 0; + for my $device (sort keys %warning) { + $message.=sprintf("Device %d%s $cs{$device}%s%s", $device,defined($st{$device})?" $st{$device}":"",defined($ld{$device})?" $ld{$device}":"",defined($ds{$device})?" $ds{$device}":""); + $i++; + if ($i != $warning_count) { + $message.=", "; + } + } + &myexit('WARNING',$message); + } else { + # + # Everything checks out OK + # + my $device_count=int(keys %check); + if ($device_count == 1) { + for my $device ( sort keys %check ) { + $message=sprintf("Device %d%s $cs{$device}%s%s", $device,defined($st{$device})?" $st{$device}":"",defined($ld{$device})?" $ld{$device}":"",defined($ds{$device})?" $ds{$device}":""); + } + } else { + my $i = 0; + for my $device ( sort keys %check ) { + $message.=sprintf("Dev %d %0.3s%0.3s%0.3s%0.3s", $device,defined($st{$device})?"$st{$device}":"",$cs{$device},defined($ld{$device})?"$ld{$device}":"",defined($ds{$device})?"$ds{$device}":""); + $i++; + if ($i != $device_count) { + $message.=", "; + } + } + } + &myexit('OK',$message); + } +} + +sub set_values { + # + # Set item to value requested + # + my ($items,$value) = @_; + my @items = split(/,/,$items); + foreach my $item (@items) { + if (defined($state{$item})) { + $state{$item}->{'value'} = "$value"; + } else { + print STDERR "State '$item' not found\n"; + } + } +} + +sub change_values { + # + # Look for all values of a given type, set requested value to OK + # and all other values to CRITICAL + # + my ($argument,$type,$error1,$error2) = @_; + if ((defined($state{$argument})) && + ($state{$argument}->{'type'} eq "$type")) { + for my $key ( keys %state ) { + if ($state{$key}->{'type'} eq "$type") { + if ($key eq $argument) { + &set_values($argument,'OK'); + } else { + &set_values($key,'CRITICAL'); + } + } + } + } else { + &myexit('UNKNOWN',"$error1 option only works for $error2"); + } +} + +sub myexit { + # + # Print error message and exit + # + my ($error, $message) = @_; + if (!(defined($errorcodes{$error}))) { + printf STDERR "Error code $error not known\n"; + print "DRBD UNKNOWN: $message\n"; + exit $errorcodes{'UNKNOWN'}->{'retvalue'}; + } + print "DRBD $error: $message\n"; + exit $errorcodes{$error}->{'retvalue'}; +} diff --git a/dsa-nagios-checks/debian/changelog b/dsa-nagios-checks/debian/changelog index 74505bd..43de6f3 100644 --- a/dsa-nagios-checks/debian/changelog +++ b/dsa-nagios-checks/debian/changelog @@ -25,7 +25,10 @@ dsa-nagios-checks (9X) Xnstable; urgency=low [ Uli Martens ] * dsa-check-soas: Add --no-soa-ns flag. - -- Peter Palfrader Sat, 25 Aug 2012 18:17:55 +0200 + [ Martin Zobel-Helas ] + * Add dsa-check-drbd, taken from http://code.google.com/p/ganeti/wiki/DrbdDevicesMonitoring + + -- Martin Zobel-Helas Sat, 15 Sep 2012 13:40:12 +0200 dsa-nagios-checks (92.1) unstable; urgency=low -- 2.20.1