#!/usr/bin/perl -w # check _physical_ disk status of disks on Smart Array controllers # requires hpssacli or ssacli # # does _not_ check raid status. use arrayprobe for that. # Copyright (c) 2008,2009,2010,2011 Peter Palfrader # # Permission is hereby granted, free of charge, to any person obtaining # a copy of this software and associated documentation files (the # "Software"), to deal in the Software without restriction, including # without limitation the rights to use, copy, modify, merge, publish, # distribute, sublicense, and/or sell copies of the Software, and to # permit persons to whom the Software is furnished to do so, subject to # the following conditions: # # The above copyright notice and this permission notice shall be # included in all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. use strict; use English; use Getopt::Long; # nagios exit codes my %CODE = ( 'OK' => 0, 'WARNING' => 1, 'CRITICAL' => 2, 'UNKNOWN' => 3 ); my $EXITCODE = 'OK'; $SIG{'__DIE__'} = sub { print @_; exit $CODE{'UNKNOWN'}; }; # support both the older hpssacli and the newer ssacli my $BIN; if ($0 =~ /hpssacli/) { $BIN = "hpssacli"; } else { $BIN = "ssacli"; } sub runcmd($) { my ($cmd) = @_; $cmd = "sudo $BIN $cmd"; open(FH, $cmd."|") or die ("Cannot run $cmd: $!"); my @lines = ; close FH; die ("no results from $cmd\n") if (scalar @lines == 0); return \@lines; } sub record($) { my ($newexit) = @_; die "code $newexit not defined\n" unless defined $CODE{$newexit}; if ($CODE{$newexit} > $CODE{$EXITCODE}) { $EXITCODE = $newexit; }; } my $usage = "$PROGRAM_NAME: Usage: $PROGRAM_NAME [--no-battery] [--ignore-cache] [--ignore-controller=] [--no-controller-ok] [--ignore-transfer-speed= [--ignore-transfer-speed= ...]]\n"; my $params; Getopt::Long::Configure('bundling'); if (!GetOptions ( '--help' => \$params->{'help'}, '--no-battery' => \$params->{'no-battery'}, '--no-controller-ok' => \$params->{'no-controller-ok'}, '--ignore-cache' => \$params->{'ignore-cache'}, '--ignore-controller=s' => \$params->{'ignore-controller'}, '--ignore-transfer-speed=s@' => \$params->{'ignore-transfer-speed'}, )) { die ($usage); }; if ($params->{'help'}) { print $usage; exit (0); }; die ($usage) unless (scalar @ARGV == 0); my $ctrlallshow = runcmd("controller all show detail"); my $slot; my %controllers; for (@$ctrlallshow) { chomp; next if /^$/; next if ($params->{'ignore-controller'} && /$params->{'ignore-controller'}/); if (/in Slot ([0-9a-z]+)/) { $slot = $1; $controllers{$slot} = (); } elsif (/^ *(Controller|Cache|Battery\/Capacitor) Status: (.*)$/) { my $system = $1; my $status = $2; if ($system eq 'Cache') { # Can be: # - 'OK' # - 'Not Configured' (for e.g. HP SSD Smart Path) # - 'Permanently Disabled' # - ...? next if $status =~ /^(OK|Not Configured)$/; if ($params->{'ignore-cache'}) { push @{$controllers{$slot}}, "$system: $status (ignored)"; next; } } push @{$controllers{$slot}}, "$system: $status"; if ($status ne 'OK') { next if ($params->{'no-battery'} && $system eq 'Battery/Capacitor'); record('WARNING'); }; } elsif (/^ *(Cache Status Details): (Cable Error)/) { push @{$controllers{$slot}}, $2; record('CRITICAL'); } elsif (/^ *(Battery\/Capacitor Count): (.*)/) { next if $params->{'no-battery'} || int($2) > 0; push @{$controllers{$slot}}, "Battery count: $2"; record('CRITICAL'); }; }; if (scalar keys %controllers == 0) { if ($params->{'no-controller-ok'}) { print "No Smart Array controllers found with $BIN\n"; exit $CODE{'OK'} } else { print "UNKNOWN: No Smart Array controllers found with $BIN\n"; exit $CODE{'UNKNOWN'} } }; my @resultstr; for my $slot (sort keys %controllers) { my $nodrives = 0; my %status; # check logicaldrives my $logicaldrive; my @logicaldrives; my $lds = runcmd("controller slot=$slot ld all show detail"); for (@$lds) { chomp; next if /^$/; if (/Logical Drive: ([0-9a-z]+)/) { $logicaldrive = $1; push @logicaldrives, $logicaldrive; } elsif (/^Error: The specified device does not have any logical drives.$/) { $nodrives = 1; } elsif (/^ *Parity Initialization Status: (Initialization Completed|Initialization Failed|Rebuilding)$/) { my $status = $1; if ($status eq 'Initialization Completed') { push @{$status{'OK'}}, "Parity LD$logicaldrive"; } elsif ($status eq 'Rebuilding') { push @{$status{'Failed'}}, "Parity LD$logicaldrive"; record('WARNING'); } elsif ($status eq 'Initialization Failed') { push @{$status{'Failed'}}, "Parity LD$logicaldrive"; record('CRITICAL'); } else { record('UNKNOWN'); } } elsif (/^ *LD Acceleration Method: (.*)$/) { my $status = $1; # can at least be "Controller Cache" or HP SSD Smart Path", both OK if ($status eq 'All disabled') { push @{$status{'Acceleration method'}}, "LD$logicaldrive disabled"; record('WARNING'); } } } if (!$nodrives && scalar @logicaldrives == 0) { push @resultstr, "Slot $slot: unexpectedly, found no logical drives in list."; record('UNKNOWN'); } elsif ($nodrives && scalar keys %status > 0) { push @resultstr, "Slot $slot: have no logical drives but status results?"; record('UNKNOWN'); next; } elsif ($nodrives) { push @resultstr, "Slot $slot: no logical drives"; }; my $pds = runcmd("controller slot=$slot pd all show detail"); my $drive; my %drives; for (@$pds) { chomp; next if /^$/; next if (/^\S.*in Slot $slot/); next if /^ *Array [A-Z]$/i; next if /^ *unassigned/; if (/^ *HBA Drives/) { # HBA mode implies no logical drives, thus reset the "drives found" check and proceed with # checking physical drives. $nodrives = 0; next; } if (/^ *(Array [A-Z]) \(Failed\)$/i) { record('CRITICAL'); push @{$status{'Failed'}}, $1; } elsif (/^Error: The specified controller does not have any physical drives on it.$/) { $nodrives = 1; } elsif (/^ *physicaldrive (\S+)/) { $drive = $1; $drives{$drive} = {}; } elsif (defined $drive && m/^\s*(.*?):\s*(.*?)\s*$/) { $drives{$drive}{$1} = $2; } else { die ("Cannot read line '$_' gotten from $BIN controller slot=$slot pd all show\n"); } }; # Check that all drives have the proper transfer speed. # sometimes stuff breaks and they fall back to 10mb/sec. for my $drive (sort keys %drives) { my $value = $drives{$drive}; my $status = $value->{'Status'}; push @{$status{$status}}, $drive; if ($status eq 'OK') { } elsif ($status eq 'Predictive Failure' || $status eq 'Rebuilding') { record('WARNING'); } elsif ($status eq 'Failed') { record('CRITICAL'); # skip drives that are known to have failed next; } else { record('UNKNOWN'); } my $type; if ($drive =~ /^[0-9]+:[0-9]+$/) { # scsi drives $type = 'SCSI'; } elsif ($drive =~ /^[0-9]+[EI]:[0-9]+:[0-9]+$/) { # SAS $type = 'SAS'; } elsif ($drive =~ /^[0-9]+[C]:[0-9]+:[0-9]+$/) { # New 6GBPS SAS $type = 'SAS+'; } else { warn ("Unknown diskdrive ID $drive\n"); next; } my $key; my $expected; if ($type eq 'SCSI') { $key = 'Transfer Speed'; if (!defined $value->{'Transfer Mode'}) { record('WARNING'); push @{$status{'unknown transfer mode'}}, $drive; next; } elsif ($value->{'Transfer Mode'} eq 'Ultra 3 Wide') { $expected = '160 MB/Sec'; } elsif ($value->{'Transfer Mode'} eq 'Ultra 320 Wide') { $expected = '320 MB/Sec'; } else { record('WARNING'); push @{$status{'unknown transfer mode'}}, $drive."(".$value->{'Transfer Mode'}.")"; next; }; } elsif ($type eq 'SAS' || $type eq 'SAS+') { $key = 'PHY Transfer Rate'; if ($value->{'Interface Type'} eq 'SATA') { $expected = [ '1.5Gbps', '3.0Gbps', '6.0Gbps' ]; } elsif ($value->{'PHY Count'} eq '2') { if (defined($value->{'Redundant Path(s)'})) { $expected = [ '3.0GBPS, 3.0GBPS', '6.0GBPS, 6.0GBPS', '12.0GBPS, 12.0GBPS' ]; } else { $expected = [ '3.0GBPS, Unknown', 'Unknown, 3.0GBPS', '6.0GBPS, Unknown', 'Unknown, 6.0GBPS', '12.0GBPS, Unknown', 'Unknown, 12.0GBPS' ]; } } else { $expected = [ '3.0GBPS', '6.0GBPS', '12.0GBPS' ]; } } else { warn "Should not be here. Do not know what to do with type '$type'\n"; next; } if ($params->{'ignore-transfer-speed'}) { if (grep { $drive eq $_ } @{$params->{'ignore-transfer-speed'}}) { push @{$status{'ignored transfer speed'}}, $drive."(".$value->{$key}.")"; next; }; }; if (!defined $value->{$key}) { record('WARNING'); push @{$status{'unknown transfer speed'}}, $drive; } elsif (ref($expected) eq 'ARRAY') { if (scalar(grep { uc($value->{$key}) eq uc($_) } @$expected) == 0) { record('WARNING'); push @{$status{'bad transfer speed'}}, $drive."(".$value->{$key}.")"; }; } elsif (uc($value->{$key}) ne uc($expected)) { record('WARNING'); push @{$status{'bad transfer speed'}}, $drive."(".$value->{$key}.")"; }; }; if ($nodrives && scalar keys %status > 0) { push @resultstr, "Slot $slot: have no drives but status results?"; record('UNKNOWN'); next; } elsif ($nodrives) { push @resultstr, "Slot $slot: no drives"; next; }; my $status = join(" - ", ((map { $_.": ".join(", ", @{$status{$_}}) } keys %status), @{$controllers{$slot}})); push @resultstr, "Slot $slot: $status"; }; print "$EXITCODE: ", join(" --- ", @resultstr), "\n"; exit $CODE{$EXITCODE};