#!/usr/bin/perl -w # ====================================================================================== # $Id$ # check_lsi_raid: Nagios/Icinga plugin to check LSI Raid Controller status # -------------------------------------------------------------------------------------- # Created as part of a semester project at the University of Applied Sciences Hagenberg # (http://www.fh-ooe.at/en/hagenberg-campus/) # # Copyright (c) 2013: # Grubhofer Martin (s1110239013@students.fh-hagenberg.at) # Scheipner Alexander (s1110239032@students.fh-hagenberg.at) # Werner Sebastian (s1110239038@students.fh-hagenberg.at) # Georg Schoenberger (gschoenberger@thomas-krenn.com) # Jonas Meurer (jmeurer@inet.de) # # This program is free software; you can redistribute it and/or modify it under # the terms of the GNU General Public License as published by the Free Software # Foundation; either version 3 of the License, or (at your option) any later # version. # # This program is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more # details. # # You should have received a copy of the GNU General Public License along with # this program; if not, see . # ====================================================================================== use strict; use warnings; use Getopt::Long qw(:config no_ignore_case); our $VERBOSITY = 0; our $VERSION = "1.1"; our $NAME = "check_lsi_raid: Nagios/Icinga plugin to check LSI Raid Controller status"; our $C_TEMP_WARNING = 80; our $C_TEMP_CRITICAL = 90; our $PD_TEMP_WARNING = 40; our $PD_TEMP_CRITICAL = 45; our ($ignerr_m, $ignerr_o, $ignerr_p, $ignerr_s) = (0, 0, 0, 0); use constant { STATE_OK => 0, STATE_WARNING => 1, STATE_CRITICAL => 2, STATE_UNKNOWN => 3, }; # Always return the highest state level sub getExitState { my $returnState = STATE_OK; # check if no state is NULL if (!defined($_[0]) || !defined($_[1])) { $returnState = STATE_UNKNOWN; } # check previous state if ($_[0] > $returnState) { $returnState = $_[0]; } # check upcoming state if ($_[1] > $returnState) { $returnState = $_[1]; } return $returnState; } # Explains the Usage of the plugin, also which options take which values sub displayUsage { print "Usage: \n"; print " [ -h | --help ]\n Display this help page\n"; print " [ -v | -vv | -vvv | --verbose ]\n Sets the verbosity level.\n no -v is the normal single line output for Nagios/Icinga\n -v is a more detailed version but still usable in Nagios.\n -vv is a multiline output for debugging configuration errors or more detailed information.\n -vvv is for plugin problem diagnosis.\n For further information please visit: http://nagiosplug.sourceforge.net/developer-guidelines.html#AEN39\n"; print " [ -V --version ]\n Displays the Version of the tk-lsi-plugin and the version of StorCLI\n"; print " [ -C | --controller ]\n Specifies a Controller number, defaults to 0\n"; print " [ -EID | --enclosure ]\n Specifies one or more Enclosures, defaults to all\n Takes either an integer as additional argument (>=0) or a comma seperated list(0,1,2,3,...)\n"; print " [ -LD | --logicaldevice ]\n Specifies one or more Logical Devices, defaults to all\n Takes either an integer as additional argument (>=0) or a comma seperated list(0,1,2,3,...)\n"; print " [ -PD | --physicaldevice ]\n Specifies one or more Physical Devices, defaults to all\n Takes either an integer as additional argument (>=0) or a comma seperated list(0,1,2,3,...)\n"; print " [ -Tw | --temperature-warn ]\n Specifies the RAID-Controller temperature warning range, default is ${C_TEMP_WARNING}C or more\n"; print " [ -Tc | --temperature-critical ]\n Specifies the RAID-Controller temperature critical error range, default is ${C_TEMP_CRITICAL}C or more. Requires -Tw | --temperature-warn to be set.\n"; print " [ -PDTw | --physicaldevicetemperature-warn ]\n Specifies the disk temperature warning range, default is ${PD_TEMP_WARNING}C or more\n"; print " [ -PDTc | --physicaldevicetemperature-critical ]\n Specifies the disk temperature critical error range, default is ${PD_TEMP_CRITICAL}C or more. Requires -PDTw | --physicaldevicetemperature-warn to be set.\n"; print " [ -Im | --ignore-media-errors ]\n Specifies the warning threshold for media errors per disk, default is $ignerr_m.\n"; print " [ -Io | --ignore-other-errors ]\n Specifies the warning threshold for other errors per disk, default is $ignerr_o.\n"; print " [ -Ip | --ignore-predictive-fail-count ]\n Specifies the warhing threshold for predictive fail count per disk, default is $ignerr_p.\n"; print " [ -Is | --ignore-shield-counter ]\n Specifies the warning threshold for shield counter per disk, default is $ignerr_s.\n"; print " [ -p | --path ]\n Specifies the path to StorCLI, default is /usr/bin/storcli or C:\\Programme\\...\\storcli.exe\n"; print " [ -b <0/1> | --BBU <0/1> ]\n Check if a BBU or a CacheVault module is present. One must be present unless '-b 0' is defined. This ensures that for a given controller a BBU/CV must be present per default.\n"; } # Displays a short Help text for the user sub displayHelp { print $NAME . " Version: " . $VERSION ."\n"; print "Copyright (C) 2013 Thomas-Krenn.AG\n"; print "Current updates available at http://git.thomas-krenn.com/check_lsi_raid.git\n"; print "This Nagios/Icinga Plugin checks LSI RAID-Controllers for Controller, \nPhysical-Device and Logical Device warnings and errors.\n"; print "In order for this plugin to work properly you need to add the \nnagios-user to your sudoers file (or create a new one in /etc/sudoers.d/)\n"; displayUsage(); print "Further information about this plugin can be found at: http://www.thomas-krenn.com/de/wiki/LSI_RAID_Monitoring_Plugin and http://www.thomas-krenn.com/de/wiki/LSI_RAID_Monitoring_Plugin Please send an email to the tk-monitoring plugin-user mailing list: tk-monitoring-plugins-user\@lists.thomas-krenn.com if you have questions regarding use of this software, to submit patches, or suggest improvements. The mailing list archive is available at: http://lists.thomas-krenn.com/pipermail/tk-monitoring-plugins-user\n"; exit(STATE_OK); } # Prints the Name, Version of the Plugin # Also Prints the version of StorCLI sub displayVersion { my $sudo = $_[0]; my $storcli = $_[1]; my @storcliVersion = `$sudo $storcli -v`; print $NAME . "\nVersion: ". $VERSION . "\n\n"; foreach my $line (@storcliVersion){ if($line =~ /^\s*Storage/) { $line =~ s/^\s+|\s+$//g; print $line; } } print "\n"; exit(STATE_OK); } sub checkCommandStatus{ my @output = @{(shift)}; foreach my $line (@output){ if(($line =~ /^Status/) && ($line eq "Status = Success\n")){ return 1; } else{ return 0; } } } # Returns information about: # - Controller status and controller temperature sub getControllerStatus { my $sudo = $_[0]; my $storcli = $_[1]; my $controller = $_[2]; my @temperature_w = @{($_[3])}; my @temperature_c = @{($_[4])}; my $command = "$sudo $storcli /c$controller show all"; my $status = 0; # Return Status my $statusMessage = ''; # Return String my @output = `$command`; if(checkCommandStatus(\@output)) { foreach my $line (@output) { my $first; my $last; my $temp; my $crit = 0; if($line =~ /^([a-zA-Z0-9]*)/) { $first = $1; if($line =~ /([a-zA-Z0-9]*)$/) { $last = $1; if($first eq "Controller") { if($line =~ /\s+([a-zA-Z0-9]*)/) { if($1 eq "Status") { if($last ne "OK") { $status = getExitState($status, STATE_WARNING); $statusMessage .= "Ctrl. status not optimal, "; } } elsif($1 eq "must") { if($last ne "No") { $status = getExitState($status, STATE_CRITICAL); $statusMessage .= "Ctrl. needs reboot, "; } } elsif($1 eq "has") { if($last ne "No") { $status = getExitState($status, STATE_WARNING); $statusMessage .= "Ctrl. booted in safe mode, "; } } elsif($1 eq "temperature") { $temp = $last; if($temperature_w[0] eq "in") { if(($temp >= $temperature_w[1]) && ($temp <= $temperature_w[2])) { # is in warn range, so also check if in critical error range if($temperature_c[0] eq "in") { if(($temp >= $temperature_c[1]) && ($temp <= $temperature_c[2])) { # critical error $crit = 1; $status = getExitState($status, STATE_CRITICAL); if ($VERBOSITY == 0) {$statusMessage .= "Ctrl. temp. critical, "; } if ($VERBOSITY == 1) {$statusMessage .= "Ctrl. temp. is critical (${temp}C), "; } if ($VERBOSITY >= 2) {$statusMessage .= "Controller temperature is critical (${temp}C), "; } } } else { if(($temp < $temperature_c[1]) || ($temp > $temperature_c[2])) { # critical error $crit = 1; $status = getExitState($status, STATE_CRITICAL); if ($VERBOSITY == 0) {$statusMessage .= "Ctrl. temp. critical, "; } if ($VERBOSITY == 1) {$statusMessage .= "Ctrl. temp. is critical (${temp}C), "; } if ($VERBOSITY >= 2) {$statusMessage .= "Controller temperature is critical (${temp}C), "; } } } if($crit eq 0) { # only warn if not already given a critical error $status = getExitState($status, STATE_WARNING); if ($VERBOSITY == 0) {$statusMessage .= "Ctrl. temp. warning, "; } if ($VERBOSITY == 1) {$statusMessage .= "Ctrl. temp. warning (${temp}C), "; } if ($VERBOSITY >= 2) {$statusMessage .= "Controller temperature warning (${temp}C), "; } } } } else { if(($temp < $temperature_w[1]) || ($temp > $temperature_w[2])) { # is in warn range, so also check if in critical error range if($temperature_c[0] eq "in") { if(($temp >= $temperature_c[1]) && ($temp <= $temperature_c[2])) { # critical error $crit = 1; $status = getExitState($status, STATE_CRITICAL); if ($VERBOSITY == 0) {$statusMessage .= "Ctrl. temp. critical, "; } if ($VERBOSITY == 1) {$statusMessage .= "Ctrl. temp. is critical (${temp}C), "; } if ($VERBOSITY >= 2) {$statusMessage .= "Controller temperature is critical (${temp}C), "; } } } else { if(($temp < $temperature_c[1]) || ($temp > $temperature_c[2])) { # critical error $crit = 1; $status = getExitState($status, STATE_CRITICAL); if ($VERBOSITY == 0) {$statusMessage .= "Ctrl. temp. critical, "; } if ($VERBOSITY == 1) {$statusMessage .= "Ctrl. temp. is critical (${temp}C), "; } if ($VERBOSITY >= 2) {$statusMessage .= "Controller temperature is critical (${temp}C), "; } } } if($crit eq 0) { # only warn if not already given a critical error $status = getExitState($status, STATE_WARNING); if ($VERBOSITY == 0) {$statusMessage .= "Ctrl. temp. warning, "; } if ($VERBOSITY == 1) {$statusMessage .= "Ctrl. temp. warning (${temp}C), "; } if ($VERBOSITY >= 2) {$statusMessage .= "Controller temperature warning (${temp}C), "; } } } } } } } elsif($first eq "ROC") { if($line =~ /\s+([a-zA-Z0-9]*)/) { if($1 eq "temperature") { $temp = $last; if($temperature_w[0] eq "in") { if(($temp >= $temperature_w[1]) && ($temp <= $temperature_w[2])) { # is in warn range, so also check if in critical error range if($temperature_c[0] eq "in") { if(($temp >= $temperature_c[1]) && ($temp <= $temperature_c[2])) { # critical error $crit = 1; $status = getExitState($status, STATE_CRITICAL); if ($VERBOSITY == 0) {$statusMessage .= "ROC temp. critical, "; } if ($VERBOSITY == 1) {$statusMessage .= "ROC temp. is critical (${temp}C), "; } if ($VERBOSITY >= 2) {$statusMessage .= "ROC temperature is critical (${temp}C), "; } } } else { if(($temp < $temperature_c[1]) || ($temp > $temperature_c[2])) { # critical error $crit = 1; $status = getExitState($status, STATE_CRITICAL); if ($VERBOSITY == 0) {$statusMessage .= "ROC temp. critical, "; } if ($VERBOSITY == 1) {$statusMessage .= "ROC temp. is critical (${temp}C), "; } if ($VERBOSITY >= 2) {$statusMessage .= "ROC temperature is critical (${temp}C), "; } } } if($crit eq 0) { # only warn if not already given a critical error $status = getExitState($status, STATE_WARNING); if ($VERBOSITY == 0) {$statusMessage .= "ROC temp. warning, "; } if ($VERBOSITY == 1) {$statusMessage .= "ROC temp. warning (${temp}C), "; } if ($VERBOSITY >= 2) {$statusMessage .= "ROC temperature warning (${temp}C), "; } } } } else { if(($temp < $temperature_w[1]) || ($temp > $temperature_w[2])) { # is in warn range, so also check if in critical error range if($temperature_c[0] eq "in") { if(($temp >= $temperature_c[1]) && ($temp <= $temperature_c[2])) { # critical error $crit = 1; $status = getExitState($status, STATE_CRITICAL); if ($VERBOSITY == 0) {$statusMessage .= "ROC temp. critical, "; } if ($VERBOSITY == 1) {$statusMessage .= "ROC temp. is critical (${temp}C), "; } if ($VERBOSITY >= 2) {$statusMessage .= "ROC temperature is critical (${temp}C), "; } } } else { if(($temp < $temperature_c[1]) || ($temp > $temperature_c[2])) { # critical error $crit = 1; $status = getExitState($status, STATE_CRITICAL); if ($VERBOSITY == 0) {$statusMessage .= "ROC temp. critical, "; } if ($VERBOSITY == 1) {$statusMessage .= "ROC temp. is critical (${temp}C), "; } if ($VERBOSITY >= 2) {$statusMessage .= "ROC temperature is critical (${temp}C), "; } } } if($crit eq 0) { # only warn if not already given a critical error $status = getExitState($status, STATE_WARNING); if ($VERBOSITY == 0) {$statusMessage .= "ROC temp. warning, "; } if ($VERBOSITY == 1) {$statusMessage .= "ROC temp. warning (${temp}C), "; } if ($VERBOSITY >= 2) {$statusMessage .= "ROC temperature warning (${temp}C), "; } } } } } } } elsif($first eq "Memory") { if($line =~ /(\s+[a-zA-Z0-9]*)/) { if($1 eq "Correctable") { if($last ne "0") { $status = getExitState($status, STATE_WARNING); $statusMessage .= "Memory correctable errors detected, "; } } elsif($1 eq "Uncorrectable") { if($last ne "0") { $status = getExitState($status, STATE_CRITICAL); $statusMessage .= "Memory uncorrectable errors detected, "; } } } } elsif($first eq "Failed") { if($last ne "No") { $status = getExitState($status, STATE_WARNING); $statusMessage .= "Failed to get lock key on bootup, "; } } #TODO Improve rollback detection elsif($first eq "A") { if($last ne "No") { $status = getExitState($status, STATE_WARNING); $statusMessage .= "A rollback operation is in progress, "; } } } } } return ($status, $statusMessage); } else { print "Invalid StorCLI command!\n"; exit(STATE_UNKNOWN); } } # Returns information about: # - Logical device status sub getLogicalDeviceStatus { my $sudo = $_[0]; my $storcli = $_[1]; my $controller = $_[2]; my @logDevices = @{($_[3])}; my $action = $_[4]; my $command = "$sudo $storcli /c$controller"; my $status = 0; # Return Status my $statusMessage = ''; # Return String if(scalar(@logDevices) == 0) { $command .= "/vall"; } elsif(scalar(@logDevices) == 1) { $command .= "/v$logDevices[0]"; } else { $command .= "/v".join(",", @logDevices); } $command .= " show $action"; my @output = `$command`; if($output[1] eq "Status = Success\n") { if($action eq "all") { my $output_dev = -1; my $flag = -1; #check for lines with /c0/vX and then go 6 lines forward foreach my $line (@output) { if($flag > 0) { $flag--; } elsif($line =~ /^\/c$controller\/v([0-9]*)/) { $output_dev = $1; $flag = 6; } if($flag eq 0) { $flag = -1; my @values = split(' ',$line); if($values[2] ne "Optl") { $status = getExitState($status, STATE_CRITICAL); if($values[1] ne 'Cac1'){ #check for consistency if($values[4] eq 'Yes'){ if ($VERBOSITY == 0) {$statusMessage .= "Virtual disk state not optimal, "; } if ($VERBOSITY >= 1) {$statusMessage .= "Virtual disk $output_dev state not optimal ($values[2]), "; } } else{ if ($VERBOSITY == 0) {$statusMessage .= "Virtual disk state not optimal - not consistant, "; } if ($VERBOSITY >= 1) {$statusMessage .= "Virtual disk $output_dev is not optimal - not consistant (state $values[2]), "; } } } else{ if ($VERBOSITY == 0) {$statusMessage .= "Virtual cachecade disk state not optimal, "; } if ($VERBOSITY >= 1) {$statusMessage .= "Virtual cachecade disk $output_dev state not optimal ($values[2]), "; } } } else { if($values[4] ne "Yes" && $values[1] ne "Cac1"){ $status = getExitState($status, STATE_WARNING); if ($VERBOSITY == 0) {$statusMessage .= "Virtual disk state optimal - but not consistent, "; } if ($VERBOSITY >= 1) {$statusMessage .= "Virtual disk $output_dev state optimal - but not consistent, "; } } } } } } elsif($action eq "init") { my $output_dev = -1; my $flag = -1; foreach my $line (@output) { if($flag > 0 ) { $flag--; } elsif($line =~ /^VD\ Operation\ Status\ :/) { $flag = 6; } if($flag eq 0) { my @values = split(' ',$line); if($values[0] =~ /^[0-9]*$/) { if($values[3] ne "Not") { $status = getExitState($status, STATE_WARNING); if ($VERBOSITY == 0) {$statusMessage .= "Virtual disk initialization in progress, "; } if ($VERBOSITY >= 1) {$statusMessage .= "Virtual disk $values[0] initialization in progress ($values[2] %), "; } } } else { last; } } } } return ($status, $statusMessage); } else { print "Invalid StorCLI command!\n"; exit(STATE_UNKNOWN); } } # Returns information about: # - Physical device status sub getPhysDeviceStatus { my $sudo = $_[0]; my $storcli = $_[1]; my $controller = $_[2]; my @enclosures = @{($_[3])}; my @physDevices = @{($_[4])}; my @physicalDeviceTemperature_w = @{($_[5])}; my @physicalDeviceTemperature_c = @{($_[6])}; my $action = $_[7]; my $command = "$sudo $storcli /c$controller"; my $status = 0; my $statusMessage = ''; if(scalar(@enclosures) == 0) { $command .= "/eall"; } elsif(scalar(@enclosures) == 1) { $command .= "/e$enclosures[0]"; } else { $command .= "/e".join(",", @enclosures); } if(scalar(@physDevices) == 0) { $command .= "/sall"; } elsif(scalar(@physDevices) == 1) { $command .= "/s$physDevices[0]"; } else { $command .= "/s".join(",", @physDevices); } $command .= " show $action"; my @output = `$command`; if($output[1] eq "Status = Success\n") { if($action eq "all") { my $output_enc = -1; my $output_dev = -1; my $flag = -1; foreach my $line (@output) { if($flag > 0) { $flag--; } elsif($line =~ /^Drive\s\/c$controller\/e([0-9]*)\/s([0-9]*)\s\:/) { # Check the overall drive state $output_enc = $1; $output_dev = $2; $flag = 6; } elsif($line =~ /^Drive\s\/c$controller\/e([0-9]*)\/s([0-9]*)\sState\s\:/) { # Check the drive state in block Detailed information $output_enc = $1; $output_dev = $2; $flag = 1; } elsif($flag eq 0 && index($line, " ") eq -1) { # Detect the end of the block which was last checked $flag = -1; } elsif($flag eq 0) { my @values = split(' ',$line); if($values[0] =~ /^[0-9]*:[0-9]*/) { # Check the overall drive state if($values[2] eq "Offln") { $status = getExitState($status, STATE_WARNING); if ($VERBOSITY == 0) {$statusMessage .= "Physical drive is offline, "; } if ($VERBOSITY >= 1) {$statusMessage .= "Physical drive $output_dev in enclosure $output_enc is offline, "; } } elsif($values[2] eq "UBad") { $status = getExitState($status, STATE_CRITICAL); if ($VERBOSITY == 0) {$statusMessage .= "Physical drive state is Unconfigured Bad, "; } if ($VERBOSITY >= 1) {$statusMessage .= "Physical drive $output_dev state in enclosure $output_enc is Unconfigured Bad, "; } } } elsif($values[0] =~ /^[a-zA-Z\.]*/) { # Check the drive state in block Detailed information if($values[0] eq "Shield") { if($values[3] > $ignerr_s) { $status = getExitState($status, STATE_WARNING); if ($VERBOSITY == 0) {$statusMessage .= "Shield counter (phys. drive) $values[3] (>$ignerr_s), "; } if ($VERBOSITY == 1) {$statusMessage .= "Physical drive $output_dev: shield counter $values[3] (>$ignerr_s), "; } if ($VERBOSITY >= 2) {$statusMessage .= "Physical drive $output_dev in enclosure $output_enc: shield counter $values[3] (>$ignerr_s), "; } } } elsif($values[0] eq "Media") { if($values[4] > $ignerr_m) { $status = getExitState($status, STATE_WARNING); if ($VERBOSITY == 0) {$statusMessage .= "Media error count (phys. drive) $values[4] (>$ignerr_m), "; } if ($VERBOSITY == 1) {$statusMessage .= "Physical drive $output_dev: media error count $values[4] (>$ignerr_m), "; } if ($VERBOSITY >= 2) {$statusMessage .= "Physical drive $output_dev in enclosure $output_enc: media error count $values[4] (>$ignerr_m), "; } } } elsif($values[0] eq "Other") { if($values[4] > $ignerr_o) { $status = getExitState($status, STATE_WARNING); if ($VERBOSITY == 0) {$statusMessage .= "Other error count (phys. drive) $values[4] (>$ignerr_o), "; } if ($VERBOSITY == 1) {$statusMessage .= "Physical drive $output_dev: other error count $values[4] (>$ignerr_o), "; } if ($VERBOSITY >= 2) {$statusMessage .= "Physical drive $output_dev in enclosure $output_enc: other error count $values[4] (>$ignerr_o), "; } } } elsif($values[0] eq "Drive") { chop($values[3]); # check if temp is really a number, could be N/A also if($values[3] =~ /^[-+]?[0-9]*\.?[0-9]+$/){ my $temp = $values[3]; my $crit = 0; # check for warn range if($physicalDeviceTemperature_w[0] eq "in") { if(($temp >= $physicalDeviceTemperature_w[1]) && ($temp <= $physicalDeviceTemperature_w[2])) { # is in warn range, so also check if in critical error range if($physicalDeviceTemperature_c[0] eq "in") { if(($temp >= $physicalDeviceTemperature_c[1]) && ($temp <= $physicalDeviceTemperature_c[2])) { # critical error $crit = 1; $status = getExitState($status, STATE_CRITICAL); if ($VERBOSITY == 0) {$statusMessage .= "Phys. drive temperature critical, "; } if ($VERBOSITY == 1) {$statusMessage .= "Physical drive $output_dev: temperature is ${temp}C, "; } if ($VERBOSITY >= 2) {$statusMessage .= "Physical drive $output_dev in enclosure $output_enc: temperature is ${temp}C, "; } } } else { if(($temp < $physicalDeviceTemperature_c[1]) || ($temp > $physicalDeviceTemperature_c[2])) { # critical error $crit = 1; $status = getExitState($status, STATE_CRITICAL); if ($VERBOSITY == 0) {$statusMessage .= "Phys. drive temperature critical, "; } if ($VERBOSITY == 1) {$statusMessage .= "Physical drive $output_dev: temperature is ${temp}C, "; } if ($VERBOSITY >= 2) {$statusMessage .= "Physical drive $output_dev in enclosure $output_enc: temperature is ${temp}C, "; } } } if($crit eq 0) { # only warn if not already given a critical error $status = getExitState($status, STATE_WARNING); if ($VERBOSITY == 0) {$statusMessage .= "Phys. drive temperature warning, "; } if ($VERBOSITY == 1) {$statusMessage .= "Physical drive $output_dev: temperature is ${temp}C, "; } if ($VERBOSITY >= 2) {$statusMessage .= "Physical drive $output_dev in enclosure $output_enc: temperature is ${temp}C, "; } } } } else { if(($temp < $physicalDeviceTemperature_w[1]) || ($temp > $physicalDeviceTemperature_w[2])) { # is in warn range, so also check if in critical error range if($physicalDeviceTemperature_c[0] eq "in") { if(($temp >= $physicalDeviceTemperature_c[1]) && ($temp <= $physicalDeviceTemperature_c[2])) { # critical error $crit = 1; $status = getExitState($status, STATE_CRITICAL); if ($VERBOSITY == 0) {$statusMessage .= "Phys. drive temperature critical, "; } if ($VERBOSITY == 1) {$statusMessage .= "Physical drive $output_dev: temperature is ${temp}C, "; } if ($VERBOSITY >= 2) {$statusMessage .= "Physical drive $output_dev in enclosure $output_enc: temperature is ${temp}C, "; } } } else { if(($temp < $physicalDeviceTemperature_c[1]) || ($temp > $physicalDeviceTemperature_c[2])) { # critical error $crit = 1; $status = getExitState($status, STATE_CRITICAL); if ($VERBOSITY == 0) {$statusMessage .= "Phys. drive temperature critical, "; } if ($VERBOSITY == 1) {$statusMessage .= "Physical drive $output_dev: temperature is ${temp}C, "; } if ($VERBOSITY >= 2) {$statusMessage .= "Physical drive $output_dev in enclosure $output_enc: temperature is ${temp}C, "; } } } if($crit eq 0) { # only warn if not already given a critical error $status = getExitState($status, STATE_WARNING); if ($VERBOSITY == 0) {$statusMessage .= "Phys. drive temperature warning, "; } if ($VERBOSITY == 1) {$statusMessage .= "Physical drive $output_dev: temperature is ${temp}C, "; } if ($VERBOSITY >= 2) {$statusMessage .= "Physical drive $output_dev in enclosure $output_enc: temperature is ${temp}C, "; } } } } } } elsif($values[0] eq "Predictive") { if($values[4] > $ignerr_p) { $status = getExitState($status, STATE_WARNING); if ($VERBOSITY == 0) {$statusMessage .= "Phys. drive Predictive Fail Count $values[4] (>$ignerr_p), "; } if ($VERBOSITY == 1) {$statusMessage .= "Physical drive $output_dev: Predictive Fail Count $values[4] (>$ignerr_p), "; } if ($VERBOSITY >= 2) {$statusMessage .= "Physical drive $output_dev in enclosure $output_enc: Predictive Fail Count $values[4] (>$ignerr_p), "; } } } elsif($values[0] eq "S.M.A.R.T") { if($values[6] ne "No") { $status = getExitState($status, STATE_WARNING); if ($VERBOSITY == 0) {$statusMessage .= "S.M.A.R.T alert flagged by drive, "; } if ($VERBOSITY == 1) {$statusMessage .= "Physical drive $output_dev: S.M.A.R.T alert flagged, "; } if ($VERBOSITY >= 2) {$statusMessage .= "Physical drive $output_dev in enclosure $output_enc: S.M.A.R.T alert flagged, "; } } } } else { $flag = -1; } } } } else { # Check initialization or rebuild output my $output_enc = -1; my $output_dev = -1; foreach my $line (@output) { if($line =~ /^\/c$controller\/e([0-9]*)\/s([0-9]*)/) { $output_enc = $1; $output_dev = $2; } if(($output_enc ne -1) && ($output_dev ne -1)) { #grep for status and floating point number if($line =~ /^\/c$controller\/e$output_enc\/s$output_dev\s*([\-]{1}|[0-9]*\.?[0-9]*)\s*([\w\s]*)$/){ my $inProgress = $1; my $state = $2; if($state =~ m/^In progress/) { $status = getExitState($status, STATE_WARNING); if ($VERBOSITY == 0) {$statusMessage .= "Phys. drive: $action in progress, "; } if ($VERBOSITY == 1) {$statusMessage .= "Physical drive $output_dev: $action in progress, "; } if ($VERBOSITY >= 2) {$statusMessage .= "Physical drive $output_dev in enclosure $output_enc: $action in progress (percentage: $inProgress), "; } } } } } } return ($status, $statusMessage); } else { print "Invalid StorCLI command!\n"; exit(STATE_UNKNOWN); } } # Returns information about: # - Battery Backup Unit status # - Temperature, Battery status, voltage sub getBBUStatus { my $sudo = $_[0]; my $storcli = $_[1]; my $controller = $_[2]; my $command = "$sudo $storcli /c$controller/bbu show status"; my $status = 0; my $statusMessage = ''; my @output = `$command`; if($output[1] eq "Status = Success\n") { my $blockid = 0; foreach my $line (@output) { my $first; my $last; if($line =~ /^([a-zA-Z0-9]*)/) { $first = $1; if($first eq 'BBU_Info' || $first eq 'BBU_Firmware_Status' || $first eq 'GasGaugeStatus') { $blockid++; } if($line =~ /([a-zA-Z0-9]*)$/) { $last = $1; # Check BBU_Info block if ($blockid eq 1) { if($first eq "Battery") { if($line =~ /\s+([a-zA-Z0-9]*)/) { if($1 eq "State") { if($last ne "Optimal") { $status = getExitState($status, STATE_WARNING); $statusMessage .= "BBU state not optimal, "; } } } } } # Check BBU_Firmware_Status if ($blockid eq 2) { if($first eq "Temperature") { if($last ne "OK") { $status = getExitState($status, STATE_CRITICAL); if ($VERBOSITY == 0) {$statusMessage .= "BBU temp. critical, "; } if ($VERBOSITY >= 1) {$statusMessage .= "BBU Temperature critical, "; } } } elsif($first eq "Battery") { if($line =~ /\s+([a-zA-Z0-9]*)/) { if($1 eq "State") { if($last ne "Optimal") { $status = getExitState($status, STATE_WARNING); $statusMessage .= "BBU state not optimal, "; } } elsif($1 eq "Pack") { if($last ne "No") { $status = getExitState($status, STATE_CRITICAL); $statusMessage .= "BBU pack missing, "; } } elsif($1 eq "Replacement") { if($last ne "No") { $status = getExitState($status, STATE_WARNING); $statusMessage .= "BBU replacement required, "; } } } } elsif($first eq "Voltage") { if($last ne "OK") { $status = getExitState($status, STATE_CRITICAL); $statusMessage .= "BBU voltage not ok, "; } } elsif($first eq "Learn") { if($last ne "OK") { $status = getExitState($status, STATE_WARNING); $statusMessage .= "BBU learn cycle status not ok, "; } } elsif($first eq "I2C") { if($last ne "No") { $status = getExitState($status, STATE_CRITICAL); $statusMessage .= "BBU I2C errors, "; } } elsif($first eq "Remaining") { if($last ne "No") { $status = getExitState($status, STATE_WARNING); if ($VERBOSITY == 0) {$statusMessage .= "BBU capacity low, "; } if ($VERBOSITY >= 1) {$statusMessage .= "BBU remaining capacity is low, "; } } } } # Check GasGaugeStatus if ($blockid eq 3) { if($first eq "Over") { if($line =~ /\s+([a-zA-Z0-9]*)/) { if($1 eq "Temperature") { if($last ne "No") { $status = getExitState($status, STATE_CRITICAL); if ($VERBOSITY == 0) {$statusMessage .= "BBU temp. critical, "; } if ($VERBOSITY >= 1) {$statusMessage .= "BBU Temperature critical, "; } } } elsif($1 eq "Charged") { if($last ne "No") { $status = getExitState($status, STATE_CRITICAL); $statusMessage .= "BBU over charged, "; } } } } } } } } return ($status, $statusMessage); } else { print "Invalid StorCLI command!\n"; exit(STATE_UNKNOWN); } } # Returns information about: # - Cache Vault module status # - If CacheVault must be replaced sub getCVStatus { my $sudo = $_[0]; my $storcli = $_[1]; my $controller = $_[2]; my $command = "$sudo $storcli /c$controller/cv show status"; my $status = 0; my $statusMessage = ''; my @output = `$command`; if($output[1] eq "Status = Success\n") { my $currBlock; foreach my $line (@output) { if($line =~ /^(Cachevault_Info|Firmware_Status)/){ $currBlock = $1; next; } if(defined($currBlock)){ $line =~ s/^\s+|\s+$//g;#trim line if($currBlock eq 'Cachevault_Info' && $line =~ /^State/){ my @vals = split('\s{2,}',$line); if($vals[1] ne "Optimal") { $status = getExitState($status, STATE_WARNING); if($VERBOSITY == 0){$statusMessage .= "CacheVault state not optimal, ";} if($VERBOSITY >= 1){$statusMessage .= "Cachevault Info state $1, "; } } } elsif($currBlock eq 'Firmware_Status' && $line =~ /^Replacement required/){ $line =~ /([a-zA-Z0-9]*)$/; if($1 ne "No") { $status = getExitState($status, STATE_WARNING); $statusMessage .= "CacheVault replacement required, "; } } } } return ($status, $statusMessage); } else { print "Invalid StorCLI command!\n"; exit(STATE_UNKNOWN); } } # Checks if wheter BBU or CV is present # - One of the two show commands must return 'Success' sub checkBBUorCVIsPresent{ my $sudo = $_[0]; my $storcli = $_[1]; my $controller = $_[2]; my $status = 0; my $statusMessage = ''; my ($bbu,$cv); my $command = "$sudo $storcli /c$controller/bbu show"; my @output = `$command`; if($output[1] eq "Status = Success\n") { $bbu = 1; } else{$bbu = 0}; $command = "$sudo $storcli /c$controller/cv show"; @output = `$command`; if($output[1] eq "Status = Success\n") { $cv = 1; } else{$cv = 0}; return ($bbu, $cv); } # Nagios development guidelines: temperature threshold sheme # http://nagiosplug.sourceforge.net/developer-guidelines.html#THRESHOLDFORMAT # Returns a temperature range (array) in or out which a temperature should be # Array content: ("in" or "out", range from, range to) # Example ranges: # Generate an alert if x... # -Tw 10 < 0 or > 10, (outside the range of {0 .. 10}) # -Tw 10: < 10, (outside {10 .. inf}) # -Tw ~:10 > 10, (outside the range of {-inf .. 10}) # -Tw 10:20 < 10 or > 20, (outside the range of {10 .. 20}) # -Tw @10:20 >= 10 and <= 20, (inside the range of {10 .. 20}) sub getThresholds { my @thresholds = @{($_[0])}; my $default = $_[1]; if(scalar(@thresholds) eq 0) { return @thresholds = ("out", -273, $default); } if(substr($thresholds[0], 0, 1) eq "@") { if($thresholds[0] =~ /^\@([0-9]*)\:([0-9]*)$/) { @thresholds = ("in", $1, $2); } else { print "Invalid temperature parameter!\n"; exit(STATE_UNKNOWN); } } elsif(substr($thresholds[0], 0, 1) eq "~") { if($thresholds[0] =~ /^\~\:([0-9]*)$/) { @thresholds = ("out", -273, $1); } else { print "Invalid temperature parameter!\n"; exit(STATE_UNKNOWN); } } elsif(index($thresholds[0], ":") ne -1) { if($thresholds[0] =~ /^([0-9]*)\:([0-9]{1,3})$/) { @thresholds = ("out", $1, $2); } elsif($thresholds[0] =~ /^([0-9]*)\:$/) { @thresholds = ("in", -273, ($1 - 1)); } else { print "Invalid temperature parameter!\n"; exit(STATE_UNKNOWN); } } else { @thresholds = ("out", 0, $thresholds[0]); } if(($thresholds[1] =~ /^(-?[0-9]*)$/) && ($thresholds[2] =~ /^(-?[0-9]*)$/)) { return @thresholds; } else { print "Invalid temperature parameter!\n"; exit(STATE_UNKNOWN); } } MAIN: { my $sudo = ''; my $storcli = ''; my $controller = 0; my @enclosures; my @logDevices; my @physDevices; my @temperature_w; my @temperature_c; my @physicalDeviceTemperature_w; my @physicalDeviceTemperature_c; my $bbu = 1; my $platform = $^O; my $statusMessage = ''; if ( !(GetOptions( 'h|help' => sub {displayHelp();}, 'v|verbose' => sub {$VERBOSITY = 1 }, 'vv' => sub {$VERBOSITY = 2}, 'vvv' => sub {$VERBOSITY = 3}, 'V|version' => sub {displayVersion($sudo, $storcli);}, 'C|controller=i' => \$controller, 'EID|enclosure=s' => \@enclosures, 'LD|logicaldevice=s' => \@logDevices, 'PD|physicaldevice=s' => \@physDevices, 'Tw|temperature-warn=s' => \@temperature_w, 'Tc|temperature-critical=s' => \@temperature_c, 'PDTw|physicaldevicetemperature-warn=s' => \@physicalDeviceTemperature_w, 'PDTc|physicaldevicetemperature-critical=s' => \@physicalDeviceTemperature_c, 'Im|ignore-media-errors=i' => \$ignerr_m, 'Io|ignore-other-errors=i' => \$ignerr_o, 'Ip|ignore-predictive-fail-count=i' => \$ignerr_p, 'Is|ignore-shield-counter=i' => \$ignerr_s, 'p|path=s' => \$storcli, 'b|BBU=i' => \$bbu ))) { print $NAME . " Version: " . $VERSION ."\n"; displayUsage(); exit(STATE_UNKNOWN); } # Check platform if ($platform eq 'linux') { chomp($sudo = `which sudo`); if ($storcli eq '') { if ( -e '/opt/MegaRAID/storcli/storcli64') { $storcli = '/opt/MegaRAID/storcli/storcli64' } elsif ( -e '/opt/MegaRAID/storcli/storcli') { $storcli = '/opt/MegaRAID/storcli/storcli' } else { chomp($storcli= `which storcli`); } } unless ( -e $storcli && -x $sudo ) { print "No sudo rights or StorCLI not found!\n"; exit(STATE_UNKNOWN); } } else { eval('use File::Which'); $sudo = ''; if ($storcli eq '') { if (defined(which( 'storcli64.exe' )) ) { $storcli = which( 'storcli64.exe' ); } elsif (defined(which( 'storcli.exe' ))) { $storcli = which( 'storcli.exe' ); } } unless ( -e $storcli ) { print "StorCLI not found!\n"; exit(STATE_UNKNOWN); } } # Input validation my @controllerVersion = `$sudo $storcli /c$controller show all`; if($controllerVersion[2] eq "Description = Controller $controller not found\n") { print "Invalid controller number, device not found!\n"; exit(STATE_UNKNOWN); } if(($bbu != 1) && ($bbu != 0)) { print "Invalid BBU/CV parameter, must be 0 or 1!\n"; exit(STATE_UNKNOWN); } @enclosures = split(/,/,join(',', @enclosures)); @logDevices = split(/,/,join(',', @logDevices)); @physDevices = split(/,/,join(',', @physDevices)); # check given thresholds if(@temperature_c && !@temperature_w){ print "Please also specify warning threshold!\n"; displayUsage(); exit(STATE_UNKNOWN); } @temperature_w = getThresholds(\@temperature_w, $C_TEMP_WARNING); @temperature_c = getThresholds(\@temperature_c, $C_TEMP_CRITICAL); if(@physicalDeviceTemperature_c && !@physicalDeviceTemperature_w){ print "Please also specify PD warning threshold!\n"; displayUsage(); exit(STATE_UNKNOWN); } @physicalDeviceTemperature_w = getThresholds(\@physicalDeviceTemperature_w, $PD_TEMP_WARNING); @physicalDeviceTemperature_c = getThresholds(\@physicalDeviceTemperature_c, $PD_TEMP_CRITICAL); # Set exit status my $exitstatus = 0; my $newexitstatus = 0; my $newstatusMessage = ''; ($newexitstatus, $statusMessage) = getControllerStatus($sudo, $storcli, $controller, \@temperature_w, \@temperature_c); $newstatusMessage .= $statusMessage; $exitstatus = getExitState($newexitstatus, $exitstatus); my ($bbuPresent,$cvPresent) = (0,0); if($bbu == 1){ ($bbuPresent,$cvPresent) = checkBBUorCVIsPresent($sudo, $storcli, $controller); if($bbuPresent == 0 && $cvPresent == 0){ $exitstatus = getExitState(STATE_CRITICAL, $exitstatus); $newstatusMessage .= "No BBU or CV found, "; } } if($bbuPresent == 1){ ($newexitstatus, $statusMessage) = getBBUStatus($sudo, $storcli, $controller); $newstatusMessage .= $statusMessage; $exitstatus = getExitState($newexitstatus, $exitstatus); } if($cvPresent == 1){ ($newexitstatus, $statusMessage) = getCVStatus($sudo, $storcli, $controller); $newstatusMessage .= $statusMessage; $exitstatus = getExitState($newexitstatus, $exitstatus); } ($newexitstatus, $statusMessage) = getLogicalDeviceStatus($sudo, $storcli, $controller, \@logDevices, "init"); $newstatusMessage .= $statusMessage; $exitstatus = getExitState($newexitstatus, $exitstatus); ($newexitstatus, $statusMessage) = getLogicalDeviceStatus($sudo, $storcli, $controller, \@logDevices, "all"); $newstatusMessage .= $statusMessage; $exitstatus = getExitState($newexitstatus, $exitstatus); ($newexitstatus, $statusMessage) = getPhysDeviceStatus($sudo, $storcli, $controller, \@enclosures, \@physDevices, \@physicalDeviceTemperature_w, \@physicalDeviceTemperature_c, "initialization"); $newstatusMessage .= $statusMessage; $exitstatus = getExitState($newexitstatus, $exitstatus); ($newexitstatus, $statusMessage) = getPhysDeviceStatus($sudo, $storcli, $controller, \@enclosures, \@physDevices, \@physicalDeviceTemperature_w, \@physicalDeviceTemperature_c, "rebuild"); $newstatusMessage .= $statusMessage; $exitstatus = getExitState($newexitstatus, $exitstatus); ($newexitstatus, $statusMessage) = getPhysDeviceStatus($sudo, $storcli, $controller, \@enclosures, \@physDevices, \@physicalDeviceTemperature_w, \@physicalDeviceTemperature_c, "all"); $newstatusMessage .= $statusMessage; $exitstatus = getExitState($newexitstatus, $exitstatus); if($exitstatus == 0) { print "LSIRAID OK (Ctrl #$controller) | STATUS=$exitstatus\n"; } elsif($exitstatus == 1) { chop($newstatusMessage); chop($newstatusMessage); print "LSIRAID WARNING (Ctrl #$controller): [$newstatusMessage] | STATUS=$exitstatus\n"; } elsif($exitstatus == 2) { chop($newstatusMessage); chop($newstatusMessage); print "LSIRAID CRITICAL (Ctrl #$controller): [$newstatusMessage] | STATUS=$exitstatus\n"; } exit($exitstatus); }