diff --git a/check_lsi_raid b/check_lsi_raid index a21d566..7e7a9fb 100644 --- a/check_lsi_raid +++ b/check_lsi_raid @@ -27,7 +27,6 @@ use strict; use warnings; use Getopt::Long qw(:config no_ignore_case); -use Data::Dumper; use feature qw/switch/; #später durch nachfolgende Zeile ersetzen, da wir noch mit Perl 5 arbeiten! #use Switch 'Perl6'; @@ -79,11 +78,10 @@ sub displayUsage { } # Displays a short Help text for the user -# TODO: FH Copyright, ADD URL and Mailing List +# TODO: ADD URL and Mailing List sub displayHelp { print $NAME . " Version: " . $VERSION ."\n"; print "Copyright (C) 2009-2013 Thomas-Krenn.AG\n"; - #FH Copyright print "Current updates available at http://www.thomas-krenn.com/en/oss//\n"; print "This Nagios/Icinga Plugin checks LSI RAID-Controllers for Controller, \nPhysical-Device and Logical Device warnings and errors.\n"; print "In order for this plugin to work properly you need to add the \nnagios-user to your sudoers file (or create a new one in /etc/sudoers.d/)\n"; @@ -114,8 +112,7 @@ sub displayVersion { } # Returns information about: -# - Controller status -# - Temperature, ... +# - Controller status and controller temperature sub getControllerStatus { my $sudo = $_[0]; my $storcli = $_[1]; @@ -128,14 +125,9 @@ sub getControllerStatus { my $statusMessage = ''; # Return String my @output = `$command`; - #my @output = ("", "Status = Success\n"); - #command successful? if($output[1] eq "Status = Success\n") { foreach my $line (@output) { - #/^([a-zA-Z0-9]*)/ erstes wort - #/(\s+[a-zA-Z0-9]*)/ zweites wort - #/([a-zA-Z0-9]*)$/ letztes wort my $first; my $last; if($line =~ /^([a-zA-Z0-9]*)/) { @@ -164,6 +156,134 @@ sub getControllerStatus { $statusMessage .= "Ctrl. booted in safe mode, "; } } + when("temperature") { + $temp = $last; + if($temperature_w[0] eq "in") { + if(($temp >= $temperature_w[1]) && ($temp <= $temperature_w[2])) { + # is in warn range, so also check if in critical error range + if($temperature_c[0] eq "in") { + if(($temp >= $temperature_c[1]) && ($temp <= $temperature_c[2])) { + # critical error + $crit = 1; + $status = getExitState($status, STATE_CRITICAL); + if ($VERBOSITY == 0) {$statusMessage .= "Ctrl. temp. critical, "; } + if ($VERBOSITY == 1) {$statusMessage .= "Ctrl. temp. is critical (${temp}C), "; } + if ($VERBOSITY >= 2) {$statusMessage .= "Controller temperature is critical (${temp}C), "; } + } + } else { + if(($temp < $temperature_c[1]) || ($temp > $temperature_c[2])) { + # critical error + $crit = 1; + $status = getExitState($status, STATE_CRITICAL); + if ($VERBOSITY == 0) {$statusMessage .= "Ctrl. temp. critical, "; } + if ($VERBOSITY == 1) {$statusMessage .= "Ctrl. temp. is critical (${temp}C), "; } + if ($VERBOSITY >= 2) {$statusMessage .= "Controller temperature is critical (${temp}C), "; } + } + } + if($crit eq 0) { # only warn if not already given a critical error + $status = getExitState($status, STATE_WARNING); + if ($VERBOSITY == 0) {$statusMessage .= "Ctrl. temp. warning, "; } + if ($VERBOSITY == 1) {$statusMessage .= "Ctrl. temp. warning (${temp}C), "; } + if ($VERBOSITY >= 2) {$statusMessage .= "Controller temperature warning (${temp}C), "; } + } + } + } else { + if(($temp < $temperature_w[1]) || ($temp > $temperature_w[2])) { + # is in warn range, so also check if in critical error range + if($temperature_c[0] eq "in") { + if(($temp >= $temperature_c[1]) && ($temp <= $temperature_c[2])) { + # critical error + $crit = 1; + $status = getExitState($status, STATE_CRITICAL); + if ($VERBOSITY == 0) {$statusMessage .= "Ctrl. temp. critical, "; } + if ($VERBOSITY == 1) {$statusMessage .= "Ctrl. temp. is critical (${temp}C), "; } + if ($VERBOSITY >= 2) {$statusMessage .= "Controller temperature is critical (${temp}C), "; } + } + } else { + if(($temp < $temperature_c[1]) || ($temp > $temperature_c[2])) { + # critical error + $crit = 1; + $status = getExitState($status, STATE_CRITICAL); + if ($VERBOSITY == 0) {$statusMessage .= "Ctrl. temp. critical, "; } + if ($VERBOSITY == 1) {$statusMessage .= "Ctrl. temp. is critical (${temp}C), "; } + if ($VERBOSITY >= 2) {$statusMessage .= "Controller temperature is critical (${temp}C), "; } + } + } + if($crit eq 0) { # only warn if not already given a critical error + $status = getExitState($status, STATE_WARNING); + if ($VERBOSITY == 0) {$statusMessage .= "Ctrl. temp. warning, "; } + if ($VERBOSITY == 1) {$statusMessage .= "Ctrl. temp. warning (${temp}C), "; } + if ($VERBOSITY >= 2) {$statusMessage .= "Controller temperature warning (${temp}C), "; } + } + } + } + } + } + } + } + when("ROC") { + if($line =~ /(\s+[a-zA-Z0-9]*)/) { + if($1 eq "temperature") { + $temp = $last; + if($temperature_w[0] eq "in") { + if(($temp >= $temperature_w[1]) && ($temp <= $temperature_w[2])) { + # is in warn range, so also check if in critical error range + if($temperature_c[0] eq "in") { + if(($temp >= $temperature_c[1]) && ($temp <= $temperature_c[2])) { + # critical error + $crit = 1; + $status = getExitState($status, STATE_CRITICAL); + if ($VERBOSITY == 0) {$statusMessage .= "ROC temp. critical, "; } + if ($VERBOSITY == 1) {$statusMessage .= "ROC temp. is critical (${temp}C), "; } + if ($VERBOSITY >= 2) {$statusMessage .= "ROC temperature is critical (${temp}C), "; } + } + } else { + if(($temp < $temperature_c[1]) || ($temp > $temperature_c[2])) { + # critical error + $crit = 1; + $status = getExitState($status, STATE_CRITICAL); + if ($VERBOSITY == 0) {$statusMessage .= "ROC temp. critical, "; } + if ($VERBOSITY == 1) {$statusMessage .= "ROC temp. is critical (${temp}C), "; } + if ($VERBOSITY >= 2) {$statusMessage .= "ROC temperature is critical (${temp}C), "; } + } + } + if($crit eq 0) { # only warn if not already given a critical error + $status = getExitState($status, STATE_WARNING); + if ($VERBOSITY == 0) {$statusMessage .= "ROC temp. warning, "; } + if ($VERBOSITY == 1) {$statusMessage .= "ROC temp. warning (${temp}C), "; } + if ($VERBOSITY >= 2) {$statusMessage .= "ROC temperature warning (${temp}C), "; } + } + } + } else { + if(($temp < $temperature_w[1]) || ($temp > $temperature_w[2])) { + # is in warn range, so also check if in critical error range + if($temperature_c[0] eq "in") { + if(($temp >= $temperature_c[1]) && ($temp <= $temperature_c[2])) { + # critical error + $crit = 1; + $status = getExitState($status, STATE_CRITICAL); + if ($VERBOSITY == 0) {$statusMessage .= "ROC temp. critical, "; } + if ($VERBOSITY == 1) {$statusMessage .= "ROC temp. is critical (${temp}C), "; } + if ($VERBOSITY >= 2) {$statusMessage .= "ROC temperature is critical (${temp}C), "; } + } + } else { + if(($temp < $temperature_c[1]) || ($temp > $temperature_c[2])) { + # critical error + $crit = 1; + $status = getExitState($status, STATE_CRITICAL); + if ($VERBOSITY == 0) {$statusMessage .= "ROC temp. critical, "; } + if ($VERBOSITY == 1) {$statusMessage .= "ROC temp. is critical (${temp}C), "; } + if ($VERBOSITY >= 2) {$statusMessage .= "ROC temperature is critical (${temp}C), "; } + } + } + if($crit eq 0) { # only warn if not already given a critical error + $status = getExitState($status, STATE_WARNING); + if ($VERBOSITY == 0) {$statusMessage .= "ROC temp. warning, "; } + if ($VERBOSITY == 1) {$statusMessage .= "ROC temp. warning (${temp}C), "; } + if ($VERBOSITY >= 2) {$statusMessage .= "ROC temperature warning (${temp}C), "; } + } + } + } } } } @@ -208,6 +328,8 @@ sub getControllerStatus { } } +# Returns information about: +# - Logical device status sub getLogicalDeviceStatus { my $sudo = $_[0]; my $storcli = $_[1]; @@ -226,9 +348,7 @@ sub getLogicalDeviceStatus { } $command .= " show $action"; my @output = `$command`; - #my @output = ("", "Status = Success\n", "/c0/v1 :", "1/1 RAID5 Optl RW Yes RWBD - 930.0 GB"); - #command successful? if($output[1] eq "Status = Success\n") { if($action eq "all") { my $output_dev = -1; @@ -321,7 +441,6 @@ sub getPhysDeviceStatus { } $command .= " show $action"; my @output = `$command`; - #my @output = ("", "Status = Success\n", "Drive /c0/e252/s0 State :", "=======================", "Shield Counter = 0", "Media Error Count = 0", "Other Error Count = 0", "Drive Temperature = 31C (87.80 F)", "Predictive Failure Count = 0", "S.M.A.R.T alert flagged by drive = No"); if($output[1] eq "Status = Success\n") { if($action eq "all") { @@ -512,7 +631,6 @@ sub getBBUStatus { my $statusMessage = ''; my @output = `$command`; - #my @output = ("", "Status = Success\n"); if($output[1] eq "Status = Success\n") { my $blockid = 0; @@ -760,7 +878,6 @@ MAIN: { # Input validation my @controllerVersion = `$sudo $storcli /c$controller show all`; - #my @controllerVersion = ("", "", "Description = None\n"); if($controllerVersion[2] eq "Description = Controller $controller not found\n") { print "Invalid controller number, device not found!\n"; exit(STATE_UNKNOWN); @@ -799,8 +916,8 @@ MAIN: { @enclosures = split(/,/,join(',', @enclosures)); @logDevices = split(/,/,join(',', @logDevices)); @physDevices = split(/,/,join(',', @physDevices)); - @temperature_w = getThresholds(\@temperature_w, 40); # 40 = default value - @temperature_c = getThresholds(\@temperature_c, 50); + @temperature_w = getThresholds(\@temperature_w, 50); # 50 = default value + @temperature_c = getThresholds(\@temperature_c, 60); @physicalDeviceTemperature_w = getThresholds(\@physicalDeviceTemperature_w, 40); @physicalDeviceTemperature_c = getThresholds(\@physicalDeviceTemperature_c, 45); diff --git a/check_lsi_raid.POD b/check_lsi_raid.POD new file mode 100644 index 0000000..0238bc0 --- /dev/null +++ b/check_lsi_raid.POD @@ -0,0 +1,170 @@ +=head1 NAME + +check_lsi_raid - nagios/icinga plugin to check lsi raid-controllers + +=head1 SYNOPSIS + +Example: + + ./check_lsi_raid -Tw 40 -Tc 50 -LD 0,1 -PD 1 -b 0 + LSIRAID OK (Ctrl #0) + +=head1 VERSION + +This document describes check_lsi_raid version 0.2 + +=head1 DESCRIPTION + +check_lsi_raid is a Nagios/Icinga Plugin to check LSI raid-controllers for errors/warnings. +This plugin makes heavy use of 'StorCLI' which can be obtained from the LSI homepage. +It checks the controller, the physical devices and the logical devices seperately for errors or warnings +(for more detailed information about what is beeing monitored see METHODS). + +=head1 DEPENDENCIES + +- This plugin requires a running Nagios or Icinga. (either local or NRPE) + +- sudo + +- storcli/storcli64 (Can be obtained from the LSI homepage) + +- The following Perl Modules: + + use strict; + use warnings; + use Getopt::Long qw(:config no_ignore_case); + use Switch 'Perl6'; + +=head1 METHODS + +=head2 getExitState + +Returns the correct exit code after each test. +Can only increment the exit-code (if necessary) since the exit-code may not change from Warning to OK. + +=head2 displayUsage + +Explains the usage of the plugin and which parameters are available. + +=head2 displayHelp + +Prints a short help text and quick information where to find additional help. + +=head2 displayVersion + +Prints the plugin version and the StorCLI version. + +=head2 getControllerStatus + +Checks the LSI raid-controller for controller errors. (uses storcli /c show all) + +- Overall status + +- Reboot necessary? + +- Booted in safe mode? + +- Memory errors + +- Failed to get lock key? + +- Rollback operation in progress? + +- Temperature of controller and/or ROC (if present) + +The controller/ROC temperature can be set to a warning (parameter: -Tw|--temperature-warn) and a critical level (parameter: -Tc|--temperature-critical). See getThresholds for more information. + +=head2 getLogicalDeviceStatus + +Returns information about logical devices: + +- General logical devices status + +- Initialization in progress? + +=head2 getPhysDeviceStatus + +Returns information about physical devices attached to the LSI raid-controller: + +- Disk status + +- Several error counts + +- Device temperature + +- Predictive fail count + +- S.M.A.R.T. status + +The device temperature can be set to a warning (parameter: -PDTw|--physicaldevicetemperature-warn) and a critical level (parameter: -PDTc|--physicaldevicetemperature-critical). See getThresholds for more information. + +=head2 getBBUStatus + +Returns information about the battery backup unit (BBU) status + +- Overall status + +- Temperature status + +- Bad voltage + +- Learn cycle status + +- I2C errors + +- Capacity + +=head2 getThresholds + +Uses the -Tw --temperature-warn and -Tc --temperature-critical parameters and parses them + +Uses correct Nagios Threshold implementation: L + +Returns a temperature range (array) in or out which a temperature should be + +Array content: ("in" or "out", range from, range to) + +=head3 Examples + +Generate an alert if x... +-Tw 10 < 0 or > 10, (outside the range of {0 .. 10}) +-Tw 10: < 10, (outside {10 .. inf}) +-Tw ~:10 > 10, (outside the range of {-inf .. 10}) +-Tw 10:20 < 10 or > 20, (outside the range of {10 .. 20}) +-Tw @10:20 >= 10 and <= 20, (inside the range of {10 .. 20}) + +=head1 LICENSE AND COPYRIGHT + +Copyright (c) 2013, +Martin Grubhofer C<< >>, C<< >>. +Scheipner Alexander C<< >>. +Werner Sebastian C<< >>. +All rights reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; either version 3 of the License, or (at your option) any later +version. + +=head1 DISCLAIMER OF WARRANTY + +BECAUSE THIS SOFTWARE IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE SOFTWARE, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE SOFTWARE "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER +EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE +ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE SOFTWARE IS WITH +YOU. SHOULD THE SOFTWARE PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL +NECESSARY SERVICING, REPAIR, OR CORRECTION. + +IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE SOFTWARE AS PERMITTED BY THE ABOVE LICENCE, BE +LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL, +OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE +THE SOFTWARE (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING +RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A +FAILURE OF THE SOFTWARE TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF +SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF +SUCH DAMAGES. \ No newline at end of file