- Added perldoc (check_lsi_raid.POD)

- Added controller/ROC temperature check (getControllerStatus, not tested yet!)
- Added missing function descriptions (short)
- Removed debug comments and Dumper
- Removed todo (FH copyright)
This commit is contained in:
Grubhofer Martin
2013-06-11 14:48:13 +02:00
parent 535361ea4f
commit 03eccc0f49
2 changed files with 304 additions and 17 deletions

View File

@@ -27,7 +27,6 @@
use strict;
use warnings;
use Getopt::Long qw(:config no_ignore_case);
use Data::Dumper;
use feature qw/switch/; #später durch nachfolgende Zeile ersetzen, da wir noch mit Perl 5 arbeiten!
#use Switch 'Perl6';
@@ -79,11 +78,10 @@ sub displayUsage {
}
# Displays a short Help text for the user
# TODO: FH Copyright, ADD URL and Mailing List
# TODO: ADD URL and Mailing List
sub displayHelp {
print $NAME . " Version: " . $VERSION ."\n";
print "Copyright (C) 2009-2013 Thomas-Krenn.AG\n";
#FH Copyright
print "Current updates available at http://www.thomas-krenn.com/en/oss/<NOT HERE YET!!!!!!>/\n";
print "This Nagios/Icinga Plugin checks LSI RAID-Controllers for Controller, \nPhysical-Device and Logical Device warnings and errors.\n";
print "In order for this plugin to work properly you need to add the \nnagios-user to your sudoers file (or create a new one in /etc/sudoers.d/)\n";
@@ -114,8 +112,7 @@ sub displayVersion {
}
# Returns information about:
# - Controller status
# - Temperature, ...
# - Controller status and controller temperature
sub getControllerStatus {
my $sudo = $_[0];
my $storcli = $_[1];
@@ -128,14 +125,9 @@ sub getControllerStatus {
my $statusMessage = ''; # Return String
my @output = `$command`;
#my @output = ("", "Status = Success\n");
#command successful?
if($output[1] eq "Status = Success\n") {
foreach my $line (@output) {
#/^([a-zA-Z0-9]*)/ erstes wort
#/(\s+[a-zA-Z0-9]*)/ zweites wort
#/([a-zA-Z0-9]*)$/ letztes wort
my $first;
my $last;
if($line =~ /^([a-zA-Z0-9]*)/) {
@@ -164,6 +156,134 @@ sub getControllerStatus {
$statusMessage .= "Ctrl. booted in safe mode, ";
}
}
when("temperature") {
$temp = $last;
if($temperature_w[0] eq "in") {
if(($temp >= $temperature_w[1]) && ($temp <= $temperature_w[2])) {
# is in warn range, so also check if in critical error range
if($temperature_c[0] eq "in") {
if(($temp >= $temperature_c[1]) && ($temp <= $temperature_c[2])) {
# critical error
$crit = 1;
$status = getExitState($status, STATE_CRITICAL);
if ($VERBOSITY == 0) {$statusMessage .= "Ctrl. temp. critical, "; }
if ($VERBOSITY == 1) {$statusMessage .= "Ctrl. temp. is critical (${temp}C), "; }
if ($VERBOSITY >= 2) {$statusMessage .= "Controller temperature is critical (${temp}C), "; }
}
} else {
if(($temp < $temperature_c[1]) || ($temp > $temperature_c[2])) {
# critical error
$crit = 1;
$status = getExitState($status, STATE_CRITICAL);
if ($VERBOSITY == 0) {$statusMessage .= "Ctrl. temp. critical, "; }
if ($VERBOSITY == 1) {$statusMessage .= "Ctrl. temp. is critical (${temp}C), "; }
if ($VERBOSITY >= 2) {$statusMessage .= "Controller temperature is critical (${temp}C), "; }
}
}
if($crit eq 0) { # only warn if not already given a critical error
$status = getExitState($status, STATE_WARNING);
if ($VERBOSITY == 0) {$statusMessage .= "Ctrl. temp. warning, "; }
if ($VERBOSITY == 1) {$statusMessage .= "Ctrl. temp. warning (${temp}C), "; }
if ($VERBOSITY >= 2) {$statusMessage .= "Controller temperature warning (${temp}C), "; }
}
}
} else {
if(($temp < $temperature_w[1]) || ($temp > $temperature_w[2])) {
# is in warn range, so also check if in critical error range
if($temperature_c[0] eq "in") {
if(($temp >= $temperature_c[1]) && ($temp <= $temperature_c[2])) {
# critical error
$crit = 1;
$status = getExitState($status, STATE_CRITICAL);
if ($VERBOSITY == 0) {$statusMessage .= "Ctrl. temp. critical, "; }
if ($VERBOSITY == 1) {$statusMessage .= "Ctrl. temp. is critical (${temp}C), "; }
if ($VERBOSITY >= 2) {$statusMessage .= "Controller temperature is critical (${temp}C), "; }
}
} else {
if(($temp < $temperature_c[1]) || ($temp > $temperature_c[2])) {
# critical error
$crit = 1;
$status = getExitState($status, STATE_CRITICAL);
if ($VERBOSITY == 0) {$statusMessage .= "Ctrl. temp. critical, "; }
if ($VERBOSITY == 1) {$statusMessage .= "Ctrl. temp. is critical (${temp}C), "; }
if ($VERBOSITY >= 2) {$statusMessage .= "Controller temperature is critical (${temp}C), "; }
}
}
if($crit eq 0) { # only warn if not already given a critical error
$status = getExitState($status, STATE_WARNING);
if ($VERBOSITY == 0) {$statusMessage .= "Ctrl. temp. warning, "; }
if ($VERBOSITY == 1) {$statusMessage .= "Ctrl. temp. warning (${temp}C), "; }
if ($VERBOSITY >= 2) {$statusMessage .= "Controller temperature warning (${temp}C), "; }
}
}
}
}
}
}
}
when("ROC") {
if($line =~ /(\s+[a-zA-Z0-9]*)/) {
if($1 eq "temperature") {
$temp = $last;
if($temperature_w[0] eq "in") {
if(($temp >= $temperature_w[1]) && ($temp <= $temperature_w[2])) {
# is in warn range, so also check if in critical error range
if($temperature_c[0] eq "in") {
if(($temp >= $temperature_c[1]) && ($temp <= $temperature_c[2])) {
# critical error
$crit = 1;
$status = getExitState($status, STATE_CRITICAL);
if ($VERBOSITY == 0) {$statusMessage .= "ROC temp. critical, "; }
if ($VERBOSITY == 1) {$statusMessage .= "ROC temp. is critical (${temp}C), "; }
if ($VERBOSITY >= 2) {$statusMessage .= "ROC temperature is critical (${temp}C), "; }
}
} else {
if(($temp < $temperature_c[1]) || ($temp > $temperature_c[2])) {
# critical error
$crit = 1;
$status = getExitState($status, STATE_CRITICAL);
if ($VERBOSITY == 0) {$statusMessage .= "ROC temp. critical, "; }
if ($VERBOSITY == 1) {$statusMessage .= "ROC temp. is critical (${temp}C), "; }
if ($VERBOSITY >= 2) {$statusMessage .= "ROC temperature is critical (${temp}C), "; }
}
}
if($crit eq 0) { # only warn if not already given a critical error
$status = getExitState($status, STATE_WARNING);
if ($VERBOSITY == 0) {$statusMessage .= "ROC temp. warning, "; }
if ($VERBOSITY == 1) {$statusMessage .= "ROC temp. warning (${temp}C), "; }
if ($VERBOSITY >= 2) {$statusMessage .= "ROC temperature warning (${temp}C), "; }
}
}
} else {
if(($temp < $temperature_w[1]) || ($temp > $temperature_w[2])) {
# is in warn range, so also check if in critical error range
if($temperature_c[0] eq "in") {
if(($temp >= $temperature_c[1]) && ($temp <= $temperature_c[2])) {
# critical error
$crit = 1;
$status = getExitState($status, STATE_CRITICAL);
if ($VERBOSITY == 0) {$statusMessage .= "ROC temp. critical, "; }
if ($VERBOSITY == 1) {$statusMessage .= "ROC temp. is critical (${temp}C), "; }
if ($VERBOSITY >= 2) {$statusMessage .= "ROC temperature is critical (${temp}C), "; }
}
} else {
if(($temp < $temperature_c[1]) || ($temp > $temperature_c[2])) {
# critical error
$crit = 1;
$status = getExitState($status, STATE_CRITICAL);
if ($VERBOSITY == 0) {$statusMessage .= "ROC temp. critical, "; }
if ($VERBOSITY == 1) {$statusMessage .= "ROC temp. is critical (${temp}C), "; }
if ($VERBOSITY >= 2) {$statusMessage .= "ROC temperature is critical (${temp}C), "; }
}
}
if($crit eq 0) { # only warn if not already given a critical error
$status = getExitState($status, STATE_WARNING);
if ($VERBOSITY == 0) {$statusMessage .= "ROC temp. warning, "; }
if ($VERBOSITY == 1) {$statusMessage .= "ROC temp. warning (${temp}C), "; }
if ($VERBOSITY >= 2) {$statusMessage .= "ROC temperature warning (${temp}C), "; }
}
}
}
}
}
}
@@ -208,6 +328,8 @@ sub getControllerStatus {
}
}
# Returns information about:
# - Logical device status
sub getLogicalDeviceStatus {
my $sudo = $_[0];
my $storcli = $_[1];
@@ -226,9 +348,7 @@ sub getLogicalDeviceStatus {
}
$command .= " show $action";
my @output = `$command`;
#my @output = ("", "Status = Success\n", "/c0/v1 :", "1/1 RAID5 Optl RW Yes RWBD - 930.0 GB");
#command successful?
if($output[1] eq "Status = Success\n") {
if($action eq "all") {
my $output_dev = -1;
@@ -321,7 +441,6 @@ sub getPhysDeviceStatus {
}
$command .= " show $action";
my @output = `$command`;
#my @output = ("", "Status = Success\n", "Drive /c0/e252/s0 State :", "=======================", "Shield Counter = 0", "Media Error Count = 0", "Other Error Count = 0", "Drive Temperature = 31C (87.80 F)", "Predictive Failure Count = 0", "S.M.A.R.T alert flagged by drive = No");
if($output[1] eq "Status = Success\n") {
if($action eq "all") {
@@ -512,7 +631,6 @@ sub getBBUStatus {
my $statusMessage = '';
my @output = `$command`;
#my @output = ("", "Status = Success\n");
if($output[1] eq "Status = Success\n") {
my $blockid = 0;
@@ -760,7 +878,6 @@ MAIN: {
# Input validation
my @controllerVersion = `$sudo $storcli /c$controller show all`;
#my @controllerVersion = ("", "", "Description = None\n");
if($controllerVersion[2] eq "Description = Controller $controller not found\n") {
print "Invalid controller number, device not found!\n";
exit(STATE_UNKNOWN);
@@ -799,8 +916,8 @@ MAIN: {
@enclosures = split(/,/,join(',', @enclosures));
@logDevices = split(/,/,join(',', @logDevices));
@physDevices = split(/,/,join(',', @physDevices));
@temperature_w = getThresholds(\@temperature_w, 40); # 40 = default value
@temperature_c = getThresholds(\@temperature_c, 50);
@temperature_w = getThresholds(\@temperature_w, 50); # 50 = default value
@temperature_c = getThresholds(\@temperature_c, 60);
@physicalDeviceTemperature_w = getThresholds(\@physicalDeviceTemperature_w, 40);
@physicalDeviceTemperature_c = getThresholds(\@physicalDeviceTemperature_c, 45);

170
check_lsi_raid.POD Normal file
View File

@@ -0,0 +1,170 @@
=head1 NAME
check_lsi_raid - nagios/icinga plugin to check lsi raid-controllers
=head1 SYNOPSIS
Example:
./check_lsi_raid -Tw 40 -Tc 50 -LD 0,1 -PD 1 -b 0
LSIRAID OK (Ctrl #0)
=head1 VERSION
This document describes check_lsi_raid version 0.2
=head1 DESCRIPTION
check_lsi_raid is a Nagios/Icinga Plugin to check LSI raid-controllers for errors/warnings.
This plugin makes heavy use of 'StorCLI' which can be obtained from the LSI homepage.
It checks the controller, the physical devices and the logical devices seperately for errors or warnings
(for more detailed information about what is beeing monitored see METHODS).
=head1 DEPENDENCIES
- This plugin requires a running Nagios or Icinga. (either local or NRPE)
- sudo
- storcli/storcli64 (Can be obtained from the LSI homepage)
- The following Perl Modules:
use strict;
use warnings;
use Getopt::Long qw(:config no_ignore_case);
use Switch 'Perl6';
=head1 METHODS
=head2 getExitState
Returns the correct exit code after each test.
Can only increment the exit-code (if necessary) since the exit-code may not change from Warning to OK.
=head2 displayUsage
Explains the usage of the plugin and which parameters are available.
=head2 displayHelp
Prints a short help text and quick information where to find additional help.
=head2 displayVersion
Prints the plugin version and the StorCLI version.
=head2 getControllerStatus
Checks the LSI raid-controller for controller errors. (uses storcli /c<num> show all)
- Overall status
- Reboot necessary?
- Booted in safe mode?
- Memory errors
- Failed to get lock key?
- Rollback operation in progress?
- Temperature of controller and/or ROC (if present)
The controller/ROC temperature can be set to a warning (parameter: -Tw|--temperature-warn) and a critical level (parameter: -Tc|--temperature-critical). See getThresholds for more information.
=head2 getLogicalDeviceStatus
Returns information about logical devices:
- General logical devices status
- Initialization in progress?
=head2 getPhysDeviceStatus
Returns information about physical devices attached to the LSI raid-controller:
- Disk status
- Several error counts
- Device temperature
- Predictive fail count
- S.M.A.R.T. status
The device temperature can be set to a warning (parameter: -PDTw|--physicaldevicetemperature-warn) and a critical level (parameter: -PDTc|--physicaldevicetemperature-critical). See getThresholds for more information.
=head2 getBBUStatus
Returns information about the battery backup unit (BBU) status
- Overall status
- Temperature status
- Bad voltage
- Learn cycle status
- I2C errors
- Capacity
=head2 getThresholds
Uses the -Tw --temperature-warn and -Tc --temperature-critical parameters and parses them
Uses correct Nagios Threshold implementation: L<http://nagiosplug.sourceforge.net/developer-guidelines.html#THRESHOLDFORMAT>
Returns a temperature range (array) in or out which a temperature should be
Array content: ("in" or "out", range from, range to)
=head3 Examples
Generate an alert if x...
-Tw 10 < 0 or > 10, (outside the range of {0 .. 10})
-Tw 10: < 10, (outside {10 .. inf})
-Tw ~:10 > 10, (outside the range of {-inf .. 10})
-Tw 10:20 < 10 or > 20, (outside the range of {10 .. 20})
-Tw @10:20 >= 10 and <= 20, (inside the range of {10 .. 20})
=head1 LICENSE AND COPYRIGHT
Copyright (c) 2013,
Martin Grubhofer C<< <mgrubhofer@thomas-krenn.com> >>, C<< <s1110239013@students.fh-hagenberg.at> >>.
Scheipner Alexander C<< <s1110239032@students.fh-hagenberg.at> >>.
Werner Sebastian C<< <s1110239038@students.fh-hagenberg.at> >>.
All rights reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
Foundation; either version 3 of the License, or (at your option) any later
version.
=head1 DISCLAIMER OF WARRANTY
BECAUSE THIS SOFTWARE IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
FOR THE SOFTWARE, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
PROVIDE THE SOFTWARE "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER
EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE
ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE SOFTWARE IS WITH
YOU. SHOULD THE SOFTWARE PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL
NECESSARY SERVICING, REPAIR, OR CORRECTION.
IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
REDISTRIBUTE THE SOFTWARE AS PERMITTED BY THE ABOVE LICENCE, BE
LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL,
OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE
THE SOFTWARE (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING
RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A
FAILURE OF THE SOFTWARE TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF
SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
SUCH DAMAGES.