mirror of
https://github.com/thomas-krenn/check_lsi_raid.git
synced 2026-02-25 05:08:48 +01:00
- Added perldoc (check_lsi_raid.POD)
- Added controller/ROC temperature check (getControllerStatus, not tested yet!) - Added missing function descriptions (short) - Removed debug comments and Dumper - Removed todo (FH copyright)
This commit is contained in:
151
check_lsi_raid
151
check_lsi_raid
@@ -27,7 +27,6 @@
|
||||
use strict;
|
||||
use warnings;
|
||||
use Getopt::Long qw(:config no_ignore_case);
|
||||
use Data::Dumper;
|
||||
use feature qw/switch/; #später durch nachfolgende Zeile ersetzen, da wir noch mit Perl 5 arbeiten!
|
||||
#use Switch 'Perl6';
|
||||
|
||||
@@ -79,11 +78,10 @@ sub displayUsage {
|
||||
}
|
||||
|
||||
# Displays a short Help text for the user
|
||||
# TODO: FH Copyright, ADD URL and Mailing List
|
||||
# TODO: ADD URL and Mailing List
|
||||
sub displayHelp {
|
||||
print $NAME . " Version: " . $VERSION ."\n";
|
||||
print "Copyright (C) 2009-2013 Thomas-Krenn.AG\n";
|
||||
#FH Copyright
|
||||
print "Current updates available at http://www.thomas-krenn.com/en/oss/<NOT HERE YET!!!!!!>/\n";
|
||||
print "This Nagios/Icinga Plugin checks LSI RAID-Controllers for Controller, \nPhysical-Device and Logical Device warnings and errors.\n";
|
||||
print "In order for this plugin to work properly you need to add the \nnagios-user to your sudoers file (or create a new one in /etc/sudoers.d/)\n";
|
||||
@@ -114,8 +112,7 @@ sub displayVersion {
|
||||
}
|
||||
|
||||
# Returns information about:
|
||||
# - Controller status
|
||||
# - Temperature, ...
|
||||
# - Controller status and controller temperature
|
||||
sub getControllerStatus {
|
||||
my $sudo = $_[0];
|
||||
my $storcli = $_[1];
|
||||
@@ -128,14 +125,9 @@ sub getControllerStatus {
|
||||
my $statusMessage = ''; # Return String
|
||||
|
||||
my @output = `$command`;
|
||||
#my @output = ("", "Status = Success\n");
|
||||
|
||||
#command successful?
|
||||
if($output[1] eq "Status = Success\n") {
|
||||
foreach my $line (@output) {
|
||||
#/^([a-zA-Z0-9]*)/ erstes wort
|
||||
#/(\s+[a-zA-Z0-9]*)/ zweites wort
|
||||
#/([a-zA-Z0-9]*)$/ letztes wort
|
||||
my $first;
|
||||
my $last;
|
||||
if($line =~ /^([a-zA-Z0-9]*)/) {
|
||||
@@ -164,6 +156,134 @@ sub getControllerStatus {
|
||||
$statusMessage .= "Ctrl. booted in safe mode, ";
|
||||
}
|
||||
}
|
||||
when("temperature") {
|
||||
$temp = $last;
|
||||
if($temperature_w[0] eq "in") {
|
||||
if(($temp >= $temperature_w[1]) && ($temp <= $temperature_w[2])) {
|
||||
# is in warn range, so also check if in critical error range
|
||||
if($temperature_c[0] eq "in") {
|
||||
if(($temp >= $temperature_c[1]) && ($temp <= $temperature_c[2])) {
|
||||
# critical error
|
||||
$crit = 1;
|
||||
$status = getExitState($status, STATE_CRITICAL);
|
||||
if ($VERBOSITY == 0) {$statusMessage .= "Ctrl. temp. critical, "; }
|
||||
if ($VERBOSITY == 1) {$statusMessage .= "Ctrl. temp. is critical (${temp}C), "; }
|
||||
if ($VERBOSITY >= 2) {$statusMessage .= "Controller temperature is critical (${temp}C), "; }
|
||||
}
|
||||
} else {
|
||||
if(($temp < $temperature_c[1]) || ($temp > $temperature_c[2])) {
|
||||
# critical error
|
||||
$crit = 1;
|
||||
$status = getExitState($status, STATE_CRITICAL);
|
||||
if ($VERBOSITY == 0) {$statusMessage .= "Ctrl. temp. critical, "; }
|
||||
if ($VERBOSITY == 1) {$statusMessage .= "Ctrl. temp. is critical (${temp}C), "; }
|
||||
if ($VERBOSITY >= 2) {$statusMessage .= "Controller temperature is critical (${temp}C), "; }
|
||||
}
|
||||
}
|
||||
if($crit eq 0) { # only warn if not already given a critical error
|
||||
$status = getExitState($status, STATE_WARNING);
|
||||
if ($VERBOSITY == 0) {$statusMessage .= "Ctrl. temp. warning, "; }
|
||||
if ($VERBOSITY == 1) {$statusMessage .= "Ctrl. temp. warning (${temp}C), "; }
|
||||
if ($VERBOSITY >= 2) {$statusMessage .= "Controller temperature warning (${temp}C), "; }
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if(($temp < $temperature_w[1]) || ($temp > $temperature_w[2])) {
|
||||
# is in warn range, so also check if in critical error range
|
||||
if($temperature_c[0] eq "in") {
|
||||
if(($temp >= $temperature_c[1]) && ($temp <= $temperature_c[2])) {
|
||||
# critical error
|
||||
$crit = 1;
|
||||
$status = getExitState($status, STATE_CRITICAL);
|
||||
if ($VERBOSITY == 0) {$statusMessage .= "Ctrl. temp. critical, "; }
|
||||
if ($VERBOSITY == 1) {$statusMessage .= "Ctrl. temp. is critical (${temp}C), "; }
|
||||
if ($VERBOSITY >= 2) {$statusMessage .= "Controller temperature is critical (${temp}C), "; }
|
||||
}
|
||||
} else {
|
||||
if(($temp < $temperature_c[1]) || ($temp > $temperature_c[2])) {
|
||||
# critical error
|
||||
$crit = 1;
|
||||
$status = getExitState($status, STATE_CRITICAL);
|
||||
if ($VERBOSITY == 0) {$statusMessage .= "Ctrl. temp. critical, "; }
|
||||
if ($VERBOSITY == 1) {$statusMessage .= "Ctrl. temp. is critical (${temp}C), "; }
|
||||
if ($VERBOSITY >= 2) {$statusMessage .= "Controller temperature is critical (${temp}C), "; }
|
||||
}
|
||||
}
|
||||
if($crit eq 0) { # only warn if not already given a critical error
|
||||
$status = getExitState($status, STATE_WARNING);
|
||||
if ($VERBOSITY == 0) {$statusMessage .= "Ctrl. temp. warning, "; }
|
||||
if ($VERBOSITY == 1) {$statusMessage .= "Ctrl. temp. warning (${temp}C), "; }
|
||||
if ($VERBOSITY >= 2) {$statusMessage .= "Controller temperature warning (${temp}C), "; }
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
when("ROC") {
|
||||
if($line =~ /(\s+[a-zA-Z0-9]*)/) {
|
||||
if($1 eq "temperature") {
|
||||
$temp = $last;
|
||||
if($temperature_w[0] eq "in") {
|
||||
if(($temp >= $temperature_w[1]) && ($temp <= $temperature_w[2])) {
|
||||
# is in warn range, so also check if in critical error range
|
||||
if($temperature_c[0] eq "in") {
|
||||
if(($temp >= $temperature_c[1]) && ($temp <= $temperature_c[2])) {
|
||||
# critical error
|
||||
$crit = 1;
|
||||
$status = getExitState($status, STATE_CRITICAL);
|
||||
if ($VERBOSITY == 0) {$statusMessage .= "ROC temp. critical, "; }
|
||||
if ($VERBOSITY == 1) {$statusMessage .= "ROC temp. is critical (${temp}C), "; }
|
||||
if ($VERBOSITY >= 2) {$statusMessage .= "ROC temperature is critical (${temp}C), "; }
|
||||
}
|
||||
} else {
|
||||
if(($temp < $temperature_c[1]) || ($temp > $temperature_c[2])) {
|
||||
# critical error
|
||||
$crit = 1;
|
||||
$status = getExitState($status, STATE_CRITICAL);
|
||||
if ($VERBOSITY == 0) {$statusMessage .= "ROC temp. critical, "; }
|
||||
if ($VERBOSITY == 1) {$statusMessage .= "ROC temp. is critical (${temp}C), "; }
|
||||
if ($VERBOSITY >= 2) {$statusMessage .= "ROC temperature is critical (${temp}C), "; }
|
||||
}
|
||||
}
|
||||
if($crit eq 0) { # only warn if not already given a critical error
|
||||
$status = getExitState($status, STATE_WARNING);
|
||||
if ($VERBOSITY == 0) {$statusMessage .= "ROC temp. warning, "; }
|
||||
if ($VERBOSITY == 1) {$statusMessage .= "ROC temp. warning (${temp}C), "; }
|
||||
if ($VERBOSITY >= 2) {$statusMessage .= "ROC temperature warning (${temp}C), "; }
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if(($temp < $temperature_w[1]) || ($temp > $temperature_w[2])) {
|
||||
# is in warn range, so also check if in critical error range
|
||||
if($temperature_c[0] eq "in") {
|
||||
if(($temp >= $temperature_c[1]) && ($temp <= $temperature_c[2])) {
|
||||
# critical error
|
||||
$crit = 1;
|
||||
$status = getExitState($status, STATE_CRITICAL);
|
||||
if ($VERBOSITY == 0) {$statusMessage .= "ROC temp. critical, "; }
|
||||
if ($VERBOSITY == 1) {$statusMessage .= "ROC temp. is critical (${temp}C), "; }
|
||||
if ($VERBOSITY >= 2) {$statusMessage .= "ROC temperature is critical (${temp}C), "; }
|
||||
}
|
||||
} else {
|
||||
if(($temp < $temperature_c[1]) || ($temp > $temperature_c[2])) {
|
||||
# critical error
|
||||
$crit = 1;
|
||||
$status = getExitState($status, STATE_CRITICAL);
|
||||
if ($VERBOSITY == 0) {$statusMessage .= "ROC temp. critical, "; }
|
||||
if ($VERBOSITY == 1) {$statusMessage .= "ROC temp. is critical (${temp}C), "; }
|
||||
if ($VERBOSITY >= 2) {$statusMessage .= "ROC temperature is critical (${temp}C), "; }
|
||||
}
|
||||
}
|
||||
if($crit eq 0) { # only warn if not already given a critical error
|
||||
$status = getExitState($status, STATE_WARNING);
|
||||
if ($VERBOSITY == 0) {$statusMessage .= "ROC temp. warning, "; }
|
||||
if ($VERBOSITY == 1) {$statusMessage .= "ROC temp. warning (${temp}C), "; }
|
||||
if ($VERBOSITY >= 2) {$statusMessage .= "ROC temperature warning (${temp}C), "; }
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -208,6 +328,8 @@ sub getControllerStatus {
|
||||
}
|
||||
}
|
||||
|
||||
# Returns information about:
|
||||
# - Logical device status
|
||||
sub getLogicalDeviceStatus {
|
||||
my $sudo = $_[0];
|
||||
my $storcli = $_[1];
|
||||
@@ -226,9 +348,7 @@ sub getLogicalDeviceStatus {
|
||||
}
|
||||
$command .= " show $action";
|
||||
my @output = `$command`;
|
||||
#my @output = ("", "Status = Success\n", "/c0/v1 :", "1/1 RAID5 Optl RW Yes RWBD - 930.0 GB");
|
||||
|
||||
#command successful?
|
||||
if($output[1] eq "Status = Success\n") {
|
||||
if($action eq "all") {
|
||||
my $output_dev = -1;
|
||||
@@ -321,7 +441,6 @@ sub getPhysDeviceStatus {
|
||||
}
|
||||
$command .= " show $action";
|
||||
my @output = `$command`;
|
||||
#my @output = ("", "Status = Success\n", "Drive /c0/e252/s0 State :", "=======================", "Shield Counter = 0", "Media Error Count = 0", "Other Error Count = 0", "Drive Temperature = 31C (87.80 F)", "Predictive Failure Count = 0", "S.M.A.R.T alert flagged by drive = No");
|
||||
|
||||
if($output[1] eq "Status = Success\n") {
|
||||
if($action eq "all") {
|
||||
@@ -512,7 +631,6 @@ sub getBBUStatus {
|
||||
my $statusMessage = '';
|
||||
|
||||
my @output = `$command`;
|
||||
#my @output = ("", "Status = Success\n");
|
||||
|
||||
if($output[1] eq "Status = Success\n") {
|
||||
my $blockid = 0;
|
||||
@@ -760,7 +878,6 @@ MAIN: {
|
||||
|
||||
# Input validation
|
||||
my @controllerVersion = `$sudo $storcli /c$controller show all`;
|
||||
#my @controllerVersion = ("", "", "Description = None\n");
|
||||
if($controllerVersion[2] eq "Description = Controller $controller not found\n") {
|
||||
print "Invalid controller number, device not found!\n";
|
||||
exit(STATE_UNKNOWN);
|
||||
@@ -799,8 +916,8 @@ MAIN: {
|
||||
@enclosures = split(/,/,join(',', @enclosures));
|
||||
@logDevices = split(/,/,join(',', @logDevices));
|
||||
@physDevices = split(/,/,join(',', @physDevices));
|
||||
@temperature_w = getThresholds(\@temperature_w, 40); # 40 = default value
|
||||
@temperature_c = getThresholds(\@temperature_c, 50);
|
||||
@temperature_w = getThresholds(\@temperature_w, 50); # 50 = default value
|
||||
@temperature_c = getThresholds(\@temperature_c, 60);
|
||||
@physicalDeviceTemperature_w = getThresholds(\@physicalDeviceTemperature_w, 40);
|
||||
@physicalDeviceTemperature_c = getThresholds(\@physicalDeviceTemperature_c, 45);
|
||||
|
||||
|
||||
170
check_lsi_raid.POD
Normal file
170
check_lsi_raid.POD
Normal file
@@ -0,0 +1,170 @@
|
||||
=head1 NAME
|
||||
|
||||
check_lsi_raid - nagios/icinga plugin to check lsi raid-controllers
|
||||
|
||||
=head1 SYNOPSIS
|
||||
|
||||
Example:
|
||||
|
||||
./check_lsi_raid -Tw 40 -Tc 50 -LD 0,1 -PD 1 -b 0
|
||||
LSIRAID OK (Ctrl #0)
|
||||
|
||||
=head1 VERSION
|
||||
|
||||
This document describes check_lsi_raid version 0.2
|
||||
|
||||
=head1 DESCRIPTION
|
||||
|
||||
check_lsi_raid is a Nagios/Icinga Plugin to check LSI raid-controllers for errors/warnings.
|
||||
This plugin makes heavy use of 'StorCLI' which can be obtained from the LSI homepage.
|
||||
It checks the controller, the physical devices and the logical devices seperately for errors or warnings
|
||||
(for more detailed information about what is beeing monitored see METHODS).
|
||||
|
||||
=head1 DEPENDENCIES
|
||||
|
||||
- This plugin requires a running Nagios or Icinga. (either local or NRPE)
|
||||
|
||||
- sudo
|
||||
|
||||
- storcli/storcli64 (Can be obtained from the LSI homepage)
|
||||
|
||||
- The following Perl Modules:
|
||||
|
||||
use strict;
|
||||
use warnings;
|
||||
use Getopt::Long qw(:config no_ignore_case);
|
||||
use Switch 'Perl6';
|
||||
|
||||
=head1 METHODS
|
||||
|
||||
=head2 getExitState
|
||||
|
||||
Returns the correct exit code after each test.
|
||||
Can only increment the exit-code (if necessary) since the exit-code may not change from Warning to OK.
|
||||
|
||||
=head2 displayUsage
|
||||
|
||||
Explains the usage of the plugin and which parameters are available.
|
||||
|
||||
=head2 displayHelp
|
||||
|
||||
Prints a short help text and quick information where to find additional help.
|
||||
|
||||
=head2 displayVersion
|
||||
|
||||
Prints the plugin version and the StorCLI version.
|
||||
|
||||
=head2 getControllerStatus
|
||||
|
||||
Checks the LSI raid-controller for controller errors. (uses storcli /c<num> show all)
|
||||
|
||||
- Overall status
|
||||
|
||||
- Reboot necessary?
|
||||
|
||||
- Booted in safe mode?
|
||||
|
||||
- Memory errors
|
||||
|
||||
- Failed to get lock key?
|
||||
|
||||
- Rollback operation in progress?
|
||||
|
||||
- Temperature of controller and/or ROC (if present)
|
||||
|
||||
The controller/ROC temperature can be set to a warning (parameter: -Tw|--temperature-warn) and a critical level (parameter: -Tc|--temperature-critical). See getThresholds for more information.
|
||||
|
||||
=head2 getLogicalDeviceStatus
|
||||
|
||||
Returns information about logical devices:
|
||||
|
||||
- General logical devices status
|
||||
|
||||
- Initialization in progress?
|
||||
|
||||
=head2 getPhysDeviceStatus
|
||||
|
||||
Returns information about physical devices attached to the LSI raid-controller:
|
||||
|
||||
- Disk status
|
||||
|
||||
- Several error counts
|
||||
|
||||
- Device temperature
|
||||
|
||||
- Predictive fail count
|
||||
|
||||
- S.M.A.R.T. status
|
||||
|
||||
The device temperature can be set to a warning (parameter: -PDTw|--physicaldevicetemperature-warn) and a critical level (parameter: -PDTc|--physicaldevicetemperature-critical). See getThresholds for more information.
|
||||
|
||||
=head2 getBBUStatus
|
||||
|
||||
Returns information about the battery backup unit (BBU) status
|
||||
|
||||
- Overall status
|
||||
|
||||
- Temperature status
|
||||
|
||||
- Bad voltage
|
||||
|
||||
- Learn cycle status
|
||||
|
||||
- I2C errors
|
||||
|
||||
- Capacity
|
||||
|
||||
=head2 getThresholds
|
||||
|
||||
Uses the -Tw --temperature-warn and -Tc --temperature-critical parameters and parses them
|
||||
|
||||
Uses correct Nagios Threshold implementation: L<http://nagiosplug.sourceforge.net/developer-guidelines.html#THRESHOLDFORMAT>
|
||||
|
||||
Returns a temperature range (array) in or out which a temperature should be
|
||||
|
||||
Array content: ("in" or "out", range from, range to)
|
||||
|
||||
=head3 Examples
|
||||
|
||||
Generate an alert if x...
|
||||
-Tw 10 < 0 or > 10, (outside the range of {0 .. 10})
|
||||
-Tw 10: < 10, (outside {10 .. inf})
|
||||
-Tw ~:10 > 10, (outside the range of {-inf .. 10})
|
||||
-Tw 10:20 < 10 or > 20, (outside the range of {10 .. 20})
|
||||
-Tw @10:20 >= 10 and <= 20, (inside the range of {10 .. 20})
|
||||
|
||||
=head1 LICENSE AND COPYRIGHT
|
||||
|
||||
Copyright (c) 2013,
|
||||
Martin Grubhofer C<< <mgrubhofer@thomas-krenn.com> >>, C<< <s1110239013@students.fh-hagenberg.at> >>.
|
||||
Scheipner Alexander C<< <s1110239032@students.fh-hagenberg.at> >>.
|
||||
Werner Sebastian C<< <s1110239038@students.fh-hagenberg.at> >>.
|
||||
All rights reserved.
|
||||
|
||||
This program is free software; you can redistribute it and/or modify it under
|
||||
the terms of the GNU General Public License as published by the Free Software
|
||||
Foundation; either version 3 of the License, or (at your option) any later
|
||||
version.
|
||||
|
||||
=head1 DISCLAIMER OF WARRANTY
|
||||
|
||||
BECAUSE THIS SOFTWARE IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
|
||||
FOR THE SOFTWARE, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
|
||||
OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
|
||||
PROVIDE THE SOFTWARE "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER
|
||||
EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE
|
||||
ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE SOFTWARE IS WITH
|
||||
YOU. SHOULD THE SOFTWARE PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL
|
||||
NECESSARY SERVICING, REPAIR, OR CORRECTION.
|
||||
|
||||
IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
|
||||
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
|
||||
REDISTRIBUTE THE SOFTWARE AS PERMITTED BY THE ABOVE LICENCE, BE
|
||||
LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL,
|
||||
OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE
|
||||
THE SOFTWARE (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING
|
||||
RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A
|
||||
FAILURE OF THE SOFTWARE TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF
|
||||
SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
|
||||
SUCH DAMAGES.
|
||||
Reference in New Issue
Block a user