From 4862a1e322d51b6f7352e6915caf81f333cf765e Mon Sep 17 00:00:00 2001 From: Peter Palfrader Date: Mon, 20 Mar 2017 09:26:05 +0100 Subject: [PATCH] Add dsa-check_puppet_agent --- .../checks/dsa-check_puppet_agent | 343 ++++++++++++++++++ dsa-nagios-checks/debian/changelog | 1 + 2 files changed, 344 insertions(+) create mode 100644 dsa-nagios-checks/checks/dsa-check_puppet_agent diff --git a/dsa-nagios-checks/checks/dsa-check_puppet_agent b/dsa-nagios-checks/checks/dsa-check_puppet_agent new file mode 100644 index 0000000..ccb285c --- /dev/null +++ b/dsa-nagios-checks/checks/dsa-check_puppet_agent @@ -0,0 +1,343 @@ +#!/bin/sh + +# downloaded from +# https://raw.githubusercontent.com/aswen/nagios-plugins/master/check_puppet_agent +# by Peter Palfrader, Mon, 20 Mar 2017 09:25:09 +0100 + +# Nagios plugin to monitor Puppet agent state +# +# Copyright (c) 2011 Alexander Swen +# +# Permission to use, copy, modify, and distribute this software for any +# purpose with or without fee is hereby granted, provided that the above +# copyright notice and this permission notice appear in all copies. +# +# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +# +# +# Example configuration +# +# Typical this check is placed on a client and runs via nrpe. +# So add this to nrpe.cfg: +# command[check_puppet_agent]=/usr/lib/nagios/plugins/check_puppet +# or if you want to specify options (rather than have the script calculate key values and facts) then something like +# command[check_puppet_agent]=/usr/lib/nagios/plugins/check_puppet -w 3600 -c 7200 -s /var/lib/puppet/state/last_run_summary.yaml -d 0 +# This should warn when the agent hasnt run for an hour and go critical after two hours +# if you have dont_blame_nrpe=1 set you can choose to +# command[check_puppet_agent]=/usr/lib/nagios/plugins/check_puppet -w $ARG1$ -c $ARG2$ -s $ARG3$ -d $ARG4$ +# +# define service { +# use generic-service +# service_description Puppet agent +# check_command check_nrpe!check_puppet_agent +# or +# check_command check_nrpe!check_puppet_agent!3600!7200 +#} +# +# Sudo required. +# The user running this script must be allowed using sudo to run puppet config print, e.g. in /etc/sudoers include the 3 lines +# User_Alias NAGIOS=nagios +# Cmnd_Alias PUPPETCHECK=/usr/bin/puppet config print --section agent runinterval,\ +# /usr/bin/puppet config print --section agent splay,\ +# /usr/bin/puppet config print --section agent splaylimit,\ +# /usr/bin/puppet config print --section agent agent_disabled_lockfile,\ +# /usr/bin/puppet config print --section agent lastrunfile,\ +# /usr/bin/puppet config print --section agent lastrunreport,\ +# /usr/bin/puppet config print --section agent pidfile +# NAGIOS ALL=NOPASSWD:PUPPETCHECK +# +# CHANGELOG: +# 20120126 A.Swen created. +# 20120214 trey85stang Modified, added getopts, usage, defaults. +# 20120220 A.Swen lastrunfile can be overriden. +# 20130717 A.Swen Moved finding lastrunfile to after getopts and made it conditional to param -s. +# Added option to tell script if puppet agent is started from cron or as a daemon (-d). +# Switched to use awk to filter values from lastrunfile and set them as params. +# Updated some comments. +# Removed bug in search for process (that would previously always find something because grep find it's processline). +# "puppet agent --configprint lastrunfile" has to be run as root. As normal user it yields ~/.puppet/var/state. +# Based on feedback Михайло Масик updated: +# - Puppet --configprint => puppet agent --configprint (version 3 has new way of printing config). +# - Added new pattern to search for process. +# - Added test kill -0 to see if process is still there. +# 20130725 A.Swen Based on feedback Михайло Масик updated a test (removed ! from test). +# 20130725 A.Swen Added sudo to puppet config print pidfile. +# 20131209 Mark Ruys Issue warning when last_run_report.yaml contain errors. +# 20141015 A.Swen Add show disabled status. +# 20141127 KissT Remove requirement to have sudo custom rule. +# 20150917 A.Swen Based on an idea of Daniel Lawrence check for major version to decide how to print config. +# Based on idea of D.Stirling switched to sh. +# Findout puppet executable location using which. +# Based on an idea of D.Stirling updated daemon check. +# Based on an idea of D.Stirling made BSD compattible. +# Based on an idea of BTriller fix the getopts command to parse the agent_disabled_lockfile option. +# 20151201 Akomakom Add perf data option. +# More reliable yaml parsing. +# If $HOME not set: set it. +# Fix PS command for Suse. +# 20151218 K.A. Gillow Calculate warn/crit based on runinterval and splay setting rather than use fixed settings. +# Check system has been up longer than crit/warn time otherwise don't yet trigger normally relevant fault levels. +# We never generally want puppet disabled so change to warning. +# 20151229 A.Swen Fix bug in PERF_DATA (replace compset by set). +# Prettify $PERF_DATA output. +# 20160201 S. Sams Changes to PERF_DATA output format to increase compatibility with Nagios Plugin guidelines. +# Add compatibility with Puppet 4.x +# 20160315 J. Yaworski Add -v, allowing to pass a version to compare +# 20160815 L. Buriola Add -E to show first error on output + +# FUNCTIONS +result () { + case $1 in + 0) echo "OK: Puppet agent $version running catalogversion $config, and executed at $last_run_human for last time. $PERF_DATA";rc=0 ;; + 1) echo "UNKNOWN: last_run_summary.yaml not found, not readable or incomplete";rc=3 ;; + 2) echo "WARNING: Last run was $time_since_last seconds ago. Warn is $WARN. $PERF_DATA";rc=1 ;; + 3) echo "CRITICAL: Last run was $time_since_last seconds ago. Crit is $CRIT. $PERF_DATA";rc=2 ;; + 4) echo "CRITICAL: Puppet daemon not running or something wrong with process";rc=2 ;; + 5) echo "UNKNOWN: no WARN or CRIT parameters were sent to this check";rc=3 ;; + 6) echo "CRITICAL: Last run had 1 or more errors. Check the logs. $FIRST_ERROR $PERF_DATA";rc=2 ;; + 7) echo "DISABLED: Reason: $(sed -e 's/{"disabled_message":"//' -e 's/"}//' $agent_disabled_lockfile). $PERF_DATA";rc=1 ;; + 8) echo "UNKNOWN: No Puppet executable found";rc=3 ;; + 9) echo "UNKNOWN: Internal error: $2"; rc=3 ;; + 10) echo "OK (PROBABLY): Puppet agent last successful run $last_run_human (runinterval $runinterval, splay $splay, splaylimit $splay limit) but system has not been up long enough to guarantee a fresh puppet run should have occurred";rc=0 ;; + 11) echo "INFO: Puppet agent is version $version, but should be $wanted_version. $PERF_DATA";rc=0 ;; + 12) echo "UNKNOWN: last_run_report.yaml not found, not readable or incomplete";rc=3 ;; + esac + exit $rc +} + +usage () { + echo "" + echo "USAGE: " + echo " $0 [-c 7200] [-w 3600] [-d 0] [-l agent_disabled_lockfile] [-s lastrunfile] [-r lastrunreport] [-v wanted_version] [-PEh]" + echo " -c Critical threshold (default 7200 seconds)" + echo " -w Warning threshold (default 3600 seconds)" + echo " -d 0|1: puppet agent should be a daemon(1) or not (0).(default 1)" + echo " -h Show this help." + echo " -l Agent_disabled_lockfile (default: /var/lib/puppet/state/agent_disabled.lock)" + echo " -s Lastrunfile (default: /var/lib/puppet/state/last_run_summary.yaml)" + echo " -r Lastrunreport (default: /var/lib/puppet/state/last_run_report.yaml)" + echo " -P Enable perf_data in the output" + echo " -E Show first error in the output" + echo " -v The version of puppet that should be running" + echo "" + exit 1 +} + +# Get a flat representation of yaml without relying on external tools. +parse_yaml () { + local prefix=$2 + local s='[[:space:]]*' w='[a-zA-Z0-9_]*' fs=$(echo @|tr @ '\034') + sed -ne "s|^\($s\):|\1|" \ + -e "s|^\($s\)\($w\)$s:$s[\"']\(.*\)[\"']$s\$|\1$fs\2$fs\3|p" \ + -e "s|^\($s\)\($w\)$s:$s\(.*\)$s\$|\1$fs\2$fs\3|p" $1 | + awk -F$fs '{ + indent = length($1)/2; + vname[indent] = $2; + for (i in vname) {if (i > indent) {delete vname[i]}} + if (length($3) > 0) { + vn=""; for (i=0; i&1)" + +# Set Puppet configprint syntax. +case $puppet_major_version in + 2) + puppet_config_print="sudo $PUPPET --configprint" + ;; + 3) + puppet_config_print="sudo $PUPPET config print" + ;; + *) + puppet_config_print="sudo $PUPPET config print --section agent" + ;; +esac + +# construct WARN and CRIT times based on runinterval plus a safety buffer +# if they have not already been explicitly set +runinterval=$($puppet_config_print runinterval) +splaylimit=0 +splay=$($puppet_config_print splay) +[ "$splay" != "false" ] && splaylimit=$($puppet_config_print splaylimit) +[ -z "$WARN" ] && WARN=$(($runinterval + $splaylimit)) +[ -z "$CRIT" ] && CRIT=$(($WARN + $runinterval)) +#now check we finally have some sensible settings +[ -z "$WARN" -o $WARN -lt 30 ] && result 5 +[ -z "$CRIT" -o $CRIT -lt 60 ] && result 5 + +# If the disabled lockfile is not given as a param try to find it ourselves. +[ -z "$agent_disabled_lockfile" ] && agent_disabled_lockfile=$($puppet_config_print agent_disabled_lockfile) +# If there's a disabled.lock file don't look any further. +[ -f "$agent_disabled_lockfile" ] && result 7 + +# If the lastrunfile is not given as a param try to find it ourselves. +[ -z "$lastrunfile" ] && lastrunfile=$($puppet_config_print lastrunfile) +# Check if state file exists. +[ -s $lastrunfile -a -r $lastrunfile ] || result 1 + +# If the lastrunreport is not given as a param try to find it ourselves. +[ -z "$lastrunreport" ] && lastrunreport=$($puppet_config_print lastrunreport) +# Check if state file exists. +[ -n "$SHOW_ERROR" ] && ( [ -s $lastrunreport -a -r $lastrunreport ] || result 12 ) + +# Check if daemonized was set, else set default to 1. +[ -n "$daemonized" ] || daemonized=1 +# If Puppet agent runs as a daemon there should be a process. We can't check so much when it is triggered by cron. +if [ $daemonized -eq 1 ];then + # Puppet version 4 changed several paths, determine correct ones + if [ $puppet_major_version -ge 4 ];then + puppet_daemon_rundir="puppetlabs" + puppet_daemon_regex="/opt/puppetlabs/puppet/bin/ruby /opt/puppetlabs/puppet/bin/puppet" + else + puppet_daemon_rundir="puppet" + puppet_daemon_regex="/usr(/local)?/bin/ruby[^ ]* /usr(/local)?/s?bin/puppetd?" + fi + + # Check puppet daemon: + [ "$(ps axfww|egrep "$puppet_daemon_regex"|grep -v egrep)" ] || result 4 + + uname -a|grep -q BSD && default_pidfile=/var/$puppet_daemon_rundir/run/agent.pid || default_pidfile=/var/run/$puppet_daemon_rundir/agent.pid + [ -e $default_pidfile ] && pidfile=$default_pidfile || pidfile=$($puppet_config_print pidfile) + + # If there is a pidfile tell me the pid, else fail. + [ -f $pidfile ]&&pid=$(cat $pidfile)||result 4 + + # See if the process is running. + ps -p $pid > /dev/null || result 4 + + # On Linux test if the pid we found in the pidfile is puppet: + if uname -a|grep -q Linux;then + grep -q puppet /proc/$pid/cmdline ||result 4 + fi +fi + +# parse last run file +# puppet version 4 files have less intendation, add prefix to match parsed variables from older versions +[ $puppet_major_version -ge 4 ] && yaml_prefix="_" +eval $(parse_yaml $lastrunfile $yaml_prefix) +# this flattens the hierarchy to single-level name/value variables, eg: +# _events_total="14" +# _version_config="1448907293" + +# Construct perf data using anything that starts with "_resources_ or _time_total" +if [ -n "$PERF" ] ; then + for V in $(set | grep "^_resources_\|^_time_total") ; do + PERF_DATA="$(echo $V | sed 's/^_//' | sed "s/='/=/" | sed "s/'$//") $PERF_DATA" + done + PERF_DATA="| $PERF_DATA" +fi + +# Construct FIRST_ERROR using last_run_report.yaml +if [ -n "$SHOW_ERROR" ] ; then + FIRST_ERROR=$(get_first_error) +fi + +# Check when last run happened. +last_run=$_time_last_run +last_run_human=$(date -d @$last_run +%c) +now=$(date +%s) + +# Check how long system been up in seconds +uptime=$(cut -f1 -d' ' /proc/uptime | cut -f1 -d.) + +# Assess last run time relative to warn/crit values and system uptime. +time_since_last=$((now-last_run)) +[ $time_since_last -ge $CRIT -a $uptime -ge $CRIT ] && result 3 +[ $time_since_last -ge $CRIT -a $uptime -lt $CRIT ] && result 10 +[ $time_since_last -ge $WARN -a $uptime -ge $WARN ] && result 2 +[ $time_since_last -ge $WARN -a $uptime -lt $WARN ] && result 10 + +# Get some more info from the yaml file. +config=$_version_config +version=$_version_puppet +failed=$_resources_failed +failure=$_events_failure +failed_to_restart=$_resources_failed_to_restart + +# If any of the values above doesn't return raise an error. +[ -z "$last_run" -o -z "$config" -o -z "$version" -o -z "$failed" -o -z "$failure" -o -z "$failed_to_restart" ] && result 1 +# If anything went wrong last run => crit. +[ $failed -gt 0 -o $failure -gt 0 -o $failed_to_restart -gt 0 ] && result 6 + +# If $wanted_version is set, compare it to the running version +if [ -n "$wanted_version" -a -n "$version" ]; then + [ "$wanted_version" != "$version" ] && result 11 +fi + +# If we reached here all is ok. +result 0 + +# END diff --git a/dsa-nagios-checks/debian/changelog b/dsa-nagios-checks/debian/changelog index d343e41..dc80b42 100644 --- a/dsa-nagios-checks/debian/changelog +++ b/dsa-nagios-checks/debian/changelog @@ -7,6 +7,7 @@ dsa-nagios-checks (110) UNRELEASED; urgency=medium * dsa-check-file_age: support multiple files. * dsa-check-running-kernel: meta package version check, also work on 4.x kernels and later. + * dsa-check_puppet_agent: add from Alexander Swen's github. -- Peter Palfrader Mon, 23 Jan 2017 14:14:06 +0100 -- 2.20.1