4 # https://raw.githubusercontent.com/aswen/nagios-plugins/master/check_puppet_agent
5 # by Peter Palfrader, Mon, 20 Mar 2017 09:25:09 +0100
7 # Nagios plugin to monitor Puppet agent state
9 # Copyright (c) 2011 Alexander Swen <a@swen.nu>
11 # Permission to use, copy, modify, and distribute this software for any
12 # purpose with or without fee is hereby granted, provided that the above
13 # copyright notice and this permission notice appear in all copies.
15 # THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
16 # WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
17 # MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
18 # ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 # WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
20 # ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
21 # OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
24 # Example configuration
26 # Typical this check is placed on a client and runs via nrpe.
27 # So add this to nrpe.cfg:
28 # command[check_puppet_agent]=/usr/lib/nagios/plugins/check_puppet
29 # or if you want to specify options (rather than have the script calculate key values and facts) then something like
30 # command[check_puppet_agent]=/usr/lib/nagios/plugins/check_puppet -w 3600 -c 7200 -s /var/lib/puppet/state/last_run_summary.yaml -d 0
31 # This should warn when the agent hasnt run for an hour and go critical after two hours
32 # if you have dont_blame_nrpe=1 set you can choose to
33 # command[check_puppet_agent]=/usr/lib/nagios/plugins/check_puppet -w $ARG1$ -c $ARG2$ -s $ARG3$ -d $ARG4$
37 # service_description Puppet agent
38 # check_command check_nrpe!check_puppet_agent
40 # check_command check_nrpe!check_puppet_agent!3600!7200
44 # The user running this script must be allowed using sudo to run puppet config print, e.g. in /etc/sudoers include the 3 lines
45 # User_Alias NAGIOS=nagios
46 # Cmnd_Alias PUPPETCHECK=/usr/bin/puppet config print --section agent runinterval,\
47 # /usr/bin/puppet config print --section agent splay,\
48 # /usr/bin/puppet config print --section agent splaylimit,\
49 # /usr/bin/puppet config print --section agent agent_disabled_lockfile,\
50 # /usr/bin/puppet config print --section agent lastrunfile,\
51 # /usr/bin/puppet config print --section agent lastrunreport,\
52 # /usr/bin/puppet config print --section agent pidfile
53 # NAGIOS ALL=NOPASSWD:PUPPETCHECK
56 # 20120126 A.Swen created.
57 # 20120214 trey85stang Modified, added getopts, usage, defaults.
58 # 20120220 A.Swen lastrunfile can be overriden.
59 # 20130717 A.Swen Moved finding lastrunfile to after getopts and made it conditional to param -s.
60 # Added option to tell script if puppet agent is started from cron or as a daemon (-d).
61 # Switched to use awk to filter values from lastrunfile and set them as params.
62 # Updated some comments.
63 # Removed bug in search for process (that would previously always find something because grep find it's processline).
64 # "puppet agent --configprint lastrunfile" has to be run as root. As normal user it yields ~/.puppet/var/state.
65 # Based on feedback Михайло Масик updated:
66 # - Puppet --configprint => puppet agent --configprint (version 3 has new way of printing config).
67 # - Added new pattern to search for process.
68 # - Added test kill -0 to see if process is still there.
69 # 20130725 A.Swen Based on feedback Михайло Масик updated a test (removed ! from test).
70 # 20130725 A.Swen Added sudo to puppet config print pidfile.
71 # 20131209 Mark Ruys Issue warning when last_run_report.yaml contain errors.
72 # 20141015 A.Swen Add show disabled status.
73 # 20141127 KissT Remove requirement to have sudo custom rule.
74 # 20150917 A.Swen Based on an idea of Daniel Lawrence check for major version to decide how to print config.
75 # Based on idea of D.Stirling switched to sh.
76 # Findout puppet executable location using which.
77 # Based on an idea of D.Stirling updated daemon check.
78 # Based on an idea of D.Stirling made BSD compattible.
79 # Based on an idea of BTriller fix the getopts command to parse the agent_disabled_lockfile option.
80 # 20151201 Akomakom Add perf data option.
81 # More reliable yaml parsing.
82 # If $HOME not set: set it.
83 # Fix PS command for Suse.
84 # 20151218 K.A. Gillow Calculate warn/crit based on runinterval and splay setting rather than use fixed settings.
85 # Check system has been up longer than crit/warn time otherwise don't yet trigger normally relevant fault levels.
86 # We never generally want puppet disabled so change to warning.
87 # 20151229 A.Swen Fix bug in PERF_DATA (replace compset by set).
88 # Prettify $PERF_DATA output.
89 # 20160201 S. Sams Changes to PERF_DATA output format to increase compatibility with Nagios Plugin guidelines.
90 # Add compatibility with Puppet 4.x
91 # 20160315 J. Yaworski Add -v, allowing to pass a version to compare
92 # 20160815 L. Buriola Add -E to show first error on output
97 0) echo "OK: Puppet agent $version running catalogversion $config, and executed at $last_run_human for last time. $PERF_DATA";rc=0 ;;
98 1) echo "UNKNOWN: last_run_summary.yaml not found, not readable or incomplete";rc=3 ;;
99 2) echo "WARNING: Last run was $time_since_last seconds ago. Warn is $WARN. $PERF_DATA";rc=1 ;;
100 3) echo "CRITICAL: Last run was $time_since_last seconds ago. Crit is $CRIT. $PERF_DATA";rc=2 ;;
101 4) echo "CRITICAL: Puppet daemon not running or something wrong with process";rc=2 ;;
102 5) echo "UNKNOWN: no WARN or CRIT parameters were sent to this check";rc=3 ;;
103 6) echo "CRITICAL: Last run had 1 or more errors. Check the logs. $FIRST_ERROR $PERF_DATA";rc=2 ;;
104 7) echo "DISABLED: Reason: $(sed -e 's/{"disabled_message":"//' -e 's/"}//' $agent_disabled_lockfile). $PERF_DATA";rc=1 ;;
105 8) echo "UNKNOWN: No Puppet executable found";rc=3 ;;
106 9) echo "UNKNOWN: Internal error: $2"; rc=3 ;;
107 10) echo "OK (PROBABLY): Puppet agent last successful run $last_run_human (runinterval $runinterval, splay $splay, splaylimit $splay limit) but system has not been up long enough to guarantee a fresh puppet run should have occurred";rc=0 ;;
108 11) echo "INFO: Puppet agent is version $version, but should be $wanted_version. $PERF_DATA";rc=0 ;;
109 12) echo "UNKNOWN: last_run_report.yaml not found, not readable or incomplete";rc=3 ;;
117 echo " $0 [-c 7200] [-w 3600] [-d 0] [-l agent_disabled_lockfile] [-s lastrunfile] [-r lastrunreport] [-v wanted_version] [-PEh]"
118 echo " -c Critical threshold (default 7200 seconds)"
119 echo " -w Warning threshold (default 3600 seconds)"
120 echo " -d 0|1: puppet agent should be a daemon(1) or not (0).(default 1)"
121 echo " -h Show this help."
122 echo " -l Agent_disabled_lockfile (default: /var/lib/puppet/state/agent_disabled.lock)"
123 echo " -s Lastrunfile (default: /var/lib/puppet/state/last_run_summary.yaml)"
124 echo " -r Lastrunreport (default: /var/lib/puppet/state/last_run_report.yaml)"
125 echo " -P Enable perf_data in the output"
126 echo " -E Show first error in the output"
127 echo " -v The version of puppet that should be running"
132 # Get a flat representation of yaml without relying on external tools.
135 local s='[[:space:]]*' w='[a-zA-Z0-9_]*' fs=$(echo @|tr @ '\034')
136 sed -ne "s|^\($s\):|\1|" \
137 -e "s|^\($s\)\($w\)$s:$s[\"']\(.*\)[\"']$s\$|\1$fs\2$fs\3|p" \
138 -e "s|^\($s\)\($w\)$s:$s\(.*\)$s\$|\1$fs\2$fs\3|p" $1 |
140 indent = length($1)/2;
142 for (i in vname) {if (i > indent) {delete vname[i]}}
143 if (length($3) > 0) {
144 vn=""; for (i=0; i<indent; i++) {vn=(vn)(vname[i])("_")}
145 printf("%s%s%s=\"%s\"\n", "'$prefix'",vn, $2, $3);
150 # Get first error from last_run_report.yaml
152 grep_cmd="/bin/grep -B 3 -A 1"
153 first_error_time=$($grep_cmd "status: failure" $lastrunreport | grep "time: " | sort -n | head -1)
154 first_error=$($grep_cmd "$first_error_time" $lastrunreport | grep "message: " | sed 's/.*message: //' | head -1)
155 echo "FIRST_ERROR ($first_error)"
159 while getopts "c:d:l:s:r:w:v:PEh" opt; do
162 if ! echo $OPTARG | grep -q "[A-Za-z]" && [ -n "$OPTARG" ]
170 # argument should be 0 or 1
171 if [ $OPTARG -eq 0 -o $OPTARG -eq 1 ];then
178 l) agent_disabled_lockfile=$OPTARG ;;
179 s) lastrunfile=$OPTARG ;;
180 r) lastrunreport=$OPTARG ;;
182 if ! echo $OPTARG | grep -q "[A-Za-z]" && [ -n "$OPTARG" ]
196 wanted_version=$OPTARG
204 [ -z "$HOME" ] && export HOME=$(getent passwd `whoami` | cut -d: -f6) # Some clean environment situations make puppet -V fail.
206 # Ensure installation directory of puppet 4 is included in PATH
207 PATH="$PATH:/opt/puppetlabs/bin"
208 # Find location of puppet executable.
209 PUPPET=$(which puppet) || result 8
211 # Find out Puppet major version to determine configprint syntax.
212 puppet_major_version=$($PUPPET -V|cut -d. -f1)
214 [ -z "$puppet_major_version" ] && result 9 "Puppet version unknown from $($PUPPET -V 2>&1)"
216 # Set Puppet configprint syntax.
217 case $puppet_major_version in
219 puppet_config_print="sudo $PUPPET --configprint"
222 puppet_config_print="sudo $PUPPET config print"
225 puppet_config_print="sudo $PUPPET config print --section agent"
229 # construct WARN and CRIT times based on runinterval plus a safety buffer
230 # if they have not already been explicitly set
231 runinterval=$($puppet_config_print runinterval)
233 splay=$($puppet_config_print splay)
234 [ "$splay" != "false" ] && splaylimit=$($puppet_config_print splaylimit)
235 [ -z "$WARN" ] && WARN=$(($runinterval + $splaylimit))
236 [ -z "$CRIT" ] && CRIT=$(($WARN + $runinterval))
237 #now check we finally have some sensible settings
238 [ -z "$WARN" -o $WARN -lt 30 ] && result 5
239 [ -z "$CRIT" -o $CRIT -lt 60 ] && result 5
241 # If the disabled lockfile is not given as a param try to find it ourselves.
242 [ -z "$agent_disabled_lockfile" ] && agent_disabled_lockfile=$($puppet_config_print agent_disabled_lockfile)
243 # If there's a disabled.lock file don't look any further.
244 [ -f "$agent_disabled_lockfile" ] && result 7
246 # If the lastrunfile is not given as a param try to find it ourselves.
247 [ -z "$lastrunfile" ] && lastrunfile=$($puppet_config_print lastrunfile)
248 # Check if state file exists.
249 [ -s $lastrunfile -a -r $lastrunfile ] || result 1
251 # If the lastrunreport is not given as a param try to find it ourselves.
252 [ -z "$lastrunreport" ] && lastrunreport=$($puppet_config_print lastrunreport)
253 # Check if state file exists.
254 [ -n "$SHOW_ERROR" ] && ( [ -s $lastrunreport -a -r $lastrunreport ] || result 12 )
256 # Check if daemonized was set, else set default to 1.
257 [ -n "$daemonized" ] || daemonized=1
258 # If Puppet agent runs as a daemon there should be a process. We can't check so much when it is triggered by cron.
259 if [ $daemonized -eq 1 ];then
260 # Puppet version 4 changed several paths, determine correct ones
261 if [ $puppet_major_version -ge 4 ];then
262 puppet_daemon_rundir="puppetlabs"
263 puppet_daemon_regex="/opt/puppetlabs/puppet/bin/ruby /opt/puppetlabs/puppet/bin/puppet"
265 puppet_daemon_rundir="puppet"
266 puppet_daemon_regex="/usr(/local)?/bin/ruby[^ ]* /usr(/local)?/s?bin/puppetd?"
269 # Check puppet daemon:
270 [ "$(ps axfww|egrep "$puppet_daemon_regex"|grep -v egrep)" ] || result 4
272 uname -a|grep -q BSD && default_pidfile=/var/$puppet_daemon_rundir/run/agent.pid || default_pidfile=/var/run/$puppet_daemon_rundir/agent.pid
273 [ -e $default_pidfile ] && pidfile=$default_pidfile || pidfile=$($puppet_config_print pidfile)
275 # If there is a pidfile tell me the pid, else fail.
276 [ -f $pidfile ]&&pid=$(cat $pidfile)||result 4
278 # See if the process is running.
279 ps -p $pid > /dev/null || result 4
281 # On Linux test if the pid we found in the pidfile is puppet:
282 if uname -a|grep -q Linux;then
283 grep -q puppet /proc/$pid/cmdline ||result 4
287 # parse last run file
288 # puppet version 4 files have less intendation, add prefix to match parsed variables from older versions
289 [ $puppet_major_version -ge 4 ] && yaml_prefix="_"
290 eval $(parse_yaml $lastrunfile $yaml_prefix)
291 # this flattens the hierarchy to single-level name/value variables, eg:
293 # _version_config="1448907293"
295 # Construct perf data using anything that starts with "_resources_ or _time_total"
296 if [ -n "$PERF" ] ; then
297 for V in $(set | grep "^_resources_\|^_time_total") ; do
298 PERF_DATA="$(echo $V | sed 's/^_//' | sed "s/='/=/" | sed "s/'$//") $PERF_DATA"
300 PERF_DATA="| $PERF_DATA"
303 # Construct FIRST_ERROR using last_run_report.yaml
304 if [ -n "$SHOW_ERROR" ] ; then
305 FIRST_ERROR=$(get_first_error)
308 # Check when last run happened.
309 last_run=$_time_last_run
310 last_run_human=$(date -d @$last_run +%c)
313 # Check how long system been up in seconds
314 uptime=$(cut -f1 -d' ' /proc/uptime | cut -f1 -d.)
316 # Assess last run time relative to warn/crit values and system uptime.
317 time_since_last=$((now-last_run))
318 [ $time_since_last -ge $CRIT -a $uptime -ge $CRIT ] && result 3
319 [ $time_since_last -ge $CRIT -a $uptime -lt $CRIT ] && result 10
320 [ $time_since_last -ge $WARN -a $uptime -ge $WARN ] && result 2
321 [ $time_since_last -ge $WARN -a $uptime -lt $WARN ] && result 10
323 # Get some more info from the yaml file.
324 config=$_version_config
325 version=$_version_puppet
326 failed=$_resources_failed
327 failure=$_events_failure
328 failed_to_restart=$_resources_failed_to_restart
330 # If any of the values above doesn't return raise an error.
331 [ -z "$last_run" -o -z "$config" -o -z "$version" -o -z "$failed" -o -z "$failure" -o -z "$failed_to_restart" ] && result 1
332 # If anything went wrong last run => crit.
333 [ $failed -gt 0 -o $failure -gt 0 -o $failed_to_restart -gt 0 ] && result 6
335 # If $wanted_version is set, compare it to the running version
336 if [ -n "$wanted_version" -a -n "$version" ]; then
337 [ "$wanted_version" != "$version" ] && result 11
340 # If we reached here all is ok.