conova-node01:
address: 217.196.149.227
parents: gw-conova
- hostgroups: computers, stretch, service, sw-raid
+ hostgroups: computers, stretch, service, sw-raid, drbd-hosts
conova-node02:
address: 217.196.149.228
parents: gw-conova
- hostgroups: computers, stretch, service, sw-raid
+ hostgroups: computers, stretch, service, sw-raid, drbd-hosts
ganeti-conova:
address: 217.196.149.235
parents: gw-conova
handel:
address: 82.195.75.104
parents: ganeti3
- hostgroups: computers, service, kvmdomains, apache2-hosts, stretch, postgres96-hosts
+ hostgroups: computers, service, kvmdomains, apache2-hosts, stretch, postgres96-hosts, hassrvfs
kaufmann:
address: 82.195.75.107
parents: ganeti3
address: 140.211.166.196
parents: pieta
hostgroups: computers, hassrvfs, buildd, stretch
- powerpc-osuosl-01:
- address: 140.211.166.197
- parents: pieta
- hostgroups: computers, hassrvfs, buildd, jessie
# }}}
# {{{ gw-sanger
sallinen:
parents: gw-sanger
hostgroups: computers, postgres94-hosts, service, apache2-hosts, sw-raid, jessie, rsyncd-hosts, hasvarlogfs, multipath-hosts, nfs-server, varnish-hosts
contacts: tjrc1, dave
- smetana:
- address: 193.62.202.29
- parents: gw-sanger
- hostgroups: computers, sw-raid, sparc, wheezy, no-bacula
- contacts: tjrc1, dave
# }}}
# {{{ gw-scanplus
lobos:
address: 143.106.167.124
parents: gw-unicamp
hostgroups: computers, stretch, service, manyprocesses
- powerpc-unicamp-01:
- address: 143.106.167.120
- parents: prokofiev
- hostgroups: computers, hassrvfs, buildd, jessie
ppc64el-unicamp-01:
address: 143.106.167.121
parents: prokofiev
armhf:
alias: armhf
private: 1
- sparc:
- alias: sparc
- private: 1
porterbox:
alias: developer accessible porter machines
alias: Dell PowerEdge 1950 hosts
private: 1
- wheezy:
- alias: Hosts running wheezy
jessie:
alias: Hosts running jessie
stretch:
high-RTT:
alias: machines with high round trip times
private: 1
- alioth:
- alias: machines that just are just awkward
- private: 1
#openstack-compute:
# alias: nodes that run OpenStack compute
# private: 1
nrpe: "/usr/lib/nagios/plugins/dsa-check-ipv6-default-gw"
hostgroups: computers
check_interval: 60
- excludehostgroups: alioth
# }}}
# {{{ ### disk usage
-
nrpe: "/usr/lib/nagios/plugins/dsa-check-config"
hostgroups: computers
check_interval: 60
- excludehostgroups: alioth
-
name: setup - local hostname etc-hosts
nrpe: 'if getent ahosts `hostname` | grep -q 127.0; then echo "Warning: local hostname resolves to 127/8 address"; exit 1; else echo "OK: Hostname resolves to non-127/8 address."; exit 0; fi'
name: free memory - percent
nrpe: "/usr/lib/nagios/plugins/dsa-check-memory -m pct"
hostgroups: computers
- -
- name: process - getty
- nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:8 -c 1: -u root -C getty -a /sbin/getty"
- hostgroups: computers
- excludehosts: zelenka, zandonai
- excludehostgroups: jessie, stretch
-
name: process - getty
nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:8 -c 1: -u root -C agetty -a /sbin/agetty"
- hostgroups: jessie, stretch
+ hostgroups: computers
-
name: processes - zombies
servicegroups: backup
nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:1 -c 1: -u bacula -C bacula-fd -a '/usr/sbin/bacula-fd -c /etc/bacula/bacula-fd.conf'"
hostgroups: computers
- excludehostgroups: alioth
-
name: network backup status - draghi
-
name: process - ulogd
nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:1 -c 1: -u ulog -C ulogd -a '/usr/sbin/ulogd --daemon --uid ulog'"
- hostgroups: jessie, stretch
- -
- name: unexpected process - ulogd
- nrpe: "/usr/lib/nagios/plugins/check_procs -w 0:0 -C ulogd"
- hostgroups: sparc
+ hostgroups: computers
####
-
name: process - samhain
excludehostgroups: brokensamhain
# }}}
# {{{ logging
- -
- name: process - syslog-ng
- nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:1 -c 1: -u root -C syslog-ng -a '/sbin/syslog-ng -p /var/run/syslog-ng.pid'"
- hostgroups: computers
- excludehostgroups: jessie, stretch
-
name: process - syslog-ng
nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:1 -c 1: -u root -C syslog-ng -a '/sbin/syslog-ng -F'"
- hostgroups: jessie, stretch
+ hostgroups: computers
-
name: remote logging on lotti
remotecheck: "/usr/lib/nagios/plugins/dsa-check-log-age-loghost $HOSTNAME$"
runfrom: lotti
hostgroups: computers
- excludehostgroups: alioth
-
name: remote logging on lully
remotecheck: "/usr/lib/nagios/plugins/dsa-check-log-age-loghost $HOSTNAME$"
runfrom: lully
hostgroups: computers
- excludehostgroups: alioth
-
name: remote logging on loghost-grnet-01
remotecheck: "/usr/lib/nagios/plugins/dsa-check-log-age-loghost $HOSTNAME$"
runfrom: loghost-grnet-01
hostgroups: computers
- excludehostgroups: alioth
# }}}
# {{{ base service
-
name: system time synced
nrpe: "/usr/lib/nagios/plugins/dsa-check-timedatectl -s"
hostgroups: computers
- excludehostgroups: systemd-timesyncd, wheezy
+ excludehostgroups: systemd-timesyncd
servicegroups: time
-
name: system time synced
name: process - irqbalance
nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:1 -c 1: -u root -C irqbalance -a '/usr/sbin/irqbalance'"
hostgroups: computers
- excludehosts: harris, smetana
+ excludehosts: harris
###
-
name: process - cron
name: process - ud-replicated
nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:1 -c 1:1 -u root -C ud-replicated -a '/usr/bin/python /usr/bin/ud-replicated'"
hostgroups: computers
- excludehostgroups: alioth
- ###
- -
- name: process - monit
- nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:1 -c 1:1 -u root -C monit -a '/usr/bin/monit -d 300 -I -c /etc/monit/monitrc -s /var/lib/monit/monit.state'"
- hostgroups: computers
- excludehostgroups: alioth, jessie, stretch
###
-
name: MQ connection on rainier
hostgroups: computers
check_interval: 60
retry_interval: 15
- excludehostgroups: alioth, broken_mq
+ excludehostgroups: broken_mq
-
name: MQ connection on rapoport
servicegroups: MQ
hostgroups: computers
check_interval: 60
retry_interval: 15
- excludehostgroups: alioth, broken_mq
+ excludehostgroups: broken_mq
###
-
name: local resolver
name: process - unbound
nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:1 -c 1: -u unbound -C unbound -a '/usr/sbin/unbound'"
hostgroups: computers
- excludehostgroups: alioth
- ###
-
- name: process - uptimed
- nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:1 -c 1: -u daemon -C uptimed -a '/usr/sbin/uptimed'"
+ name: unbound trust anchors
+ nrpe: "/usr/lib/nagios/plugins/dsa-check-unbound-anchors"
hostgroups: computers
+ check_interval: 60
###
-
- name: process - udevd
- nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:1 -c 1: -u root -p 1 -C udevd -a 'udevd'"
+ name: process - uptimed
+ nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:1 -c 1: -u daemon -C uptimed -a '/usr/sbin/uptimed'"
hostgroups: computers
- excludehostgroups: jessie, stretch
-
name: process - udevd
nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:1 -c 1: -u root -p 1 -C systemd-udevd -a '/lib/systemd/systemd-udevd'"
- hostgroups: jessie, stretch
+ hostgroups: computers
###
-
name: unexpected process - acpid
-
name: process - stunnel4 - puppet-ekeyd
nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:6 -c 1: -u stunnel4 -C stunnel4 -a '/usr/bin/stunnel4 /etc/stunnel/puppet-ekeyd.conf'"
- hostgroups: wheezy, jessie, stretch
- excludehostgroups: alioth
+ hostgroups: computers
-
name: process - stunnel4 - puppet-ekeyd is crazy
nrpe: "sudo /usr/lib/nagios/plugins/dsa-check-stunnel-sanity"
hostgroups: computers
- excludehostgroups: alioth
excludehosts: czerny, grnet-node01, storace
# }}}
# {{{ anti-services
runfrom: handel
# }}}
# {{{ HW health/raid
- -
- name: process - mdadm monitor
- servicegroups: raid
- nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:1 -c 1:1 -u root -C mdadm -a '/sbin/mdadm --monitor --pid-file /run/mdadm/monitor.pid --daemonise --scan'"
- hostgroups: sw-raid
- excludehostgroups: jessie, stretch
-
name: process - mdadm monitor
servicegroups: raid
nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:1 -c 1:1 -u root -C mdadm -a '/sbin/mdadm --monitor --scan'"
hostgroups: sw-raid
- excludehostgroups: wheezy
-
name: RAID - sw raid
servicegroups: raid
name: mail queue
nrpe: "/usr/lib/nagios/plugins/check_mailq -M exim -w 1000 -c 2000"
hostgroups: heavy-exim
+ -
+ name: process - fail2ban
+ nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:1 -C fail2ban-server"
+ hostgroups: heavy-exim, heavy-postfix
+ -
+ name: unwanted process - fail2ban
+ nrpe: "/usr/lib/nagios/plugins/check_procs -w 0:0 -C fail2ban-server"
+ hostgroups: computers
+ excludehostgroups: heavy-exim, heavy-postfix
# }}}
# {{{ clamav
-
-
name: process - weightd - master
nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:1 -c 1: -u polw -a 'policyd-weight (master)'"
- hostgroups: heavy-postfix, alioth
+ hostgroups: heavy-postfix
-
name: process - weightd - cache
nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:1 -c 1: -u polw -a 'policyd-weight (cache)'"
- hostgroups: heavy-postfix, alioth
+ hostgroups: heavy-postfix
depends: process - weightd - master
-
name: process - weightd - child
nrpe: "/usr/lib/nagios/plugins/check_procs -w 2:50 -c 1: -u polw -a 'policyd-weight (child)'"
- hostgroups: heavy-postfix, alioth
+ hostgroups: heavy-postfix
depends: process - weightd - master
###
-
name: unwanted process - policyd-weight
nrpe: "/usr/lib/nagios/plugins/check_procs -w 0:0 -C policyd-weight"
hostgroups: computers
- excludehostgroups: heavy-postfix, alioth
+ excludehostgroups: heavy-postfix
# }}}
# {{{ postfix
###
-
name: system - all services running
nrpe: "/usr/bin/sudo /usr/lib/nagios/plugins/dsa-check-systemd-services"
- hostgroups: jessie, stretch
+ hostgroups: computers
###
-
name: process - slapd
hostgroups: computers
check_interval: 60
retry_interval: 15
+ ####
+ -
+ name: ping peer on mgmt network
+ nrpe: "/usr/lib/nagios/plugins/check_ping -H 172.29.184.12 -w 50,10% -c 200,30%"
+ hosts: conova-node01
+ check_interval: 5
+ max_check_attempts: 4
+ retry_interval: 1
+ -
+ name: ping peer on mgmt network
+ nrpe: "/usr/lib/nagios/plugins/check_ping -H 172.29.184.11 -w 50,10% -c 200,30%"
+ hosts: conova-node02
+ check_interval: 5
+ max_check_attempts: 4
+ retry_interval: 1
# }}}
# }}}