X-Git-Url: https://git.adam-barratt.org.uk/?a=blobdiff_plain;f=config%2Fnagios-master.cfg;h=d3b85e365ef9fe70ec143ae9d487b6e6012eef0e;hb=e5b63a271a6def4e2b3a3e23857edde90880d2e0;hp=e9c134c388d58a68387cca25a9f7c56ef217981d;hpb=cc33b9c07688eabb479083349004b2720599f379;p=mirror%2Fdsa-nagios.git diff --git a/config/nagios-master.cfg b/config/nagios-master.cfg index e9c134c..d3b85e3 100644 --- a/config/nagios-master.cfg +++ b/config/nagios-master.cfg @@ -515,11 +515,11 @@ servers: conova-node01: address: 217.196.149.227 parents: gw-conova - hostgroups: computers, stretch, service, sw-raid + hostgroups: computers, stretch, service, sw-raid, drbd-hosts conova-node02: address: 217.196.149.228 parents: gw-conova - hostgroups: computers, stretch, service, sw-raid + hostgroups: computers, stretch, service, sw-raid, drbd-hosts ganeti-conova: address: 217.196.149.235 parents: gw-conova @@ -725,6 +725,14 @@ servers: address: 82.195.75.103 parents: gw-manda hostgroups: computers, service, dl380, acpid-hosts, stretch, drbd-hosts, manyprocesses + manda-node03: + address: 82.195.75.69 + parents: gw-manda + hostgroups: computers, service, stretch, r540 + manda-node04: + address: 82.195.75.70 + parents: gw-manda + hostgroups: computers, service, stretch, r540 bendel: address: 82.195.75.100 parents: ganeti3 @@ -752,7 +760,7 @@ servers: handel: address: 82.195.75.104 parents: ganeti3 - hostgroups: computers, service, kvmdomains, apache2-hosts, stretch, postgres96-hosts + hostgroups: computers, service, kvmdomains, apache2-hosts, stretch, postgres96-hosts, hassrvfs kaufmann: address: 82.195.75.107 parents: ganeti3 @@ -830,10 +838,6 @@ servers: address: 140.211.166.196 parents: pieta hostgroups: computers, hassrvfs, buildd, stretch - powerpc-osuosl-01: - address: 140.211.166.197 - parents: pieta - hostgroups: computers, hassrvfs, buildd, jessie # }}} # {{{ gw-sanger sallinen: @@ -847,7 +851,7 @@ servers: sibelius: address: 193.62.202.28 parents: gw-sanger - hostgroups: computers, postgres94-hosts, service, apache2-hosts, sw-raid, jessie, rsyncd-hosts, hasvarlogfs, multipath-hosts, nfs-server, varnish-hosts + hostgroups: computers, service, apache2-hosts, sw-raid, jessie, rsyncd-hosts, hasvarlogfs, multipath-hosts, nfs-server, varnish-hosts contacts: tjrc1, dave # }}} # {{{ gw-scanplus @@ -1023,10 +1027,6 @@ servers: address: 143.106.167.124 parents: gw-unicamp hostgroups: computers, stretch, service, manyprocesses - powerpc-unicamp-01: - address: 143.106.167.120 - parents: prokofiev - hostgroups: computers, hassrvfs, buildd, jessie ppc64el-unicamp-01: address: 143.106.167.121 parents: prokofiev @@ -1140,6 +1140,9 @@ hostgroups: pe1950: alias: Dell PowerEdge 1950 hosts private: 1 + r540: + alias: Dell PowerEdge R540 hosts + private: 1 jessie: alias: Hosts running jessie @@ -1184,9 +1187,6 @@ hostgroups: xinetd-hosts: alias: hosts providing services via xinetd private: 1 - postgres94-hosts: - alias: hosts running postgres94 - private: 1 postgres96-hosts: alias: hosts running postgres96 private: 1 @@ -1278,9 +1278,6 @@ hostgroups: high-RTT: alias: machines with high round trip times private: 1 - alioth: - alias: machines that just are just awkward - private: 1 #openstack-compute: # alias: nodes that run OpenStack compute # private: 1 @@ -1352,7 +1349,6 @@ services: nrpe: "/usr/lib/nagios/plugins/dsa-check-ipv6-default-gw" hostgroups: computers check_interval: 60 - excludehostgroups: alioth # }}} # {{{ ### disk usage - @@ -1537,7 +1533,6 @@ services: nrpe: "/usr/lib/nagios/plugins/dsa-check-config" hostgroups: computers check_interval: 60 - excludehostgroups: alioth - name: setup - local hostname etc-hosts nrpe: 'if getent ahosts `hostname` | grep -q 127.0; then echo "Warning: local hostname resolves to 127/8 address"; exit 1; else echo "OK: Hostname resolves to non-127/8 address."; exit 0; fi' @@ -1651,7 +1646,6 @@ services: servicegroups: backup nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:1 -c 1: -u bacula -C bacula-fd -a '/usr/sbin/bacula-fd -c /etc/bacula/bacula-fd.conf'" hostgroups: computers - excludehostgroups: alioth - name: network backup status - draghi @@ -1738,19 +1732,16 @@ services: remotecheck: "/usr/lib/nagios/plugins/dsa-check-log-age-loghost $HOSTNAME$" runfrom: lotti hostgroups: computers - excludehostgroups: alioth - name: remote logging on lully remotecheck: "/usr/lib/nagios/plugins/dsa-check-log-age-loghost $HOSTNAME$" runfrom: lully hostgroups: computers - excludehostgroups: alioth - name: remote logging on loghost-grnet-01 remotecheck: "/usr/lib/nagios/plugins/dsa-check-log-age-loghost $HOSTNAME$" runfrom: loghost-grnet-01 hostgroups: computers - excludehostgroups: alioth # }}} # {{{ base service - @@ -1840,7 +1831,6 @@ services: name: process - ud-replicated nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:1 -c 1:1 -u root -C ud-replicated -a '/usr/bin/python /usr/bin/ud-replicated'" hostgroups: computers - excludehostgroups: alioth ### - name: MQ connection on rainier @@ -1850,7 +1840,7 @@ services: hostgroups: computers check_interval: 60 retry_interval: 15 - excludehostgroups: alioth, broken_mq + excludehostgroups: broken_mq - name: MQ connection on rapoport servicegroups: MQ @@ -1859,7 +1849,7 @@ services: hostgroups: computers check_interval: 60 retry_interval: 15 - excludehostgroups: alioth, broken_mq + excludehostgroups: broken_mq ### - name: local resolver @@ -1870,7 +1860,11 @@ services: name: process - unbound nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:1 -c 1: -u unbound -C unbound -a '/usr/sbin/unbound'" hostgroups: computers - excludehostgroups: alioth + - + name: unbound trust anchors + nrpe: "/usr/lib/nagios/plugins/dsa-check-unbound-anchors" + hostgroups: computers + check_interval: 60 ### - name: process - uptimed @@ -1901,12 +1895,10 @@ services: name: process - stunnel4 - puppet-ekeyd nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:6 -c 1: -u stunnel4 -C stunnel4 -a '/usr/bin/stunnel4 /etc/stunnel/puppet-ekeyd.conf'" hostgroups: computers - excludehostgroups: alioth - name: process - stunnel4 - puppet-ekeyd is crazy nrpe: "sudo /usr/lib/nagios/plugins/dsa-check-stunnel-sanity" hostgroups: computers - excludehostgroups: alioth excludehosts: czerny, grnet-node01, storace # }}} # {{{ anti-services @@ -2049,7 +2041,7 @@ services: - name: HW - OpenManage status nrpe: "/usr/bin/sudo /usr/lib/nagios/plugins/dsa-check-openmanage" - hostgroups: pe1950 + hostgroups: pe1950, r540 # }}} # }}} # {{{ ### mail stuff @@ -2078,6 +2070,15 @@ services: name: mail queue nrpe: "/usr/lib/nagios/plugins/check_mailq -M exim -w 1000 -c 2000" hostgroups: heavy-exim + - + name: process - fail2ban + nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:1 -C fail2ban-server" + hostgroups: heavy-exim, heavy-postfix + - + name: unwanted process - fail2ban + nrpe: "/usr/lib/nagios/plugins/check_procs -w 0:0 -C fail2ban-server" + hostgroups: computers + excludehostgroups: heavy-exim, heavy-postfix # }}} # {{{ clamav - @@ -2172,23 +2173,23 @@ services: - name: process - weightd - master nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:1 -c 1: -u polw -a 'policyd-weight (master)'" - hostgroups: heavy-postfix, alioth + hostgroups: heavy-postfix - name: process - weightd - cache nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:1 -c 1: -u polw -a 'policyd-weight (cache)'" - hostgroups: heavy-postfix, alioth + hostgroups: heavy-postfix depends: process - weightd - master - name: process - weightd - child nrpe: "/usr/lib/nagios/plugins/check_procs -w 2:50 -c 1: -u polw -a 'policyd-weight (child)'" - hostgroups: heavy-postfix, alioth + hostgroups: heavy-postfix depends: process - weightd - master ### - name: unwanted process - policyd-weight nrpe: "/usr/lib/nagios/plugins/check_procs -w 0:0 -C policyd-weight" hostgroups: computers - excludehostgroups: heavy-postfix, alioth + excludehostgroups: heavy-postfix # }}} # {{{ postfix ### @@ -2398,15 +2399,11 @@ services: name: unwanted process - postgresql nrpe: "/usr/lib/nagios/plugins/check_procs -w 0 -C postgres" hostgroups: computers - excludehostgroups: postgres94-hosts, postgres96-hosts + excludehostgroups: postgres96-hosts - name: unwanted process - postgresql 9.0 nrpe: "/usr/lib/nagios/plugins/check_procs -w 0 -C postgres -a '9.0/bin/postgres'" hostgroups: computers - - - name: process - postgresql94 - master - nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:10 -c 1: -u postgres -C postgres -a '/usr/lib/postgresql/9.4/bin/postgres'" - hostgroups: postgres94-hosts - name: process - postgresql96 - master nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:10 -c 1: -u postgres -C postgres -a '/usr/lib/postgresql/9.6/bin/postgres'" @@ -2898,6 +2895,37 @@ services: hostgroups: computers check_interval: 60 retry_interval: 15 + #### + - + name: ping peer on mgmt network + nrpe: "/usr/lib/nagios/plugins/check_ping -H 172.29.184.12 -w 50,10% -c 200,30%" + hosts: conova-node01 + check_interval: 5 + max_check_attempts: 4 + retry_interval: 1 + - + name: ping peer on mgmt network + nrpe: "/usr/lib/nagios/plugins/check_ping -H 172.29.184.11 -w 50,10% -c 200,30%" + hosts: conova-node02 + check_interval: 5 + max_check_attempts: 4 + retry_interval: 1 + + - + name: ping peer on mgmt network + nrpe: "/usr/lib/nagios/plugins/check_ping -H 172.29.182.14 -w 50,10% -c 200,30%" + hosts: manda-node03 + check_interval: 5 + max_check_attempts: 4 + retry_interval: 1 + - + name: ping peer on mgmt network + nrpe: "/usr/lib/nagios/plugins/check_ping -H 172.29.182.13 -w 50,10% -c 200,30%" + hosts: manda-node04 + check_interval: 5 + max_check_attempts: 4 + retry_interval: 1 + # }}} # }}} # }}}