X-Git-Url: https://git.adam-barratt.org.uk/?a=blobdiff_plain;ds=sidebyside;f=config%2Fnagios-master.cfg;h=e22a9781722c06d34b5c3c39a5fc14f13c4c0271;hb=e4b50b7ffb43e15e014e00939e785a64227b6687;hp=7278b5df29bcd5fe740a42f4e3514624f4377dc5;hpb=141315704546e1677b902059a430bb05cc169f41;p=mirror%2Fdsa-nagios.git diff --git a/config/nagios-master.cfg b/config/nagios-master.cfg index 7278b5d..e22a978 100644 --- a/config/nagios-master.cfg +++ b/config/nagios-master.cfg @@ -515,11 +515,11 @@ servers: conova-node01: address: 217.196.149.227 parents: gw-conova - hostgroups: computers, stretch, service, sw-raid + hostgroups: computers, stretch, service, sw-raid, drbd-hosts conova-node02: address: 217.196.149.228 parents: gw-conova - hostgroups: computers, stretch, service, sw-raid + hostgroups: computers, stretch, service, sw-raid, drbd-hosts ganeti-conova: address: 217.196.149.235 parents: gw-conova @@ -725,6 +725,14 @@ servers: address: 82.195.75.103 parents: gw-manda hostgroups: computers, service, dl380, acpid-hosts, stretch, drbd-hosts, manyprocesses + manda-node03: + address: 82.195.75.69 + parents: gw-manda + hostgroups: computers, service, stretch, r540, drbd-hosts, manyprocesses + manda-node04: + address: 82.195.75.70 + parents: gw-manda + hostgroups: computers, service, stretch, r540, drbd-hosts, manyprocesses bendel: address: 82.195.75.100 parents: ganeti3 @@ -744,7 +752,7 @@ servers: draghi: address: 82.195.75.106 parents: ganeti3 - hostgroups: computers, service, hasbootfs, hassrvfs, apache2-hosts, spamd, heavy-exim, kvmdomains, xinetd-hosts, apache-https, stretch + hostgroups: computers, service, hassrvfs, apache2-hosts, spamd, heavy-exim, kvmdomains, xinetd-hosts, apache-https, stretch geo1: address: 82.195.75.105 parents: ganeti3 @@ -752,7 +760,7 @@ servers: handel: address: 82.195.75.104 parents: ganeti3 - hostgroups: computers, service, kvmdomains, apache2-hosts, stretch, postgres96-hosts + hostgroups: computers, service, kvmdomains, apache2-hosts, stretch, postgres96-hosts, hassrvfs kaufmann: address: 82.195.75.107 parents: ganeti3 @@ -822,6 +830,10 @@ servers: parents: byrd hostgroups: computers, service, kvmdomains, stretch, apache2-hosts, hassrvfs, rsyncd-hosts, apache-https + pijper: + address: 140.211.166.194 + parents: gw-osuosl + hostgroups: computers, stretch, service, manyprocesses pieta: address: 140.211.166.195 parents: gw-osuosl @@ -830,10 +842,6 @@ servers: address: 140.211.166.196 parents: pieta hostgroups: computers, hassrvfs, buildd, stretch - powerpc-osuosl-01: - address: 140.211.166.197 - parents: pieta - hostgroups: computers, hassrvfs, buildd, jessie # }}} # {{{ gw-sanger sallinen: @@ -847,7 +855,7 @@ servers: sibelius: address: 193.62.202.28 parents: gw-sanger - hostgroups: computers, postgres94-hosts, service, apache2-hosts, sw-raid, jessie, rsyncd-hosts, hasvarlogfs, multipath-hosts, nfs-server, varnish-hosts + hostgroups: computers, service, apache2-hosts, sw-raid, jessie, rsyncd-hosts, hasvarlogfs, multipath-hosts, nfs-server, varnish-hosts contacts: tjrc1, dave # }}} # {{{ gw-scanplus @@ -1023,10 +1031,6 @@ servers: address: 143.106.167.124 parents: gw-unicamp hostgroups: computers, stretch, service, manyprocesses - powerpc-unicamp-01: - address: 143.106.167.120 - parents: prokofiev - hostgroups: computers, hassrvfs, buildd, jessie ppc64el-unicamp-01: address: 143.106.167.121 parents: prokofiev @@ -1098,9 +1102,6 @@ hostgroups: armhf: alias: armhf private: 1 - sparc: - alias: sparc - private: 1 porterbox: alias: developer accessible porter machines @@ -1143,6 +1144,9 @@ hostgroups: pe1950: alias: Dell PowerEdge 1950 hosts private: 1 + r540: + alias: Dell PowerEdge R540 hosts + private: 1 jessie: alias: Hosts running jessie @@ -1187,9 +1191,6 @@ hostgroups: xinetd-hosts: alias: hosts providing services via xinetd private: 1 - postgres94-hosts: - alias: hosts running postgres94 - private: 1 postgres96-hosts: alias: hosts running postgres96 private: 1 @@ -1281,9 +1282,6 @@ hostgroups: high-RTT: alias: machines with high round trip times private: 1 - alioth: - alias: machines that just are just awkward - private: 1 #openstack-compute: # alias: nodes that run OpenStack compute # private: 1 @@ -1355,7 +1353,6 @@ services: nrpe: "/usr/lib/nagios/plugins/dsa-check-ipv6-default-gw" hostgroups: computers check_interval: 60 - excludehostgroups: alioth # }}} # {{{ ### disk usage - @@ -1540,7 +1537,6 @@ services: nrpe: "/usr/lib/nagios/plugins/dsa-check-config" hostgroups: computers check_interval: 60 - excludehostgroups: alioth - name: setup - local hostname etc-hosts nrpe: 'if getent ahosts `hostname` | grep -q 127.0; then echo "Warning: local hostname resolves to 127/8 address"; exit 1; else echo "OK: Hostname resolves to non-127/8 address."; exit 0; fi' @@ -1654,7 +1650,6 @@ services: servicegroups: backup nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:1 -c 1: -u bacula -C bacula-fd -a '/usr/sbin/bacula-fd -c /etc/bacula/bacula-fd.conf'" hostgroups: computers - excludehostgroups: alioth - name: network backup status - draghi @@ -1708,10 +1703,6 @@ services: name: process - ulogd nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:1 -c 1: -u ulog -C ulogd -a '/usr/sbin/ulogd --daemon --uid ulog'" hostgroups: computers - - - name: unexpected process - ulogd - nrpe: "/usr/lib/nagios/plugins/check_procs -w 0:0 -C ulogd" - hostgroups: sparc #### - name: process - samhain @@ -1745,19 +1736,16 @@ services: remotecheck: "/usr/lib/nagios/plugins/dsa-check-log-age-loghost $HOSTNAME$" runfrom: lotti hostgroups: computers - excludehostgroups: alioth - name: remote logging on lully remotecheck: "/usr/lib/nagios/plugins/dsa-check-log-age-loghost $HOSTNAME$" runfrom: lully hostgroups: computers - excludehostgroups: alioth - name: remote logging on loghost-grnet-01 remotecheck: "/usr/lib/nagios/plugins/dsa-check-log-age-loghost $HOSTNAME$" runfrom: loghost-grnet-01 hostgroups: computers - excludehostgroups: alioth # }}} # {{{ base service - @@ -1847,7 +1835,6 @@ services: name: process - ud-replicated nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:1 -c 1:1 -u root -C ud-replicated -a '/usr/bin/python /usr/bin/ud-replicated'" hostgroups: computers - excludehostgroups: alioth ### - name: MQ connection on rainier @@ -1857,7 +1844,7 @@ services: hostgroups: computers check_interval: 60 retry_interval: 15 - excludehostgroups: alioth, broken_mq + excludehostgroups: broken_mq - name: MQ connection on rapoport servicegroups: MQ @@ -1866,7 +1853,7 @@ services: hostgroups: computers check_interval: 60 retry_interval: 15 - excludehostgroups: alioth, broken_mq + excludehostgroups: broken_mq ### - name: local resolver @@ -1877,7 +1864,11 @@ services: name: process - unbound nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:1 -c 1: -u unbound -C unbound -a '/usr/sbin/unbound'" hostgroups: computers - excludehostgroups: alioth + - + name: unbound trust anchors + nrpe: "/usr/lib/nagios/plugins/dsa-check-unbound-anchors" + hostgroups: computers + check_interval: 60 ### - name: process - uptimed @@ -1908,12 +1899,10 @@ services: name: process - stunnel4 - puppet-ekeyd nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:6 -c 1: -u stunnel4 -C stunnel4 -a '/usr/bin/stunnel4 /etc/stunnel/puppet-ekeyd.conf'" hostgroups: computers - excludehostgroups: alioth - name: process - stunnel4 - puppet-ekeyd is crazy nrpe: "sudo /usr/lib/nagios/plugins/dsa-check-stunnel-sanity" hostgroups: computers - excludehostgroups: alioth excludehosts: czerny, grnet-node01, storace # }}} # {{{ anti-services @@ -2056,7 +2045,12 @@ services: - name: HW - OpenManage status nrpe: "/usr/bin/sudo /usr/lib/nagios/plugins/dsa-check-openmanage" - hostgroups: pe1950 + hostgroups: pe1950, r540 + excludehosts: wieck, schumann + - + name: HW - OpenManage status + nrpe: "/usr/bin/sudo /usr/lib/nagios/plugins/dsa-check-openmanage -b bp=0" + hosts: wieck, schumann # }}} # }}} # {{{ ### mail stuff @@ -2085,6 +2079,15 @@ services: name: mail queue nrpe: "/usr/lib/nagios/plugins/check_mailq -M exim -w 1000 -c 2000" hostgroups: heavy-exim + - + name: process - fail2ban + nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:1 -C fail2ban-server" + hostgroups: heavy-exim, heavy-postfix + - + name: unwanted process - fail2ban + nrpe: "/usr/lib/nagios/plugins/check_procs -w 0:0 -C fail2ban-server" + hostgroups: computers + excludehostgroups: heavy-exim, heavy-postfix # }}} # {{{ clamav - @@ -2179,23 +2182,23 @@ services: - name: process - weightd - master nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:1 -c 1: -u polw -a 'policyd-weight (master)'" - hostgroups: heavy-postfix, alioth + hostgroups: heavy-postfix - name: process - weightd - cache nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:1 -c 1: -u polw -a 'policyd-weight (cache)'" - hostgroups: heavy-postfix, alioth + hostgroups: heavy-postfix depends: process - weightd - master - name: process - weightd - child nrpe: "/usr/lib/nagios/plugins/check_procs -w 2:50 -c 1: -u polw -a 'policyd-weight (child)'" - hostgroups: heavy-postfix, alioth + hostgroups: heavy-postfix depends: process - weightd - master ### - name: unwanted process - policyd-weight nrpe: "/usr/lib/nagios/plugins/check_procs -w 0:0 -C policyd-weight" hostgroups: computers - excludehostgroups: heavy-postfix, alioth + excludehostgroups: heavy-postfix # }}} # {{{ postfix ### @@ -2405,15 +2408,11 @@ services: name: unwanted process - postgresql nrpe: "/usr/lib/nagios/plugins/check_procs -w 0 -C postgres" hostgroups: computers - excludehostgroups: postgres94-hosts, postgres96-hosts + excludehostgroups: postgres96-hosts - name: unwanted process - postgresql 9.0 nrpe: "/usr/lib/nagios/plugins/check_procs -w 0 -C postgres -a '9.0/bin/postgres'" hostgroups: computers - - - name: process - postgresql94 - master - nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:10 -c 1: -u postgres -C postgres -a '/usr/lib/postgresql/9.4/bin/postgres'" - hostgroups: postgres94-hosts - name: process - postgresql96 - master nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:10 -c 1: -u postgres -C postgres -a '/usr/lib/postgresql/9.6/bin/postgres'" @@ -2905,6 +2904,37 @@ services: hostgroups: computers check_interval: 60 retry_interval: 15 + #### + - + name: ping peer on mgmt network + nrpe: "/usr/lib/nagios/plugins/check_ping -H 172.29.184.12 -w 50,10% -c 200,30%" + hosts: conova-node01 + check_interval: 5 + max_check_attempts: 4 + retry_interval: 1 + - + name: ping peer on mgmt network + nrpe: "/usr/lib/nagios/plugins/check_ping -H 172.29.184.11 -w 50,10% -c 200,30%" + hosts: conova-node02 + check_interval: 5 + max_check_attempts: 4 + retry_interval: 1 + + - + name: ping peer on mgmt network + nrpe: "/usr/lib/nagios/plugins/check_ping -H 172.29.182.14 -w 50,10% -c 200,30%" + hosts: manda-node03 + check_interval: 5 + max_check_attempts: 4 + retry_interval: 1 + - + name: ping peer on mgmt network + nrpe: "/usr/lib/nagios/plugins/check_ping -H 172.29.182.13 -w 50,10% -c 200,30%" + hosts: manda-node04 + check_interval: 5 + max_check_attempts: 4 + retry_interval: 1 + # }}} # }}} # }}}