sibelius no longer runs postgresql
[mirror/dsa-nagios.git] / config / nagios-master.cfg
index 7278b5d..06bfb67 100644 (file)
@@ -515,11 +515,11 @@ servers:
   conova-node01:
     address: 217.196.149.227
     parents: gw-conova
-    hostgroups: computers, stretch, service, sw-raid
+    hostgroups: computers, stretch, service, sw-raid, drbd-hosts
   conova-node02:
     address: 217.196.149.228
     parents: gw-conova
-    hostgroups: computers, stretch, service, sw-raid
+    hostgroups: computers, stretch, service, sw-raid, drbd-hosts
   ganeti-conova:
     address: 217.196.149.235
     parents: gw-conova
@@ -752,7 +752,7 @@ servers:
   handel:
     address: 82.195.75.104
     parents: ganeti3
-    hostgroups: computers, service, kvmdomains, apache2-hosts, stretch, postgres96-hosts
+    hostgroups: computers, service, kvmdomains, apache2-hosts, stretch, postgres96-hosts, hassrvfs
   kaufmann:
     address: 82.195.75.107
     parents: ganeti3
@@ -830,10 +830,6 @@ servers:
     address: 140.211.166.196
     parents: pieta
     hostgroups: computers, hassrvfs, buildd, stretch
-  powerpc-osuosl-01:
-    address: 140.211.166.197
-    parents: pieta
-    hostgroups: computers, hassrvfs, buildd, jessie
   # }}}
   # {{{ gw-sanger
   sallinen:
@@ -847,7 +843,7 @@ servers:
   sibelius:
     address: 193.62.202.28
     parents: gw-sanger
-    hostgroups: computers, postgres94-hosts, service, apache2-hosts, sw-raid, jessie, rsyncd-hosts, hasvarlogfs, multipath-hosts, nfs-server, varnish-hosts
+    hostgroups: computers, service, apache2-hosts, sw-raid, jessie, rsyncd-hosts, hasvarlogfs, multipath-hosts, nfs-server, varnish-hosts
     contacts: tjrc1, dave
   # }}}
   # {{{ gw-scanplus
@@ -1023,10 +1019,6 @@ servers:
     address: 143.106.167.124
     parents: gw-unicamp
     hostgroups: computers, stretch, service, manyprocesses
-  powerpc-unicamp-01:
-    address: 143.106.167.120
-    parents: prokofiev
-    hostgroups: computers, hassrvfs, buildd, jessie
   ppc64el-unicamp-01:
     address: 143.106.167.121
     parents: prokofiev
@@ -1098,9 +1090,6 @@ hostgroups:
   armhf:
     alias: armhf
     private: 1
-  sparc:
-    alias: sparc
-    private: 1
 
   porterbox:
     alias: developer accessible porter machines
@@ -1187,9 +1176,6 @@ hostgroups:
   xinetd-hosts:
     alias: hosts providing services via xinetd
     private: 1
-  postgres94-hosts:
-    alias: hosts running postgres94
-    private: 1
   postgres96-hosts:
     alias: hosts running postgres96
     private: 1
@@ -1281,9 +1267,6 @@ hostgroups:
   high-RTT:
     alias: machines with high round trip times
     private: 1
-  alioth:
-    alias: machines that just are just awkward
-    private: 1
   #openstack-compute:
   #  alias: nodes that run OpenStack compute
   #  private: 1
@@ -1355,7 +1338,6 @@ services:
     nrpe: "/usr/lib/nagios/plugins/dsa-check-ipv6-default-gw"
     hostgroups: computers
     check_interval: 60
-    excludehostgroups: alioth
   # }}}
   # {{{ ### disk usage
   -
@@ -1540,7 +1522,6 @@ services:
     nrpe: "/usr/lib/nagios/plugins/dsa-check-config"
     hostgroups: computers
     check_interval: 60
-    excludehostgroups: alioth
   -
     name: setup - local hostname etc-hosts
     nrpe: 'if getent ahosts `hostname` | grep -q 127.0; then echo "Warning: local hostname resolves to 127/8 address"; exit 1; else echo "OK: Hostname resolves to non-127/8 address."; exit 0; fi'
@@ -1654,7 +1635,6 @@ services:
     servicegroups: backup
     nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:1 -c 1: -u bacula -C bacula-fd -a '/usr/sbin/bacula-fd -c /etc/bacula/bacula-fd.conf'"
     hostgroups: computers
-    excludehostgroups: alioth
 
   -
     name: network backup status - draghi
@@ -1708,10 +1688,6 @@ services:
     name: process - ulogd
     nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:1 -c 1: -u ulog -C ulogd -a '/usr/sbin/ulogd --daemon --uid ulog'"
     hostgroups: computers
-  -
-    name: unexpected process - ulogd
-    nrpe: "/usr/lib/nagios/plugins/check_procs -w 0:0 -C ulogd"
-    hostgroups: sparc
   ####
   -
     name: process - samhain
@@ -1745,19 +1721,16 @@ services:
     remotecheck: "/usr/lib/nagios/plugins/dsa-check-log-age-loghost $HOSTNAME$"
     runfrom: lotti
     hostgroups: computers
-    excludehostgroups: alioth
   -
     name: remote logging on lully
     remotecheck: "/usr/lib/nagios/plugins/dsa-check-log-age-loghost $HOSTNAME$"
     runfrom: lully
     hostgroups: computers
-    excludehostgroups: alioth
   -
     name: remote logging on loghost-grnet-01
     remotecheck: "/usr/lib/nagios/plugins/dsa-check-log-age-loghost $HOSTNAME$"
     runfrom: loghost-grnet-01
     hostgroups: computers
-    excludehostgroups: alioth
   # }}}
   # {{{ base service
   -
@@ -1847,7 +1820,6 @@ services:
     name: process - ud-replicated
     nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:1 -c 1:1 -u root -C ud-replicated -a '/usr/bin/python /usr/bin/ud-replicated'"
     hostgroups: computers
-    excludehostgroups: alioth
   ###
   -
     name: MQ connection on rainier
@@ -1857,7 +1829,7 @@ services:
     hostgroups: computers
     check_interval:  60
     retry_interval: 15
-    excludehostgroups: alioth, broken_mq
+    excludehostgroups: broken_mq
   -
     name: MQ connection on rapoport
     servicegroups: MQ
@@ -1866,7 +1838,7 @@ services:
     hostgroups: computers
     check_interval:  60
     retry_interval: 15
-    excludehostgroups: alioth, broken_mq
+    excludehostgroups: broken_mq
   ###
   -
     name: local resolver
@@ -1877,7 +1849,11 @@ services:
     name: process - unbound
     nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:1 -c 1: -u unbound -C unbound -a '/usr/sbin/unbound'"
     hostgroups: computers
-    excludehostgroups: alioth
+  -
+    name: unbound trust anchors
+    nrpe: "/usr/lib/nagios/plugins/dsa-check-unbound-anchors"
+    hostgroups: computers
+    check_interval: 60
   ###
   -
     name: process - uptimed
@@ -1908,12 +1884,10 @@ services:
     name: process - stunnel4 - puppet-ekeyd
     nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:6 -c 1: -u stunnel4 -C stunnel4 -a '/usr/bin/stunnel4 /etc/stunnel/puppet-ekeyd.conf'"
     hostgroups: computers
-    excludehostgroups: alioth
   -
     name: process - stunnel4 - puppet-ekeyd is crazy
     nrpe: "sudo /usr/lib/nagios/plugins/dsa-check-stunnel-sanity"
     hostgroups: computers
-    excludehostgroups: alioth
     excludehosts: czerny, grnet-node01, storace
   # }}}
   # {{{ anti-services
@@ -2085,6 +2059,15 @@ services:
     name: mail queue
     nrpe: "/usr/lib/nagios/plugins/check_mailq -M exim -w 1000 -c 2000"
     hostgroups: heavy-exim
+  -
+    name: process - fail2ban
+    nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:1 -C fail2ban-server"
+    hostgroups: heavy-exim, heavy-postfix
+  -
+    name: unwanted process - fail2ban
+    nrpe: "/usr/lib/nagios/plugins/check_procs -w 0:0 -C fail2ban-server"
+    hostgroups: computers
+    excludehostgroups: heavy-exim, heavy-postfix
   # }}}
   # {{{ clamav
   -
@@ -2179,23 +2162,23 @@ services:
   -
     name: process - weightd - master
     nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:1 -c 1: -u polw -a 'policyd-weight (master)'"
-    hostgroups: heavy-postfix, alioth
+    hostgroups: heavy-postfix
   -
     name: process - weightd - cache
     nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:1 -c 1: -u polw -a 'policyd-weight (cache)'"
-    hostgroups: heavy-postfix, alioth
+    hostgroups: heavy-postfix
     depends: process - weightd - master
   -
     name: process - weightd - child
     nrpe: "/usr/lib/nagios/plugins/check_procs -w 2:50 -c 1: -u polw -a 'policyd-weight (child)'"
-    hostgroups: heavy-postfix, alioth
+    hostgroups: heavy-postfix
     depends: process - weightd - master
   ###
   -
     name: unwanted process - policyd-weight
     nrpe: "/usr/lib/nagios/plugins/check_procs -w 0:0 -C policyd-weight"
     hostgroups: computers
-    excludehostgroups: heavy-postfix, alioth
+    excludehostgroups: heavy-postfix
   # }}}
   # {{{ postfix
   ###
@@ -2405,15 +2388,11 @@ services:
     name: unwanted process - postgresql
     nrpe: "/usr/lib/nagios/plugins/check_procs -w 0 -C postgres"
     hostgroups: computers
-    excludehostgroups: postgres94-hosts, postgres96-hosts
+    excludehostgroups: postgres96-hosts
   -
     name: unwanted process - postgresql 9.0
     nrpe: "/usr/lib/nagios/plugins/check_procs -w 0 -C postgres -a '9.0/bin/postgres'"
     hostgroups: computers
-  -
-    name: process - postgresql94 - master
-    nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:10 -c 1: -u postgres -C postgres -a '/usr/lib/postgresql/9.4/bin/postgres'"
-    hostgroups: postgres94-hosts
   -
     name: process - postgresql96 - master
     nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:10 -c 1: -u postgres -C postgres -a '/usr/lib/postgresql/9.6/bin/postgres'"
@@ -2905,6 +2884,21 @@ services:
     hostgroups: computers
     check_interval:  60
     retry_interval: 15
+  ####
+  -
+    name: ping peer on mgmt network
+    nrpe: "/usr/lib/nagios/plugins/check_ping -H 172.29.184.12 -w 50,10% -c 200,30%"
+    hosts: conova-node01
+    check_interval: 5
+    max_check_attempts: 4
+    retry_interval: 1
+  -
+    name: ping peer on mgmt network
+    nrpe: "/usr/lib/nagios/plugins/check_ping -H 172.29.184.11 -w 50,10% -c 200,30%"
+    hosts: conova-node02
+    check_interval: 5
+    max_check_attempts: 4
+    retry_interval: 1
   # }}}
 # }}}