Decommission kantuser (RT#7583)
[mirror/dsa-nagios.git] / config / nagios-master.cfg
index f844998..cf058f8 100644 (file)
@@ -139,7 +139,7 @@ servers:
     parents: gw-ubcece
     hostgroups: layer3-infrastructure
   gw-unicamp:
-    address: 177.220.10.129
+    address: 143.106.167.113
     parents: gw-ubcece
     hostgroups: layer3-infrastructure
   gw-utwente:
@@ -238,15 +238,15 @@ servers:
   arm-arm-01:
     address: 217.140.96.58
     parents: gw-arm
-    hostgroups: computers, hassrvfs, buildd, stretch, broken_mq
+    hostgroups: computers, hassrvfs, buildd, stretch, broken_mq, sw-raid
   arm-arm-03:
     address: 217.140.96.60
     parents: gw-arm
-    hostgroups: computers, hassrvfs, buildd, stretch, broken_mq
+    hostgroups: computers, hassrvfs, buildd, stretch, broken_mq, sw-raid
   arm-arm-04:
     address: 217.140.96.61
     parents: gw-arm
-    hostgroups: computers, hassrvfs, buildd, stretch, broken_mq
+    hostgroups: computers, hassrvfs, buildd, stretch, broken_mq, sw-raid
   harris:
     address: 217.140.96.66
     parents: gw-arm
@@ -402,10 +402,6 @@ servers:
     address: 5.153.231.20
     parents: ganeti-bytemark
     hostgroups: computers, general, kvmdomains, stretch, nfs-client, autofs, systemd-timesyncd
-  moszumanska:
-    address: 5.153.231.21
-    parents: ganeti-bytemark
-    hostgroups: secondary-IPs
   dillon:
     address: 5.153.231.22
     parents: ganeti-bytemark
@@ -515,11 +511,11 @@ servers:
   conova-node01:
     address: 217.196.149.227
     parents: gw-conova
-    hostgroups: computers, stretch, service, sw-raid
+    hostgroups: computers, stretch, service, sw-raid, drbd-hosts
   conova-node02:
     address: 217.196.149.228
     parents: gw-conova
-    hostgroups: computers, stretch, service, sw-raid
+    hostgroups: computers, stretch, service, sw-raid, drbd-hosts
   ganeti-conova:
     address: 217.196.149.235
     parents: gw-conova
@@ -725,6 +721,14 @@ servers:
     address: 82.195.75.103
     parents: gw-manda
     hostgroups: computers, service, dl380, acpid-hosts, stretch, drbd-hosts, manyprocesses
+  manda-node03:
+    address: 82.195.75.69
+    parents: gw-manda
+    hostgroups: computers, service, stretch, r540, drbd-hosts, manyprocesses
+  manda-node04:
+    address: 82.195.75.70
+    parents: gw-manda
+    hostgroups: computers, service, stretch, r540, drbd-hosts, manyprocesses
   bendel:
     address: 82.195.75.100
     parents: ganeti3
@@ -744,7 +748,7 @@ servers:
   draghi:
     address: 82.195.75.106
     parents: ganeti3
-    hostgroups: computers, service, hasbootfs, hassrvfs, apache2-hosts, spamd, heavy-exim, kvmdomains, xinetd-hosts, apache-https, stretch
+    hostgroups: computers, service, hassrvfs, apache2-hosts, spamd, heavy-exim, kvmdomains, xinetd-hosts, apache-https, stretch
   geo1:
     address: 82.195.75.105
     parents: ganeti3
@@ -752,7 +756,7 @@ servers:
   handel:
     address: 82.195.75.104
     parents: ganeti3
-    hostgroups: computers, service, kvmdomains, apache2-hosts, stretch, postgres96-hosts
+    hostgroups: computers, service, kvmdomains, apache2-hosts, stretch, postgres96-hosts, hassrvfs
   kaufmann:
     address: 82.195.75.107
     parents: ganeti3
@@ -822,22 +826,23 @@ servers:
     parents: byrd
     hostgroups: computers, service, kvmdomains, stretch, apache2-hosts, hassrvfs, rsyncd-hosts, apache-https
 
+  pijper:
+    address: 140.211.166.194
+    parents: gw-osuosl
+    hostgroups: computers, stretch, service, manyprocesses
+  loghost-osuosl-01:
+    address: 140.211.166.202
+    parents: pijper
+    hostgroups: computers, service, kvmdomains, stretch, hassrvfs, systemd-timesyncd
+
   pieta:
     address: 140.211.166.195
     parents: gw-osuosl
     hostgroups: computers, stretch, service, manyprocesses
   ppc64el-osuosl-01:
     address: 140.211.166.196
-    parents: pieta
+    parents: pijper
     hostgroups: computers, hassrvfs, buildd, stretch
-  powerpc-osuosl-01:
-    address: 140.211.166.197
-    parents: pieta
-    hostgroups: computers, hassrvfs, buildd, jessie
-  partch:
-    address: 140.211.15.152
-    parents: gw-osuosl
-    hostgroups: computers, jessie, hassrvfs, porterbox, sw-raid
   # }}}
   # {{{ gw-sanger
   sallinen:
@@ -851,12 +856,7 @@ servers:
   sibelius:
     address: 193.62.202.28
     parents: gw-sanger
-    hostgroups: computers, postgres94-hosts, service, apache2-hosts, sw-raid, jessie, rsyncd-hosts, hasvarlogfs, multipath-hosts, nfs-server, varnish-hosts
-    contacts: tjrc1, dave
-  smetana:
-    address: 193.62.202.29
-    parents: gw-sanger
-    hostgroups: computers, sw-raid, sparc, wheezy, no-bacula
+    hostgroups: computers, service, apache2-hosts, sw-raid, jessie, rsyncd-hosts, hasvarlogfs, multipath-hosts, nfs-server, varnish-hosts
     contacts: tjrc1, dave
   # }}}
   # {{{ gw-scanplus
@@ -996,10 +996,6 @@ servers:
     address: 209.87.16.46
     parents: ubc-gateway
     hostgroups: computers, service, kvmdomains, stretch, systemd-timesyncd, apache2-hosts, apache-https, broken_https_default_vhost
-  kantuser:
-    address: 209.87.16.47
-    parents: ubc-gateway
-    hostgroups: computers, service, kvmdomains, stretch, systemd-timesyncd, apache2-hosts
   grabbe:
     address: 209.87.16.48
     parents: ubc-gateway
@@ -1029,19 +1025,15 @@ servers:
   # }}}
   # {{{ gw-unicamp
   prokofiev:
-    address: 177.220.10.140
+    address: 143.106.167.124
     parents: gw-unicamp
     hostgroups: computers, stretch, service, manyprocesses
-  powerpc-unicamp-01:
-    address: 177.220.10.141
-    parents: prokofiev
-    hostgroups: computers, hassrvfs, buildd, jessie
   ppc64el-unicamp-01:
-    address: 177.220.10.142
+    address: 143.106.167.121
     parents: prokofiev
     hostgroups: computers, hassrvfs, buildd, stretch
   plummer:
-    address: 177.220.10.143
+    address: 143.106.167.122
     parents: prokofiev
     hostgroups: computers, porterbox, hassrvfs, stretch
   # }}}
@@ -1107,9 +1099,6 @@ hostgroups:
   armhf:
     alias: armhf
     private: 1
-  sparc:
-    alias: sparc
-    private: 1
 
   porterbox:
     alias: developer accessible porter machines
@@ -1152,9 +1141,10 @@ hostgroups:
   pe1950:
     alias: Dell PowerEdge 1950 hosts
     private: 1
+  r540:
+    alias: Dell PowerEdge R540 hosts
+    private: 1
 
-  wheezy:
-    alias: Hosts running wheezy
   jessie:
     alias: Hosts running jessie
   stretch:
@@ -1198,9 +1188,6 @@ hostgroups:
   xinetd-hosts:
     alias: hosts providing services via xinetd
     private: 1
-  postgres94-hosts:
-    alias: hosts running postgres94
-    private: 1
   postgres96-hosts:
     alias: hosts running postgres96
     private: 1
@@ -1292,9 +1279,6 @@ hostgroups:
   high-RTT:
     alias: machines with high round trip times
     private: 1
-  alioth:
-    alias: machines that just are just awkward
-    private: 1
   #openstack-compute:
   #  alias: nodes that run OpenStack compute
   #  private: 1
@@ -1366,7 +1350,6 @@ services:
     nrpe: "/usr/lib/nagios/plugins/dsa-check-ipv6-default-gw"
     hostgroups: computers
     check_interval: 60
-    excludehostgroups: alioth
   # }}}
   # {{{ ### disk usage
   -
@@ -1551,7 +1534,6 @@ services:
     nrpe: "/usr/lib/nagios/plugins/dsa-check-config"
     hostgroups: computers
     check_interval: 60
-    excludehostgroups: alioth
   -
     name: setup - local hostname etc-hosts
     nrpe: 'if getent ahosts `hostname` | grep -q 127.0; then echo "Warning: local hostname resolves to 127/8 address"; exit 1; else echo "OK: Hostname resolves to non-127/8 address."; exit 0; fi'
@@ -1598,16 +1580,10 @@ services:
     name: free memory - percent
     nrpe: "/usr/lib/nagios/plugins/dsa-check-memory -m pct"
     hostgroups: computers
-  -
-    name: process - getty
-    nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:8 -c 1: -u root -C getty -a /sbin/getty"
-    hostgroups: computers
-    excludehosts: zelenka, zandonai
-    excludehostgroups: jessie, stretch
   -
     name: process - getty
     nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:8 -c 1: -u root -C agetty -a /sbin/agetty"
-    hostgroups: jessie, stretch
+    hostgroups: computers
 
   -
     name: processes - zombies
@@ -1671,7 +1647,6 @@ services:
     servicegroups: backup
     nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:1 -c 1: -u bacula -C bacula-fd -a '/usr/sbin/bacula-fd -c /etc/bacula/bacula-fd.conf'"
     hostgroups: computers
-    excludehostgroups: alioth
 
   -
     name: network backup status - draghi
@@ -1724,11 +1699,7 @@ services:
   -
     name: process - ulogd
     nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:1 -c 1: -u ulog -C ulogd -a '/usr/sbin/ulogd --daemon --uid ulog'"
-    hostgroups: jessie, stretch
-  -
-    name: unexpected process - ulogd
-    nrpe: "/usr/lib/nagios/plugins/check_procs -w 0:0 -C ulogd"
-    hostgroups: sparc
+    hostgroups: computers
   ####
   -
     name: process - samhain
@@ -1752,34 +1723,31 @@ services:
     excludehostgroups: brokensamhain
   # }}}
   # {{{ logging
-  -
-    name: process - syslog-ng
-    nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:1 -c 1: -u root -C syslog-ng  -a '/sbin/syslog-ng -p /var/run/syslog-ng.pid'"
-    hostgroups: computers
-    excludehostgroups: jessie, stretch
   -
     name: process - syslog-ng
     nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:1 -c 1: -u root -C syslog-ng  -a '/sbin/syslog-ng -F'"
-    hostgroups: jessie, stretch
+    hostgroups: computers
 
   -
     name: remote logging on lotti
     remotecheck: "/usr/lib/nagios/plugins/dsa-check-log-age-loghost $HOSTNAME$"
     runfrom: lotti
     hostgroups: computers
-    excludehostgroups: alioth
   -
     name: remote logging on lully
     remotecheck: "/usr/lib/nagios/plugins/dsa-check-log-age-loghost $HOSTNAME$"
     runfrom: lully
     hostgroups: computers
-    excludehostgroups: alioth
   -
     name: remote logging on loghost-grnet-01
     remotecheck: "/usr/lib/nagios/plugins/dsa-check-log-age-loghost $HOSTNAME$"
     runfrom: loghost-grnet-01
     hostgroups: computers
-    excludehostgroups: alioth
+  -
+    name: remote logging on loghost-osuosl-01
+    remotecheck: "/usr/lib/nagios/plugins/dsa-check-log-age-loghost $HOSTNAME$"
+    runfrom: loghost-osuosl-01
+    hostgroups: computers
   # }}}
   # {{{ base service
   -
@@ -1841,7 +1809,7 @@ services:
     name: system time synced
     nrpe: "/usr/lib/nagios/plugins/dsa-check-timedatectl -s"
     hostgroups: computers
-    excludehostgroups: systemd-timesyncd, wheezy
+    excludehostgroups: systemd-timesyncd
     servicegroups: time
   -
     name: system time synced
@@ -1858,7 +1826,7 @@ services:
     name: process - irqbalance
     nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:1 -c 1: -u root -C irqbalance -a '/usr/sbin/irqbalance'"
     hostgroups: computers
-    excludehosts: harris, smetana
+    excludehosts: harris
   ###
   -
     name: process - cron
@@ -1869,13 +1837,6 @@ services:
     name: process - ud-replicated
     nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:1 -c 1:1 -u root -C ud-replicated -a '/usr/bin/python /usr/bin/ud-replicated'"
     hostgroups: computers
-    excludehostgroups: alioth
-  ###
-  -
-    name: process - monit
-    nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:1 -c 1:1 -u root -C monit -a '/usr/bin/monit -d 300 -I -c /etc/monit/monitrc -s /var/lib/monit/monit.state'"
-    hostgroups: computers
-    excludehostgroups: alioth, jessie, stretch
   ###
   -
     name: MQ connection on rainier
@@ -1885,7 +1846,7 @@ services:
     hostgroups: computers
     check_interval:  60
     retry_interval: 15
-    excludehostgroups: alioth, broken_mq
+    excludehostgroups: broken_mq
   -
     name: MQ connection on rapoport
     servicegroups: MQ
@@ -1894,7 +1855,7 @@ services:
     hostgroups: computers
     check_interval:  60
     retry_interval: 15
-    excludehostgroups: alioth, broken_mq
+    excludehostgroups: broken_mq
   ###
   -
     name: local resolver
@@ -1905,22 +1866,20 @@ services:
     name: process - unbound
     nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:1 -c 1: -u unbound -C unbound -a '/usr/sbin/unbound'"
     hostgroups: computers
-    excludehostgroups: alioth
-  ###
   -
-    name: process - uptimed
-    nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:1 -c 1: -u daemon -C uptimed -a '/usr/sbin/uptimed'"
+    name: unbound trust anchors
+    nrpe: "/usr/lib/nagios/plugins/dsa-check-unbound-anchors"
     hostgroups: computers
+    check_interval: 60
   ###
   -
-    name: process - udevd
-    nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:1 -c 1: -u root -p 1 -C udevd -a 'udevd'"
+    name: process - uptimed
+    nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:1 -c 1: -u daemon -C uptimed -a '/usr/sbin/uptimed'"
     hostgroups: computers
-    excludehostgroups: jessie, stretch
   -
     name: process - udevd
     nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:1 -c 1: -u root -p 1 -C systemd-udevd -a '/lib/systemd/systemd-udevd'"
-    hostgroups: jessie, stretch
+    hostgroups: computers
   ###
   -
     name: unexpected process - acpid
@@ -1941,13 +1900,11 @@ services:
   -
     name: process - stunnel4 - puppet-ekeyd
     nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:6 -c 1: -u stunnel4 -C stunnel4 -a '/usr/bin/stunnel4 /etc/stunnel/puppet-ekeyd.conf'"
-    hostgroups: wheezy, jessie, stretch
-    excludehostgroups: alioth
+    hostgroups: computers
   -
     name: process - stunnel4 - puppet-ekeyd is crazy
     nrpe: "sudo /usr/lib/nagios/plugins/dsa-check-stunnel-sanity"
     hostgroups: computers
-    excludehostgroups: alioth
     excludehosts: czerny, grnet-node01, storace
   # }}}
   # {{{ anti-services
@@ -2008,18 +1965,11 @@ services:
     runfrom: handel
   # }}}
   # {{{ HW health/raid
-  -
-    name: process - mdadm monitor
-    servicegroups: raid
-    nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:1 -c 1:1 -u root -C mdadm -a '/sbin/mdadm --monitor --pid-file /run/mdadm/monitor.pid --daemonise --scan'"
-    hostgroups: sw-raid
-    excludehostgroups: jessie, stretch
   -
     name: process - mdadm monitor
     servicegroups: raid
     nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:1 -c 1:1 -u root -C mdadm -a '/sbin/mdadm --monitor --scan'"
     hostgroups: sw-raid
-    excludehostgroups: wheezy
   -
     name: RAID - sw raid
     servicegroups: raid
@@ -2097,7 +2047,12 @@ services:
   -
     name: HW - OpenManage status
     nrpe: "/usr/bin/sudo /usr/lib/nagios/plugins/dsa-check-openmanage"
-    hostgroups: pe1950
+    hostgroups: pe1950, r540
+    excludehosts: wieck, schumann
+  -
+    name: HW - OpenManage status
+    nrpe: "/usr/bin/sudo /usr/lib/nagios/plugins/dsa-check-openmanage -b bp=0 -b bat_charge=0:0"
+    hosts: wieck, schumann
   # }}}
   # }}}
   # {{{ ### mail stuff
@@ -2126,6 +2081,15 @@ services:
     name: mail queue
     nrpe: "/usr/lib/nagios/plugins/check_mailq -M exim -w 1000 -c 2000"
     hostgroups: heavy-exim
+  -
+    name: process - fail2ban
+    nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:1 -C fail2ban-server"
+    hostgroups: heavy-exim, heavy-postfix
+  -
+    name: unwanted process - fail2ban
+    nrpe: "/usr/lib/nagios/plugins/check_procs -w 0:0 -C fail2ban-server"
+    hostgroups: computers
+    excludehostgroups: heavy-exim, heavy-postfix
   # }}}
   # {{{ clamav
   -
@@ -2220,23 +2184,23 @@ services:
   -
     name: process - weightd - master
     nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:1 -c 1: -u polw -a 'policyd-weight (master)'"
-    hostgroups: heavy-postfix, alioth
+    hostgroups: heavy-postfix
   -
     name: process - weightd - cache
     nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:1 -c 1: -u polw -a 'policyd-weight (cache)'"
-    hostgroups: heavy-postfix, alioth
+    hostgroups: heavy-postfix
     depends: process - weightd - master
   -
     name: process - weightd - child
     nrpe: "/usr/lib/nagios/plugins/check_procs -w 2:50 -c 1: -u polw -a 'policyd-weight (child)'"
-    hostgroups: heavy-postfix, alioth
+    hostgroups: heavy-postfix
     depends: process - weightd - master
   ###
   -
     name: unwanted process - policyd-weight
     nrpe: "/usr/lib/nagios/plugins/check_procs -w 0:0 -C policyd-weight"
     hostgroups: computers
-    excludehostgroups: heavy-postfix, alioth
+    excludehostgroups: heavy-postfix
   # }}}
   # {{{ postfix
   ###
@@ -2446,15 +2410,11 @@ services:
     name: unwanted process - postgresql
     nrpe: "/usr/lib/nagios/plugins/check_procs -w 0 -C postgres"
     hostgroups: computers
-    excludehostgroups: postgres94-hosts, postgres96-hosts
+    excludehostgroups: postgres96-hosts
   -
     name: unwanted process - postgresql 9.0
     nrpe: "/usr/lib/nagios/plugins/check_procs -w 0 -C postgres -a '9.0/bin/postgres'"
     hostgroups: computers
-  -
-    name: process - postgresql94 - master
-    nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:10 -c 1: -u postgres -C postgres -a '/usr/lib/postgresql/9.4/bin/postgres'"
-    hostgroups: postgres94-hosts
   -
     name: process - postgresql96 - master
     nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:10 -c 1: -u postgres -C postgres -a '/usr/lib/postgresql/9.6/bin/postgres'"
@@ -2737,6 +2697,16 @@ services:
     check: "dsa_check_staticsync!miniconf10.debconf.org"
     hosts: global
     servicegroups: mirror
+  -
+    name: mirror static sync - wiki
+    check: "dsa_check_staticsync!wiki.debconf.org"
+    hosts: global
+    servicegroups: mirror
+  -
+    name: mirror static sync - www
+    check: "dsa_check_staticsync!www.debconf.org"
+    hosts: global
+    servicegroups: mirror
   # }}}
   # {{{ DNS
   -
@@ -2877,7 +2847,7 @@ services:
   -
     name: system - all services running
     nrpe: "/usr/bin/sudo /usr/lib/nagios/plugins/dsa-check-systemd-services"
-    hostgroups: jessie, stretch
+    hostgroups: computers
   ###
   -
     name: process - slapd
@@ -2946,6 +2916,37 @@ services:
     hostgroups: computers
     check_interval:  60
     retry_interval: 15
+  ####
+  -
+    name: ping peer on mgmt network
+    nrpe: "/usr/lib/nagios/plugins/check_ping -H 172.29.184.12 -w 50,10% -c 200,30%"
+    hosts: conova-node01
+    check_interval: 5
+    max_check_attempts: 4
+    retry_interval: 1
+  -
+    name: ping peer on mgmt network
+    nrpe: "/usr/lib/nagios/plugins/check_ping -H 172.29.184.11 -w 50,10% -c 200,30%"
+    hosts: conova-node02
+    check_interval: 5
+    max_check_attempts: 4
+    retry_interval: 1
+
+  -
+    name: ping peer on mgmt network
+    nrpe: "/usr/lib/nagios/plugins/check_ping -H 172.29.182.14 -w 50,10% -c 200,30%"
+    hosts: manda-node03
+    check_interval: 5
+    max_check_attempts: 4
+    retry_interval: 1
+  -
+    name: ping peer on mgmt network
+    nrpe: "/usr/lib/nagios/plugins/check_ping -H 172.29.182.13 -w 50,10% -c 200,30%"
+    hosts: manda-node04
+    check_interval: 5
+    max_check_attempts: 4
+    retry_interval: 1
+  # }}}
   # }}}
 # }}}