both pkgmirror-csail and sibelius run varnish
[mirror/dsa-nagios.git] / config / nagios-master.cfg
index be0938b..aefb247 100644 (file)
@@ -85,10 +85,6 @@ servers:
     address: 72.52.94.70
     parents: gw-ubcece
     hostgroups: layer3-infrastructure
-  gw-karlsruhe:
-    address: 129.143.59.214
-    parents: gw-ubcece
-    hostgroups: layer3-infrastructure
   gw-leaseweb:
     address: 185.17.185.190
     parents: gw-ubcece
@@ -181,6 +177,8 @@ servers:
     address: 130.239.18.123
     parents: gw-accumu
     hostgroups: computers, hasbootfs, aacraid, nfs-client, service, apache2-hosts, stretch, autofs, sw-raid
+    contacts: zobel, tfheen, lfilipoz, zumbi, jcristau, pabs, aurel32, dsa-nsa
+    contact_groups: ""
   mirror-accumu:
     address: 130.242.6.199
     parents: gw-accumu2
@@ -445,7 +443,7 @@ servers:
   x86-bm-01:
     address: 5.153.231.32
     parents: ganeti-bytemark
-    hostgroups: computers, kvmdomains, stretch, no-bacula, systemd-timesyncd
+    hostgroups: computers, pybuildd, hassrvfs, kvmdomains, stretch, systemd-timesyncd
   tate:
     address: 5.153.231.33
     parents: ganeti-bytemark
@@ -487,6 +485,8 @@ servers:
     address: 5.153.231.41
     parents: gw-bytemark
     hostgroups: computers, service, stretch, hassrvfs, dl380, manyprocesses, apache2-hosts
+    contacts: zobel, tfheen, lfilipoz, zumbi, jcristau, pabs, aurel32, dsa-nsa
+    contact_groups: ""
   # }}}
   # {{{ gw-c3sl
   santoro:
@@ -593,7 +593,7 @@ servers:
   pkgmirror-csail:
     address: 128.31.0.51
     parents: ganeti-csail
-    hostgroups: computers, service, kvmdomains, stretch, apache2-hosts, no-bacula, apache-https, hassrvfs, systemd-timesyncd
+    hostgroups: computers, service, kvmdomains, stretch, apache2-hosts, no-bacula, apache-https, hassrvfs, systemd-timesyncd, varnish-hosts
   usper:
     address: 128.31.0.69
     parents: ganeti-csail
@@ -639,7 +639,7 @@ servers:
   x86-grnet-01:
     address: 194.177.211.203
     parents: ganeti-grnet
-    hostgroups: computers, buildd, hassrvfs, kvmdomains, stretch, systemd-timesyncd
+    hostgroups: computers, pybuildd, hassrvfs, kvmdomains, stretch, systemd-timesyncd
   vittoria:
     address: 194.177.211.205
     parents: ganeti-grnet
@@ -683,27 +683,32 @@ servers:
   lw01:
     address: 185.17.185.177
     parents: gw-leaseweb
-    hostgroups: computers, service, jessie, dl180, nfs-server, rsyncd-hosts
+    hostgroups: computers, service, stretch, dl180, nfs-server, rsyncd-hosts
   lw02:
     address: 185.17.185.178
     parents: gw-leaseweb
-    hostgroups: computers, service, jessie, dl180, nfs-server, rsyncd-hosts
+    hostgroups: computers, service, stretch, dl180, nfs-server, rsyncd-hosts
   lw03:
     address: 185.17.185.179
     parents: gw-leaseweb
-    hostgroups: computers, service, jessie, dl180, nfs-server, rsyncd-hosts
+    hostgroups: computers, service, stretch, dl180, nfs-server, rsyncd-hosts
   lw04:
     address: 185.17.185.180
     parents: gw-leaseweb
-    hostgroups: computers, service, jessie, dl180, nfs-server, rsyncd-hosts
+    hostgroups: computers, service, stretch, dl180, nfs-server, rsyncd-hosts
   lw07:
     address: 185.17.185.187
     parents: gw-leaseweb
-    hostgroups: computers, service, jessie, dl180, nfs-client, autofs, hassrvfs, postgres94-hosts, apache2-hosts
+    hostgroups: computers, service, stretch, dl180, nfs-client, autofs, hassrvfs, postgres96-hosts, apache2-hosts, haproxy-hosts, haproxy-https-host, varnish-hosts
+  lw07-2:
+    address: 185.17.185.185
+    parents: lw07
+    hostgroups: secondary-IPs, https-service
+
   lw08:
     address: 185.17.185.189
     parents: gw-leaseweb
-    hostgroups: computers, service, jessie, dl180, nfs-client, autofs, hassrvfs, apache2-hosts
+    hostgroups: computers, service, stretch, dl180, nfs-client, autofs, hassrvfs, apache2-hosts
   lw09:
     address: 185.17.185.181
     parents: gw-leaseweb
@@ -713,13 +718,6 @@ servers:
     parents: gw-leaseweb
     hostgroups: computers, service, stretch, dl180
   # }}}
-  # {{{ gw-karlsruhe
-  zemlinsky:
-    address: 129.143.160.6
-    parents: gw-karlsruhe
-    hostgroups: computers, buildd, stretch
-    contacts: pkern
-  # }}}
   # {{{ gw-manda
   czerny:
     address: 82.195.75.109
@@ -814,7 +812,7 @@ servers:
   zani:
     address: 148.100.88.22
     parents: gw-marist
-    hostgroups: computers, buildd, hassrvfs, stretch, incomingmailrelayed
+    hostgroups: computers, pybuildd, hassrvfs, stretch, incomingmailrelayed
   # }}}
   # {{{ gw-osuosl
   byrd:
@@ -847,11 +845,15 @@ servers:
   sallinen:
     address: 193.62.202.26
     parents: gw-sanger
-    hostgroups: computers, service, stretch, dl380, nfs-client, autofs, postgres96-hosts
+    hostgroups: computers, service, stretch, dl380, nfs-client, autofs, postgres96-hosts, apache2-hosts, haproxy-hosts, haproxy-https-host, varnish-hosts
+  sallinen-2:
+    address: 193.62.202.27
+    parents: sallinen
+    hostgroups: secondary-IPs, https-service
   sibelius:
     address: 193.62.202.28
     parents: gw-sanger
-    hostgroups: computers, postgres94-hosts, service, apache2-hosts, sw-raid, jessie, rsyncd-hosts, hasvarlogfs, multipath-hosts, nfs-server
+    hostgroups: computers, postgres94-hosts, service, apache2-hosts, sw-raid, jessie, rsyncd-hosts, hasvarlogfs, multipath-hosts, nfs-server, varnish-hosts
     contacts: tjrc1, dave
   smetana:
     address: 193.62.202.29
@@ -991,11 +993,19 @@ servers:
   godard:
     address: 209.87.16.44
     parents: ubc-gateway
-    hostgroups: computers, service, kvmdomains, stretch, hassrvfs, apache2-hosts, apache-https, systemd-timesyncd, postfix-hosts, postgres96-hosts, manyprocesses
+    hostgroups: computers, service, kvmdomains, stretch, hassrvfs, apache2-hosts, apache-https, systemd-timesyncd, postfix-hosts, postgres96-hosts, crazymanyprocesses
   debussy:
     address: 209.87.16.46
     parents: ubc-gateway
     hostgroups: computers, service, kvmdomains, stretch, systemd-timesyncd, apache2-hosts, apache-https, broken_https_default_vhost
+  kantuser:
+    address: 209.87.16.47
+    parents: ubc-gateway
+    hostgroups: computers, service, kvmdomains, stretch, systemd-timesyncd, apache2-hosts
+  grabbe:
+    address: 209.87.16.48
+    parents: ubc-gateway
+    hostgroups: computers, service, kvmdomains, stretch, systemd-timesyncd, apache2-hosts, apache-https
   # }}}
   # {{{ gw-umn
   #saens:
@@ -1091,13 +1101,8 @@ hostgroups:
   computers:
     alias: computers
     private: 1
-    extinfo-icon_image: base/debian.png
-    extinfo-icon_image_alt: Debian GNU/Linux
-    extinfo-notes_url: https://db.debian.org/machines.cgi?host=%s
   layer3-infrastructure:
     alias: Layer 3 Devices
-    extinfo-icon_image: base/switch40.png
-    extinfo-icon_image_alt: router
   notacomputer:
     alias: Systems that are not really systems.  Yeah :)
     private: 1
@@ -1114,6 +1119,8 @@ hostgroups:
     alias: machines running services
   buildd:
     alias: buildd systems
+  pybuildd:
+    alias: buildd systems running pybuildd
   general:
     alias: general purpose developer accessible machines
 
@@ -1228,6 +1235,17 @@ hostgroups:
     private: 1
   manyprocesses:
     alias: hosts with lots and lots of (kernel) processes
+  crazymanyprocesses:
+    alias: hosts with stupidly lots of processes
+  varnish-hosts:
+    alias: hosts running varnish
+    private: 1
+  haproxy-hosts:
+    alias: hosts running haproxy
+    private: 1
+  haproxy-https-host:
+    alias: "host providing https on the standard port via haproxy"
+    private: 1
 
   no-bacula:
     alias: hosts which are not being backed up with bacula
@@ -1348,6 +1366,12 @@ services:
     check_interval: 5
     max_check_attempts: 4
     retry_interval: 1
+  -
+    name: network - v6 gw
+    nrpe: "/usr/lib/nagios/plugins/dsa-check-ipv6-default-gw"
+    hostgroups: computers
+    check_interval: 60
+    excludehostgroups: alioth
   # }}}
   # {{{ ### disk usage
   -
@@ -1405,42 +1429,42 @@ services:
   -
     name: disk usage on /srv/farm-snapshot/farm-2017-0
     servicegroups: diskspace
-    nrpe: "/usr/lib/nagios/plugins/check_disk 97 95 /srv/farm-snapshot/farm-2017-0"
+    nrpe: "/usr/lib/nagios/plugins/check_disk 95 97 /srv/farm-snapshot/farm-2017-0"
     hosts: sibelius
   -
     name: disk usage on /srv/farm-snapshot/farm-2017-1
     servicegroups: diskspace
-    nrpe: "/usr/lib/nagios/plugins/check_disk 97 95 /srv/farm-snapshot/farm-2017-1"
+    nrpe: "/usr/lib/nagios/plugins/check_disk 95 97 /srv/farm-snapshot/farm-2017-1"
     hosts: sibelius
   -
     name: disk usage on /srv/farm-snapshot/farm-2017-2
     servicegroups: diskspace
-    nrpe: "/usr/lib/nagios/plugins/check_disk 97 95 /srv/farm-snapshot/farm-2017-2"
+    nrpe: "/usr/lib/nagios/plugins/check_disk 95 97 /srv/farm-snapshot/farm-2017-2"
     hosts: sibelius
   -
     name: disk usage on /srv/farm-snapshot/farm-2017-3
     servicegroups: diskspace
-    nrpe: "/usr/lib/nagios/plugins/check_disk 97 95 /srv/farm-snapshot/farm-2017-3"
+    nrpe: "/usr/lib/nagios/plugins/check_disk 95 97 /srv/farm-snapshot/farm-2017-3"
     hosts: sibelius
   -
     name: disk usage on /srv/farm-snapshot/farm-2017-4
     servicegroups: diskspace
-    nrpe: "/usr/lib/nagios/plugins/check_disk 97 95 /srv/farm-snapshot/farm-2017-4"
+    nrpe: "/usr/lib/nagios/plugins/check_disk 95 97 /srv/farm-snapshot/farm-2017-4"
     hosts: sibelius
   -
     name: disk usage on /srv/farm-snapshot/farm-2017-5
     servicegroups: diskspace
-    nrpe: "/usr/lib/nagios/plugins/check_disk 97 95 /srv/farm-snapshot/farm-2017-5"
+    nrpe: "/usr/lib/nagios/plugins/check_disk 95 97 /srv/farm-snapshot/farm-2017-5"
     hosts: sibelius
   -
     name: disk usage on /srv/farm-snapshot/farm-2017-6
     servicegroups: diskspace
-    nrpe: "/usr/lib/nagios/plugins/check_disk 97 95 /srv/farm-snapshot/farm-2017-6"
+    nrpe: "/usr/lib/nagios/plugins/check_disk 95 97 /srv/farm-snapshot/farm-2017-6"
     hosts: sibelius
   -
     name: disk usage on /srv/farm-snapshot/farm-2017-7
     servicegroups: diskspace
-    nrpe: "/usr/lib/nagios/plugins/check_disk 97 95 /srv/farm-snapshot/farm-2017-7"
+    nrpe: "/usr/lib/nagios/plugins/check_disk 95 97 /srv/farm-snapshot/farm-2017-7"
     hosts: sibelius
   -
     name: disk usage on /srv/ftp-master.debian.org
@@ -1450,37 +1474,47 @@ services:
   -
     name: disk usage on /storage/snapshot-farm-1
     servicegroups: diskspace
-    nrpe: "/usr/lib/nagios/plugins/check_disk 98 92 /storage/snapshot-farm-1"
+    nrpe: "/usr/lib/nagios/plugins/check_disk 95 97 /storage/snapshot-farm-1"
     hosts: lw01
   -
     name: disk usage on /storage/snapshot-farm-2
     servicegroups: diskspace
-    nrpe: "/usr/lib/nagios/plugins/check_disk 98 92 /storage/snapshot-farm-2"
+    nrpe: "/usr/lib/nagios/plugins/check_disk 95 97 /storage/snapshot-farm-2"
     hosts: lw02
   -
     name: disk usage on /storage/snapshot-farm-3
     servicegroups: diskspace
-    nrpe: "/usr/lib/nagios/plugins/check_disk 98 92 /storage/snapshot-farm-3"
+    nrpe: "/usr/lib/nagios/plugins/check_disk 95 97 /storage/snapshot-farm-3"
     hosts: lw03
   -
     name: disk usage on /storage/snapshot-farm-4
     servicegroups: diskspace
-    nrpe: "/usr/lib/nagios/plugins/check_disk 98 92 /storage/snapshot-farm-4"
+    nrpe: "/usr/lib/nagios/plugins/check_disk 95 97 /storage/snapshot-farm-4"
     hosts: lw04
+  -
+    name: disk usage on /storage/snapshot-farm-90
+    servicegroups: diskspace
+    nrpe: "/usr/lib/nagios/plugins/check_disk 95 97 /storage/snapshot-farm-09"
+    hosts: lw09
+  -
+    name: disk usage on /storage/snapshot-farm-10
+    servicegroups: diskspace
+    nrpe: "/usr/lib/nagios/plugins/check_disk 95 97 /storage/snapshot-farm-10"
+    hosts: lw10
   -
     name: disk usage on /srv/morgue.debian.org/
     servicegroups: diskspace
-    nrpe: "/usr/lib/nagios/plugins/check_disk 95 90 /srv/morgue.debian.org"
+    nrpe: "/usr/lib/nagios/plugins/check_disk 90 95 /srv/morgue.debian.org"
     hosts: lw03
   -
     name: disk usage on /srv/QNAP-big/
     servicegroups: diskspace
-    nrpe: "/usr/lib/nagios/plugins/check_disk 90 80 /srv/QNAP-big"
+    nrpe: "/usr/lib/nagios/plugins/check_disk 80 90 /srv/QNAP-big"
     hosts: storace
   -
     name: disk usage on /srv/QNAP-tiny
     servicegroups: diskspace
-    nrpe: "/usr/lib/nagios/plugins/check_disk 90 80 /srv/QNAP-tiny"
+    nrpe: "/usr/lib/nagios/plugins/check_disk 90 95 /srv/QNAP-tiny"
     hosts: storace
   # }}}
   # {{{ ### system
@@ -1520,11 +1554,15 @@ services:
     name: processes - total
     nrpe: "/usr/lib/nagios/plugins/check_procs 620 700"
     hostgroups: computers
-    excludehostgroups: manyprocesses
+    excludehostgroups: manyprocesses, crazymanyprocesses
   -
     name: processes - total
     hostgroups: manyprocesses
     nrpe: "/usr/lib/nagios/plugins/check_procs 1500 1700"
+  -
+    name: processes - total
+    hostgroups: crazymanyprocesses
+    nrpe: "/usr/lib/nagios/plugins/check_procs 15000 25000"
   -
     name: free memory - mb
     nrpe: "/usr/lib/nagios/plugins/dsa-check-memory -m mb"
@@ -1584,7 +1622,7 @@ services:
     remotecheck: "/usr/lib/nagios/plugins/dsa-check-bacula $HOSTNAME$.debian.org"
     runfrom: dinis
     hostgroups: computers
-    excludehostgroups: buildd, porterbox, no-bacula
+    excludehostgroups: buildd, pybuildd, porterbox, no-bacula
     check_interval:  60
     retry_interval: 15
   -
@@ -1593,7 +1631,7 @@ services:
     remotecheck: "/usr/lib/nagios/plugins/dsa-check-bacula -w 1080 -c 1560 $HOSTNAME$.debian.org F"
     runfrom: dinis
     hostgroups: computers
-    excludehostgroups: buildd, porterbox, no-bacula
+    excludehostgroups: buildd, pybuildd, porterbox, no-bacula
     check_interval:  60
     retry_interval: 15
   -
@@ -1946,11 +1984,6 @@ services:
     name: "sso CRL"
     nrpe: "if [ -e /var/lib/dsa/sso/ca.crl ]; then /usr/lib/nagios/plugins/dsa-check-crl-expire -w 129600 -c 86400 /var/lib/dsa/sso/ca.crl; else echo 'No sso/ca.crl on this host.'; fi"
     hostgroups: computers
-  -
-    name: SSL certs - puppet
-    hosts: global
-    remotecheck: "/usr/lib/nagios/plugins/dsa-check-cert-expire-dir /etc/puppet/modules/ssl/files/servicecerts"
-    runfrom: handel
   -
     name: SSL certs - LE
     hosts: global
@@ -2357,15 +2390,51 @@ services:
   -
     name: network service - https cert
     check: dsa_check_cert!443
-    hostgroups: apache-https, https-service
+    hostgroups: apache-https, https-service, haproxy-https-host
     depends: network service - https
     check_interval: 60
   -
     name: unwanted network service - https
     check: dsa_check_port_closed!443
     hostgroups: apache2-hosts
-    excludehostgroups: apache-https
+    excludehostgroups: apache-https, haproxy-https-host
     check_interval: 60
+
+ ###
+  -
+    name: process - haproxy - master
+    nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:1 -c 1:1 -u root -a '/usr/sbin/haproxy-systemd-wrapper'"
+    hostgroups: haproxy-hosts
+  -
+    name: process - haproxy - worker
+    nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:6 -c 1:15 -u haproxy -a '/usr/sbin/haproxy '"
+    hostgroups: haproxy-hosts
+    depends: process - haproxy - master
+  -
+    name: network service - https
+    check: check_https
+    hostgroups: haproxy-https-host
+    depends: "process - haproxy - master"
+    check_interval: 120
+
+  -
+    name: unwanted process - haproxy
+    nrpe: "/usr/lib/nagios/plugins/check_procs -w 0 -C haproxy"
+    hostgroups: computers
+    excludehostgroups: haproxy-hosts
+
+ ###
+  -
+    name: process - varnish
+    nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:2 -c 1:15 -u vcache -a '/usr/sbin/varnishd -j unix,user=vcache -F -a '"
+    hostgroups: varnish-hosts
+  -
+    name: unwanted process - varnish
+    nrpe: "/usr/lib/nagios/plugins/check_procs -w 0 -C varnishd"
+    hostgroups: computers
+    excludehostgroups: varnish-hosts
+
+
   # }}}
   # {{{ FTP
   -
@@ -2407,19 +2476,20 @@ services:
     nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:5 -u buildd -C buildd -a '/usr/bin/buildd'"
     hostgroups: buildd
     contact_groups: buildd
+  -
+    name: process - buildd
+    servicegroups: buildd
+    nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:5 -u buildd -C python3 -a 'buildd.py'"
+    hostgroups: pybuildd
+    contact_groups: buildd
   -
     name: processes - zombie schroot
     nrpe: "(/usr/lib/nagios/plugins/check_procs -a schroot -s Zs -c 0 > /dev/null || /usr/lib/nagios/plugins/check_procs -a schroot -s Zs -c 0) && /usr/lib/nagios/plugins/check_procs -a schroot -s ZNs -c 0"
-    hostgroups: buildd
+    hostgroups: buildd, pybuildd
     contact_groups: +buildd
     check_interval: 5
     max_check_attempts: 24
     retry_interval: 5
-  -
-    name: processes - lvcreate
-    nrpe: "/usr/lib/nagios/plugins/check_procs -m 'ELAPSED' -c 500 -C lvcreate -u root -a 'lvcreate'"
-    hostgroups: buildd
-    contact_groups: +buildd
   # }}}
   # {{{ NFS Stuff
   -
@@ -2705,10 +2775,6 @@ services:
     name: DNS SOA sync - 144-28.118.59.86.in-addr.arpa
     check: "dsa_check_soas_add!denis.debian.org!144-28.118.59.86.in-addr.arpa"
     hosts: global
-  -
-    name: DNS SOA sync - alioth.debian.org
-    check: "dsa_check_soas_add!denis.debian.org!alioth.debian.org"
-    hosts: global
   -
     name: DNS SOA sync - debconf.net
     check: "dsa_check_soas_add!denis.debian.org!debconf.net"
@@ -2866,6 +2932,11 @@ services:
     check: dsa_check_cert!5061
     check_interval: 60
     hosts: vogler
+  -
+    name: freeradius process
+    nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:1 -c 1:1 -u freerad -C freeradius -a '/usr/sbin/freeradius -xx'"
+    check_interval: 60
+    hosts: vogler
   ####
   -
     name: puppetmaster cert