Remove hildegard and alain (arm* buildds)
[mirror/dsa-nagios.git] / config / nagios-master.cfg
index f1a4ccf..e70bdce 100644 (file)
@@ -90,6 +90,10 @@ servers:
     address: 129.143.57.177
     parents: gw-ubcece
     hostgroups: layer3-infrastructure
+  gw-leaseweb:
+    address: 185.17.185.190
+    parents: gw-ubcece
+    hostgroups: layer3-infrastructure
   gw-man-da:
     address: 82.195.75.126
     parents: gw-ubcece
@@ -159,7 +163,7 @@ servers:
   pkgmirror-1and1:
     address: 213.165.95.4
     parents: powell
-    hostgroups: computers, service, kvmdomains, wheezy, apache2-hosts, no-bacula
+    hostgroups: computers, service, kvmdomains, wheezy, apache2-hosts, no-bacula, apache-https
   babin:
     address: 213.165.95.6
     parents: powell
@@ -202,10 +206,6 @@ servers:
     address: 217.140.96.56
     parents: gw-arm
     hostgroups: computers, hasbootfs, hassrvfs, porterbox, wheezy, deadslow
-  alain:
-    address: 217.140.96.58
-    parents: gw-arm
-    hostgroups: computers, hasbootfs, hassrvfs, buildd, wheezy, deadslow
   alwyn:
     address: 217.140.96.59
     parents: gw-arm
@@ -290,7 +290,7 @@ servers:
   picconi:
     address: 5.153.231.3
     parents: gw-bytemark
-    hostgroups: computers, service, kvmdomains, wheezy, apache2-hosts, nfs-client, autofs, heavy-exim, spamd
+    hostgroups: computers, service, kvmdomains, wheezy, apache2-hosts, nfs-client, autofs, heavy-exim, spamd, apache-https
   senfter:
     address: 5.153.231.4
     parents: gw-bytemark
@@ -365,10 +365,16 @@ servers:
     address: 5.153.231.20
     parents: ganeti-bytemark
     hostgroups: computers, general, kvmdomains, wheezy, nfs-client, autofs
+  moszumanska:
+    address: 5.153.231.21
+    parents: ganeti-bytemark
+    contact_groups: alioth-admins
+    hostgroups: computers, general, wheezy, postgres91-hosts, apache2-hosts, acpid-hosts, apache-https, brokensamhain, no-bacula, bind9-hosts, xinetd-hosts, alioth, heavy-exim, spamd
+    no-servicegroups: true
   dillon:
     address: 5.153.231.22
     parents: ganeti-bytemark
-    hostgroups: computers, general, kvmdomains, wheezy, nfs-client, autofs
+    hostgroups: computers, general, kvmdomains, wheezy, nfs-client, autofs, hassrvfs
   ticharich:
     address: 5.153.231.23
     parents: ganeti-bytemark
@@ -428,18 +434,18 @@ servers:
     hostgroups: computers, sw-raid, hassrvfs, wheezy
   # }}}
   # {{{ gw-ftcollins
-  alkman:
-    address: 192.25.206.63
-    parents: gw-ftcollins
-    hostgroups: computers, buildd, acpid-hosts, wheezy
-  merulo:
-    address: 192.25.206.58
-    parents: gw-ftcollins
-    hostgroups: computers, porterbox, hasusrfs, wheezy
-  mundy:
-    address: 192.25.206.62
-    parents: gw-ftcollins
-    hostgroups: computers, buildd, hassrvfs, sw-raid, acpid-hosts, wheezy
+  #alkman:
+  #  address: 192.25.206.63
+  #  parents: gw-ftcollins
+  #  hostgroups: computers, buildd, acpid-hosts, wheezy
+  #merulo:
+  #  address: 192.25.206.58
+  #  parents: gw-ftcollins
+  #  hostgroups: computers, porterbox, hasusrfs, wheezy
+  #mundy:
+  #  address: 192.25.206.62
+  #  parents: gw-ftcollins
+  #  hostgroups: computers, buildd, hassrvfs, sw-raid, acpid-hosts, wheezy
   spohr:
     address: 192.25.206.33
     parents: gw-ftcollins
@@ -477,6 +483,32 @@ servers:
     parents: gw-isc
     hostgroups: computers, service, apache2-hosts, rsyncd-hosts, acpid-hosts, dl360, hasorgfs, xinetd-hosts, wheezy, security_mirror, no-bacula
   # }}}
+  # {{{ gw-leaseweb
+  lw01:
+    address: 185.17.185.177
+    parents: gw-leaseweb
+    hostgroups: computers, service, acpid-hosts, wheezy, dl180
+  lw02:
+    address: 185.17.185.178
+    parents: gw-leaseweb
+    hostgroups: computers, service, acpid-hosts, wheezy, dl180
+  lw03:
+    address: 185.17.185.179
+    parents: gw-leaseweb
+    hostgroups: computers, service, acpid-hosts, wheezy, dl180
+  lw04:
+    address: 185.17.185.180
+    parents: gw-leaseweb
+    hostgroups: computers,  service, acpid-hosts, wheezy, dl180
+  lw05:
+    address: 185.17.185.181
+    parents: gw-leaseweb
+    hostgroups: computers,  service, acpid-hosts, wheezy, dl120, sw-raid
+  lw06:
+    address: 185.17.185.182
+    parents: gw-leaseweb
+    hostgroups: computers,  service, acpid-hosts, wheezy, dl120, sw-raid
+  # }}}
   # {{{ gw-karlsruhe
   zemlinsky:
     address: 129.143.160.6
@@ -581,7 +613,7 @@ servers:
   zani:
     address: 148.100.88.22
     parents: gw-marist
-    hostgroups: computers, buildd, hassrvfs, wheezy, incomingmailrelayed
+    hostgroups: computers, buildd, hassrvfs, wheezy, incomingmailrelayed, ping-suckers
   # }}}
   # {{{ gw-osuosl
   busoni:
@@ -665,6 +697,10 @@ servers:
     address: 86.59.118.152
     parents: gw-sil
     hostgroups: computers, buildd, wheezy
+  eberlin:
+    address: 86.59.118.155
+    parents: gw-sil
+    hostgroups: computers, buildd, wheezy
   # }}}
   # {{{ gw-ubcece
   sw-ubcece:
@@ -741,10 +777,6 @@ servers:
     address: 206.12.19.13
     parents: sw-ubcece-kais
     hostgroups: computers, hashomefs, sw-raid, rsyncd-hosts, apache2-hosts, xinetd-hosts, service, nfs-server, squeeze, hassrvfs
-  paganini:
-    address: 206.12.19.10
-    parents: sw-ubcece-kais
-    hostgroups: computers, hasbootfs, aacraid, hassrvfs, nfs-client, service, squeeze, autofs
   respighi:
     address: 206.12.19.11
     parents: sw-ubcece-kais
@@ -814,7 +846,7 @@ servers:
   nono:
     address: 206.12.19.123
     parents: traetta
-    hostgroups: computers, service, kvmdomains, wheezy, heavy-exim, xinetd-hosts, apache2-hosts, apache-https
+    hostgroups: computers, service, kvmdomains, wheezy, heavy-exim, xinetd-hosts, apache2-hosts, apache-https, broken_https_default_vhost
   reger:
     address: 206.12.19.124
     parents: ganeti2
@@ -854,7 +886,7 @@ servers:
   diabelli:
     address: 206.12.19.136
     parents: traetta
-    hostgroups: computers, service, hasbootfs, kvmdomains, wheezy, apache2-hosts, apache-https
+    hostgroups: computers, service, hasbootfs, kvmdomains, wheezy, apache2-hosts, apache-https, broken_https_default_vhost
   bizet:
     address: 206.12.19.137
     parents: ganeti2
@@ -883,10 +915,6 @@ servers:
     address: 206.12.19.143
     parents: ganeti2
     hostgroups: computers, service, kvmdomains, wheezy, hassrvfs, apache2-hosts, apache-https
-  stanley:
-    address: 206.12.19.145
-    parents: ganeti2
-    hostgroups: computers, service, kvmdomains, wheezy, hassrvfs, apache2-hosts, no-bacula
   muffat:
     address: 206.12.19.146
     parents: ganeti2
@@ -924,10 +952,6 @@ servers:
     hostgroups: secondary-IPs
   # }}}
   # {{{ gw-ynic
-  hildegard:
-    address: 144.32.168.74
-    parents: gw-ynic
-    hostgroups: computers, hasbootfs, hassrvfs, armhf, wheezy, deadslow, buildd
   howells:
     address: 144.32.168.75
     parents: gw-ynic
@@ -1030,6 +1054,12 @@ hostgroups:
   dl585:
     alias: HP DL385 hosts
     private: 1
+  dl180:
+    alias: HP DL180
+    private: 1
+  dl120:
+    alias: HP DL120
+    private: 1
   sw-raid:
     alias: Hosts with Linux software raid
     private: 1
@@ -1121,6 +1151,9 @@ hostgroups:
   apache-https:
     alias: hosts with https services
     private: 1
+  broken_https_default_vhost:
+    alias: https default vhost does not say 200 OK
+    private: 1
 
   no-bacula:
     alias: hosts which are not being backed up with bacula
@@ -1178,15 +1211,17 @@ hostgroups:
     # i.e. no port 25
     private: 1
 
-  ntpsuckers:
-    alias: "hosts who's ntp offset is often unknown"
-    private: 1
-
   brokensamhain:
     alias: machines that can not run samhain
     private: 1
   high-RTT:
-    alias: machines with hight round trip times
+    alias: machines with high round trip times
+    private: 1
+  ping-suckers:
+    alias: machines that just suck at icmp
+    private: 1
+  alioth:
+    alias: machines that just are just awkward
     private: 1
 
   security_mirror:
@@ -1224,7 +1259,7 @@ services:
     name: PING
     check: "check_ping!350.0,20%!600.0,40%"
     hostgroups: pingable
-    excludehostgroups: layer3-infrastructure, high-RTT
+    excludehostgroups: layer3-infrastructure, high-RTT, ping-suckers
     normal_check_interval: 5
     max_check_attempts: 4
     retry_check_interval: 1
@@ -1235,6 +1270,13 @@ services:
     normal_check_interval: 5
     max_check_attempts: 4
     retry_check_interval: 1
+  -
+    name: PING
+    check: "check_ping!600.0,90%!900.0,95%"
+    hostgroups: ping-suckers
+    normal_check_interval: 5
+    max_check_attempts: 4
+    retry_check_interval: 1
   -
     name: PING
     check: "check_ping!2000.0,60%!3000.0,80%"
@@ -1348,9 +1390,9 @@ services:
     nrpe: "/usr/lib/nagios/plugins/check_disk 97 95 /srv/farm-snapshot/farm-misc"
     hosts: sibelius
   -
-    name: disk usage on /var/lib/postgresql/9.1/dak
+    name: disk usage on /var/lib/postgresql/9.1
     servicegroups: diskspace
-    nrpe: "/usr/lib/nagios/plugins/check_disk 75 85 /var/lib/postgresql/9.1/dak"
+    nrpe: "/usr/lib/nagios/plugins/check_disk 75 85 /var/lib/postgresql/9.1"
     hosts: franck
   -
     name: disk usage on /srv/ftp-master.debian.org
@@ -1433,7 +1475,7 @@ services:
     servicegroups: backup
     nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:1 -c 1: -u bacula -C bacula-fd -a '/usr/sbin/bacula-fd -c /etc/bacula/bacula-fd.conf'"
     hostgroups: computers
-    excludehostgroups: freebsd
+    excludehostgroups: freebsd, alioth
   -
     name: process - bacula-fd
     servicegroups: backup
@@ -1540,6 +1582,12 @@ services:
     nrpe: "/usr/lib/nagios/plugins/dsa-check-uptime"
     hostgroups: computers
  ####
+  -
+    name: processes - samhain zombies
+    nrpe: "/usr/lib/nagios/plugins/check_procs 3 6 -s Z -u root -a samhain"
+    event_handler: dsa_event_handler_restart_samhain
+    hostgroups: computers
+    excludehostgroups: brokensamhain
   -
     name: processes - zombies
     nrpe: "/usr/lib/nagios/plugins/check_procs 5 10 -s Z"
@@ -1622,7 +1670,7 @@ services:
     hostgroups: computers
     depends: process - ntpd
     excludehosts: ancina
-    excludehostgroups: ntpsuckers, deadslow
+    excludehostgroups: deadslow
     servicegroups: time
   #
   -
@@ -1660,11 +1708,13 @@ services:
     remotecheck: "/usr/lib/nagios/plugins/dsa-check-log-age-loghost $HOSTNAME$"
     runfrom: lotti
     hostgroups: computers
+    excludehostgroups: alioth
   -
     name: remote logging on lully
     remotecheck: "/usr/lib/nagios/plugins/dsa-check-log-age-loghost $HOSTNAME$"
     runfrom: lully
     hostgroups: computers
+    excludehostgroups: alioth
   -
     name: MQ connection on rainier
     remotecheck: "/usr/lib/nagios/plugins/dsa-check-mq-connection $HOSTNAME$ ud dsa"
@@ -1672,6 +1722,7 @@ services:
     hostgroups: computers
     normal_check_interval:  60
     retry_check_interval: 15
+    excludehostgroups: alioth
   -
     name: MQ connection on rapoport
     remotecheck: "/usr/lib/nagios/plugins/dsa-check-mq-connection $HOSTNAME$ ud dsa"
@@ -1679,6 +1730,7 @@ services:
     hostgroups: computers
     normal_check_interval:  60
     retry_check_interval: 15
+    excludehostgroups: alioth
  ### MAIL STUFF
  ###
   -
@@ -1814,23 +1866,23 @@ services:
   -
     name: process - weightd - master
     nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:1 -c 1: -u polw -a 'policyd-weight (master)'"
-    hostgroups: heavy-postfix
+    hostgroups: heavy-postfix, alioth
   -
     name: process - weightd - cache
     nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:1 -c 1: -u polw -a 'policyd-weight (cache)'"
-    hostgroups: heavy-postfix
+    hostgroups: heavy-postfix, alioth
     depends: process - weightd - master
   -
     name: process - weightd - child
     nrpe: "/usr/lib/nagios/plugins/check_procs -w 2:50 -c 1: -u polw -a 'policyd-weight (child)'"
-    hostgroups: heavy-postfix
+    hostgroups: heavy-postfix, alioth
     depends: process - weightd - master
  ###
   -
     name: unwanted process - policyd-weight
     nrpe: "/usr/lib/nagios/plugins/check_procs -w 0:0 -C policyd-weight"
     hostgroups: computers
-    excludehostgroups: heavy-postfix, deadslow
+    excludehostgroups: heavy-postfix, deadslow, alioth
 
 
  ###
@@ -1921,15 +1973,12 @@ services:
     nrpe: "/usr/lib/nagios/plugins/dsa-check-config"
     hostgroups: computers
     normal_check_interval: 60
+    excludehostgroups: alioth
   -
     name: setup - local hostname etc-hosts
     nrpe: 'if getent ahosts `hostname` | grep -q 127.0; then echo "Warning: local hostname resolves to 127/8 address"; exit 1; else echo "OK: Hostname resolves to non-127/8 address."; exit 0; fi'
     hostgroups: computers
     normal_check_interval: 60
-  -
-    name: setup - ud-ldap freshness
-    nrpe: "/usr/lib/nagios/plugins/dsa-check-udldap-freshness"
-    hostgroups: computers
   -
     name: system - available entropy
     nrpe: "/usr/lib/nagios/plugins/dsa-check-entropy"
@@ -1952,6 +2001,7 @@ services:
     name: process - unbound
     nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:1 -c 1: -u unbound -C unbound -a '/usr/sbin/unbound'"
     hostgroups: unbound-hosts, squeeze, wheezy
+    excludehostgroups: alioth
  ###
   -
     name: process - uptimed
@@ -2034,7 +2084,7 @@ services:
     name: process - ud-replicated
     nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:1 -c 1:1 -u root -C ud-replicated -a '/usr/bin/python /usr/bin/ud-replicated'"
     hostgroups: computers
-    excludehostgroups: squeeze,freebsd
+    excludehostgroups: squeeze, freebsd, alioth
   -
     name: process - ud-replicated
     nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:1 -c 1:1 -u root -C python2.7 -a '/usr/bin/python /usr/bin/ud-replicated'"
@@ -2047,13 +2097,13 @@ services:
     name: process - monit
     nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:1 -c 1:1 -u root -C monit -a '/usr/bin/monit -d 300 -I -c /etc/monit/monitrc -s /var/lib/monit/monit.state'"
     hostgroups: computers
-    excludehostgroups: squeeze
+    excludehostgroups: squeeze, alioth
   -
     name: HW - hpacucli status
     servicegroups: raid
     nrpe: "/usr/lib/nagios/plugins/dsa-check-hpacucli"
     normal_check_interval: 120
-    hostgroups: dl385, dl380, dl360, bl460
+    hostgroups: dl385, dl380, dl360, bl460, dl180
     excludehosts: schein, rietz
   -
     name: HW - hpacucli status
@@ -2080,6 +2130,13 @@ services:
     normal_check_interval: 120
     hostgroups: dl585
  ###
+  -
+    name: HW - edac status
+    nrpe: "/usr/lib/nagios/plugins/dsa-check-edac"
+    normal_check_interval: 120
+    hosts: lw05, lw06
+    #hostgroups: computers
+    #excludehosts: villa, lobos, senfl, schein
   -
     name: HW - hpasmcli status
     nrpe: "/usr/lib/nagios/plugins/dsa-check-hpasm"
@@ -2279,6 +2336,7 @@ services:
     check: check_https
     hostgroups: apache-https
     excludehosts: handel,menotti
+    excludehostgroups: broken_https_default_vhost
     depends: "process - apache2 - master"
     normal_check_interval: 120
   -
@@ -2287,6 +2345,12 @@ services:
     hosts: handel,menotti
     depends: "process - apache2 - master"
     normal_check_interval: 120
+  -
+    name: network service - https
+    check: dsa_check_https_any_status
+    hostgroups: broken_https_default_vhost
+    depends: "process - apache2 - master"
+    normal_check_interval: 120
   -
     name: network service - https cert
     check: dsa_check_cert!443
@@ -2390,7 +2454,7 @@ services:
     hostgroups: computers
   -
     name: process - postgresql91 - master
-    nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:4 -c 1: -u postgres -C postgres -a '/usr/lib/postgresql/9.1/bin/postgres'"
+    nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:10 -c 1: -u postgres -C postgres -a '/usr/lib/postgresql/9.1/bin/postgres'"
     hostgroups: postgres91-hosts
   -
     name: postgresql backups
@@ -2403,7 +2467,7 @@ services:
     name: process - stunnel4 - puppet-ekeyd
     nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:6 -c 1: -u stunnel4 -C stunnel4 -a '/usr/bin/stunnel4 /etc/stunnel/puppet-ekeyd.conf'"
     hostgroups: squeeze, wheezy
-    excludehostgroups: freebsd
+    excludehostgroups: freebsd, alioth
  ####
   -
     name: process - UPS - nut usbhid-ups - ups1
@@ -2503,6 +2567,13 @@ services:
 
  ############ MISC OTHER Stuff ############
  #####
+  -
+    name: puppetmaster cert
+    nrpe: "/usr/lib/nagios/plugins/dsa-check-cert-expire /var/lib/puppet/ssl/certs/ca.pem"
+    hosts: handel
+    normal_check_interval: 60
+    max_check_attempts: 2
+    retry_check_interval: 5
   -
     name: mirror sync - bugs
     check: "dsa_check_mirrorsync_skew!bugs.debian.org!project/trace/bugs-master.debian.org!120:600"
@@ -2589,5 +2660,12 @@ services:
     remotecheck: "/usr/lib/nagios/plugins/dsa-check-msa-eventlog --start=7778 $HOSTADDRESS$ public"
     runfrom: dijkstra
     hosts: giustini
+ ############
+  -
+    name: current chroots
+    nrpe: "/usr/lib/nagios/plugins/dsa-check-dchroots-current"
+    hostgroups: porterbox
+    normal_check_interval:  60
+    retry_check_interval: 15
 
 # vim: set ts=2 sw=2 et ai si fdm=marker: