X-Git-Url: https://git.adam-barratt.org.uk/?a=blobdiff_plain;f=config%2Fnagios-master.cfg;h=e9c134c388d58a68387cca25a9f7c56ef217981d;hb=cc33b9c07688eabb479083349004b2720599f379;hp=ba445cccf7eb848a56852f0cafc7904cd464b6b7;hpb=c4643a69f0b67bf2b2d503236b72d36a178e45e2;p=mirror%2Fdsa-nagios.git diff --git a/config/nagios-master.cfg b/config/nagios-master.cfg index ba445cc..e9c134c 100644 --- a/config/nagios-master.cfg +++ b/config/nagios-master.cfg @@ -139,7 +139,7 @@ servers: parents: gw-ubcece hostgroups: layer3-infrastructure gw-unicamp: - address: 177.220.10.129 + address: 143.106.167.113 parents: gw-ubcece hostgroups: layer3-infrastructure gw-utwente: @@ -238,15 +238,15 @@ servers: arm-arm-01: address: 217.140.96.58 parents: gw-arm - hostgroups: computers, hassrvfs, buildd, stretch, broken_mq + hostgroups: computers, hassrvfs, buildd, stretch, broken_mq, sw-raid arm-arm-03: address: 217.140.96.60 parents: gw-arm - hostgroups: computers, hassrvfs, buildd, stretch, broken_mq + hostgroups: computers, hassrvfs, buildd, stretch, broken_mq, sw-raid arm-arm-04: address: 217.140.96.61 parents: gw-arm - hostgroups: computers, hassrvfs, buildd, stretch, broken_mq + hostgroups: computers, hassrvfs, buildd, stretch, broken_mq, sw-raid harris: address: 217.140.96.66 parents: gw-arm @@ -405,9 +405,7 @@ servers: moszumanska: address: 5.153.231.21 parents: ganeti-bytemark - contact_groups: alioth-admins - hostgroups: computers, general, wheezy, postgres91-hosts, apache2-hosts, acpid-hosts, apache-https, brokensamhain, no-bacula, bind9-hosts, xinetd-hosts, alioth, heavy-exim, spamd - no-servicegroups: true + hostgroups: secondary-IPs dillon: address: 5.153.231.22 parents: ganeti-bytemark @@ -593,7 +591,7 @@ servers: pkgmirror-csail: address: 128.31.0.51 parents: ganeti-csail - hostgroups: computers, service, kvmdomains, stretch, apache2-hosts, no-bacula, apache-https, hassrvfs, systemd-timesyncd + hostgroups: computers, service, kvmdomains, stretch, apache2-hosts, no-bacula, apache-https, hassrvfs, systemd-timesyncd, varnish-hosts usper: address: 128.31.0.69 parents: ganeti-csail @@ -683,27 +681,32 @@ servers: lw01: address: 185.17.185.177 parents: gw-leaseweb - hostgroups: computers, service, jessie, dl180, nfs-server, rsyncd-hosts + hostgroups: computers, service, stretch, dl180, nfs-server, rsyncd-hosts lw02: address: 185.17.185.178 parents: gw-leaseweb - hostgroups: computers, service, jessie, dl180, nfs-server, rsyncd-hosts + hostgroups: computers, service, stretch, dl180, nfs-server, rsyncd-hosts lw03: address: 185.17.185.179 parents: gw-leaseweb - hostgroups: computers, service, jessie, dl180, nfs-server, rsyncd-hosts + hostgroups: computers, service, stretch, dl180, nfs-server, rsyncd-hosts lw04: address: 185.17.185.180 parents: gw-leaseweb - hostgroups: computers, service, jessie, dl180, nfs-server, rsyncd-hosts + hostgroups: computers, service, stretch, dl180, nfs-server, rsyncd-hosts lw07: address: 185.17.185.187 parents: gw-leaseweb - hostgroups: computers, service, jessie, dl180, nfs-client, autofs, hassrvfs, postgres94-hosts, apache2-hosts + hostgroups: computers, service, stretch, dl180, nfs-client, autofs, hassrvfs, postgres96-hosts, apache2-hosts, haproxy-hosts, haproxy-https-host, varnish-hosts + lw07-2: + address: 185.17.185.185 + parents: lw07 + hostgroups: secondary-IPs, https-service + lw08: address: 185.17.185.189 parents: gw-leaseweb - hostgroups: computers, service, jessie, dl180, nfs-client, autofs, hassrvfs, apache2-hosts + hostgroups: computers, service, stretch, dl180, nfs-client, autofs, hassrvfs, apache2-hosts lw09: address: 185.17.185.181 parents: gw-leaseweb @@ -831,25 +834,20 @@ servers: address: 140.211.166.197 parents: pieta hostgroups: computers, hassrvfs, buildd, jessie - partch: - address: 140.211.15.152 - parents: gw-osuosl - hostgroups: computers, jessie, hassrvfs, porterbox, sw-raid # }}} # {{{ gw-sanger sallinen: address: 193.62.202.26 parents: gw-sanger - hostgroups: computers, service, stretch, dl380, nfs-client, autofs, postgres96-hosts + hostgroups: computers, service, stretch, dl380, nfs-client, autofs, postgres96-hosts, apache2-hosts, haproxy-hosts, haproxy-https-host, varnish-hosts + sallinen-2: + address: 193.62.202.27 + parents: sallinen + hostgroups: secondary-IPs, https-service sibelius: address: 193.62.202.28 parents: gw-sanger - hostgroups: computers, postgres94-hosts, service, apache2-hosts, sw-raid, jessie, rsyncd-hosts, hasvarlogfs, multipath-hosts, nfs-server - contacts: tjrc1, dave - smetana: - address: 193.62.202.29 - parents: gw-sanger - hostgroups: computers, sw-raid, sparc, wheezy, no-bacula + hostgroups: computers, postgres94-hosts, service, apache2-hosts, sw-raid, jessie, rsyncd-hosts, hasvarlogfs, multipath-hosts, nfs-server, varnish-hosts contacts: tjrc1, dave # }}} # {{{ gw-scanplus @@ -989,6 +987,14 @@ servers: address: 209.87.16.46 parents: ubc-gateway hostgroups: computers, service, kvmdomains, stretch, systemd-timesyncd, apache2-hosts, apache-https, broken_https_default_vhost + kantuser: + address: 209.87.16.47 + parents: ubc-gateway + hostgroups: computers, service, kvmdomains, stretch, systemd-timesyncd, apache2-hosts + grabbe: + address: 209.87.16.48 + parents: ubc-gateway + hostgroups: computers, service, kvmdomains, stretch, systemd-timesyncd, apache2-hosts, apache-https # }}} # {{{ gw-umn #saens: @@ -1014,19 +1020,19 @@ servers: # }}} # {{{ gw-unicamp prokofiev: - address: 177.220.10.140 + address: 143.106.167.124 parents: gw-unicamp hostgroups: computers, stretch, service, manyprocesses powerpc-unicamp-01: - address: 177.220.10.141 + address: 143.106.167.120 parents: prokofiev hostgroups: computers, hassrvfs, buildd, jessie ppc64el-unicamp-01: - address: 177.220.10.142 + address: 143.106.167.121 parents: prokofiev hostgroups: computers, hassrvfs, buildd, stretch plummer: - address: 177.220.10.143 + address: 143.106.167.122 parents: prokofiev hostgroups: computers, porterbox, hassrvfs, stretch # }}} @@ -1092,9 +1098,6 @@ hostgroups: armhf: alias: armhf private: 1 - sparc: - alias: sparc - private: 1 porterbox: alias: developer accessible porter machines @@ -1138,8 +1141,6 @@ hostgroups: alias: Dell PowerEdge 1950 hosts private: 1 - wheezy: - alias: Hosts running wheezy jessie: alias: Hosts running jessie stretch: @@ -1183,9 +1184,6 @@ hostgroups: xinetd-hosts: alias: hosts providing services via xinetd private: 1 - postgres91-hosts: - alias: hosts running postgres91 - private: 1 postgres94-hosts: alias: hosts running postgres94 private: 1 @@ -1220,6 +1218,15 @@ hostgroups: alias: hosts with lots and lots of (kernel) processes crazymanyprocesses: alias: hosts with stupidly lots of processes + varnish-hosts: + alias: hosts running varnish + private: 1 + haproxy-hosts: + alias: hosts running haproxy + private: 1 + haproxy-https-host: + alias: "host providing https on the standard port via haproxy" + private: 1 no-bacula: alias: hosts which are not being backed up with bacula @@ -1475,6 +1482,38 @@ services: servicegroups: diskspace nrpe: "/usr/lib/nagios/plugins/check_disk 95 97 /storage/snapshot-farm-10" hosts: lw10 + + - + name: disk usage on nfs farm 1 + servicegroups: diskspace + nrpe: "/usr/lib/nagios/plugins/check_disk 95 97 /auto.dsa/snapshot-1" + hosts: lw07 + - + name: disk usage on nfs farm 2 + servicegroups: diskspace + nrpe: "/usr/lib/nagios/plugins/check_disk 95 97 /auto.dsa/snapshot-2" + hosts: lw07 + - + name: disk usage on nfs farm 3 + servicegroups: diskspace + nrpe: "/usr/lib/nagios/plugins/check_disk 95 97 /auto.dsa/snapshot-3" + hosts: lw07 + - + name: disk usage on nfs farm 4 + servicegroups: diskspace + nrpe: "/usr/lib/nagios/plugins/check_disk 95 97 /auto.dsa/snapshot-4" + hosts: lw07 + - + name: disk usage on nfs farm 09 + servicegroups: diskspace + nrpe: "/usr/lib/nagios/plugins/check_disk 95 97 /auto.dsa/snapshot-09" + hosts: lw07 + - + name: disk usage on nfs farm 10 + servicegroups: diskspace + nrpe: "/usr/lib/nagios/plugins/check_disk 95 97 /auto.dsa/snapshot-10" + hosts: lw07 + - name: disk usage on /srv/morgue.debian.org/ servicegroups: diskspace @@ -1545,16 +1584,10 @@ services: name: free memory - percent nrpe: "/usr/lib/nagios/plugins/dsa-check-memory -m pct" hostgroups: computers - - - name: process - getty - nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:8 -c 1: -u root -C getty -a /sbin/getty" - hostgroups: computers - excludehosts: zelenka, zandonai - excludehostgroups: jessie, stretch - name: process - getty nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:8 -c 1: -u root -C agetty -a /sbin/agetty" - hostgroups: jessie, stretch + hostgroups: computers - name: processes - zombies @@ -1668,19 +1701,10 @@ services: name: puppetized firewall nrpe: "/usr/lib/nagios/plugins/dsa-check-file -w -f /etc/ferm/conf.d/defs.conf" hostgroups: computers - - - name: process - ulogd - nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:1 -c 1: -u root -C ulogd -a '/usr/sbin/ulogd -d'" - hostgroups: computers - excludehostgroups: sparc, jessie, stretch - name: process - ulogd nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:1 -c 1: -u ulog -C ulogd -a '/usr/sbin/ulogd --daemon --uid ulog'" - hostgroups: jessie, stretch - - - name: unexpected process - ulogd - nrpe: "/usr/lib/nagios/plugins/check_procs -w 0:0 -C ulogd" - hostgroups: sparc + hostgroups: computers #### - name: process - samhain @@ -1704,15 +1728,10 @@ services: excludehostgroups: brokensamhain # }}} # {{{ logging - - - name: process - syslog-ng - nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:1 -c 1: -u root -C syslog-ng -a '/sbin/syslog-ng -p /var/run/syslog-ng.pid'" - hostgroups: computers - excludehostgroups: jessie, stretch - name: process - syslog-ng nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:1 -c 1: -u root -C syslog-ng -a '/sbin/syslog-ng -F'" - hostgroups: jessie, stretch + hostgroups: computers - name: remote logging on lotti @@ -1793,7 +1812,7 @@ services: name: system time synced nrpe: "/usr/lib/nagios/plugins/dsa-check-timedatectl -s" hostgroups: computers - excludehostgroups: systemd-timesyncd, wheezy + excludehostgroups: systemd-timesyncd servicegroups: time - name: system time synced @@ -1810,7 +1829,7 @@ services: name: process - irqbalance nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:1 -c 1: -u root -C irqbalance -a '/usr/sbin/irqbalance'" hostgroups: computers - excludehosts: harris, smetana + excludehosts: harris ### - name: process - cron @@ -1823,12 +1842,6 @@ services: hostgroups: computers excludehostgroups: alioth ### - - - name: process - monit - nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:1 -c 1:1 -u root -C monit -a '/usr/bin/monit -d 300 -I -c /etc/monit/monitrc -s /var/lib/monit/monit.state'" - hostgroups: computers - excludehostgroups: alioth, jessie, stretch - ### - name: MQ connection on rainier servicegroups: MQ @@ -1863,22 +1876,11 @@ services: name: process - uptimed nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:1 -c 1: -u daemon -C uptimed -a '/usr/sbin/uptimed'" hostgroups: computers - ### - - - name: process - udevd - nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:1 -c 1: -u root -p 1 -C udevd -a 'udevd'" - hostgroups: computers - excludehostgroups: jessie, stretch - name: process - udevd nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:1 -c 1: -u root -p 1 -C systemd-udevd -a '/lib/systemd/systemd-udevd'" - hostgroups: jessie, stretch + hostgroups: computers ### - - - name: process - acpid - nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:1 -c 1: -u root -C acpid -a '/usr/sbin/acpid'" - hostgroups: acpid-hosts - excludehostgroups: jessie, stretch - name: unexpected process - acpid nrpe: "/usr/lib/nagios/plugins/check_procs -w 0:0 -C acpid" @@ -1898,7 +1900,7 @@ services: - name: process - stunnel4 - puppet-ekeyd nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:6 -c 1: -u stunnel4 -C stunnel4 -a '/usr/bin/stunnel4 /etc/stunnel/puppet-ekeyd.conf'" - hostgroups: wheezy, jessie, stretch + hostgroups: computers excludehostgroups: alioth - name: process - stunnel4 - puppet-ekeyd is crazy @@ -1965,18 +1967,11 @@ services: runfrom: handel # }}} # {{{ HW health/raid - - - name: process - mdadm monitor - servicegroups: raid - nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:1 -c 1:1 -u root -C mdadm -a '/sbin/mdadm --monitor --pid-file /run/mdadm/monitor.pid --daemonise --scan'" - hostgroups: sw-raid - excludehostgroups: jessie, stretch - name: process - mdadm monitor servicegroups: raid nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:1 -c 1:1 -u root -C mdadm -a '/sbin/mdadm --monitor --scan'" hostgroups: sw-raid - excludehostgroups: wheezy - name: RAID - sw raid servicegroups: raid @@ -2094,16 +2089,10 @@ services: nrpe: "/usr/lib/nagios/plugins/check_clamd -H /var/run/clamav/clamd.ctl" hostgroups: heavy-exim, heavy-postfix depends: process - clamav - clamd - - - name: process - clamav - freshclam - nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:1 -c 1: -u clamav -C freshclam -a '/usr/bin/freshclam -d --quiet'" - hostgroups: heavy-exim, heavy-postfix - excludehostgroups: jessie, stretch - name: process - clamav - freshclam nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:1 -c 1: -u clamav -C freshclam -a '/usr/bin/freshclam -d --foreground=true'" hostgroups: heavy-exim, heavy-postfix - excludehostgroups: wheezy - name: unwanted process - clamav nrpe: "/usr/lib/nagios/plugins/check_procs -w 0:0 -C clamd" @@ -2116,18 +2105,11 @@ services: excludehostgroups: heavy-exim, heavy-postfix # }}} # {{{ anti-spam - - - name: process - spamd - master - nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:1 -c 1: -u root -C spamd -a '/usr/sbin/spamd --create-prefs --max-children 5 --helper-home-dir -d --pidfile=/var/run/spamd.pid'" - hostgroups: spamd - excludehosts: picconi - excludehostgroups: jessie, stretch - name: process - spamd - master nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:1 -c 1: -u root -C spamd -a '/usr/sbin/spamd -d --pidfile=/var/run/spamd.pid --create-prefs --max-children 5 --helper-home-dir'" hostgroups: spamd excludehosts: picconi - excludehostgroups: wheezy - name: process - spamd - master nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:1 -c 1: -u root -C spamd -a '/usr/sbin/spamd -d --pidfile=/var/run/spamd.pid --create-prefs --max-children 20 --min-spare=5 --helper-home-dir'" @@ -2156,16 +2138,10 @@ services: hostgroups: computers ### - - - name: process - postgrey - nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:1 -c 1: -u postgrey -a '/usr/sbin/postgrey --pidfile=/var/run/postgrey.pid --daemonize --unix=/var/run/postgrey/socket --retry-window=4 --auto-whitelist-clients=10 --exim'" - hostgroups: heavy-exim - excludehostgroups: jessie, stretch - name: process - postgrey nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:1 -c 1: -u postgrey -a 'postgrey --pidfile=/var/run/postgrey.pid --daemonize --unix=/var/run/postgrey/socket --retry-window=4 --auto-whitelist-clients=10 --exim'" hostgroups: heavy-exim - excludehostgroups: wheezy - name: process - postgrey nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:1 -c 1: -u postgrey -a 'postgrey --pidfile=/var/run/postgrey.pid --daemonize --inet=127.0.0.1:60000'" @@ -2364,15 +2340,52 @@ services: - name: network service - https cert check: dsa_check_cert!443 - hostgroups: apache-https, https-service + hostgroups: apache-https, https-service, haproxy-https-host depends: network service - https check_interval: 60 - name: unwanted network service - https check: dsa_check_port_closed!443 hostgroups: apache2-hosts - excludehostgroups: apache-https + excludehostgroups: apache-https, haproxy-https-host check_interval: 60 + + ### + - + name: process - haproxy - master + nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:1 -c 1:1 -u root -a '/usr/sbin/haproxy-systemd-wrapper'" + hostgroups: haproxy-hosts + - + name: process - haproxy - worker + nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:6 -c 1:15 -u haproxy -a '/usr/sbin/haproxy '" + hostgroups: haproxy-hosts + depends: process - haproxy - master + - + name: network service - https + check: check_https + hostgroups: haproxy-https-host + depends: "process - haproxy - master" + check_interval: 120 + + - + name: unwanted process - haproxy + nrpe: "/usr/lib/nagios/plugins/check_procs -w 0 -C haproxy" + hostgroups: computers + excludehostgroups: haproxy-hosts + + ### + - + name: process - varnish + nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:2 -c 1:15 -u vcache -a '/usr/sbin/varnishd -j unix,user=vcache -F -a '" + hostgroups: varnish-hosts + excludehostgroups: jessie + - + name: unwanted process - varnish + nrpe: "/usr/lib/nagios/plugins/check_procs -w 0 -C varnishd" + hostgroups: computers + excludehostgroups: varnish-hosts + + # }}} # {{{ FTP - @@ -2385,15 +2398,11 @@ services: name: unwanted process - postgresql nrpe: "/usr/lib/nagios/plugins/check_procs -w 0 -C postgres" hostgroups: computers - excludehostgroups: postgres91-hosts, postgres94-hosts, postgres96-hosts + excludehostgroups: postgres94-hosts, postgres96-hosts - name: unwanted process - postgresql 9.0 nrpe: "/usr/lib/nagios/plugins/check_procs -w 0 -C postgres -a '9.0/bin/postgres'" hostgroups: computers - - - name: process - postgresql91 - master - nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:10 -c 1: -u postgres -C postgres -a '/usr/lib/postgresql/9.1/bin/postgres'" - hostgroups: postgres91-hosts - name: process - postgresql94 - master nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:10 -c 1: -u postgres -C postgres -a '/usr/lib/postgresql/9.4/bin/postgres'" @@ -2820,7 +2829,7 @@ services: - name: system - all services running nrpe: "/usr/bin/sudo /usr/lib/nagios/plugins/dsa-check-systemd-services" - hostgroups: jessie, stretch + hostgroups: computers ### - name: process - slapd @@ -2887,7 +2896,6 @@ services: name: puppet - agent check nrpe: "/usr/lib/nagios/plugins/dsa-check-statusfile /var/cache/dsa/nagios/puppet-agent" hostgroups: computers - excludehosts: moszumanska check_interval: 60 retry_interval: 15 # }}}