X-Git-Url: https://git.adam-barratt.org.uk/?a=blobdiff_plain;ds=sidebyside;f=config%2Fnagios-master.cfg;h=911fa93f96e16987e5c00b639f5f2a2912da29a2;hb=fb1dfd0900ee419e17df34c9e1b0cf9bc5011c0f;hp=999aa941387fd5b3fffc6dd47aa371e296262619;hpb=fc8a680d8b01ca1f7bb36c795944b26abfac12ac;p=mirror%2Fdsa-nagios.git diff --git a/config/nagios-master.cfg b/config/nagios-master.cfg index 999aa94..911fa93 100644 --- a/config/nagios-master.cfg +++ b/config/nagios-master.cfg @@ -85,10 +85,6 @@ servers: address: 72.52.94.70 parents: gw-ubcece hostgroups: layer3-infrastructure - gw-karlsruhe: - address: 129.143.59.214 - parents: gw-ubcece - hostgroups: layer3-infrastructure gw-leaseweb: address: 185.17.185.190 parents: gw-ubcece @@ -447,7 +443,7 @@ servers: x86-bm-01: address: 5.153.231.32 parents: ganeti-bytemark - hostgroups: computers, kvmdomains, stretch, no-bacula, systemd-timesyncd + hostgroups: computers, pybuildd, hassrvfs, kvmdomains, stretch, systemd-timesyncd tate: address: 5.153.231.33 parents: ganeti-bytemark @@ -643,7 +639,7 @@ servers: x86-grnet-01: address: 194.177.211.203 parents: ganeti-grnet - hostgroups: computers, buildd, hassrvfs, kvmdomains, stretch, systemd-timesyncd + hostgroups: computers, pybuildd, hassrvfs, kvmdomains, stretch, systemd-timesyncd vittoria: address: 194.177.211.205 parents: ganeti-grnet @@ -687,27 +683,32 @@ servers: lw01: address: 185.17.185.177 parents: gw-leaseweb - hostgroups: computers, service, jessie, dl180, nfs-server, rsyncd-hosts + hostgroups: computers, service, stretch, dl180, nfs-server, rsyncd-hosts lw02: address: 185.17.185.178 parents: gw-leaseweb - hostgroups: computers, service, jessie, dl180, nfs-server, rsyncd-hosts + hostgroups: computers, service, stretch, dl180, nfs-server, rsyncd-hosts lw03: address: 185.17.185.179 parents: gw-leaseweb - hostgroups: computers, service, jessie, dl180, nfs-server, rsyncd-hosts + hostgroups: computers, service, stretch, dl180, nfs-server, rsyncd-hosts lw04: address: 185.17.185.180 parents: gw-leaseweb - hostgroups: computers, service, jessie, dl180, nfs-server, rsyncd-hosts + hostgroups: computers, service, stretch, dl180, nfs-server, rsyncd-hosts lw07: address: 185.17.185.187 parents: gw-leaseweb - hostgroups: computers, service, jessie, dl180, nfs-client, autofs, hassrvfs, postgres94-hosts, apache2-hosts + hostgroups: computers, service, stretch, dl180, nfs-client, autofs, hassrvfs, postgres96-hosts, apache2-hosts, haproxy-hosts, haproxy-https-host, varnish-hosts + lw07-2: + address: 185.17.185.185 + parents: lw07 + hostgroups: secondary-IPs, https-service + lw08: address: 185.17.185.189 parents: gw-leaseweb - hostgroups: computers, service, jessie, dl180, nfs-client, autofs, hassrvfs, apache2-hosts + hostgroups: computers, service, stretch, dl180, nfs-client, autofs, hassrvfs, apache2-hosts lw09: address: 185.17.185.181 parents: gw-leaseweb @@ -717,13 +718,6 @@ servers: parents: gw-leaseweb hostgroups: computers, service, stretch, dl180 # }}} - # {{{ gw-karlsruhe - zemlinsky: - address: 129.143.160.6 - parents: gw-karlsruhe - hostgroups: computers, buildd, stretch - contacts: pkern - # }}} # {{{ gw-manda czerny: address: 82.195.75.109 @@ -818,7 +812,7 @@ servers: zani: address: 148.100.88.22 parents: gw-marist - hostgroups: computers, buildd, hassrvfs, stretch, incomingmailrelayed + hostgroups: computers, pybuildd, hassrvfs, stretch, incomingmailrelayed # }}} # {{{ gw-osuosl byrd: @@ -851,7 +845,11 @@ servers: sallinen: address: 193.62.202.26 parents: gw-sanger - hostgroups: computers, service, stretch, dl380, nfs-client, autofs, postgres96-hosts + hostgroups: computers, service, stretch, dl380, nfs-client, autofs, postgres96-hosts, apache2-hosts, haproxy-hosts, haproxy-https-host, varnish-hosts + sallinen-2: + address: 193.62.202.27 + parents: sallinen + hostgroups: secondary-IPs, https-service sibelius: address: 193.62.202.28 parents: gw-sanger @@ -995,11 +993,19 @@ servers: godard: address: 209.87.16.44 parents: ubc-gateway - hostgroups: computers, service, kvmdomains, stretch, hassrvfs, apache2-hosts, apache-https, systemd-timesyncd, postfix-hosts, postgres96-hosts, manyprocesses + hostgroups: computers, service, kvmdomains, stretch, hassrvfs, apache2-hosts, apache-https, systemd-timesyncd, postfix-hosts, postgres96-hosts, crazymanyprocesses debussy: address: 209.87.16.46 parents: ubc-gateway hostgroups: computers, service, kvmdomains, stretch, systemd-timesyncd, apache2-hosts, apache-https, broken_https_default_vhost + kantuser: + address: 209.87.16.47 + parents: ubc-gateway + hostgroups: computers, service, kvmdomains, stretch, systemd-timesyncd, apache2-hosts + grabbe: + address: 209.87.16.48 + parents: ubc-gateway + hostgroups: computers, service, kvmdomains, stretch, systemd-timesyncd, apache2-hosts, apache-https # }}} # {{{ gw-umn #saens: @@ -1095,13 +1101,8 @@ hostgroups: computers: alias: computers private: 1 - extinfo-icon_image: base/debian.png - extinfo-icon_image_alt: Debian GNU/Linux - extinfo-notes_url: https://db.debian.org/machines.cgi?host=%s layer3-infrastructure: alias: Layer 3 Devices - extinfo-icon_image: base/switch40.png - extinfo-icon_image_alt: router notacomputer: alias: Systems that are not really systems. Yeah :) private: 1 @@ -1118,6 +1119,8 @@ hostgroups: alias: machines running services buildd: alias: buildd systems + pybuildd: + alias: buildd systems running pybuildd general: alias: general purpose developer accessible machines @@ -1232,6 +1235,17 @@ hostgroups: private: 1 manyprocesses: alias: hosts with lots and lots of (kernel) processes + crazymanyprocesses: + alias: hosts with stupidly lots of processes + varnish-hosts: + alias: hosts running varnish + private: 1 + haproxy-hosts: + alias: hosts running haproxy + private: 1 + haproxy-https-host: + alias: "host providing https on the standard port via haproxy" + private: 1 no-bacula: alias: hosts which are not being backed up with bacula @@ -1540,11 +1554,15 @@ services: name: processes - total nrpe: "/usr/lib/nagios/plugins/check_procs 620 700" hostgroups: computers - excludehostgroups: manyprocesses + excludehostgroups: manyprocesses, crazymanyprocesses - name: processes - total hostgroups: manyprocesses nrpe: "/usr/lib/nagios/plugins/check_procs 1500 1700" + - + name: processes - total + hostgroups: crazymanyprocesses + nrpe: "/usr/lib/nagios/plugins/check_procs 15000 25000" - name: free memory - mb nrpe: "/usr/lib/nagios/plugins/dsa-check-memory -m mb" @@ -1604,7 +1622,7 @@ services: remotecheck: "/usr/lib/nagios/plugins/dsa-check-bacula $HOSTNAME$.debian.org" runfrom: dinis hostgroups: computers - excludehostgroups: buildd, porterbox, no-bacula + excludehostgroups: buildd, pybuildd, porterbox, no-bacula check_interval: 60 retry_interval: 15 - @@ -1613,7 +1631,7 @@ services: remotecheck: "/usr/lib/nagios/plugins/dsa-check-bacula -w 1080 -c 1560 $HOSTNAME$.debian.org F" runfrom: dinis hostgroups: computers - excludehostgroups: buildd, porterbox, no-bacula + excludehostgroups: buildd, pybuildd, porterbox, no-bacula check_interval: 60 retry_interval: 15 - @@ -1966,11 +1984,6 @@ services: name: "sso CRL" nrpe: "if [ -e /var/lib/dsa/sso/ca.crl ]; then /usr/lib/nagios/plugins/dsa-check-crl-expire -w 129600 -c 86400 /var/lib/dsa/sso/ca.crl; else echo 'No sso/ca.crl on this host.'; fi" hostgroups: computers - - - name: SSL certs - puppet - hosts: global - remotecheck: "/usr/lib/nagios/plugins/dsa-check-cert-expire-dir /etc/puppet/modules/ssl/files/servicecerts" - runfrom: handel - name: SSL certs - LE hosts: global @@ -2377,15 +2390,51 @@ services: - name: network service - https cert check: dsa_check_cert!443 - hostgroups: apache-https, https-service + hostgroups: apache-https, https-service, haproxy-https-host depends: network service - https check_interval: 60 - name: unwanted network service - https check: dsa_check_port_closed!443 hostgroups: apache2-hosts - excludehostgroups: apache-https + excludehostgroups: apache-https, haproxy-https-host check_interval: 60 + + ### + - + name: process - haproxy - master + nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:1 -c 1:1 -u root -a '/usr/sbin/haproxy-systemd-wrapper'" + hostgroups: haproxy-hosts + - + name: process - haproxy - worker + nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:6 -c 1:15 -u haproxy -a '/usr/sbin/haproxy '" + hostgroups: haproxy-hosts + depends: process - haproxy - master + - + name: network service - https + check: check_https + hostgroups: haproxy-https-host + depends: "process - haproxy - master" + check_interval: 120 + + - + name: unwanted process - haproxy + nrpe: "/usr/lib/nagios/plugins/check_procs -w 0 -C haproxy" + hostgroups: computers + excludehostgroups: haproxy-hosts + + ### + - + name: process - varnish + nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:2 -c 1:15 -u vcache -a '/usr/sbin/varnishd -j unix,user=vcache -F -a '" + hostgroups: varnish-hosts + - + name: unwanted process - varnish + nrpe: "/usr/lib/nagios/plugins/check_procs -w 0 -C varnishd" + hostgroups: computers + excludehostgroups: varnish-hosts + + # }}} # {{{ FTP - @@ -2427,19 +2476,20 @@ services: nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:5 -u buildd -C buildd -a '/usr/bin/buildd'" hostgroups: buildd contact_groups: buildd + - + name: process - buildd + servicegroups: buildd + nrpe: "/usr/lib/nagios/plugins/check_procs -w 1:5 -u buildd -C python3 -a 'buildd.py'" + hostgroups: pybuildd + contact_groups: buildd - name: processes - zombie schroot nrpe: "(/usr/lib/nagios/plugins/check_procs -a schroot -s Zs -c 0 > /dev/null || /usr/lib/nagios/plugins/check_procs -a schroot -s Zs -c 0) && /usr/lib/nagios/plugins/check_procs -a schroot -s ZNs -c 0" - hostgroups: buildd + hostgroups: buildd, pybuildd contact_groups: +buildd check_interval: 5 max_check_attempts: 24 retry_interval: 5 - - - name: processes - lvcreate - nrpe: "/usr/lib/nagios/plugins/check_procs -m 'ELAPSED' -c 500 -C lvcreate -u root -a 'lvcreate'" - hostgroups: buildd - contact_groups: +buildd # }}} # {{{ NFS Stuff -