Alerts


/etc/prometheus/alerts.yml > infra_nodes
NodeExporterDown (9 active)
alert: NodeExporterDown
expr: up{job="node"}
  == 0
for: 10m
labels:
  severity: warning
annotations:
  description: Node exporter unreachable on {{ $labels.instance }} for 10 minutes.
  source: https://prometheus.bosspacific.com.au/graph?query=up%7Bjob%3D%22node%22%7D%3D%3D0
  summary: Node exporter down on {{ $labels.vm }}
Labels State Active Since Value
alertname="NodeExporterDown" instance="192.168.1.75:9100" job="node" role="jenkins" severity="warning" vm="ci" firing 2026-01-27 03:49:51 +0000 UTC 0
alertname="NodeExporterDown" instance="192.168.1.50:9100" job="node" role="odoo" severity="warning" vm="erp" firing 2026-01-24 22:51:46 +0000 UTC 0
alertname="NodeExporterDown" instance="192.168.1.80:9100" job="node" role="search" severity="warning" vm="search" firing 2026-01-27 03:49:51 +0000 UTC 0
alertname="NodeExporterDown" instance="192.168.1.90:9100" job="node" role="keycloak-host" severity="warning" vm="auth" firing 2026-01-24 22:51:46 +0000 UTC 0
alertname="NodeExporterDown" instance="192.168.1.200:9100" job="node" role="ispconfig" severity="warning" vm="wolfware" firing 2026-01-24 23:56:06 +0000 UTC 0
alertname="NodeExporterDown" instance="192.168.1.30:9100" job="node" role="docker" severity="warning" vm="docker" firing 2026-01-27 03:49:51 +0000 UTC 0
alertname="NodeExporterDown" instance="192.168.1.60:9100" job="node" role="gitlab" severity="warning" vm="git" firing 2026-01-24 22:51:46 +0000 UTC 0
alertname="NodeExporterDown" instance="192.168.1.40:9100" job="node" role="postgres" severity="warning" vm="pg" firing 2026-01-24 22:51:46 +0000 UTC 0
alertname="NodeExporterDown" instance="192.168.1.70:9100" job="node" role="nextcloud" severity="warning" vm="cloud" firing 2026-01-24 22:51:46 +0000 UTC 0
KeycloakMetricsDown (0 active)
alert: KeycloakMetricsDown
expr: up{job="keycloak"}
  == 0
for: 3m
labels:
  severity: critical
annotations:
  description: Keycloak /metrics endpoint is unreachable for 3 minutes.
  source: https://prometheus.bosspacific.com.au/graph?query=up{job="keycloak"}==0
  summary: Keycloak metrics down
/etc/prometheus/alerts.yml > service_checks
HostSshDown (0 active)
alert: HostSshDown
expr: probe_success{instance=~".*
  host .*",job="probe_tcp"} == 0
for: 10m
labels:
  severity: warning
annotations:
  description: SSH TCP probe failing for {{ $labels.instance }} for 10 minutes.
  source: https://prometheus.bosspacific.com.au/graph?query=probe_success{job="probe_tcp"}==0
  summary: 'Host SSH down: {{ $labels.instance }}'
PostgresTcpDown (0 active)
alert: PostgresTcpDown
expr: probe_success{instance=~"postgres.*",job="probe_tcp"}
  == 0
for: 5m
labels:
  severity: critical
annotations:
  description: Blackbox TCP probe failing for {{ $labels.instance }} for 5 minutes.
  source: https://prometheus.bosspacific.com.au/graph?query=probe_success{job="probe_tcp"}==0
  summary: 'Postgres TCP down: {{ $labels.instance }}'
ServiceHttpDown (0 active)
alert: ServiceHttpDown
expr: probe_success{job="probe_http"}
  == 0
for: 5m
labels:
  severity: critical
annotations:
  description: Blackbox HTTP probe failing for {{ $labels.instance }} for 5 minutes.
  source: https://prometheus.bosspacific.com.au/graph?query=probe_success{job="probe_http"}==0
  summary: 'HTTP service down: {{ $labels.instance }}'