Alerts


/etc/prometheus/alerts.yml > infra_nodes
NodeExporterDown (1 active)
alert: NodeExporterDown
expr: up{job="node"}
  == 0
for: 10m
labels:
  severity: warning
annotations:
  description: Node exporter unreachable on {{ $labels.instance }} for 10 minutes.
  source: https://prometheus.bosspacific.com.au/graph?query=up%7Bjob%3D%22node%22%7D%3D%3D0
  summary: Node exporter down on {{ $labels.vm }}
Labels State Active Since Value
alertname="NodeExporterDown" instance="192.168.1.70:9100" job="node" role="nextcloud" severity="warning" vm="cloud" firing 2026-03-25 23:01:21.701478546 +0000 UTC 0
KeycloakMetricsDown (0 active)
alert: KeycloakMetricsDown
expr: up{job="keycloak"}
  == 0
for: 3m
labels:
  severity: critical
annotations:
  description: Keycloak /metrics endpoint is unreachable for 3 minutes.
  source: https://prometheus.bosspacific.com.au/graph?query=up{job="keycloak"}==0
  summary: Keycloak metrics down
/etc/prometheus/alerts.yml > service_checks
HostSshDown (1 active)
alert: HostSshDown
expr: probe_success{instance=~".*
  host .*",job="probe_tcp"} == 0
for: 10m
labels:
  severity: warning
annotations:
  description: SSH TCP probe failing for {{ $labels.instance }} for 10 minutes.
  source: https://prometheus.bosspacific.com.au/graph?query=probe_success{job="probe_tcp"}==0
  summary: 'Host SSH down: {{ $labels.instance }}'
Labels State Active Since Value
alertname="HostSshDown" instance="cloud host (192.168.1.70)" job="probe_tcp" name="cloud host (192.168.1.70)" severity="warning" firing 2026-03-25 23:01:17.017314157 +0000 UTC 0
PostgresTcpDown (0 active)
alert: PostgresTcpDown
expr: probe_success{instance=~"postgres.*",job="probe_tcp"}
  == 0
for: 5m
labels:
  severity: critical
annotations:
  description: Blackbox TCP probe failing for {{ $labels.instance }} for 5 minutes.
  source: https://prometheus.bosspacific.com.au/graph?query=probe_success{job="probe_tcp"}==0
  summary: 'Postgres TCP down: {{ $labels.instance }}'
ServiceHttpDown (0 active)
alert: ServiceHttpDown
expr: probe_success{job="probe_http"}
  == 0
for: 5m
labels:
  severity: critical
annotations:
  description: Blackbox HTTP probe failing for {{ $labels.instance }} for 5 minutes.
  source: https://prometheus.bosspacific.com.au/graph?query=probe_success{job="probe_http"}==0
  summary: 'HTTP service down: {{ $labels.instance }}'