|
/etc/prometheus/alerts.yml > infra_nodes
|
alert: NodeExporterDown
expr: up{job="node"}
== 0
for: 10m
labels:
severity: warning
annotations:
description: Node exporter unreachable on {{ $labels.instance }} for 10 minutes.
source: https://prometheus.bosspacific.com.au/graph?query=up%7Bjob%3D%22node%22%7D%3D%3D0
summary: Node exporter down on {{ $labels.vm }}
| Labels |
State |
Active Since |
Value |
|
alertname="NodeExporterDown"
instance="192.168.1.70:9100"
job="node"
role="nextcloud"
severity="warning"
vm="cloud"
|
firing |
2026-03-25 23:01:21.701478546 +0000 UTC |
0 |
| Annotations |
- description
- Node exporter unreachable on 192.168.1.70:9100 for 10 minutes.
- source
- https://prometheus.bosspacific.com.au/graph?query=up%7Bjob%3D%22node%22%7D%3D%3D0
- summary
- Node exporter down on cloud
|
|
alert: KeycloakMetricsDown
expr: up{job="keycloak"}
== 0
for: 3m
labels:
severity: critical
annotations:
description: Keycloak /metrics endpoint is unreachable for 3 minutes.
source: https://prometheus.bosspacific.com.au/graph?query=up{job="keycloak"}==0
summary: Keycloak metrics down
|
|
/etc/prometheus/alerts.yml > service_checks
|
alert: HostSshDown
expr: probe_success{instance=~".*
host .*",job="probe_tcp"} == 0
for: 10m
labels:
severity: warning
annotations:
description: SSH TCP probe failing for {{ $labels.instance }} for 10 minutes.
source: https://prometheus.bosspacific.com.au/graph?query=probe_success{job="probe_tcp"}==0
summary: 'Host SSH down: {{ $labels.instance }}'
| Labels |
State |
Active Since |
Value |
|
alertname="HostSshDown"
instance="cloud host (192.168.1.70)"
job="probe_tcp"
name="cloud host (192.168.1.70)"
severity="warning"
|
firing |
2026-03-25 23:01:17.017314157 +0000 UTC |
0 |
| Annotations |
- description
- SSH TCP probe failing for cloud host (192.168.1.70) for 10 minutes.
- source
- https://prometheus.bosspacific.com.au/graph?query=probe_success{job="probe_tcp"}==0
- summary
- Host SSH down: cloud host (192.168.1.70)
|
|
|
|
alert: ServiceHttpDown
expr: probe_success{job="probe_http"}
== 0
for: 5m
labels:
severity: critical
annotations:
description: Blackbox HTTP probe failing for {{ $labels.instance }} for 5 minutes.
source: https://prometheus.bosspacific.com.au/graph?query=probe_success{job="probe_http"}==0
summary: 'HTTP service down: {{ $labels.instance }}'
|