Files
2025-11-22 09:57:32 -08:00

67 lines
2.1 KiB
Plaintext

---
groups:
- name: hosts
rules:
- alert: NodeDown
expr: up{job="node-exporter"} == 0
for: 5s
labels:
severity: critical
annotations:
summary: "{{$labels.node}} Down"
description: "{{$labels.node}} is down"
- alert: Disk High
expr: nomad_client_host_disk_used_percent{disk="/dev/sda1"} > 80
for: 5m
labels:
severity: critical
annotations:
summary: "{{$labels.host}} Disk Usage > 80%"
description: "{{$labels.host}} disk usage at {{ $value }}%"
- alert: NomadJobFailed
expr: increase(nomad_nomad_job_summary_failed[1h]) > 0
for: 60m
labels:
severity: warning
annotations:
summary: Nomad job failed (instance {{ $labels.instance }})
description: "Nomad job failed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NomadJobLost
expr: increase(nomad_nomad_job_summary_lost[1h]) > 0
for: 60m
labels:
severity: warning
annotations:
summary: Nomad job lost (instance {{ $labels.instance }})
description: "Nomad job lost\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NomadJobQueued
expr: nomad_nomad_job_summary_queued > 0
for: 2m
labels:
severity: warning
annotations:
summary: Nomad job queued (instance {{ $labels.instance }})
description: "Nomad job queued\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NomadBlockedEvaluation
expr: nomad_nomad_blocked_evals_total_blocked > 0
for: 10m
labels:
severity: warning
annotations:
summary: Nomad blocked evaluation (instance {{ $labels.instance }})
description: "Nomad blocked evaluation\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ConsulServiceHealthcheckFailed
expr: consul_catalog_service_node_healthy == 0
for: 10m
labels:
severity: critical
annotations:
summary: Consul service healthcheck failed (instance {{ $labels.instance }})
description: "Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"