--- groups: - name: hosts rules: - alert: NodeDown expr: up{job="node-exporter"} == 0 for: 5s labels: severity: critical annotations: summary: "{{$labels.node}} Down" description: "{{$labels.node}} is down" - alert: Disk High expr: nomad_client_host_disk_used_percent{disk="/dev/sda1"} > 80 for: 5m labels: severity: critical annotations: summary: "{{$labels.host}} Disk Usage > 80%" description: "{{$labels.host}} disk usage at {{ $value }}%" - alert: NomadJobFailed expr: increase(nomad_nomad_job_summary_failed[1h]) > 0 for: 60m labels: severity: warning annotations: summary: Nomad job failed (instance {{ $labels.instance }}) description: "Nomad job failed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: NomadJobLost expr: increase(nomad_nomad_job_summary_lost[1h]) > 0 for: 60m labels: severity: warning annotations: summary: Nomad job lost (instance {{ $labels.instance }}) description: "Nomad job lost\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: NomadJobQueued expr: nomad_nomad_job_summary_queued > 0 for: 2m labels: severity: warning annotations: summary: Nomad job queued (instance {{ $labels.instance }}) description: "Nomad job queued\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: NomadBlockedEvaluation expr: nomad_nomad_blocked_evals_total_blocked > 0 for: 10m labels: severity: warning annotations: summary: Nomad blocked evaluation (instance {{ $labels.instance }}) description: "Nomad blocked evaluation\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ConsulServiceHealthcheckFailed expr: consul_catalog_service_node_healthy == 0 for: 10m labels: severity: critical annotations: summary: Consul service healthcheck failed (instance {{ $labels.instance }}) description: "Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"