67 lines
2.1 KiB
Plaintext
67 lines
2.1 KiB
Plaintext
---
|
|
groups:
|
|
- name: hosts
|
|
rules:
|
|
- alert: NodeDown
|
|
expr: up{job="node-exporter"} == 0
|
|
for: 5s
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "{{$labels.node}} Down"
|
|
description: "{{$labels.node}} is down"
|
|
|
|
- alert: Disk High
|
|
expr: nomad_client_host_disk_used_percent{disk="/dev/sda1"} > 80
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "{{$labels.host}} Disk Usage > 80%"
|
|
description: "{{$labels.host}} disk usage at {{ $value }}%"
|
|
|
|
- alert: NomadJobFailed
|
|
expr: increase(nomad_nomad_job_summary_failed[1h]) > 0
|
|
for: 60m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Nomad job failed (instance {{ $labels.instance }})
|
|
description: "Nomad job failed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: NomadJobLost
|
|
expr: increase(nomad_nomad_job_summary_lost[1h]) > 0
|
|
for: 60m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Nomad job lost (instance {{ $labels.instance }})
|
|
description: "Nomad job lost\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: NomadJobQueued
|
|
expr: nomad_nomad_job_summary_queued > 0
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Nomad job queued (instance {{ $labels.instance }})
|
|
description: "Nomad job queued\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: NomadBlockedEvaluation
|
|
expr: nomad_nomad_blocked_evals_total_blocked > 0
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Nomad blocked evaluation (instance {{ $labels.instance }})
|
|
description: "Nomad blocked evaluation\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: ConsulServiceHealthcheckFailed
|
|
expr: consul_catalog_service_node_healthy == 0
|
|
for: 10m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: Consul service healthcheck failed (instance {{ $labels.instance }})
|
|
description: "Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|