sync
This commit is contained in:
66
consul_backup/alerts
Normal file
66
consul_backup/alerts
Normal file
@@ -0,0 +1,66 @@
|
||||
---
|
||||
groups:
|
||||
- name: hosts
|
||||
rules:
|
||||
- alert: NodeDown
|
||||
expr: up{job="node-exporter"} == 0
|
||||
for: 5s
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "{{$labels.node}} Down"
|
||||
description: "{{$labels.node}} is down"
|
||||
|
||||
- alert: Disk High
|
||||
expr: nomad_client_host_disk_used_percent{disk="/dev/sda1"} > 80
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "{{$labels.host}} Disk Usage > 80%"
|
||||
description: "{{$labels.host}} disk usage at {{ $value }}%"
|
||||
|
||||
- alert: NomadJobFailed
|
||||
expr: increase(nomad_nomad_job_summary_failed[1h]) > 0
|
||||
for: 60m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Nomad job failed (instance {{ $labels.instance }})
|
||||
description: "Nomad job failed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: NomadJobLost
|
||||
expr: increase(nomad_nomad_job_summary_lost[1h]) > 0
|
||||
for: 60m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Nomad job lost (instance {{ $labels.instance }})
|
||||
description: "Nomad job lost\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: NomadJobQueued
|
||||
expr: nomad_nomad_job_summary_queued > 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Nomad job queued (instance {{ $labels.instance }})
|
||||
description: "Nomad job queued\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: NomadBlockedEvaluation
|
||||
expr: nomad_nomad_blocked_evals_total_blocked > 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Nomad blocked evaluation (instance {{ $labels.instance }})
|
||||
description: "Nomad blocked evaluation\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: ConsulServiceHealthcheckFailed
|
||||
expr: consul_catalog_service_node_healthy == 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Consul service healthcheck failed (instance {{ $labels.instance }})
|
||||
description: "Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
Reference in New Issue
Block a user