1
0
Fork 0
HPCplaybooks/roles/prom_server/templates/etc/alerting.rules

72 lines
2.6 KiB
Plaintext

groups:
- name: basic
rules:
- alert: InstanceDown
expr: up{job="node"} == 0
for: 10m
labels:
severity: page
annotations:
description: '{{ $labels.instance }} of job {{ $labels.job }} has been down
for more than 10 minutes.'
summary: Instance {{ $labels.instance }} down
- alert: Time not being synced
expr: node_timex_sync_status{job="node"} == 0
for: 5m
labels:
severity: page
annotations:
description: '{{ $labels.instance }} is not configured to sync its time with an external ntp server'
summary: Instance {{ $labels.instance }} no ntp configured.
- alert: clock wrong
expr: node_timex_offset_seconds{job="node"} > 1
for: 10m
labels:
severity: page
annotations:
description: '{{ $labels.instance }} has a clock offset > 1 second.'
summary: '{{ $labels.instance }} has clock drift.'
- alert: DiskWillFillIn8Hours
expr: predict_linear(node_filesystem_free{job="node",mountpoint!~"/tmp|/local|/target/gpfs3"}[2h], 8 * 3600) < 0
for: 2h
labels:
severity: page
annotations:
description: Instance {{ $labels.instance }} will fill up within 8 hours
summary: '{{ $labels.instance }} disk full'
- alert: DiskWillFillIn72Hours
expr: predict_linear(node_filesystem_free{job="node",mountpoint!~"/tmp|/local|/target/gpfs3"}[6h], 72 * 3600) < 0
for: 8h
labels:
severity: page
annotations:
description: Instance {{ $labels.instance }} will fill up within 72 hours
summary: '{{ $labels.instance }} disk almost full'
- alert: DiskFull
expr: node_filesystem_free{job="node",mountpoint!~"/tmp|/net|/cvmfs|/var/lib/nfs/rpc_pipefs|/cvmfs|/misc|/run/docker/netns/.+?|/cgroup.+?", fstype!~"fuse.+?"} < 5.24288e+06
for: 5m
labels:
severity: page
annotations:
description: Instance {{ $labels.instance }} has a full {{ $labels.mountpoint }}.
summary: '{{ $labels.instance }} Disk full'
- alert: tmpFull
expr: node_filesystem_free{job="node",mountpoint="/tmp"} < 5242880
for: 30m
labels:
severity: page
annotations:
description: Instance {{ $labels.instance }} Has a full /tmp
summary: '{{ $labels.instance }} /tmp full'
- alert: NodeRebooted
expr: delta(node_boot_time[1h]) > 10
for: 1m
labels:
severity: page
annotations:
description: Instance {{ $labels.instance }} has been rebooted.
summary: '{{ $labels.instance }} rebooted'
# - alert: TestAlert
# expr: probe_success{instance="195.169.22.220:11211",job="blackbox"} == 0
# for: 1m