forked from HPC/HPCplaybooks
72 lines
2.6 KiB
Plaintext
72 lines
2.6 KiB
Plaintext
groups:
|
|
- name: basic
|
|
rules:
|
|
- alert: InstanceDown
|
|
expr: up{job="node"} == 0
|
|
for: 10m
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
description: '{{ $labels.instance }} of job {{ $labels.job }} has been down
|
|
for more than 10 minutes.'
|
|
summary: Instance {{ $labels.instance }} down
|
|
- alert: Time not being synced
|
|
expr: node_timex_sync_status{job="node"} == 0
|
|
for: 5m
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
description: '{{ $labels.instance }} is not configured to sync its time with an external ntp server'
|
|
summary: Instance {{ $labels.instance }} no ntp configured.
|
|
- alert: clock wrong
|
|
expr: node_timex_offset_seconds{job="node"} > 1
|
|
for: 10m
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
description: '{{ $labels.instance }} has a clock offset > 1 second.'
|
|
summary: '{{ $labels.instance }} has clock drift.'
|
|
- alert: DiskWillFillIn8Hours
|
|
expr: predict_linear(node_filesystem_free{job="node",mountpoint!~"/tmp|/local|/target/gpfs3"}[2h], 8 * 3600) < 0
|
|
for: 2h
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
description: Instance {{ $labels.instance }} will fill up within 8 hours
|
|
summary: '{{ $labels.instance }} disk full'
|
|
- alert: DiskWillFillIn72Hours
|
|
expr: predict_linear(node_filesystem_free{job="node",mountpoint!~"/tmp|/local|/target/gpfs3"}[6h], 72 * 3600) < 0
|
|
for: 8h
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
description: Instance {{ $labels.instance }} will fill up within 72 hours
|
|
summary: '{{ $labels.instance }} disk almost full'
|
|
- alert: DiskFull
|
|
expr: node_filesystem_free{job="node",mountpoint!~"/tmp|/net|/cvmfs|/var/lib/nfs/rpc_pipefs|/cvmfs|/misc|/run/docker/netns/.+?|/cgroup.+?", fstype!~"fuse.+?"} < 5.24288e+06
|
|
for: 5m
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
description: Instance {{ $labels.instance }} has a full {{ $labels.mountpoint }}.
|
|
summary: '{{ $labels.instance }} Disk full'
|
|
- alert: tmpFull
|
|
expr: node_filesystem_free{job="node",mountpoint="/tmp"} < 5242880
|
|
for: 30m
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
description: Instance {{ $labels.instance }} Has a full /tmp
|
|
summary: '{{ $labels.instance }} /tmp full'
|
|
- alert: NodeRebooted
|
|
expr: delta(node_boot_time[1h]) > 10
|
|
for: 1m
|
|
labels:
|
|
severity: page
|
|
annotations:
|
|
description: Instance {{ $labels.instance }} has been rebooted.
|
|
summary: '{{ $labels.instance }} rebooted'
|
|
# - alert: TestAlert
|
|
# expr: probe_success{instance="195.169.22.220:11211",job="blackbox"} == 0
|
|
# for: 1m
|