forked from HPC/HPCplaybooks
Added a generic prometheus server role.
This commit is contained in:
71
roles/prom_server/templates/etc/alerting.rules
Normal file
71
roles/prom_server/templates/etc/alerting.rules
Normal file
@ -0,0 +1,71 @@
|
||||
groups:
|
||||
- name: basic
|
||||
rules:
|
||||
- alert: InstanceDown
|
||||
expr: up{job="node"} == 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
description: '{{ $labels.instance }} of job {{ $labels.job }} has been down
|
||||
for more than 10 minutes.'
|
||||
summary: Instance {{ $labels.instance }} down
|
||||
- alert: Time not being synced
|
||||
expr: node_timex_sync_status{job="node"} == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
description: '{{ $labels.instance }} is not configured to sync its time with an external ntp server'
|
||||
summary: Instance {{ $labels.instance }} no ntp configured.
|
||||
- alert: clock wrong
|
||||
expr: node_timex_offset_seconds{job="node"} > 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
description: '{{ $labels.instance }} has a clock offset > 1 second.'
|
||||
summary: '{{ $labels.instance }} has clock drift.'
|
||||
- alert: DiskWillFillIn8Hours
|
||||
expr: predict_linear(node_filesystem_free{job="node",mountpoint!~"/tmp|/local|/target/gpfs3"}[2h], 8 * 3600) < 0
|
||||
for: 2h
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
description: Instance {{ $labels.instance }} will fill up within 8 hours
|
||||
summary: '{{ $labels.instance }} disk full'
|
||||
- alert: DiskWillFillIn72Hours
|
||||
expr: predict_linear(node_filesystem_free{job="node",mountpoint!~"/tmp|/local|/target/gpfs3"}[6h], 72 * 3600) < 0
|
||||
for: 8h
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
description: Instance {{ $labels.instance }} will fill up within 72 hours
|
||||
summary: '{{ $labels.instance }} disk almost full'
|
||||
- alert: DiskFull
|
||||
expr: node_filesystem_free{job="node",mountpoint!~"/tmp|/net|/cvmfs|/var/lib/nfs/rpc_pipefs|/cvmfs|/misc|/run/docker/netns/.+?|/cgroup.+?", fstype!~"fuse.+?"} < 5.24288e+06
|
||||
for: 5m
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
description: Instance {{ $labels.instance }} has a full {{ $labels.mountpoint }}.
|
||||
summary: '{{ $labels.instance }} Disk full'
|
||||
- alert: tmpFull
|
||||
expr: node_filesystem_free{job="node",mountpoint="/tmp"} < 5242880
|
||||
for: 30m
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
description: Instance {{ $labels.instance }} Has a full /tmp
|
||||
summary: '{{ $labels.instance }} /tmp full'
|
||||
- alert: NodeRebooted
|
||||
expr: delta(node_boot_time[1h]) > 10
|
||||
for: 1m
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
description: Instance {{ $labels.instance }} has been rebooted.
|
||||
summary: '{{ $labels.instance }} rebooted'
|
||||
# - alert: TestAlert
|
||||
# expr: probe_success{instance="195.169.22.220:11211",job="blackbox"} == 0
|
||||
# for: 1m
|
0
roles/prom_server/templates/etc/cadvisor.json
Normal file
0
roles/prom_server/templates/etc/cadvisor.json
Normal file
55
roles/prom_server/templates/etc/prometheus.yml
Normal file
55
roles/prom_server/templates/etc/prometheus.yml
Normal file
@ -0,0 +1,55 @@
|
||||
# my global config
|
||||
global:
|
||||
scrape_interval: 60s # By default, scrape targets every 15 seconds.
|
||||
evaluation_interval: 60s # By default, scrape targets every 15 seconds.
|
||||
# scrape_timeout is set to the global default (10s).
|
||||
|
||||
# Attach these labels to any time series or alerts when communicating with
|
||||
# external systems (federation, remote storage, Alertmanager).
|
||||
external_labels:
|
||||
monitor: {{ ansible_hostname }}
|
||||
|
||||
# alert
|
||||
alerting:
|
||||
alertmanagers:
|
||||
- scheme: http
|
||||
static_configs:
|
||||
- targets:
|
||||
- "alertmanager.kube.hpc.rug.nl"
|
||||
basic_auth:
|
||||
username: hpc
|
||||
password: {{ alertmanager_pass }}
|
||||
|
||||
# Load and evaluate rules in this file every 'evaluation_interval' seconds.
|
||||
rule_files:
|
||||
- '/etc/prometheus/alerting.rules'
|
||||
|
||||
# A scrape configuration containing exactly one endpoint to scrape:
|
||||
# Here it's Prometheus itself.
|
||||
scrape_configs:
|
||||
# The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
|
||||
- job_name: 'prometheus'
|
||||
static_configs:
|
||||
- targets: ['localhost:9090']
|
||||
|
||||
# peregrine
|
||||
- job_name: 'node'
|
||||
scrape_interval: 120s
|
||||
file_sd_configs:
|
||||
- files:
|
||||
- targets.json
|
||||
|
||||
# peregrine
|
||||
- job_name: 'ipmi'
|
||||
scrape_interval: 120s
|
||||
file_sd_configs:
|
||||
- files:
|
||||
- ipmi-targets.json
|
||||
|
||||
|
||||
# Scrape the cadvisor container exporter
|
||||
- job_name: 'cadvisor'
|
||||
scrape_interval: 60s
|
||||
file_sd_configs:
|
||||
- files:
|
||||
- cadvisor.json
|
10
roles/prom_server/templates/etc/targets.json
Normal file
10
roles/prom_server/templates/etc/targets.json
Normal file
@ -0,0 +1,10 @@
|
||||
[
|
||||
{
|
||||
"targets": [
|
||||
],
|
||||
"labels": {
|
||||
"env": "blank",
|
||||
"job": "node"
|
||||
}
|
||||
}
|
||||
]
|
Reference in New Issue
Block a user