forked from HPC/HPCplaybooks
Added a generic prometheus server role.
This commit is contained in:
parent
d68ec10d2c
commit
3bd13d018a
@ -1,3 +1,4 @@
|
|||||||
[defaults]
|
[defaults]
|
||||||
inventory = hosts.py
|
inventory = hosts.py
|
||||||
stdout_callback = debug
|
stdout_callback = debug
|
||||||
|
vault_password_file = .vault_pass.txt
|
||||||
|
5
prometheus.yml
Normal file
5
prometheus.yml
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
---
|
||||||
|
- hosts: all
|
||||||
|
become: True
|
||||||
|
roles:
|
||||||
|
- prom_server
|
59
roles/prom_server/tasks/main.yml
Normal file
59
roles/prom_server/tasks/main.yml
Normal file
@ -0,0 +1,59 @@
|
|||||||
|
---
|
||||||
|
- include_vars: vars/secrets.yml
|
||||||
|
|
||||||
|
- file:
|
||||||
|
path: "{{ item }}"
|
||||||
|
state: directory
|
||||||
|
mode: 0777
|
||||||
|
with_items:
|
||||||
|
- /srv/prometheus/etc/prometheus
|
||||||
|
- /srv/prometheus/prometheus
|
||||||
|
|
||||||
|
- name: Install prometheus.yml
|
||||||
|
template:
|
||||||
|
src: templates/etc/{{ item }}
|
||||||
|
dest: /srv/prometheus/etc/prometheus/{{ item }}
|
||||||
|
mode: 644
|
||||||
|
owner: root
|
||||||
|
group: root
|
||||||
|
with_items:
|
||||||
|
- prometheus.yml
|
||||||
|
|
||||||
|
- name: Install other settings files.
|
||||||
|
copy:
|
||||||
|
src: templates/etc/{{ item }}
|
||||||
|
dest: /srv/prometheus/etc/prometheus/{{ item }}
|
||||||
|
mode: 644
|
||||||
|
owner: root
|
||||||
|
group: root
|
||||||
|
with_items:
|
||||||
|
- alerting.rules
|
||||||
|
- targets.json
|
||||||
|
|
||||||
|
tags:
|
||||||
|
- service-files
|
||||||
|
|
||||||
|
- name: Install service files.
|
||||||
|
template:
|
||||||
|
src: templates/prometheus.service
|
||||||
|
dest: /etc/systemd/system/prometheus.service
|
||||||
|
mode: 644
|
||||||
|
owner: root
|
||||||
|
group: root
|
||||||
|
tags:
|
||||||
|
- service-files
|
||||||
|
|
||||||
|
- name: install service files
|
||||||
|
command: systemctl daemon-reload
|
||||||
|
|
||||||
|
- name: enable service at boot
|
||||||
|
systemd:
|
||||||
|
name: prometheus.service
|
||||||
|
enabled: yes
|
||||||
|
|
||||||
|
- name: make sure servcies are started.
|
||||||
|
systemd:
|
||||||
|
name: prometheus.service
|
||||||
|
state: restarted
|
||||||
|
tags:
|
||||||
|
- start-service
|
71
roles/prom_server/templates/etc/alerting.rules
Normal file
71
roles/prom_server/templates/etc/alerting.rules
Normal file
@ -0,0 +1,71 @@
|
|||||||
|
groups:
|
||||||
|
- name: basic
|
||||||
|
rules:
|
||||||
|
- alert: InstanceDown
|
||||||
|
expr: up{job="node"} == 0
|
||||||
|
for: 10m
|
||||||
|
labels:
|
||||||
|
severity: page
|
||||||
|
annotations:
|
||||||
|
description: '{{ $labels.instance }} of job {{ $labels.job }} has been down
|
||||||
|
for more than 10 minutes.'
|
||||||
|
summary: Instance {{ $labels.instance }} down
|
||||||
|
- alert: Time not being synced
|
||||||
|
expr: node_timex_sync_status{job="node"} == 0
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: page
|
||||||
|
annotations:
|
||||||
|
description: '{{ $labels.instance }} is not configured to sync its time with an external ntp server'
|
||||||
|
summary: Instance {{ $labels.instance }} no ntp configured.
|
||||||
|
- alert: clock wrong
|
||||||
|
expr: node_timex_offset_seconds{job="node"} > 1
|
||||||
|
for: 10m
|
||||||
|
labels:
|
||||||
|
severity: page
|
||||||
|
annotations:
|
||||||
|
description: '{{ $labels.instance }} has a clock offset > 1 second.'
|
||||||
|
summary: '{{ $labels.instance }} has clock drift.'
|
||||||
|
- alert: DiskWillFillIn8Hours
|
||||||
|
expr: predict_linear(node_filesystem_free{job="node",mountpoint!~"/tmp|/local|/target/gpfs3"}[2h], 8 * 3600) < 0
|
||||||
|
for: 2h
|
||||||
|
labels:
|
||||||
|
severity: page
|
||||||
|
annotations:
|
||||||
|
description: Instance {{ $labels.instance }} will fill up within 8 hours
|
||||||
|
summary: '{{ $labels.instance }} disk full'
|
||||||
|
- alert: DiskWillFillIn72Hours
|
||||||
|
expr: predict_linear(node_filesystem_free{job="node",mountpoint!~"/tmp|/local|/target/gpfs3"}[6h], 72 * 3600) < 0
|
||||||
|
for: 8h
|
||||||
|
labels:
|
||||||
|
severity: page
|
||||||
|
annotations:
|
||||||
|
description: Instance {{ $labels.instance }} will fill up within 72 hours
|
||||||
|
summary: '{{ $labels.instance }} disk almost full'
|
||||||
|
- alert: DiskFull
|
||||||
|
expr: node_filesystem_free{job="node",mountpoint!~"/tmp|/net|/cvmfs|/var/lib/nfs/rpc_pipefs|/cvmfs|/misc|/run/docker/netns/.+?|/cgroup.+?", fstype!~"fuse.+?"} < 5.24288e+06
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: page
|
||||||
|
annotations:
|
||||||
|
description: Instance {{ $labels.instance }} has a full {{ $labels.mountpoint }}.
|
||||||
|
summary: '{{ $labels.instance }} Disk full'
|
||||||
|
- alert: tmpFull
|
||||||
|
expr: node_filesystem_free{job="node",mountpoint="/tmp"} < 5242880
|
||||||
|
for: 30m
|
||||||
|
labels:
|
||||||
|
severity: page
|
||||||
|
annotations:
|
||||||
|
description: Instance {{ $labels.instance }} Has a full /tmp
|
||||||
|
summary: '{{ $labels.instance }} /tmp full'
|
||||||
|
- alert: NodeRebooted
|
||||||
|
expr: delta(node_boot_time[1h]) > 10
|
||||||
|
for: 1m
|
||||||
|
labels:
|
||||||
|
severity: page
|
||||||
|
annotations:
|
||||||
|
description: Instance {{ $labels.instance }} has been rebooted.
|
||||||
|
summary: '{{ $labels.instance }} rebooted'
|
||||||
|
# - alert: TestAlert
|
||||||
|
# expr: probe_success{instance="195.169.22.220:11211",job="blackbox"} == 0
|
||||||
|
# for: 1m
|
0
roles/prom_server/templates/etc/cadvisor.json
Normal file
0
roles/prom_server/templates/etc/cadvisor.json
Normal file
55
roles/prom_server/templates/etc/prometheus.yml
Normal file
55
roles/prom_server/templates/etc/prometheus.yml
Normal file
@ -0,0 +1,55 @@
|
|||||||
|
# my global config
|
||||||
|
global:
|
||||||
|
scrape_interval: 60s # By default, scrape targets every 15 seconds.
|
||||||
|
evaluation_interval: 60s # By default, scrape targets every 15 seconds.
|
||||||
|
# scrape_timeout is set to the global default (10s).
|
||||||
|
|
||||||
|
# Attach these labels to any time series or alerts when communicating with
|
||||||
|
# external systems (federation, remote storage, Alertmanager).
|
||||||
|
external_labels:
|
||||||
|
monitor: {{ ansible_hostname }}
|
||||||
|
|
||||||
|
# alert
|
||||||
|
alerting:
|
||||||
|
alertmanagers:
|
||||||
|
- scheme: http
|
||||||
|
static_configs:
|
||||||
|
- targets:
|
||||||
|
- "alertmanager.kube.hpc.rug.nl"
|
||||||
|
basic_auth:
|
||||||
|
username: hpc
|
||||||
|
password: {{ alertmanager_pass }}
|
||||||
|
|
||||||
|
# Load and evaluate rules in this file every 'evaluation_interval' seconds.
|
||||||
|
rule_files:
|
||||||
|
- '/etc/prometheus/alerting.rules'
|
||||||
|
|
||||||
|
# A scrape configuration containing exactly one endpoint to scrape:
|
||||||
|
# Here it's Prometheus itself.
|
||||||
|
scrape_configs:
|
||||||
|
# The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
|
||||||
|
- job_name: 'prometheus'
|
||||||
|
static_configs:
|
||||||
|
- targets: ['localhost:9090']
|
||||||
|
|
||||||
|
# peregrine
|
||||||
|
- job_name: 'node'
|
||||||
|
scrape_interval: 120s
|
||||||
|
file_sd_configs:
|
||||||
|
- files:
|
||||||
|
- targets.json
|
||||||
|
|
||||||
|
# peregrine
|
||||||
|
- job_name: 'ipmi'
|
||||||
|
scrape_interval: 120s
|
||||||
|
file_sd_configs:
|
||||||
|
- files:
|
||||||
|
- ipmi-targets.json
|
||||||
|
|
||||||
|
|
||||||
|
# Scrape the cadvisor container exporter
|
||||||
|
- job_name: 'cadvisor'
|
||||||
|
scrape_interval: 60s
|
||||||
|
file_sd_configs:
|
||||||
|
- files:
|
||||||
|
- cadvisor.json
|
10
roles/prom_server/templates/etc/targets.json
Normal file
10
roles/prom_server/templates/etc/targets.json
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"targets": [
|
||||||
|
],
|
||||||
|
"labels": {
|
||||||
|
"env": "blank",
|
||||||
|
"job": "node"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
19
roles/prom_server/templates/prometheus.service
Normal file
19
roles/prom_server/templates/prometheus.service
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
[Unit]
|
||||||
|
Description=Prometheus monitoring
|
||||||
|
After=docker.service
|
||||||
|
Requires=docker.service
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
TimeoutStartSec=0
|
||||||
|
Restart=always
|
||||||
|
ExecStartPre=-/usr/bin/docker kill %n
|
||||||
|
ExecStartPre=-/usr/bin/docker rm %n
|
||||||
|
ExecStart=/usr/bin/docker run --name %n \
|
||||||
|
--network host \
|
||||||
|
-v /srv/prometheus/prometheus:/prometheus \
|
||||||
|
-v /srv/prometheus/etc/prometheus:/etc/prometheus \
|
||||||
|
prom/prometheus:v2.6.0 \
|
||||||
|
--storage.tsdb.retention 365d --config.file=/etc/prometheus/prometheus.yml \
|
||||||
|
--storage.tsdb.path=/prometheus --web.enable-lifecycle
|
||||||
|
[Install]
|
||||||
|
WantedBy=multi-user.target
|
8
roles/prom_server/vars/secrets.yml
Normal file
8
roles/prom_server/vars/secrets.yml
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
$ANSIBLE_VAULT;1.1;AES256
|
||||||
|
35653034666233356434653337323037616464346462626436613836626633653661613162393235
|
||||||
|
3731313333396465616430306530653430353730636662350a326134643635636364363566313933
|
||||||
|
38303164616631316265393330343566383232333337386661643534356263323137616362393662
|
||||||
|
3636366636613934660a366631616666366331326331623261396435656533313563666464396439
|
||||||
|
38663533386634323933646166306666626533623730613363396639633638393864396264313836
|
||||||
|
39343132653439376361353462626332336134626661656236366636623932363638656530313966
|
||||||
|
616665383932306236346236633636623561
|
Loading…
Reference in New Issue
Block a user