1
0
forked from HPC/HPCplaybooks

Added a generic prometheus server role.

This commit is contained in:
Egon Rijpkema 2018-12-21 11:59:20 +01:00
parent d68ec10d2c
commit 3bd13d018a
9 changed files with 228 additions and 0 deletions

View File

@ -1,3 +1,4 @@
[defaults]
inventory = hosts.py
stdout_callback = debug
vault_password_file = .vault_pass.txt

5
prometheus.yml Normal file
View File

@ -0,0 +1,5 @@
---
- hosts: all
become: True
roles:
- prom_server

View File

@ -0,0 +1,59 @@
---
- include_vars: vars/secrets.yml
- file:
path: "{{ item }}"
state: directory
mode: 0777
with_items:
- /srv/prometheus/etc/prometheus
- /srv/prometheus/prometheus
- name: Install prometheus.yml
template:
src: templates/etc/{{ item }}
dest: /srv/prometheus/etc/prometheus/{{ item }}
mode: 644
owner: root
group: root
with_items:
- prometheus.yml
- name: Install other settings files.
copy:
src: templates/etc/{{ item }}
dest: /srv/prometheus/etc/prometheus/{{ item }}
mode: 644
owner: root
group: root
with_items:
- alerting.rules
- targets.json
tags:
- service-files
- name: Install service files.
template:
src: templates/prometheus.service
dest: /etc/systemd/system/prometheus.service
mode: 644
owner: root
group: root
tags:
- service-files
- name: install service files
command: systemctl daemon-reload
- name: enable service at boot
systemd:
name: prometheus.service
enabled: yes
- name: make sure servcies are started.
systemd:
name: prometheus.service
state: restarted
tags:
- start-service

View File

@ -0,0 +1,71 @@
groups:
- name: basic
rules:
- alert: InstanceDown
expr: up{job="node"} == 0
for: 10m
labels:
severity: page
annotations:
description: '{{ $labels.instance }} of job {{ $labels.job }} has been down
for more than 10 minutes.'
summary: Instance {{ $labels.instance }} down
- alert: Time not being synced
expr: node_timex_sync_status{job="node"} == 0
for: 5m
labels:
severity: page
annotations:
description: '{{ $labels.instance }} is not configured to sync its time with an external ntp server'
summary: Instance {{ $labels.instance }} no ntp configured.
- alert: clock wrong
expr: node_timex_offset_seconds{job="node"} > 1
for: 10m
labels:
severity: page
annotations:
description: '{{ $labels.instance }} has a clock offset > 1 second.'
summary: '{{ $labels.instance }} has clock drift.'
- alert: DiskWillFillIn8Hours
expr: predict_linear(node_filesystem_free{job="node",mountpoint!~"/tmp|/local|/target/gpfs3"}[2h], 8 * 3600) < 0
for: 2h
labels:
severity: page
annotations:
description: Instance {{ $labels.instance }} will fill up within 8 hours
summary: '{{ $labels.instance }} disk full'
- alert: DiskWillFillIn72Hours
expr: predict_linear(node_filesystem_free{job="node",mountpoint!~"/tmp|/local|/target/gpfs3"}[6h], 72 * 3600) < 0
for: 8h
labels:
severity: page
annotations:
description: Instance {{ $labels.instance }} will fill up within 72 hours
summary: '{{ $labels.instance }} disk almost full'
- alert: DiskFull
expr: node_filesystem_free{job="node",mountpoint!~"/tmp|/net|/cvmfs|/var/lib/nfs/rpc_pipefs|/cvmfs|/misc|/run/docker/netns/.+?|/cgroup.+?", fstype!~"fuse.+?"} < 5.24288e+06
for: 5m
labels:
severity: page
annotations:
description: Instance {{ $labels.instance }} has a full {{ $labels.mountpoint }}.
summary: '{{ $labels.instance }} Disk full'
- alert: tmpFull
expr: node_filesystem_free{job="node",mountpoint="/tmp"} < 5242880
for: 30m
labels:
severity: page
annotations:
description: Instance {{ $labels.instance }} Has a full /tmp
summary: '{{ $labels.instance }} /tmp full'
- alert: NodeRebooted
expr: delta(node_boot_time[1h]) > 10
for: 1m
labels:
severity: page
annotations:
description: Instance {{ $labels.instance }} has been rebooted.
summary: '{{ $labels.instance }} rebooted'
# - alert: TestAlert
# expr: probe_success{instance="195.169.22.220:11211",job="blackbox"} == 0
# for: 1m

View File

@ -0,0 +1,55 @@
# my global config
global:
scrape_interval: 60s # By default, scrape targets every 15 seconds.
evaluation_interval: 60s # By default, scrape targets every 15 seconds.
# scrape_timeout is set to the global default (10s).
# Attach these labels to any time series or alerts when communicating with
# external systems (federation, remote storage, Alertmanager).
external_labels:
monitor: {{ ansible_hostname }}
# alert
alerting:
alertmanagers:
- scheme: http
static_configs:
- targets:
- "alertmanager.kube.hpc.rug.nl"
basic_auth:
username: hpc
password: {{ alertmanager_pass }}
# Load and evaluate rules in this file every 'evaluation_interval' seconds.
rule_files:
- '/etc/prometheus/alerting.rules'
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
# The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
# peregrine
- job_name: 'node'
scrape_interval: 120s
file_sd_configs:
- files:
- targets.json
# peregrine
- job_name: 'ipmi'
scrape_interval: 120s
file_sd_configs:
- files:
- ipmi-targets.json
# Scrape the cadvisor container exporter
- job_name: 'cadvisor'
scrape_interval: 60s
file_sd_configs:
- files:
- cadvisor.json

View File

@ -0,0 +1,10 @@
[
{
"targets": [
],
"labels": {
"env": "blank",
"job": "node"
}
}
]

View File

@ -0,0 +1,19 @@
[Unit]
Description=Prometheus monitoring
After=docker.service
Requires=docker.service
[Service]
TimeoutStartSec=0
Restart=always
ExecStartPre=-/usr/bin/docker kill %n
ExecStartPre=-/usr/bin/docker rm %n
ExecStart=/usr/bin/docker run --name %n \
--network host \
-v /srv/prometheus/prometheus:/prometheus \
-v /srv/prometheus/etc/prometheus:/etc/prometheus \
prom/prometheus:v2.6.0 \
--storage.tsdb.retention 365d --config.file=/etc/prometheus/prometheus.yml \
--storage.tsdb.path=/prometheus --web.enable-lifecycle
[Install]
WantedBy=multi-user.target

View File

@ -0,0 +1,8 @@
$ANSIBLE_VAULT;1.1;AES256
35653034666233356434653337323037616464346462626436613836626633653661613162393235
3731313333396465616430306530653430353730636662350a326134643635636364363566313933
38303164616631316265393330343566383232333337386661643534356263323137616362393662
3636366636613934660a366631616666366331326331623261396435656533313563666464396439
38663533386634323933646166306666626533623730613363396639633638393864396264313836
39343132653439376361353462626332336134626661656236366636623932363638656530313966
616665383932306236346236633636623561