Added a generic prometheus server role.
This commit is contained in:
parent
d68ec10d2c
commit
3bd13d018a
|
@ -1,3 +1,4 @@
|
|||
[defaults]
|
||||
inventory = hosts.py
|
||||
stdout_callback = debug
|
||||
vault_password_file = .vault_pass.txt
|
||||
|
|
|
@ -0,0 +1,5 @@
|
|||
---
|
||||
- hosts: all
|
||||
become: True
|
||||
roles:
|
||||
- prom_server
|
|
@ -0,0 +1,59 @@
|
|||
---
|
||||
- include_vars: vars/secrets.yml
|
||||
|
||||
- file:
|
||||
path: "{{ item }}"
|
||||
state: directory
|
||||
mode: 0777
|
||||
with_items:
|
||||
- /srv/prometheus/etc/prometheus
|
||||
- /srv/prometheus/prometheus
|
||||
|
||||
- name: Install prometheus.yml
|
||||
template:
|
||||
src: templates/etc/{{ item }}
|
||||
dest: /srv/prometheus/etc/prometheus/{{ item }}
|
||||
mode: 644
|
||||
owner: root
|
||||
group: root
|
||||
with_items:
|
||||
- prometheus.yml
|
||||
|
||||
- name: Install other settings files.
|
||||
copy:
|
||||
src: templates/etc/{{ item }}
|
||||
dest: /srv/prometheus/etc/prometheus/{{ item }}
|
||||
mode: 644
|
||||
owner: root
|
||||
group: root
|
||||
with_items:
|
||||
- alerting.rules
|
||||
- targets.json
|
||||
|
||||
tags:
|
||||
- service-files
|
||||
|
||||
- name: Install service files.
|
||||
template:
|
||||
src: templates/prometheus.service
|
||||
dest: /etc/systemd/system/prometheus.service
|
||||
mode: 644
|
||||
owner: root
|
||||
group: root
|
||||
tags:
|
||||
- service-files
|
||||
|
||||
- name: install service files
|
||||
command: systemctl daemon-reload
|
||||
|
||||
- name: enable service at boot
|
||||
systemd:
|
||||
name: prometheus.service
|
||||
enabled: yes
|
||||
|
||||
- name: make sure servcies are started.
|
||||
systemd:
|
||||
name: prometheus.service
|
||||
state: restarted
|
||||
tags:
|
||||
- start-service
|
|
@ -0,0 +1,71 @@
|
|||
groups:
|
||||
- name: basic
|
||||
rules:
|
||||
- alert: InstanceDown
|
||||
expr: up{job="node"} == 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
description: '{{ $labels.instance }} of job {{ $labels.job }} has been down
|
||||
for more than 10 minutes.'
|
||||
summary: Instance {{ $labels.instance }} down
|
||||
- alert: Time not being synced
|
||||
expr: node_timex_sync_status{job="node"} == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
description: '{{ $labels.instance }} is not configured to sync its time with an external ntp server'
|
||||
summary: Instance {{ $labels.instance }} no ntp configured.
|
||||
- alert: clock wrong
|
||||
expr: node_timex_offset_seconds{job="node"} > 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
description: '{{ $labels.instance }} has a clock offset > 1 second.'
|
||||
summary: '{{ $labels.instance }} has clock drift.'
|
||||
- alert: DiskWillFillIn8Hours
|
||||
expr: predict_linear(node_filesystem_free{job="node",mountpoint!~"/tmp|/local|/target/gpfs3"}[2h], 8 * 3600) < 0
|
||||
for: 2h
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
description: Instance {{ $labels.instance }} will fill up within 8 hours
|
||||
summary: '{{ $labels.instance }} disk full'
|
||||
- alert: DiskWillFillIn72Hours
|
||||
expr: predict_linear(node_filesystem_free{job="node",mountpoint!~"/tmp|/local|/target/gpfs3"}[6h], 72 * 3600) < 0
|
||||
for: 8h
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
description: Instance {{ $labels.instance }} will fill up within 72 hours
|
||||
summary: '{{ $labels.instance }} disk almost full'
|
||||
- alert: DiskFull
|
||||
expr: node_filesystem_free{job="node",mountpoint!~"/tmp|/net|/cvmfs|/var/lib/nfs/rpc_pipefs|/cvmfs|/misc|/run/docker/netns/.+?|/cgroup.+?", fstype!~"fuse.+?"} < 5.24288e+06
|
||||
for: 5m
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
description: Instance {{ $labels.instance }} has a full {{ $labels.mountpoint }}.
|
||||
summary: '{{ $labels.instance }} Disk full'
|
||||
- alert: tmpFull
|
||||
expr: node_filesystem_free{job="node",mountpoint="/tmp"} < 5242880
|
||||
for: 30m
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
description: Instance {{ $labels.instance }} Has a full /tmp
|
||||
summary: '{{ $labels.instance }} /tmp full'
|
||||
- alert: NodeRebooted
|
||||
expr: delta(node_boot_time[1h]) > 10
|
||||
for: 1m
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
description: Instance {{ $labels.instance }} has been rebooted.
|
||||
summary: '{{ $labels.instance }} rebooted'
|
||||
# - alert: TestAlert
|
||||
# expr: probe_success{instance="195.169.22.220:11211",job="blackbox"} == 0
|
||||
# for: 1m
|
|
@ -0,0 +1,55 @@
|
|||
# my global config
|
||||
global:
|
||||
scrape_interval: 60s # By default, scrape targets every 15 seconds.
|
||||
evaluation_interval: 60s # By default, scrape targets every 15 seconds.
|
||||
# scrape_timeout is set to the global default (10s).
|
||||
|
||||
# Attach these labels to any time series or alerts when communicating with
|
||||
# external systems (federation, remote storage, Alertmanager).
|
||||
external_labels:
|
||||
monitor: {{ ansible_hostname }}
|
||||
|
||||
# alert
|
||||
alerting:
|
||||
alertmanagers:
|
||||
- scheme: http
|
||||
static_configs:
|
||||
- targets:
|
||||
- "alertmanager.kube.hpc.rug.nl"
|
||||
basic_auth:
|
||||
username: hpc
|
||||
password: {{ alertmanager_pass }}
|
||||
|
||||
# Load and evaluate rules in this file every 'evaluation_interval' seconds.
|
||||
rule_files:
|
||||
- '/etc/prometheus/alerting.rules'
|
||||
|
||||
# A scrape configuration containing exactly one endpoint to scrape:
|
||||
# Here it's Prometheus itself.
|
||||
scrape_configs:
|
||||
# The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
|
||||
- job_name: 'prometheus'
|
||||
static_configs:
|
||||
- targets: ['localhost:9090']
|
||||
|
||||
# peregrine
|
||||
- job_name: 'node'
|
||||
scrape_interval: 120s
|
||||
file_sd_configs:
|
||||
- files:
|
||||
- targets.json
|
||||
|
||||
# peregrine
|
||||
- job_name: 'ipmi'
|
||||
scrape_interval: 120s
|
||||
file_sd_configs:
|
||||
- files:
|
||||
- ipmi-targets.json
|
||||
|
||||
|
||||
# Scrape the cadvisor container exporter
|
||||
- job_name: 'cadvisor'
|
||||
scrape_interval: 60s
|
||||
file_sd_configs:
|
||||
- files:
|
||||
- cadvisor.json
|
|
@ -0,0 +1,10 @@
|
|||
[
|
||||
{
|
||||
"targets": [
|
||||
],
|
||||
"labels": {
|
||||
"env": "blank",
|
||||
"job": "node"
|
||||
}
|
||||
}
|
||||
]
|
|
@ -0,0 +1,19 @@
|
|||
[Unit]
|
||||
Description=Prometheus monitoring
|
||||
After=docker.service
|
||||
Requires=docker.service
|
||||
|
||||
[Service]
|
||||
TimeoutStartSec=0
|
||||
Restart=always
|
||||
ExecStartPre=-/usr/bin/docker kill %n
|
||||
ExecStartPre=-/usr/bin/docker rm %n
|
||||
ExecStart=/usr/bin/docker run --name %n \
|
||||
--network host \
|
||||
-v /srv/prometheus/prometheus:/prometheus \
|
||||
-v /srv/prometheus/etc/prometheus:/etc/prometheus \
|
||||
prom/prometheus:v2.6.0 \
|
||||
--storage.tsdb.retention 365d --config.file=/etc/prometheus/prometheus.yml \
|
||||
--storage.tsdb.path=/prometheus --web.enable-lifecycle
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
|
@ -0,0 +1,8 @@
|
|||
$ANSIBLE_VAULT;1.1;AES256
|
||||
35653034666233356434653337323037616464346462626436613836626633653661613162393235
|
||||
3731313333396465616430306530653430353730636662350a326134643635636364363566313933
|
||||
38303164616631316265393330343566383232333337386661643534356263323137616362393662
|
||||
3636366636613934660a366631616666366331326331623261396435656533313563666464396439
|
||||
38663533386634323933646166306666626533623730613363396639633638393864396264313836
|
||||
39343132653439376361353462626332336134626661656236366636623932363638656530313966
|
||||
616665383932306236346236633636623561
|
Reference in New Issue