From 3bd13d018adcc4f6e35956e58c697790b958771c Mon Sep 17 00:00:00 2001 From: Egon Rijpkema Date: Fri, 21 Dec 2018 11:59:20 +0100 Subject: [PATCH] Added a generic prometheus server role. --- ansible.cfg | 1 + prometheus.yml | 5 ++ roles/prom_server/tasks/main.yml | 59 +++++++++++++++ .../prom_server/templates/etc/alerting.rules | 71 +++++++++++++++++++ roles/prom_server/templates/etc/cadvisor.json | 0 .../prom_server/templates/etc/prometheus.yml | 55 ++++++++++++++ roles/prom_server/templates/etc/targets.json | 10 +++ .../prom_server/templates/prometheus.service | 19 +++++ roles/prom_server/vars/secrets.yml | 8 +++ 9 files changed, 228 insertions(+) create mode 100644 prometheus.yml create mode 100644 roles/prom_server/tasks/main.yml create mode 100644 roles/prom_server/templates/etc/alerting.rules create mode 100644 roles/prom_server/templates/etc/cadvisor.json create mode 100644 roles/prom_server/templates/etc/prometheus.yml create mode 100644 roles/prom_server/templates/etc/targets.json create mode 100644 roles/prom_server/templates/prometheus.service create mode 100644 roles/prom_server/vars/secrets.yml diff --git a/ansible.cfg b/ansible.cfg index 26c2661..08d3681 100644 --- a/ansible.cfg +++ b/ansible.cfg @@ -1,3 +1,4 @@ [defaults] inventory = hosts.py stdout_callback = debug +vault_password_file = .vault_pass.txt diff --git a/prometheus.yml b/prometheus.yml new file mode 100644 index 0000000..4bca6f8 --- /dev/null +++ b/prometheus.yml @@ -0,0 +1,5 @@ +--- +- hosts: all + become: True + roles: + - prom_server diff --git a/roles/prom_server/tasks/main.yml b/roles/prom_server/tasks/main.yml new file mode 100644 index 0000000..d71b575 --- /dev/null +++ b/roles/prom_server/tasks/main.yml @@ -0,0 +1,59 @@ +--- +- include_vars: vars/secrets.yml + +- file: + path: "{{ item }}" + state: directory + mode: 0777 + with_items: + - /srv/prometheus/etc/prometheus + - /srv/prometheus/prometheus + +- name: Install prometheus.yml + template: + src: templates/etc/{{ item }} + dest: /srv/prometheus/etc/prometheus/{{ item }} + mode: 644 + owner: root + group: root + with_items: + - prometheus.yml + +- name: Install other settings files. + copy: + src: templates/etc/{{ item }} + dest: /srv/prometheus/etc/prometheus/{{ item }} + mode: 644 + owner: root + group: root + with_items: + - alerting.rules + - targets.json + + tags: + - service-files + +- name: Install service files. + template: + src: templates/prometheus.service + dest: /etc/systemd/system/prometheus.service + mode: 644 + owner: root + group: root + tags: + - service-files + +- name: install service files + command: systemctl daemon-reload + +- name: enable service at boot + systemd: + name: prometheus.service + enabled: yes + +- name: make sure servcies are started. + systemd: + name: prometheus.service + state: restarted + tags: + - start-service diff --git a/roles/prom_server/templates/etc/alerting.rules b/roles/prom_server/templates/etc/alerting.rules new file mode 100644 index 0000000..8ab7fa2 --- /dev/null +++ b/roles/prom_server/templates/etc/alerting.rules @@ -0,0 +1,71 @@ +groups: +- name: basic + rules: + - alert: InstanceDown + expr: up{job="node"} == 0 + for: 10m + labels: + severity: page + annotations: + description: '{{ $labels.instance }} of job {{ $labels.job }} has been down + for more than 10 minutes.' + summary: Instance {{ $labels.instance }} down + - alert: Time not being synced + expr: node_timex_sync_status{job="node"} == 0 + for: 5m + labels: + severity: page + annotations: + description: '{{ $labels.instance }} is not configured to sync its time with an external ntp server' + summary: Instance {{ $labels.instance }} no ntp configured. + - alert: clock wrong + expr: node_timex_offset_seconds{job="node"} > 1 + for: 10m + labels: + severity: page + annotations: + description: '{{ $labels.instance }} has a clock offset > 1 second.' + summary: '{{ $labels.instance }} has clock drift.' + - alert: DiskWillFillIn8Hours + expr: predict_linear(node_filesystem_free{job="node",mountpoint!~"/tmp|/local|/target/gpfs3"}[2h], 8 * 3600) < 0 + for: 2h + labels: + severity: page + annotations: + description: Instance {{ $labels.instance }} will fill up within 8 hours + summary: '{{ $labels.instance }} disk full' + - alert: DiskWillFillIn72Hours + expr: predict_linear(node_filesystem_free{job="node",mountpoint!~"/tmp|/local|/target/gpfs3"}[6h], 72 * 3600) < 0 + for: 8h + labels: + severity: page + annotations: + description: Instance {{ $labels.instance }} will fill up within 72 hours + summary: '{{ $labels.instance }} disk almost full' + - alert: DiskFull + expr: node_filesystem_free{job="node",mountpoint!~"/tmp|/net|/cvmfs|/var/lib/nfs/rpc_pipefs|/cvmfs|/misc|/run/docker/netns/.+?|/cgroup.+?", fstype!~"fuse.+?"} < 5.24288e+06 + for: 5m + labels: + severity: page + annotations: + description: Instance {{ $labels.instance }} has a full {{ $labels.mountpoint }}. + summary: '{{ $labels.instance }} Disk full' + - alert: tmpFull + expr: node_filesystem_free{job="node",mountpoint="/tmp"} < 5242880 + for: 30m + labels: + severity: page + annotations: + description: Instance {{ $labels.instance }} Has a full /tmp + summary: '{{ $labels.instance }} /tmp full' + - alert: NodeRebooted + expr: delta(node_boot_time[1h]) > 10 + for: 1m + labels: + severity: page + annotations: + description: Instance {{ $labels.instance }} has been rebooted. + summary: '{{ $labels.instance }} rebooted' +# - alert: TestAlert +# expr: probe_success{instance="195.169.22.220:11211",job="blackbox"} == 0 +# for: 1m diff --git a/roles/prom_server/templates/etc/cadvisor.json b/roles/prom_server/templates/etc/cadvisor.json new file mode 100644 index 0000000..e69de29 diff --git a/roles/prom_server/templates/etc/prometheus.yml b/roles/prom_server/templates/etc/prometheus.yml new file mode 100644 index 0000000..6fea748 --- /dev/null +++ b/roles/prom_server/templates/etc/prometheus.yml @@ -0,0 +1,55 @@ +# my global config +global: + scrape_interval: 60s # By default, scrape targets every 15 seconds. + evaluation_interval: 60s # By default, scrape targets every 15 seconds. + # scrape_timeout is set to the global default (10s). + + # Attach these labels to any time series or alerts when communicating with + # external systems (federation, remote storage, Alertmanager). + external_labels: + monitor: {{ ansible_hostname }} + +# alert +alerting: + alertmanagers: + - scheme: http + static_configs: + - targets: + - "alertmanager.kube.hpc.rug.nl" + basic_auth: + username: hpc + password: {{ alertmanager_pass }} + +# Load and evaluate rules in this file every 'evaluation_interval' seconds. +rule_files: +- '/etc/prometheus/alerting.rules' + +# A scrape configuration containing exactly one endpoint to scrape: +# Here it's Prometheus itself. +scrape_configs: + # The job name is added as a label `job=` to any timeseries scraped from this config. + - job_name: 'prometheus' + static_configs: + - targets: ['localhost:9090'] + + # peregrine + - job_name: 'node' + scrape_interval: 120s + file_sd_configs: + - files: + - targets.json + + # peregrine + - job_name: 'ipmi' + scrape_interval: 120s + file_sd_configs: + - files: + - ipmi-targets.json + + + # Scrape the cadvisor container exporter + - job_name: 'cadvisor' + scrape_interval: 60s + file_sd_configs: + - files: + - cadvisor.json diff --git a/roles/prom_server/templates/etc/targets.json b/roles/prom_server/templates/etc/targets.json new file mode 100644 index 0000000..df604e3 --- /dev/null +++ b/roles/prom_server/templates/etc/targets.json @@ -0,0 +1,10 @@ +[ + { + "targets": [ + ], + "labels": { + "env": "blank", + "job": "node" + } + } +] diff --git a/roles/prom_server/templates/prometheus.service b/roles/prom_server/templates/prometheus.service new file mode 100644 index 0000000..fab877c --- /dev/null +++ b/roles/prom_server/templates/prometheus.service @@ -0,0 +1,19 @@ +[Unit] +Description=Prometheus monitoring +After=docker.service +Requires=docker.service + +[Service] +TimeoutStartSec=0 +Restart=always +ExecStartPre=-/usr/bin/docker kill %n +ExecStartPre=-/usr/bin/docker rm %n +ExecStart=/usr/bin/docker run --name %n \ + --network host \ + -v /srv/prometheus/prometheus:/prometheus \ + -v /srv/prometheus/etc/prometheus:/etc/prometheus \ + prom/prometheus:v2.6.0 \ + --storage.tsdb.retention 365d --config.file=/etc/prometheus/prometheus.yml \ + --storage.tsdb.path=/prometheus --web.enable-lifecycle +[Install] +WantedBy=multi-user.target diff --git a/roles/prom_server/vars/secrets.yml b/roles/prom_server/vars/secrets.yml new file mode 100644 index 0000000..2085b6e --- /dev/null +++ b/roles/prom_server/vars/secrets.yml @@ -0,0 +1,8 @@ +$ANSIBLE_VAULT;1.1;AES256 +35653034666233356434653337323037616464346462626436613836626633653661613162393235 +3731313333396465616430306530653430353730636662350a326134643635636364363566313933 +38303164616631316265393330343566383232333337386661643534356263323137616362393662 +3636366636613934660a366631616666366331326331623261396435656533313563666464396439 +38663533386634323933646166306666626533623730613363396639633638393864396264313836 +39343132653439376361353462626332336134626661656236366636623932363638656530313966 +616665383932306236346236633636623561