Added a generic prometheus server role.
This commit is contained in:
		@@ -1,3 +1,4 @@
 | 
			
		||||
[defaults]
 | 
			
		||||
inventory = hosts.py
 | 
			
		||||
stdout_callback = debug
 | 
			
		||||
vault_password_file = .vault_pass.txt
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										5
									
								
								prometheus.yml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										5
									
								
								prometheus.yml
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,5 @@
 | 
			
		||||
---
 | 
			
		||||
- hosts: all
 | 
			
		||||
  become: True
 | 
			
		||||
  roles:
 | 
			
		||||
     - prom_server
 | 
			
		||||
							
								
								
									
										59
									
								
								roles/prom_server/tasks/main.yml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										59
									
								
								roles/prom_server/tasks/main.yml
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,59 @@
 | 
			
		||||
---
 | 
			
		||||
- include_vars: vars/secrets.yml
 | 
			
		||||
 | 
			
		||||
- file:
 | 
			
		||||
    path: "{{ item }}"
 | 
			
		||||
    state: directory
 | 
			
		||||
    mode: 0777
 | 
			
		||||
  with_items:
 | 
			
		||||
    - /srv/prometheus/etc/prometheus
 | 
			
		||||
    - /srv/prometheus/prometheus
 | 
			
		||||
 | 
			
		||||
- name: Install prometheus.yml
 | 
			
		||||
  template:
 | 
			
		||||
    src: templates/etc/{{ item }}
 | 
			
		||||
    dest: /srv/prometheus/etc/prometheus/{{ item }}
 | 
			
		||||
    mode: 644
 | 
			
		||||
    owner: root
 | 
			
		||||
    group: root
 | 
			
		||||
  with_items:
 | 
			
		||||
    - prometheus.yml
 | 
			
		||||
 | 
			
		||||
- name: Install other settings files.
 | 
			
		||||
  copy:
 | 
			
		||||
    src: templates/etc/{{ item }}
 | 
			
		||||
    dest: /srv/prometheus/etc/prometheus/{{ item }}
 | 
			
		||||
    mode: 644
 | 
			
		||||
    owner: root
 | 
			
		||||
    group: root
 | 
			
		||||
  with_items:
 | 
			
		||||
    - alerting.rules
 | 
			
		||||
    - targets.json
 | 
			
		||||
 | 
			
		||||
  tags:
 | 
			
		||||
    - service-files
 | 
			
		||||
 | 
			
		||||
- name: Install service files.
 | 
			
		||||
  template:
 | 
			
		||||
    src: templates/prometheus.service
 | 
			
		||||
    dest: /etc/systemd/system/prometheus.service
 | 
			
		||||
    mode: 644
 | 
			
		||||
    owner: root
 | 
			
		||||
    group: root
 | 
			
		||||
  tags:
 | 
			
		||||
    - service-files
 | 
			
		||||
 | 
			
		||||
- name: install service files
 | 
			
		||||
  command: systemctl daemon-reload
 | 
			
		||||
 | 
			
		||||
- name: enable service at boot
 | 
			
		||||
  systemd:
 | 
			
		||||
    name: prometheus.service
 | 
			
		||||
    enabled: yes
 | 
			
		||||
 | 
			
		||||
- name: make sure servcies are started.
 | 
			
		||||
  systemd:
 | 
			
		||||
    name: prometheus.service
 | 
			
		||||
    state: restarted
 | 
			
		||||
  tags:
 | 
			
		||||
    - start-service
 | 
			
		||||
							
								
								
									
										71
									
								
								roles/prom_server/templates/etc/alerting.rules
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										71
									
								
								roles/prom_server/templates/etc/alerting.rules
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,71 @@
 | 
			
		||||
groups:
 | 
			
		||||
- name: basic
 | 
			
		||||
  rules:
 | 
			
		||||
  - alert: InstanceDown
 | 
			
		||||
    expr: up{job="node"} == 0
 | 
			
		||||
    for: 10m
 | 
			
		||||
    labels:
 | 
			
		||||
      severity: page
 | 
			
		||||
    annotations:
 | 
			
		||||
      description: '{{ $labels.instance }} of job {{ $labels.job }} has been down
 | 
			
		||||
        for more than 10 minutes.'
 | 
			
		||||
      summary: Instance {{ $labels.instance }} down
 | 
			
		||||
  - alert: Time not being synced
 | 
			
		||||
    expr: node_timex_sync_status{job="node"} == 0
 | 
			
		||||
    for: 5m
 | 
			
		||||
    labels:
 | 
			
		||||
      severity: page
 | 
			
		||||
    annotations:
 | 
			
		||||
      description: '{{ $labels.instance }} is not configured to sync its time with an external ntp server'
 | 
			
		||||
      summary: Instance {{ $labels.instance }} no ntp configured.
 | 
			
		||||
  - alert: clock wrong
 | 
			
		||||
    expr: node_timex_offset_seconds{job="node"} > 1
 | 
			
		||||
    for: 10m
 | 
			
		||||
    labels:
 | 
			
		||||
      severity: page
 | 
			
		||||
    annotations:
 | 
			
		||||
      description: '{{ $labels.instance }} has a clock offset > 1 second.'
 | 
			
		||||
      summary: '{{ $labels.instance }} has clock drift.'
 | 
			
		||||
  - alert: DiskWillFillIn8Hours
 | 
			
		||||
    expr: predict_linear(node_filesystem_free{job="node",mountpoint!~"/tmp|/local|/target/gpfs3"}[2h], 8 * 3600) < 0
 | 
			
		||||
    for: 2h
 | 
			
		||||
    labels:
 | 
			
		||||
      severity: page
 | 
			
		||||
    annotations:
 | 
			
		||||
      description: Instance {{ $labels.instance }} will fill up within 8 hours
 | 
			
		||||
      summary: '{{ $labels.instance }} disk full'
 | 
			
		||||
  - alert: DiskWillFillIn72Hours
 | 
			
		||||
    expr: predict_linear(node_filesystem_free{job="node",mountpoint!~"/tmp|/local|/target/gpfs3"}[6h], 72 * 3600) < 0
 | 
			
		||||
    for: 8h
 | 
			
		||||
    labels:
 | 
			
		||||
      severity: page
 | 
			
		||||
    annotations:
 | 
			
		||||
      description: Instance {{ $labels.instance }} will fill up within 72 hours
 | 
			
		||||
      summary: '{{ $labels.instance }} disk almost full'
 | 
			
		||||
  - alert: DiskFull
 | 
			
		||||
    expr: node_filesystem_free{job="node",mountpoint!~"/tmp|/net|/cvmfs|/var/lib/nfs/rpc_pipefs|/cvmfs|/misc|/run/docker/netns/.+?|/cgroup.+?", fstype!~"fuse.+?"} < 5.24288e+06
 | 
			
		||||
    for: 5m
 | 
			
		||||
    labels:
 | 
			
		||||
      severity: page
 | 
			
		||||
    annotations:
 | 
			
		||||
      description: Instance {{ $labels.instance }} has a full {{ $labels.mountpoint }}.
 | 
			
		||||
      summary: '{{ $labels.instance }} Disk full'
 | 
			
		||||
  - alert: tmpFull
 | 
			
		||||
    expr: node_filesystem_free{job="node",mountpoint="/tmp"} < 5242880
 | 
			
		||||
    for: 30m
 | 
			
		||||
    labels:
 | 
			
		||||
      severity: page
 | 
			
		||||
    annotations:
 | 
			
		||||
      description: Instance {{ $labels.instance }} Has a full /tmp
 | 
			
		||||
      summary: '{{ $labels.instance }} /tmp full'
 | 
			
		||||
  - alert: NodeRebooted
 | 
			
		||||
    expr: delta(node_boot_time[1h]) > 10
 | 
			
		||||
    for: 1m
 | 
			
		||||
    labels:
 | 
			
		||||
      severity: page
 | 
			
		||||
    annotations:
 | 
			
		||||
      description: Instance {{ $labels.instance }} has been rebooted.
 | 
			
		||||
      summary: '{{ $labels.instance }} rebooted'
 | 
			
		||||
#  - alert: TestAlert
 | 
			
		||||
#    expr: probe_success{instance="195.169.22.220:11211",job="blackbox"} == 0
 | 
			
		||||
#    for: 1m
 | 
			
		||||
							
								
								
									
										0
									
								
								roles/prom_server/templates/etc/cadvisor.json
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								roles/prom_server/templates/etc/cadvisor.json
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										55
									
								
								roles/prom_server/templates/etc/prometheus.yml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										55
									
								
								roles/prom_server/templates/etc/prometheus.yml
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,55 @@
 | 
			
		||||
# my global config
 | 
			
		||||
global:
 | 
			
		||||
  scrape_interval:     60s # By default, scrape targets every 15 seconds.
 | 
			
		||||
  evaluation_interval: 60s # By default, scrape targets every 15 seconds.
 | 
			
		||||
  # scrape_timeout is set to the global default (10s).
 | 
			
		||||
 | 
			
		||||
  # Attach these labels to any time series or alerts when communicating with
 | 
			
		||||
  # external systems (federation, remote storage, Alertmanager).
 | 
			
		||||
  external_labels:
 | 
			
		||||
      monitor: {{ ansible_hostname }}
 | 
			
		||||
 | 
			
		||||
# alert
 | 
			
		||||
alerting:
 | 
			
		||||
  alertmanagers:
 | 
			
		||||
  - scheme: http
 | 
			
		||||
    static_configs:
 | 
			
		||||
    - targets:
 | 
			
		||||
      - "alertmanager.kube.hpc.rug.nl"
 | 
			
		||||
    basic_auth:
 | 
			
		||||
       username: hpc
 | 
			
		||||
       password: {{ alertmanager_pass }}
 | 
			
		||||
 | 
			
		||||
# Load and evaluate rules in this file every 'evaluation_interval' seconds.
 | 
			
		||||
rule_files:
 | 
			
		||||
- '/etc/prometheus/alerting.rules'
 | 
			
		||||
 | 
			
		||||
# A scrape configuration containing exactly one endpoint to scrape:
 | 
			
		||||
# Here it's Prometheus itself.
 | 
			
		||||
scrape_configs:
 | 
			
		||||
  # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
 | 
			
		||||
  - job_name: 'prometheus'
 | 
			
		||||
    static_configs:
 | 
			
		||||
         - targets: ['localhost:9090']
 | 
			
		||||
 | 
			
		||||
  # peregrine
 | 
			
		||||
  - job_name: 'node'
 | 
			
		||||
    scrape_interval: 120s
 | 
			
		||||
    file_sd_configs:
 | 
			
		||||
        - files:
 | 
			
		||||
          - targets.json
 | 
			
		||||
 | 
			
		||||
  # peregrine
 | 
			
		||||
  - job_name: 'ipmi'
 | 
			
		||||
    scrape_interval: 120s
 | 
			
		||||
    file_sd_configs:
 | 
			
		||||
        - files:
 | 
			
		||||
          - ipmi-targets.json
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  # Scrape the cadvisor container exporter
 | 
			
		||||
  - job_name: 'cadvisor'
 | 
			
		||||
    scrape_interval: 60s
 | 
			
		||||
    file_sd_configs:
 | 
			
		||||
        - files:
 | 
			
		||||
          - cadvisor.json
 | 
			
		||||
							
								
								
									
										10
									
								
								roles/prom_server/templates/etc/targets.json
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										10
									
								
								roles/prom_server/templates/etc/targets.json
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,10 @@
 | 
			
		||||
[
 | 
			
		||||
    {
 | 
			
		||||
        "targets": [
 | 
			
		||||
        ],
 | 
			
		||||
        "labels": {
 | 
			
		||||
            "env": "blank",
 | 
			
		||||
            "job": "node"
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
]
 | 
			
		||||
							
								
								
									
										19
									
								
								roles/prom_server/templates/prometheus.service
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										19
									
								
								roles/prom_server/templates/prometheus.service
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,19 @@
 | 
			
		||||
[Unit]
 | 
			
		||||
Description=Prometheus monitoring
 | 
			
		||||
After=docker.service
 | 
			
		||||
Requires=docker.service
 | 
			
		||||
 | 
			
		||||
[Service]
 | 
			
		||||
TimeoutStartSec=0
 | 
			
		||||
Restart=always
 | 
			
		||||
ExecStartPre=-/usr/bin/docker kill %n
 | 
			
		||||
ExecStartPre=-/usr/bin/docker rm %n
 | 
			
		||||
ExecStart=/usr/bin/docker run --name %n \
 | 
			
		||||
    --network host \
 | 
			
		||||
    -v /srv/prometheus/prometheus:/prometheus  \
 | 
			
		||||
    -v /srv/prometheus/etc/prometheus:/etc/prometheus  \
 | 
			
		||||
    prom/prometheus:v2.6.0 \
 | 
			
		||||
    --storage.tsdb.retention 365d --config.file=/etc/prometheus/prometheus.yml \
 | 
			
		||||
    --storage.tsdb.path=/prometheus --web.enable-lifecycle
 | 
			
		||||
[Install]
 | 
			
		||||
WantedBy=multi-user.target
 | 
			
		||||
							
								
								
									
										8
									
								
								roles/prom_server/vars/secrets.yml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										8
									
								
								roles/prom_server/vars/secrets.yml
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,8 @@
 | 
			
		||||
$ANSIBLE_VAULT;1.1;AES256
 | 
			
		||||
35653034666233356434653337323037616464346462626436613836626633653661613162393235
 | 
			
		||||
3731313333396465616430306530653430353730636662350a326134643635636364363566313933
 | 
			
		||||
38303164616631316265393330343566383232333337386661643534356263323137616362393662
 | 
			
		||||
3636366636613934660a366631616666366331326331623261396435656533313563666464396439
 | 
			
		||||
38663533386634323933646166306666626533623730613363396639633638393864396264313836
 | 
			
		||||
39343132653439376361353462626332336134626661656236366636623932363638656530313966
 | 
			
		||||
616665383932306236346236633636623561
 | 
			
		||||
		Reference in New Issue
	
	Block a user