10 Commits

Author SHA1 Message Date
14c9a646bf Update 'users.yml' 2019-01-28 15:09:22 +00:00
1872a4edcf Automatic updates and security reboots.
These are meant for hosts with a public ip.
2019-01-24 10:25:00 +01:00
82231aa8ba Added somewhat generic nod-exporter and cadvisor playbooks. 2019-01-02 13:34:52 +01:00
7fc312e523 Added nsswitch so that /etc/hosts file will work. 2018-12-21 12:51:01 +01:00
3bd13d018a Added a generic prometheus server role. 2018-12-21 11:59:40 +01:00
d68ec10d2c Setup log rotation for docker. 2018-10-23 11:09:45 +02:00
8be6056f96 Updated golang version 2018-10-16 10:46:23 +02:00
0622a319da Accidental capital. 2018-10-10 13:34:48 +02:00
253c438348 Added kees 2018-08-13 14:04:09 +02:00
678882be7b added playbook for stealth 2018-08-13 14:03:56 +02:00
22 changed files with 368 additions and 3 deletions

View File

@ -1,3 +1,4 @@
[defaults]
inventory = hosts.py
stdout_callback = debug
vault_password_file = .vault_pass.txt

5
cadvisor.yml Normal file
View File

@ -0,0 +1,5 @@
---
- hosts: all
become: True
roles:
- cadvisor

5
node-exporter.yml Normal file
View File

@ -0,0 +1,5 @@
---
- hosts: all
become: True
roles:
- cadvisor

5
prometheus.yml Normal file
View File

@ -0,0 +1,5 @@
---
- hosts: all
become: True
roles:
- prom_server

View File

@ -1,4 +1,4 @@
FROM golang:1.9-stretch
FROM golang:1.11-stretch
MAINTAINER Egon Rijpkema <e.m.a.rijpkema@rug.nl>

View File

@ -0,0 +1,25 @@
---
- name: Install service files.
template:
src: templates/cadvisor.service
dest: /etc/systemd/system/cadvisor.service
mode: 644
owner: root
group: root
tags:
- service-files
- name: install service files
command: systemctl daemon-reload
- name: enable service at boot
systemd:
name: cadvisor
enabled: yes
- name: make sure servcies are started.
systemd:
name: cadvisor.service
state: restarted
tags:
- start-service

View File

@ -0,0 +1,22 @@
[Unit]
Description=Prometheus container monitoring.
After=docker.service
Requires=docker.service
[Service]
TimeoutStartSec=0
Restart=always
ExecStartPre=-/usr/bin/docker kill %n
ExecStartPre=-/usr/bin/docker rm %n
ExecStart=/usr/bin/docker run --name %n \
--volume=/:/rootfs:ro \
--volume=/var/run:/var/run:rw \
--volume=/sys:/sys:ro \
--volume=/var/lib/docker/:/var/lib/docker:ro \
--volume=/dev/disk/:/dev/disk:ro \
--publish=8181:8080 \
google/cadvisor:latest
[Install]
WantedBy=multi-user.target

View File

@ -0,0 +1,7 @@
{
"log-driver": "json-file",
"log-opts": {
"max-size": "10m",
"max-file": "10"
}
}

View File

@ -19,6 +19,12 @@
- python-docker
when: ansible_distribution == 'Ubuntu' and ansible_distribution_release == 'xenial'
- name: Setup log rotation.
copy:
src: files/daemon.json
dest: /etc/docker/daemon.json
tags: ['settings']
- name: make sure service is started
systemd:
name: docker.service

View File

@ -0,0 +1,64 @@
---
- include_vars: vars/secrets.yml
- file:
path: "{{ item }}"
state: directory
mode: 0777
with_items:
- /srv/prometheus/etc/prometheus
- /srv/prometheus/prometheus
- name: Install prometheus.yml
template:
src: templates/etc/{{ item }}
dest: /srv/prometheus/etc/prometheus/{{ item }}
mode: 644
owner: root
group: root
with_items:
- prometheus.yml
- name: Install other settings files.
copy:
src: templates/etc/{{ item }}
dest: /srv/prometheus/etc/prometheus/{{ item }}
mode: 644
owner: root
group: root
with_items:
- alerting.rules
- targets.json
- name: Install nsswitch see https://github.com/jumanjiman/docker-ssllabs-scan/blob/6d9f48c9ac4f1df3eebef3ab28e2dd44a9ba4998/scanner/etc/nsswitch.conf
copy:
src: templates/etc/nsswitch.conf
dest: /srv/prometheus/etc/nsswitch.conf
mode: 644
owner: root
group: root
- name: Install service files.
template:
src: templates/prometheus.service
dest: /etc/systemd/system/prometheus.service
mode: 644
owner: root
group: root
tags:
- service-files
- name: install service files
command: systemctl daemon-reload
- name: enable service at boot
systemd:
name: prometheus.service
enabled: yes
- name: make sure servcies are started.
systemd:
name: prometheus.service
state: restarted
tags:
- start-service

View File

@ -0,0 +1,71 @@
groups:
- name: basic
rules:
- alert: InstanceDown
expr: up{job="node"} == 0
for: 10m
labels:
severity: page
annotations:
description: '{{ $labels.instance }} of job {{ $labels.job }} has been down
for more than 10 minutes.'
summary: Instance {{ $labels.instance }} down
- alert: Time not being synced
expr: node_timex_sync_status{job="node"} == 0
for: 5m
labels:
severity: page
annotations:
description: '{{ $labels.instance }} is not configured to sync its time with an external ntp server'
summary: Instance {{ $labels.instance }} no ntp configured.
- alert: clock wrong
expr: node_timex_offset_seconds{job="node"} > 1
for: 10m
labels:
severity: page
annotations:
description: '{{ $labels.instance }} has a clock offset > 1 second.'
summary: '{{ $labels.instance }} has clock drift.'
- alert: DiskWillFillIn8Hours
expr: predict_linear(node_filesystem_free{job="node",mountpoint!~"/tmp|/local|/target/gpfs3"}[2h], 8 * 3600) < 0
for: 2h
labels:
severity: page
annotations:
description: Instance {{ $labels.instance }} will fill up within 8 hours
summary: '{{ $labels.instance }} disk full'
- alert: DiskWillFillIn72Hours
expr: predict_linear(node_filesystem_free{job="node",mountpoint!~"/tmp|/local|/target/gpfs3"}[6h], 72 * 3600) < 0
for: 8h
labels:
severity: page
annotations:
description: Instance {{ $labels.instance }} will fill up within 72 hours
summary: '{{ $labels.instance }} disk almost full'
- alert: DiskFull
expr: node_filesystem_free{job="node",mountpoint!~"/tmp|/net|/cvmfs|/var/lib/nfs/rpc_pipefs|/cvmfs|/misc|/run/docker/netns/.+?|/cgroup.+?", fstype!~"fuse.+?"} < 5.24288e+06
for: 5m
labels:
severity: page
annotations:
description: Instance {{ $labels.instance }} has a full {{ $labels.mountpoint }}.
summary: '{{ $labels.instance }} Disk full'
- alert: tmpFull
expr: node_filesystem_free{job="node",mountpoint="/tmp"} < 5242880
for: 30m
labels:
severity: page
annotations:
description: Instance {{ $labels.instance }} Has a full /tmp
summary: '{{ $labels.instance }} /tmp full'
- alert: NodeRebooted
expr: delta(node_boot_time[1h]) > 10
for: 1m
labels:
severity: page
annotations:
description: Instance {{ $labels.instance }} has been rebooted.
summary: '{{ $labels.instance }} rebooted'
# - alert: TestAlert
# expr: probe_success{instance="195.169.22.220:11211",job="blackbox"} == 0
# for: 1m

View File

@ -0,0 +1,3 @@
# https://github.com/golang/go/blob/go1.9.1/src/net/conf.go#L194-L275
# https://golang.org/pkg/net/
hosts: files dns

View File

@ -0,0 +1,55 @@
# my global config
global:
scrape_interval: 60s # By default, scrape targets every 15 seconds.
evaluation_interval: 60s # By default, scrape targets every 15 seconds.
# scrape_timeout is set to the global default (10s).
# Attach these labels to any time series or alerts when communicating with
# external systems (federation, remote storage, Alertmanager).
external_labels:
monitor: {{ ansible_hostname }}
# alert
alerting:
alertmanagers:
- scheme: http
static_configs:
- targets:
- "alertmanager.kube.hpc.rug.nl"
basic_auth:
username: hpc
password: {{ alertmanager_pass }}
# Load and evaluate rules in this file every 'evaluation_interval' seconds.
rule_files:
- '/etc/prometheus/alerting.rules'
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
# The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
# peregrine
- job_name: 'node'
scrape_interval: 120s
file_sd_configs:
- files:
- targets.json
# peregrine
- job_name: 'ipmi'
scrape_interval: 120s
file_sd_configs:
- files:
- ipmi-targets.json
# Scrape the cadvisor container exporter
- job_name: 'cadvisor'
scrape_interval: 60s
file_sd_configs:
- files:
- cadvisor.json

View File

@ -0,0 +1,10 @@
[
{
"targets": [
],
"labels": {
"env": "blank",
"job": "node"
}
}
]

View File

@ -0,0 +1,20 @@
[Unit]
Description=Prometheus monitoring
After=docker.service
Requires=docker.service
[Service]
TimeoutStartSec=0
Restart=always
ExecStartPre=-/usr/bin/docker kill %n
ExecStartPre=-/usr/bin/docker rm %n
ExecStart=/usr/bin/docker run --name %n \
--network host \
-v /srv/prometheus/prometheus:/prometheus \
-v /srv/prometheus/etc/nsswitch.conf/etc/nsswitch.conf \
-v /srv/prometheus/etc/prometheus:/etc/prometheus \
prom/prometheus:v2.6.0 \
--storage.tsdb.retention 365d --config.file=/etc/prometheus/prometheus.yml \
--storage.tsdb.path=/prometheus --web.enable-lifecycle
[Install]
WantedBy=multi-user.target

View File

@ -0,0 +1,8 @@
$ANSIBLE_VAULT;1.1;AES256
35653034666233356434653337323037616464346462626436613836626633653661613162393235
3731313333396465616430306530653430353730636662350a326134643635636364363566313933
38303164616631316265393330343566383232333337386661643534356263323137616362393662
3636366636613934660a366631616666366331326331623261396435656533313563666464396439
38663533386634323933646166306666626533623730613363396639633638393864396264313836
39343132653439376361353462626332336134626661656236366636623932363638656530313966
616665383932306236346236633636623561

Binary file not shown.

View File

@ -0,0 +1,16 @@
---
- user:
name: kees
comment: "Kees Visser"
group: admin
- authorized_key:
user: kees
key: 'ssh-dss AAAAB3NzaC1kc3MAAACBALg7GbHKk2jYPNXUgW69AKKnCALjroTtwCA0bt4zde1mavYNoQK8JY/pe4BSOQtsyo3JECYzmAZwoNbq8nJCh8ORf5tKs8njEykZ0n7BVWtCT/jh9EFPTFhFK864TdFVCvwtIafAL4kEVNvJ0wrJYa1mN/ds03HWliv+3Shj6x0dAAAAFQDxlwgId3zlrXiCfk3ciAHN5b2ScwAAAIEArZ3/Hg7FECh5Fjf7lnBQZW7sjG5OLZRJIZlj2/jYnvIRUrsN2XmebwO4Q5q7g7FLWlfbg+x2Lmv1OWf/zGd3U6aAx8M+d+nTWDtWpQNvcE99HlfOs9Q4Rzxx6ZOyaZn57lCva/nCmLe0DTPVB8rvocMmqe1r3n7/KgxxKttbWRUAAACAfH2y4JPt2AcVdHnHiibpQBtxK/9m6AEjsB/g02tMXHZletMs9jF6kGynan7xJqRqvWxkGS1ClHIUdt2uK6A6pbqOf2BwcBIxAdljLRrZOyvmW9KTqduHMemYv6xQnpNGb8moWq5V5FKiATvd/LB46O1zwZejJErfj70LRE98Hv4= stealth@operator'
state: present
- name: unpack stealth tarrball.
unarchive:
src: files/stealth-linux.tar
dest: /home/kees

16
security.yml Normal file
View File

@ -0,0 +1,16 @@
---
- name: Install roles needed for exposed hosts
hosts: all
become: true
roles:
- geerlingguy.repo-epel
- geerlingguy.security
tasks:
- cron:
name: Reboot to load new kernel.
weekday: 1
minute: 45
hour: 11
user: root
job: /bin/needs-restarting -r >/dev/null 2>&1 || /sbin/shutdown -r +60 "restarting to apply updates"
cron_file: reboot

5
stealth.yml Normal file
View File

@ -0,0 +1,5 @@
---
- hosts: all
become: True
roles:
- stealth

View File

@ -51,6 +51,16 @@
key: 'ssh-rsa AAAAB3NzaC1yc2EAAAABIwAAAQEArQsJ0g/a5YOHlk7xcMpHNxiN+up4syzLZfgiICECET/SCDXUN4Xh3BlSWng8hMQMD5sNSADF4AghdLKfuqXG1MMSvzGSVTcRwiZ+Hq6YCoiinpQw0qu7LOZVZeoG8f7sGwhBqe0wKeyPe6Q7nRe0CXvM+aU4XfZz18O/d3mU1S7cEiue02MgH6ff6VTJFqOtLGpL1rILJn3t58N+2CCWxJwGplkp7hRJ9TnhQqCO+PN/p/4neusjembRu5lX+AKX1mv91WYURkxfLE3CWe9V9YJVG0lLgfXDMyghqkTwf8UsMHS5FBy8oTvuC55EhX+xm2Peo1lZlzy7t5Hg2fWYFQ== h.meijering@rug.nl'
state: present
- user:
name: kees
comment: "Kees Visser"
group: admin
- authorized_key:
user: kees
key: 'ssh-dss AAAAB3NzaC1kc3MAAACBALg7GbHKk2jYPNXUgW69AKKnCALjroTtwCA0bt4zde1mavYNoQK8JY/pe4BSOQtsyo3JECYzmAZwoNbq8nJCh8ORf5tKs8njEykZ0n7BVWtCT/jh9EFPTFhFK864TdFVCvwtIafAL4kEVNvJ0wrJYa1mN/ds03HWliv+3Shj6x0dAAAAFQDxlwgId3zlrXiCfk3ciAHN5b2ScwAAAIEArZ3/Hg7FECh5Fjf7lnBQZW7sjG5OLZRJIZlj2/jYnvIRUrsN2XmebwO4Q5q7g7FLWlfbg+x2Lmv1OWf/zGd3U6aAx8M+d+nTWDtWpQNvcE99HlfOs9Q4Rzxx6ZOyaZn57lCva/nCmLe0DTPVB8rvocMmqe1r3n7/KgxxKttbWRUAAACAfH2y4JPt2AcVdHnHiibpQBtxK/9m6AEjsB/g02tMXHZletMs9jF6kGynan7xJqRqvWxkGS1ClHIUdt2uK6A6pbqOf2BwcBIxAdljLRrZOyvmW9KTqduHMemYv6xQnpNGb8moWq5V5FKiATvd/LB46O1zwZejJErfj70LRE98Hv4= stealth@operator'
state: present
- user:
name: alex
comment: "Alex Pothaar"
@ -113,3 +123,14 @@
user: robin
key: 'ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQCXeVMbqjC0EKu8cmuxN+88l0TnzJUuRaFLufka2Mx9Adj8PtAZ4l9IP7f+O97ylbNQvci9DcC38NNe62b0ECutin3jUX9trvROYgxVMR/P89y139CSwWqBrHm29WLHdz9A0vO094HNzhp4xFVnblBUAFt3CCDIxvl59coV2bWgTykmVEoni9SSjqKgcC1hT0mIGcaDb428x9DsteJSakSNYwFbnbEbukA7Y5KQnbzaMl/h97C2FOsxiU5JZoiHgKNXCR5jkFsHzc3OEphXW1Ba4EnqsqUecpnfUr6OueFYR6a/q+AtIKVYT10lzCimXui/uf5zkntq1Kga/h3VtgmV root@robin-HP-Compaq-Elite-8300-MT'
state: present
- user:
name: henkjan
comment: "Henk-Jan Zilverberg"
group: admin
state: present
- authorized_key:
user: henkjan
key: 'ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIKOKgVel0GbF67zZaVR0TFo82e5XeZOP1e3Ld3gIdaER h.j.zilverberg@rug.nl'
state: present