1
0
forked from HPC/HPCplaybooks

10 Commits

Author SHA1 Message Date
14c9a646bf Update 'users.yml' 2019-01-28 15:09:22 +00:00
1872a4edcf Automatic updates and security reboots.
These are meant for hosts with a public ip.
2019-01-24 10:25:00 +01:00
82231aa8ba Added somewhat generic nod-exporter and cadvisor playbooks. 2019-01-02 13:34:52 +01:00
7fc312e523 Added nsswitch so that /etc/hosts file will work. 2018-12-21 12:51:01 +01:00
3bd13d018a Added a generic prometheus server role. 2018-12-21 11:59:40 +01:00
d68ec10d2c Setup log rotation for docker. 2018-10-23 11:09:45 +02:00
8be6056f96 Updated golang version 2018-10-16 10:46:23 +02:00
0622a319da Accidental capital. 2018-10-10 13:34:48 +02:00
253c438348 Added kees 2018-08-13 14:04:09 +02:00
678882be7b added playbook for stealth 2018-08-13 14:03:56 +02:00
24 changed files with 368 additions and 107 deletions

View File

@ -1,3 +1,4 @@
[defaults]
inventory = hosts.py
stdout_callback = debug
vault_password_file = .vault_pass.txt

5
cadvisor.yml Normal file
View File

@ -0,0 +1,5 @@
---
- hosts: all
become: True
roles:
- cadvisor

View File

@ -1,81 +0,0 @@
molgenis[01:99]
molgenis[100:110]
[molgenis1-70]
molgenis[01:70]
[no-httpd]
molgenis02
molgenis07
molgenis11
molgenis15
molgenis23
molgenis24
molgenis25
molgenis28
molgenis30
molgenis32
molgenis33
molgenis36
molgenis37
molgenis38
molgenis39
molgenis40
molgenis42
molgenis43
molgenis44
molgenis45
molgenis46
molgenis47
molgenis48
molgenis49
molgenis54
molgenis57
molgenis59
molgenis61
molgenis64
molgenis65
molgenis69
molgenis70
[localhost-certfile]
molgenis03
molgenis06
molgenis04
molgenis05
molgenis09
molgenis12
molgenis13
molgenis17
molgenis16
molgenis19
molgenis20
molgenis26
molgenis21
molgenis41
molgenis51
molgenis50
molgenis52
molgenis53
molgenis56
molgenis58
molgenis68
molgenis18
molgenis55
molgenis60
molgenis66
molgenis67
[fqdn-certfile]
molgenis01
molgenis10
molgenis14
molgenis22
molgenis08
molgenis31
molgenis27
molgenis29
molgenis34
molgenis35
molgenis62
molgenis63

View File

@ -1,23 +0,0 @@
---
- hosts: fqdn-certfile
become: false
tasks:
- copy:
src: newcertsmolgenis/{{ ansible_hostname }}_gcc_rug_nl.crt
dest: /etc/pki/tls/certs/{{ ansible_hostname }}_gcc_rug_nl/{{ ansible_hostname }}_gcc_rug_nl.crt
backup: yes
- copy:
src: newcertsmolgenis/rsa.{{ ansible_hostname }}.gcc.rug.nl.key
dest: /etc/pki/tls/private/{{ ansible_hostname }}_gcc_rug_nl/{{ ansible_hostname }}_gcc_rug_nl.key
backup: yes
- hosts: localhost-certfile
become: true
tasks:
- copy:
src: newcertsmolgenis/{{ ansible_hostname }}_gcc_rug_nl.crt
dest: /etc/pki/tls/certs/localhost.crt
backup: yes
- copy:
src: newcertsmolgenis/rsa.{{ ansible_hostname }}.gcc.rug.nl.key
dest: /etc/pki/tls/private/localhost.key
backup: yes

5
node-exporter.yml Normal file
View File

@ -0,0 +1,5 @@
---
- hosts: all
become: True
roles:
- cadvisor

5
prometheus.yml Normal file
View File

@ -0,0 +1,5 @@
---
- hosts: all
become: True
roles:
- prom_server

View File

@ -1,4 +1,4 @@
FROM golang:1.9-stretch
FROM golang:1.11-stretch
MAINTAINER Egon Rijpkema <e.m.a.rijpkema@rug.nl>

View File

@ -0,0 +1,25 @@
---
- name: Install service files.
template:
src: templates/cadvisor.service
dest: /etc/systemd/system/cadvisor.service
mode: 644
owner: root
group: root
tags:
- service-files
- name: install service files
command: systemctl daemon-reload
- name: enable service at boot
systemd:
name: cadvisor
enabled: yes
- name: make sure servcies are started.
systemd:
name: cadvisor.service
state: restarted
tags:
- start-service

View File

@ -0,0 +1,22 @@
[Unit]
Description=Prometheus container monitoring.
After=docker.service
Requires=docker.service
[Service]
TimeoutStartSec=0
Restart=always
ExecStartPre=-/usr/bin/docker kill %n
ExecStartPre=-/usr/bin/docker rm %n
ExecStart=/usr/bin/docker run --name %n \
--volume=/:/rootfs:ro \
--volume=/var/run:/var/run:rw \
--volume=/sys:/sys:ro \
--volume=/var/lib/docker/:/var/lib/docker:ro \
--volume=/dev/disk/:/dev/disk:ro \
--publish=8181:8080 \
google/cadvisor:latest
[Install]
WantedBy=multi-user.target

View File

@ -0,0 +1,7 @@
{
"log-driver": "json-file",
"log-opts": {
"max-size": "10m",
"max-file": "10"
}
}

View File

@ -15,10 +15,16 @@
- name: install docker
apt: pkg={{ item }} state=latest
with_items:
- docker-engine
- python-docker
- docker-engine
- python-docker
when: ansible_distribution == 'Ubuntu' and ansible_distribution_release == 'xenial'
- name: Setup log rotation.
copy:
src: files/daemon.json
dest: /etc/docker/daemon.json
tags: ['settings']
- name: make sure service is started
systemd:
name: docker.service

View File

@ -0,0 +1,64 @@
---
- include_vars: vars/secrets.yml
- file:
path: "{{ item }}"
state: directory
mode: 0777
with_items:
- /srv/prometheus/etc/prometheus
- /srv/prometheus/prometheus
- name: Install prometheus.yml
template:
src: templates/etc/{{ item }}
dest: /srv/prometheus/etc/prometheus/{{ item }}
mode: 644
owner: root
group: root
with_items:
- prometheus.yml
- name: Install other settings files.
copy:
src: templates/etc/{{ item }}
dest: /srv/prometheus/etc/prometheus/{{ item }}
mode: 644
owner: root
group: root
with_items:
- alerting.rules
- targets.json
- name: Install nsswitch see https://github.com/jumanjiman/docker-ssllabs-scan/blob/6d9f48c9ac4f1df3eebef3ab28e2dd44a9ba4998/scanner/etc/nsswitch.conf
copy:
src: templates/etc/nsswitch.conf
dest: /srv/prometheus/etc/nsswitch.conf
mode: 644
owner: root
group: root
- name: Install service files.
template:
src: templates/prometheus.service
dest: /etc/systemd/system/prometheus.service
mode: 644
owner: root
group: root
tags:
- service-files
- name: install service files
command: systemctl daemon-reload
- name: enable service at boot
systemd:
name: prometheus.service
enabled: yes
- name: make sure servcies are started.
systemd:
name: prometheus.service
state: restarted
tags:
- start-service

View File

@ -0,0 +1,71 @@
groups:
- name: basic
rules:
- alert: InstanceDown
expr: up{job="node"} == 0
for: 10m
labels:
severity: page
annotations:
description: '{{ $labels.instance }} of job {{ $labels.job }} has been down
for more than 10 minutes.'
summary: Instance {{ $labels.instance }} down
- alert: Time not being synced
expr: node_timex_sync_status{job="node"} == 0
for: 5m
labels:
severity: page
annotations:
description: '{{ $labels.instance }} is not configured to sync its time with an external ntp server'
summary: Instance {{ $labels.instance }} no ntp configured.
- alert: clock wrong
expr: node_timex_offset_seconds{job="node"} > 1
for: 10m
labels:
severity: page
annotations:
description: '{{ $labels.instance }} has a clock offset > 1 second.'
summary: '{{ $labels.instance }} has clock drift.'
- alert: DiskWillFillIn8Hours
expr: predict_linear(node_filesystem_free{job="node",mountpoint!~"/tmp|/local|/target/gpfs3"}[2h], 8 * 3600) < 0
for: 2h
labels:
severity: page
annotations:
description: Instance {{ $labels.instance }} will fill up within 8 hours
summary: '{{ $labels.instance }} disk full'
- alert: DiskWillFillIn72Hours
expr: predict_linear(node_filesystem_free{job="node",mountpoint!~"/tmp|/local|/target/gpfs3"}[6h], 72 * 3600) < 0
for: 8h
labels:
severity: page
annotations:
description: Instance {{ $labels.instance }} will fill up within 72 hours
summary: '{{ $labels.instance }} disk almost full'
- alert: DiskFull
expr: node_filesystem_free{job="node",mountpoint!~"/tmp|/net|/cvmfs|/var/lib/nfs/rpc_pipefs|/cvmfs|/misc|/run/docker/netns/.+?|/cgroup.+?", fstype!~"fuse.+?"} < 5.24288e+06
for: 5m
labels:
severity: page
annotations:
description: Instance {{ $labels.instance }} has a full {{ $labels.mountpoint }}.
summary: '{{ $labels.instance }} Disk full'
- alert: tmpFull
expr: node_filesystem_free{job="node",mountpoint="/tmp"} < 5242880
for: 30m
labels:
severity: page
annotations:
description: Instance {{ $labels.instance }} Has a full /tmp
summary: '{{ $labels.instance }} /tmp full'
- alert: NodeRebooted
expr: delta(node_boot_time[1h]) > 10
for: 1m
labels:
severity: page
annotations:
description: Instance {{ $labels.instance }} has been rebooted.
summary: '{{ $labels.instance }} rebooted'
# - alert: TestAlert
# expr: probe_success{instance="195.169.22.220:11211",job="blackbox"} == 0
# for: 1m

View File

@ -0,0 +1,3 @@
# https://github.com/golang/go/blob/go1.9.1/src/net/conf.go#L194-L275
# https://golang.org/pkg/net/
hosts: files dns

View File

@ -0,0 +1,55 @@
# my global config
global:
scrape_interval: 60s # By default, scrape targets every 15 seconds.
evaluation_interval: 60s # By default, scrape targets every 15 seconds.
# scrape_timeout is set to the global default (10s).
# Attach these labels to any time series or alerts when communicating with
# external systems (federation, remote storage, Alertmanager).
external_labels:
monitor: {{ ansible_hostname }}
# alert
alerting:
alertmanagers:
- scheme: http
static_configs:
- targets:
- "alertmanager.kube.hpc.rug.nl"
basic_auth:
username: hpc
password: {{ alertmanager_pass }}
# Load and evaluate rules in this file every 'evaluation_interval' seconds.
rule_files:
- '/etc/prometheus/alerting.rules'
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
# The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
# peregrine
- job_name: 'node'
scrape_interval: 120s
file_sd_configs:
- files:
- targets.json
# peregrine
- job_name: 'ipmi'
scrape_interval: 120s
file_sd_configs:
- files:
- ipmi-targets.json
# Scrape the cadvisor container exporter
- job_name: 'cadvisor'
scrape_interval: 60s
file_sd_configs:
- files:
- cadvisor.json

View File

@ -0,0 +1,10 @@
[
{
"targets": [
],
"labels": {
"env": "blank",
"job": "node"
}
}
]

View File

@ -0,0 +1,20 @@
[Unit]
Description=Prometheus monitoring
After=docker.service
Requires=docker.service
[Service]
TimeoutStartSec=0
Restart=always
ExecStartPre=-/usr/bin/docker kill %n
ExecStartPre=-/usr/bin/docker rm %n
ExecStart=/usr/bin/docker run --name %n \
--network host \
-v /srv/prometheus/prometheus:/prometheus \
-v /srv/prometheus/etc/nsswitch.conf/etc/nsswitch.conf \
-v /srv/prometheus/etc/prometheus:/etc/prometheus \
prom/prometheus:v2.6.0 \
--storage.tsdb.retention 365d --config.file=/etc/prometheus/prometheus.yml \
--storage.tsdb.path=/prometheus --web.enable-lifecycle
[Install]
WantedBy=multi-user.target

View File

@ -0,0 +1,8 @@
$ANSIBLE_VAULT;1.1;AES256
35653034666233356434653337323037616464346462626436613836626633653661613162393235
3731313333396465616430306530653430353730636662350a326134643635636364363566313933
38303164616631316265393330343566383232333337386661643534356263323137616362393662
3636366636613934660a366631616666366331326331623261396435656533313563666464396439
38663533386634323933646166306666626533623730613363396639633638393864396264313836
39343132653439376361353462626332336134626661656236366636623932363638656530313966
616665383932306236346236633636623561

Binary file not shown.

View File

@ -0,0 +1,16 @@
---
- user:
name: kees
comment: "Kees Visser"
group: admin
- authorized_key:
user: kees
key: 'ssh-dss AAAAB3NzaC1kc3MAAACBALg7GbHKk2jYPNXUgW69AKKnCALjroTtwCA0bt4zde1mavYNoQK8JY/pe4BSOQtsyo3JECYzmAZwoNbq8nJCh8ORf5tKs8njEykZ0n7BVWtCT/jh9EFPTFhFK864TdFVCvwtIafAL4kEVNvJ0wrJYa1mN/ds03HWliv+3Shj6x0dAAAAFQDxlwgId3zlrXiCfk3ciAHN5b2ScwAAAIEArZ3/Hg7FECh5Fjf7lnBQZW7sjG5OLZRJIZlj2/jYnvIRUrsN2XmebwO4Q5q7g7FLWlfbg+x2Lmv1OWf/zGd3U6aAx8M+d+nTWDtWpQNvcE99HlfOs9Q4Rzxx6ZOyaZn57lCva/nCmLe0DTPVB8rvocMmqe1r3n7/KgxxKttbWRUAAACAfH2y4JPt2AcVdHnHiibpQBtxK/9m6AEjsB/g02tMXHZletMs9jF6kGynan7xJqRqvWxkGS1ClHIUdt2uK6A6pbqOf2BwcBIxAdljLRrZOyvmW9KTqduHMemYv6xQnpNGb8moWq5V5FKiATvd/LB46O1zwZejJErfj70LRE98Hv4= stealth@operator'
state: present
- name: unpack stealth tarrball.
unarchive:
src: files/stealth-linux.tar
dest: /home/kees

16
security.yml Normal file
View File

@ -0,0 +1,16 @@
---
- name: Install roles needed for exposed hosts
hosts: all
become: true
roles:
- geerlingguy.repo-epel
- geerlingguy.security
tasks:
- cron:
name: Reboot to load new kernel.
weekday: 1
minute: 45
hour: 11
user: root
job: /bin/needs-restarting -r >/dev/null 2>&1 || /sbin/shutdown -r +60 "restarting to apply updates"
cron_file: reboot

5
stealth.yml Normal file
View File

@ -0,0 +1,5 @@
---
- hosts: all
become: True
roles:
- stealth

View File

@ -51,6 +51,16 @@
key: 'ssh-rsa AAAAB3NzaC1yc2EAAAABIwAAAQEArQsJ0g/a5YOHlk7xcMpHNxiN+up4syzLZfgiICECET/SCDXUN4Xh3BlSWng8hMQMD5sNSADF4AghdLKfuqXG1MMSvzGSVTcRwiZ+Hq6YCoiinpQw0qu7LOZVZeoG8f7sGwhBqe0wKeyPe6Q7nRe0CXvM+aU4XfZz18O/d3mU1S7cEiue02MgH6ff6VTJFqOtLGpL1rILJn3t58N+2CCWxJwGplkp7hRJ9TnhQqCO+PN/p/4neusjembRu5lX+AKX1mv91WYURkxfLE3CWe9V9YJVG0lLgfXDMyghqkTwf8UsMHS5FBy8oTvuC55EhX+xm2Peo1lZlzy7t5Hg2fWYFQ== h.meijering@rug.nl'
state: present
- user:
name: kees
comment: "Kees Visser"
group: admin
- authorized_key:
user: kees
key: 'ssh-dss AAAAB3NzaC1kc3MAAACBALg7GbHKk2jYPNXUgW69AKKnCALjroTtwCA0bt4zde1mavYNoQK8JY/pe4BSOQtsyo3JECYzmAZwoNbq8nJCh8ORf5tKs8njEykZ0n7BVWtCT/jh9EFPTFhFK864TdFVCvwtIafAL4kEVNvJ0wrJYa1mN/ds03HWliv+3Shj6x0dAAAAFQDxlwgId3zlrXiCfk3ciAHN5b2ScwAAAIEArZ3/Hg7FECh5Fjf7lnBQZW7sjG5OLZRJIZlj2/jYnvIRUrsN2XmebwO4Q5q7g7FLWlfbg+x2Lmv1OWf/zGd3U6aAx8M+d+nTWDtWpQNvcE99HlfOs9Q4Rzxx6ZOyaZn57lCva/nCmLe0DTPVB8rvocMmqe1r3n7/KgxxKttbWRUAAACAfH2y4JPt2AcVdHnHiibpQBtxK/9m6AEjsB/g02tMXHZletMs9jF6kGynan7xJqRqvWxkGS1ClHIUdt2uK6A6pbqOf2BwcBIxAdljLRrZOyvmW9KTqduHMemYv6xQnpNGb8moWq5V5FKiATvd/LB46O1zwZejJErfj70LRE98Hv4= stealth@operator'
state: present
- user:
name: alex
comment: "Alex Pothaar"
@ -113,3 +123,14 @@
user: robin
key: 'ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQCXeVMbqjC0EKu8cmuxN+88l0TnzJUuRaFLufka2Mx9Adj8PtAZ4l9IP7f+O97ylbNQvci9DcC38NNe62b0ECutin3jUX9trvROYgxVMR/P89y139CSwWqBrHm29WLHdz9A0vO094HNzhp4xFVnblBUAFt3CCDIxvl59coV2bWgTykmVEoni9SSjqKgcC1hT0mIGcaDb428x9DsteJSakSNYwFbnbEbukA7Y5KQnbzaMl/h97C2FOsxiU5JZoiHgKNXCR5jkFsHzc3OEphXW1Ba4EnqsqUecpnfUr6OueFYR6a/q+AtIKVYT10lzCimXui/uf5zkntq1Kga/h3VtgmV root@robin-HP-Compaq-Elite-8300-MT'
state: present
- user:
name: henkjan
comment: "Henk-Jan Zilverberg"
group: admin
state: present
- authorized_key:
user: henkjan
key: 'ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIKOKgVel0GbF67zZaVR0TFo82e5XeZOP1e3Ld3gIdaER h.j.zilverberg@rug.nl'
state: present