Update 'users.yml'

Automatic updates and security reboots.
These are meant for hosts with a public ip.
2019-01-28 15:09:22 +00:00 · 2019-01-24 10:25:00 +01:00 · 2019-01-02 13:34:52 +01:00 · 2018-12-21 12:51:01 +01:00 · 2018-12-21 11:59:40 +01:00 · 2018-10-23 11:09:45 +02:00
24 changed files with 368 additions and 107 deletions
--- a/ansible.cfg
+++ b/ansible.cfg
@@ -1,3 +1,4 @@
 [defaults]
 inventory = hosts.py
 stdout_callback = debug
+vault_password_file = .vault_pass.txt
--- a/cadvisor.yml
+++ b/cadvisor.yml
@@ -0,0 +1,5 @@
+---
+- hosts: all
+  become: True
+  roles:
+     - cadvisor
--- a/81
+++ b/81
@@ -1,81 +0,0 @@
-molgenis[01:99]
-molgenis[100:110]
-
-[molgenis1-70]
-molgenis[01:70]
-
-[no-httpd]
-molgenis02
-molgenis07
-molgenis11
-molgenis15
-molgenis23
-molgenis24
-molgenis25
-molgenis28
-molgenis30
-molgenis32
-molgenis33
-molgenis36
-molgenis37
-molgenis38
-molgenis39
-molgenis40
-molgenis42
-molgenis43
-molgenis44
-molgenis45
-molgenis46
-molgenis47
-molgenis48
-molgenis49
-molgenis54
-molgenis57
-molgenis59
-molgenis61
-molgenis64
-molgenis65
-molgenis69
-molgenis70
-
-[localhost-certfile]
-molgenis03
-molgenis06
-molgenis04
-molgenis05
-molgenis09
-molgenis12
-molgenis13
-molgenis17
-molgenis16
-molgenis19
-molgenis20
-molgenis26
-molgenis21
-molgenis41
-molgenis51
-molgenis50
-molgenis52
-molgenis53
-molgenis56
-molgenis58
-molgenis68
-molgenis18
-molgenis55
-molgenis60
-molgenis66
-molgenis67
-
-[fqdn-certfile]
-molgenis01
-molgenis10
-molgenis14
-molgenis22
-molgenis08
-molgenis31
-molgenis27
-molgenis29
-molgenis34
-molgenis35
-molgenis62
-molgenis63
--- a/molgenis_cert.yml
+++ b/molgenis_cert.yml
@@ -1,23 +0,0 @@
---
- hosts: fqdn-certfile
-  become: false
-  tasks:
-    - copy:
-        src: newcertsmolgenis/{{ ansible_hostname }}_gcc_rug_nl.crt
-        dest: /etc/pki/tls/certs/{{ ansible_hostname }}_gcc_rug_nl/{{ ansible_hostname }}_gcc_rug_nl.crt
-        backup: yes
-    - copy:
-        src: newcertsmolgenis/rsa.{{ ansible_hostname }}.gcc.rug.nl.key
-        dest: /etc/pki/tls/private/{{ ansible_hostname }}_gcc_rug_nl/{{ ansible_hostname }}_gcc_rug_nl.key
-        backup: yes
- hosts: localhost-certfile
-  become: true
-  tasks:
-    - copy:
-        src: newcertsmolgenis/{{ ansible_hostname }}_gcc_rug_nl.crt
-        dest: /etc/pki/tls/certs/localhost.crt
-        backup: yes
-    - copy:
-        src: newcertsmolgenis/rsa.{{ ansible_hostname }}.gcc.rug.nl.key
-        dest: /etc/pki/tls/private/localhost.key
-        backup: yes
--- a/node-exporter.yml
+++ b/node-exporter.yml
@@ -0,0 +1,5 @@
+---
+- hosts: all
+  become: True
+  roles:
+     - cadvisor
--- a/prometheus.yml
+++ b/prometheus.yml
@@ -0,0 +1,5 @@
+---
+- hosts: all
+  become: True
+  roles:
+     - prom_server
--- a/promtools/Dockerfile
+++ b/promtools/Dockerfile
@@ -1,4 +1,4 @@
-FROM golang:1.9-stretch
+FROM golang:1.11-stretch

 MAINTAINER Egon Rijpkema <e.m.a.rijpkema@rug.nl>

--- a/roles/cadvisor/tasks/main.yml
+++ b/roles/cadvisor/tasks/main.yml
@@ -0,0 +1,25 @@
+---
+- name: Install service files.
+  template:
+    src: templates/cadvisor.service
+    dest: /etc/systemd/system/cadvisor.service
+    mode: 644
+    owner: root
+    group: root
+  tags:
+      - service-files
+
+- name: install service files
+  command: systemctl daemon-reload
+
+- name: enable service at boot
+  systemd:
+    name: cadvisor
+    enabled: yes
+
+- name: make sure servcies are started.
+  systemd:
+    name: cadvisor.service
+    state: restarted
+  tags:
+      - start-service
--- a/roles/cadvisor/templates/cadvisor.service
+++ b/roles/cadvisor/templates/cadvisor.service
@@ -0,0 +1,22 @@
+[Unit]
+Description=Prometheus container monitoring.
+After=docker.service
+Requires=docker.service
+
+[Service]
+TimeoutStartSec=0
+Restart=always
+ExecStartPre=-/usr/bin/docker kill %n
+ExecStartPre=-/usr/bin/docker rm %n
+ExecStart=/usr/bin/docker run --name %n \
+    --volume=/:/rootfs:ro \
+    --volume=/var/run:/var/run:rw \
+    --volume=/sys:/sys:ro \
+    --volume=/var/lib/docker/:/var/lib/docker:ro \
+    --volume=/dev/disk/:/dev/disk:ro \
+    --publish=8181:8080 \
+    google/cadvisor:latest
+
+
+[Install]
+WantedBy=multi-user.target
--- a/roles/docker/files/daemon.json
+++ b/roles/docker/files/daemon.json
@@ -0,0 +1,7 @@
+{
+  "log-driver": "json-file",
+  "log-opts": {
+    "max-size": "10m",
+    "max-file": "10"
+  }
+}
--- a/roles/docker/tasks/main.yml
+++ b/roles/docker/tasks/main.yml
@@ -15,10 +15,16 @@
 - name: install docker
  apt: pkg={{ item }} state=latest
  with_items:
-     - docker-engine
-     - python-docker
+    - docker-engine
+    - python-docker
  when: ansible_distribution == 'Ubuntu' and ansible_distribution_release == 'xenial'

+- name: Setup log rotation.
+  copy:
+    src: files/daemon.json
+    dest: /etc/docker/daemon.json
+  tags: ['settings']
+
 - name: make sure service is started
  systemd:
    name: docker.service
--- a/roles/prom_server/tasks/main.yml
+++ b/roles/prom_server/tasks/main.yml
@@ -0,0 +1,64 @@
+---
+- include_vars: vars/secrets.yml
+
+- file:
+    path: "{{ item }}"
+    state: directory
+    mode: 0777
+  with_items:
+    - /srv/prometheus/etc/prometheus
+    - /srv/prometheus/prometheus
+
+- name: Install prometheus.yml
+  template:
+    src: templates/etc/{{ item }}
+    dest: /srv/prometheus/etc/prometheus/{{ item }}
+    mode: 644
+    owner: root
+    group: root
+  with_items:
+    - prometheus.yml
+
+- name: Install other settings files.
+  copy:
+    src: templates/etc/{{ item }}
+    dest: /srv/prometheus/etc/prometheus/{{ item }}
+    mode: 644
+    owner: root
+    group: root
+  with_items:
+    - alerting.rules
+    - targets.json
+
+- name: Install nsswitch see https://github.com/jumanjiman/docker-ssllabs-scan/blob/6d9f48c9ac4f1df3eebef3ab28e2dd44a9ba4998/scanner/etc/nsswitch.conf
+  copy:
+    src: templates/etc/nsswitch.conf
+    dest: /srv/prometheus/etc/nsswitch.conf
+    mode: 644
+    owner: root
+    group: root
+
+- name: Install service files.
+  template:
+    src: templates/prometheus.service
+    dest: /etc/systemd/system/prometheus.service
+    mode: 644
+    owner: root
+    group: root
+  tags:
+    - service-files
+
+- name: install service files
+  command: systemctl daemon-reload
+
+- name: enable service at boot
+  systemd:
+    name: prometheus.service
+    enabled: yes
+
+- name: make sure servcies are started.
+  systemd:
+    name: prometheus.service
+    state: restarted
+  tags:
+    - start-service
--- a/roles/prom_server/templates/etc/alerting.rules
+++ b/roles/prom_server/templates/etc/alerting.rules
@@ -0,0 +1,71 @@
+groups:
+- name: basic
+  rules:
+  - alert: InstanceDown
+    expr: up{job="node"} == 0
+    for: 10m
+    labels:
+      severity: page
+    annotations:
+      description: '{{ $labels.instance }} of job {{ $labels.job }} has been down
+        for more than 10 minutes.'
+      summary: Instance {{ $labels.instance }} down
+  - alert: Time not being synced
+    expr: node_timex_sync_status{job="node"} == 0
+    for: 5m
+    labels:
+      severity: page
+    annotations:
+      description: '{{ $labels.instance }} is not configured to sync its time with an external ntp server'
+      summary: Instance {{ $labels.instance }} no ntp configured.
+  - alert: clock wrong
+    expr: node_timex_offset_seconds{job="node"} > 1
+    for: 10m
+    labels:
+      severity: page
+    annotations:
+      description: '{{ $labels.instance }} has a clock offset > 1 second.'
+      summary: '{{ $labels.instance }} has clock drift.'
+  - alert: DiskWillFillIn8Hours
+    expr: predict_linear(node_filesystem_free{job="node",mountpoint!~"/tmp|/local|/target/gpfs3"}[2h], 8 * 3600) < 0
+    for: 2h
+    labels:
+      severity: page
+    annotations:
+      description: Instance {{ $labels.instance }} will fill up within 8 hours
+      summary: '{{ $labels.instance }} disk full'
+  - alert: DiskWillFillIn72Hours
+    expr: predict_linear(node_filesystem_free{job="node",mountpoint!~"/tmp|/local|/target/gpfs3"}[6h], 72 * 3600) < 0
+    for: 8h
+    labels:
+      severity: page
+    annotations:
+      description: Instance {{ $labels.instance }} will fill up within 72 hours
+      summary: '{{ $labels.instance }} disk almost full'
+  - alert: DiskFull
+    expr: node_filesystem_free{job="node",mountpoint!~"/tmp|/net|/cvmfs|/var/lib/nfs/rpc_pipefs|/cvmfs|/misc|/run/docker/netns/.+?|/cgroup.+?", fstype!~"fuse.+?"} < 5.24288e+06
+    for: 5m
+    labels:
+      severity: page
+    annotations:
+      description: Instance {{ $labels.instance }} has a full {{ $labels.mountpoint }}.
+      summary: '{{ $labels.instance }} Disk full'
+  - alert: tmpFull
+    expr: node_filesystem_free{job="node",mountpoint="/tmp"} < 5242880
+    for: 30m
+    labels:
+      severity: page
+    annotations:
+      description: Instance {{ $labels.instance }} Has a full /tmp
+      summary: '{{ $labels.instance }} /tmp full'
+  - alert: NodeRebooted
+    expr: delta(node_boot_time[1h]) > 10
+    for: 1m
+    labels:
+      severity: page
+    annotations:
+      description: Instance {{ $labels.instance }} has been rebooted.
+      summary: '{{ $labels.instance }} rebooted'
+#  - alert: TestAlert
+#    expr: probe_success{instance="195.169.22.220:11211",job="blackbox"} == 0
+#    for: 1m
--- a/roles/prom_server/templates/etc/cadvisor.json
+++ b/roles/prom_server/templates/etc/cadvisor.json
--- a/roles/prom_server/templates/etc/nsswitch.conf
+++ b/roles/prom_server/templates/etc/nsswitch.conf
@@ -0,0 +1,3 @@
+# https://github.com/golang/go/blob/go1.9.1/src/net/conf.go#L194-L275
+# https://golang.org/pkg/net/
+hosts: files dns
--- a/roles/prom_server/templates/etc/prometheus.yml
+++ b/roles/prom_server/templates/etc/prometheus.yml
@@ -0,0 +1,55 @@
+# my global config
+global:
+  scrape_interval:     60s # By default, scrape targets every 15 seconds.
+  evaluation_interval: 60s # By default, scrape targets every 15 seconds.
+  # scrape_timeout is set to the global default (10s).
+
+  # Attach these labels to any time series or alerts when communicating with
+  # external systems (federation, remote storage, Alertmanager).
+  external_labels:
+      monitor: {{ ansible_hostname }}
+
+# alert
+alerting:
+  alertmanagers:
+  - scheme: http
+    static_configs:
+    - targets:
+      - "alertmanager.kube.hpc.rug.nl"
+    basic_auth:
+       username: hpc
+       password: {{ alertmanager_pass }}
+
+# Load and evaluate rules in this file every 'evaluation_interval' seconds.
+rule_files:
+- '/etc/prometheus/alerting.rules'
+
+# A scrape configuration containing exactly one endpoint to scrape:
+# Here it's Prometheus itself.
+scrape_configs:
+  # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
+  - job_name: 'prometheus'
+    static_configs:
+         - targets: ['localhost:9090']
+
+  # peregrine
+  - job_name: 'node'
+    scrape_interval: 120s
+    file_sd_configs:
+        - files:
+          - targets.json
+
+  # peregrine
+  - job_name: 'ipmi'
+    scrape_interval: 120s
+    file_sd_configs:
+        - files:
+          - ipmi-targets.json
+
+
+  # Scrape the cadvisor container exporter
+  - job_name: 'cadvisor'
+    scrape_interval: 60s
+    file_sd_configs:
+        - files:
+          - cadvisor.json
--- a/roles/prom_server/templates/etc/targets.json
+++ b/roles/prom_server/templates/etc/targets.json
@@ -0,0 +1,10 @@
+[
+    {
+        "targets": [
+        ],
+        "labels": {
+            "env": "blank",
+            "job": "node"
+        }
+    }
+]
--- a/roles/prom_server/templates/prometheus.service
+++ b/roles/prom_server/templates/prometheus.service
@@ -0,0 +1,20 @@
+[Unit]
+Description=Prometheus monitoring
+After=docker.service
+Requires=docker.service
+
+[Service]
+TimeoutStartSec=0
+Restart=always
+ExecStartPre=-/usr/bin/docker kill %n
+ExecStartPre=-/usr/bin/docker rm %n
+ExecStart=/usr/bin/docker run --name %n \
+    --network host \
+    -v /srv/prometheus/prometheus:/prometheus  \
+    -v /srv/prometheus/etc/nsswitch.conf/etc/nsswitch.conf  \
+    -v /srv/prometheus/etc/prometheus:/etc/prometheus  \
+    prom/prometheus:v2.6.0 \
+    --storage.tsdb.retention 365d --config.file=/etc/prometheus/prometheus.yml \
+    --storage.tsdb.path=/prometheus --web.enable-lifecycle
+[Install]
+WantedBy=multi-user.target
--- a/roles/prom_server/vars/secrets.yml
+++ b/roles/prom_server/vars/secrets.yml
@@ -0,0 +1,8 @@
+$ANSIBLE_VAULT;1.1;AES256
+35653034666233356434653337323037616464346462626436613836626633653661613162393235
+3731313333396465616430306530653430353730636662350a326134643635636364363566313933
+38303164616631316265393330343566383232333337386661643534356263323137616362393662
+3636366636613934660a366631616666366331326331623261396435656533313563666464396439
+38663533386634323933646166306666626533623730613363396639633638393864396264313836
+39343132653439376361353462626332336134626661656236366636623932363638656530313966
+616665383932306236346236633636623561
--- a/roles/stealth/files/stealth-linux.tar
+++ b/roles/stealth/files/stealth-linux.tar
--- a/roles/stealth/tasks/main.yml
+++ b/roles/stealth/tasks/main.yml
@@ -0,0 +1,16 @@
+---
+
+- user:
+    name: kees
+    comment: "Kees Visser"
+    group: admin
+
+- authorized_key:
+    user: kees
+    key: 'ssh-dss AAAAB3NzaC1kc3MAAACBALg7GbHKk2jYPNXUgW69AKKnCALjroTtwCA0bt4zde1mavYNoQK8JY/pe4BSOQtsyo3JECYzmAZwoNbq8nJCh8ORf5tKs8njEykZ0n7BVWtCT/jh9EFPTFhFK864TdFVCvwtIafAL4kEVNvJ0wrJYa1mN/ds03HWliv+3Shj6x0dAAAAFQDxlwgId3zlrXiCfk3ciAHN5b2ScwAAAIEArZ3/Hg7FECh5Fjf7lnBQZW7sjG5OLZRJIZlj2/jYnvIRUrsN2XmebwO4Q5q7g7FLWlfbg+x2Lmv1OWf/zGd3U6aAx8M+d+nTWDtWpQNvcE99HlfOs9Q4Rzxx6ZOyaZn57lCva/nCmLe0DTPVB8rvocMmqe1r3n7/KgxxKttbWRUAAACAfH2y4JPt2AcVdHnHiibpQBtxK/9m6AEjsB/g02tMXHZletMs9jF6kGynan7xJqRqvWxkGS1ClHIUdt2uK6A6pbqOf2BwcBIxAdljLRrZOyvmW9KTqduHMemYv6xQnpNGb8moWq5V5FKiATvd/LB46O1zwZejJErfj70LRE98Hv4= stealth@operator'
+    state: present
+
+- name: unpack stealth tarrball.
+  unarchive:
+    src: files/stealth-linux.tar
+    dest: /home/kees
--- a/security.yml
+++ b/security.yml
@@ -0,0 +1,16 @@
+---
+- name: Install roles needed for exposed hosts
+  hosts: all
+  become: true
+  roles:
+     - geerlingguy.repo-epel
+     - geerlingguy.security
+  tasks:
+     - cron:
+          name: Reboot to load new kernel.
+          weekday: 1
+          minute: 45
+          hour: 11
+          user: root
+          job: /bin/needs-restarting -r >/dev/null 2>&1 || /sbin/shutdown -r +60 "restarting to apply updates"
+          cron_file: reboot
--- a/stealth.yml
+++ b/stealth.yml
@@ -0,0 +1,5 @@
+---
+- hosts: all
+  become: True
+  roles:
+     - stealth
--- a/users.yml
+++ b/users.yml
@@ -51,6 +51,16 @@
        key: 'ssh-rsa AAAAB3NzaC1yc2EAAAABIwAAAQEArQsJ0g/a5YOHlk7xcMpHNxiN+up4syzLZfgiICECET/SCDXUN4Xh3BlSWng8hMQMD5sNSADF4AghdLKfuqXG1MMSvzGSVTcRwiZ+Hq6YCoiinpQw0qu7LOZVZeoG8f7sGwhBqe0wKeyPe6Q7nRe0CXvM+aU4XfZz18O/d3mU1S7cEiue02MgH6ff6VTJFqOtLGpL1rILJn3t58N+2CCWxJwGplkp7hRJ9TnhQqCO+PN/p/4neusjembRu5lX+AKX1mv91WYURkxfLE3CWe9V9YJVG0lLgfXDMyghqkTwf8UsMHS5FBy8oTvuC55EhX+xm2Peo1lZlzy7t5Hg2fWYFQ== h.meijering@rug.nl'
        state: present

+    - user:
+        name: kees
+        comment: "Kees Visser"
+        group: admin
+
+    - authorized_key:
+        user: kees
+        key: 'ssh-dss AAAAB3NzaC1kc3MAAACBALg7GbHKk2jYPNXUgW69AKKnCALjroTtwCA0bt4zde1mavYNoQK8JY/pe4BSOQtsyo3JECYzmAZwoNbq8nJCh8ORf5tKs8njEykZ0n7BVWtCT/jh9EFPTFhFK864TdFVCvwtIafAL4kEVNvJ0wrJYa1mN/ds03HWliv+3Shj6x0dAAAAFQDxlwgId3zlrXiCfk3ciAHN5b2ScwAAAIEArZ3/Hg7FECh5Fjf7lnBQZW7sjG5OLZRJIZlj2/jYnvIRUrsN2XmebwO4Q5q7g7FLWlfbg+x2Lmv1OWf/zGd3U6aAx8M+d+nTWDtWpQNvcE99HlfOs9Q4Rzxx6ZOyaZn57lCva/nCmLe0DTPVB8rvocMmqe1r3n7/KgxxKttbWRUAAACAfH2y4JPt2AcVdHnHiibpQBtxK/9m6AEjsB/g02tMXHZletMs9jF6kGynan7xJqRqvWxkGS1ClHIUdt2uK6A6pbqOf2BwcBIxAdljLRrZOyvmW9KTqduHMemYv6xQnpNGb8moWq5V5FKiATvd/LB46O1zwZejJErfj70LRE98Hv4= stealth@operator'
+        state: present
+
    - user:
        name: alex
        comment: "Alex Pothaar"
@@ -113,3 +123,14 @@
        user: robin
        key: 'ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQCXeVMbqjC0EKu8cmuxN+88l0TnzJUuRaFLufka2Mx9Adj8PtAZ4l9IP7f+O97ylbNQvci9DcC38NNe62b0ECutin3jUX9trvROYgxVMR/P89y139CSwWqBrHm29WLHdz9A0vO094HNzhp4xFVnblBUAFt3CCDIxvl59coV2bWgTykmVEoni9SSjqKgcC1hT0mIGcaDb428x9DsteJSakSNYwFbnbEbukA7Y5KQnbzaMl/h97C2FOsxiU5JZoiHgKNXCR5jkFsHzc3OEphXW1Ba4EnqsqUecpnfUr6OueFYR6a/q+AtIKVYT10lzCimXui/uf5zkntq1Kga/h3VtgmV root@robin-HP-Compaq-Elite-8300-MT'
        state: present
+
+    - user:
+        name: henkjan
+        comment: "Henk-Jan Zilverberg"
+        group: admin
+        state: present
+
+    - authorized_key:
+        user: henkjan
+        key: 'ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIKOKgVel0GbF67zZaVR0TFo82e5XeZOP1e3Ld3gIdaER h.j.zilverberg@rug.nl'
+        state: present
Author	SHA1	Message	Date
p219755	14c9a646bf	Update 'users.yml'	2019-01-28 15:09:22 +00:00
Egon Rijpkema	1872a4edcf	Automatic updates and security reboots. These are meant for hosts with a public ip.	2019-01-24 10:25:00 +01:00
Egon Rijpkema	82231aa8ba	Added somewhat generic nod-exporter and cadvisor playbooks.	2019-01-02 13:34:52 +01:00
Egon Rijpkema	7fc312e523	Added nsswitch so that /etc/hosts file will work.	2018-12-21 12:51:01 +01:00
Egon Rijpkema	3bd13d018a	Added a generic prometheus server role.	2018-12-21 11:59:40 +01:00
Egon Rijpkema	d68ec10d2c	Setup log rotation for docker.	2018-10-23 11:09:45 +02:00
Egon Rijpkema	8be6056f96	Updated golang version	2018-10-16 10:46:23 +02:00
Egon Rijpkema	0622a319da	Accidental capital.	2018-10-10 13:34:48 +02:00
Egon Rijpkema	253c438348	Added kees	2018-08-13 14:04:09 +02:00
Egon Rijpkema	678882be7b	added playbook for stealth	2018-08-13 14:03:56 +02:00