From 29dc390edf93c54a7567eabfcb36acfb68e652c6 Mon Sep 17 00:00:00 2001 From: paul Date: Wed, 27 Jul 2016 16:09:16 +0800 Subject: [PATCH 1/4] add alertmanager --- alertmanager/config.yml | 33 +++++++++++++++++++++++++++++++++ docker-compose.yml | 15 ++++++++++++++- prometheus/alert.rules | 7 +++++++ prometheus/prometheus.yml | 1 + 4 files changed, 55 insertions(+), 1 deletion(-) create mode 100644 alertmanager/config.yml create mode 100644 prometheus/alert.rules diff --git a/alertmanager/config.yml b/alertmanager/config.yml new file mode 100644 index 0000000..8031e7f --- /dev/null +++ b/alertmanager/config.yml @@ -0,0 +1,33 @@ +global: + # The smarthost and SMTP sender used for mail notifications. + smtp_smarthost: 'localhost:25' + smtp_from: 'alertmanager@example.org' + smtp_auth_username: 'alertmanager' + smtp_auth_password: 'password' + # The auth token for Hipchat. + hipchat_auth_token: '1234556789' + # Alternative host for Hipchat. + hipchat_url: 'https://hipchat.foobar.org/' + +# The directory from which notification templates are read. +templates: +- '/etc/alertmanager/template/*.tmpl' + +# The root route on which each incoming alert enters. +route: + group_by: [cluster] + # If an alert isn't caught by a route, send it slack. + receiver: slack_general + + # The child route trees. + routes: + # Send severity=slack alerts to slack. + - match: + severity: slack + receiver: slack_general +receivers: +- name: slack_general + slack_configs: + - api_url: 'https://hooks.slack.com/services/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' + channel: '#prometheus' + send_resolved: true \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml index 6a75aa9..00f9c2b 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -20,10 +20,12 @@ services: command: - '-config.file=/etc/prometheus/prometheus.yml' - '-storage.local.path=/prometheus' + - '-alertmanager.url=http://alertmanager:9093' expose: - 9090 links: - cadvisor:cadvisor + - alertmanager:alertmanager depends_on: - cadvisor networks: @@ -35,7 +37,18 @@ services: - 9100 networks: - back-tier - + alertmanager: + image: prom/alertmanager + ports: + - 9093:9093 + volumes: + - ./alertmanager/:/etc/alertmanager/ + networks: + - back-tier + command: + - '-config.file=/etc/alertmanager/config.yml' + - '-storage.path=/alertmanager' + cadvisor: image: google/cadvisor volumes: diff --git a/prometheus/alert.rules b/prometheus/alert.rules new file mode 100644 index 0000000..1688ee9 --- /dev/null +++ b/prometheus/alert.rules @@ -0,0 +1,7 @@ +ALERT instance_down +IF up == 0 +FOR 5s +LABELS {severity="page"} +ANNOTATIONS { + DESCRIPTION="{{$labels.instance}} of job {{$labels.job}} has been down for more than 5 seconds.", + SUMMARY="Instance {{$labels.instance}} down"} \ No newline at end of file diff --git a/prometheus/prometheus.yml b/prometheus/prometheus.yml index 4fd78c0..c537348 100644 --- a/prometheus/prometheus.yml +++ b/prometheus/prometheus.yml @@ -11,6 +11,7 @@ global: # Load and evaluate rules in this file every 'evaluation_interval' seconds. rule_files: + - "alert.rules" # - "first.rules" # - "second.rules" From 7d246ab608a67a28f4abf344c21a9b47b750e4e6 Mon Sep 17 00:00:00 2001 From: paul Date: Wed, 27 Jul 2016 18:27:31 +0800 Subject: [PATCH 2/4] add slack integration --- alertmanager/config.yml | 37 +++++++------------------------------ docker-compose.yml | 2 ++ prometheus/alert.rules | 16 +++++++++------- 3 files changed, 18 insertions(+), 37 deletions(-) diff --git a/alertmanager/config.yml b/alertmanager/config.yml index 8031e7f..6e9cc78 100644 --- a/alertmanager/config.yml +++ b/alertmanager/config.yml @@ -1,33 +1,10 @@ -global: - # The smarthost and SMTP sender used for mail notifications. - smtp_smarthost: 'localhost:25' - smtp_from: 'alertmanager@example.org' - smtp_auth_username: 'alertmanager' - smtp_auth_password: 'password' - # The auth token for Hipchat. - hipchat_auth_token: '1234556789' - # Alternative host for Hipchat. - hipchat_url: 'https://hipchat.foobar.org/' - -# The directory from which notification templates are read. -templates: -- '/etc/alertmanager/template/*.tmpl' - -# The root route on which each incoming alert enters. route: - group_by: [cluster] - # If an alert isn't caught by a route, send it slack. - receiver: slack_general + receiver: 'slack' - # The child route trees. - routes: - # Send severity=slack alerts to slack. - - match: - severity: slack - receiver: slack_general receivers: -- name: slack_general - slack_configs: - - api_url: 'https://hooks.slack.com/services/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' - channel: '#prometheus' - send_resolved: true \ No newline at end of file + - name: 'slack' + slack_configs: + - send_resolved: true + username: 'llitfkitfk' + channel: '#prometheus' + api_url: 'https://hooks.slack.com/services/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' \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml index 00f9c2b..baee31e 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -23,6 +23,8 @@ services: - '-alertmanager.url=http://alertmanager:9093' expose: - 9090 + ports: + - 9090:9090 links: - cadvisor:cadvisor - alertmanager:alertmanager diff --git a/prometheus/alert.rules b/prometheus/alert.rules index 1688ee9..697931b 100644 --- a/prometheus/alert.rules +++ b/prometheus/alert.rules @@ -1,7 +1,9 @@ -ALERT instance_down -IF up == 0 -FOR 5s -LABELS {severity="page"} -ANNOTATIONS { - DESCRIPTION="{{$labels.instance}} of job {{$labels.job}} has been down for more than 5 seconds.", - SUMMARY="Instance {{$labels.instance}} down"} \ No newline at end of file +ALERT service_down + IF up == 0 + +ALERT high_load + IF node_load1 > 0.5 + ANNOTATIONS { + summary = "Instance {{ $labels.instance }} under high load", + description = "{{ $labels.instance }} of job {{ $labels.job }} is under high load.", + } \ No newline at end of file From 03117a3f35bcaf4a9cce0a56a33392f2cf55bb29 Mon Sep 17 00:00:00 2001 From: paul Date: Wed, 27 Jul 2016 18:48:24 +0800 Subject: [PATCH 3/4] add alert related dashboard json file --- HighLoadDashboard.json | 259 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 259 insertions(+) create mode 100644 HighLoadDashboard.json diff --git a/HighLoadDashboard.json b/HighLoadDashboard.json new file mode 100644 index 0000000..66da47c --- /dev/null +++ b/HighLoadDashboard.json @@ -0,0 +1,259 @@ +{ + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "Prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__requires": [ + { + "type": "panel", + "id": "graph", + "name": "Graph", + "version": "" + }, + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "3.1.0" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + } + ], + "id": null, + "title": "High Load", + "tags": [], + "style": "dark", + "timezone": "browser", + "editable": true, + "hideControls": false, + "sharedCrosshair": false, + "rows": [ + { + "collapse": false, + "editable": true, + "height": 323.625, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 1, + "isNew": true, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_load1", + "hide": false, + "intervalFactor": 2, + "legendFormat": "", + "metric": "node_load1", + "refId": "A", + "step": 10 + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Panel Title", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "show": true + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "title": "Row" + }, + { + "collapse": false, + "editable": true, + "height": 407.4375, + "panels": [ + { + "aliasColors": { + "ALERTS{alertname=\"high_load\",alertstate=\"firing\",instance=\"node-exporter:9100\",job=\"prometheus\"}": "#BF1B00" + }, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 3, + "isNew": true, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "ALERTS", + "intervalFactor": 1, + "metric": "ALERTS", + "refId": "A", + "step": 5 + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Panel Title", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "show": true + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "title": "New row" + } + ], + "time": { + "from": "now-3h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "templating": { + "list": [] + }, + "annotations": { + "list": [] + }, + "refresh": "10s", + "schemaVersion": 12, + "version": 4, + "links": [], + "gnetId": null +} \ No newline at end of file From 16fb60bfe7d22da6ceabf22a3eff72146cd317f5 Mon Sep 17 00:00:00 2001 From: paul Date: Wed, 27 Jul 2016 18:49:38 +0800 Subject: [PATCH 4/4] update alertmanger config --- alertmanager/config.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/alertmanager/config.yml b/alertmanager/config.yml index 6e9cc78..bfd485d 100644 --- a/alertmanager/config.yml +++ b/alertmanager/config.yml @@ -5,6 +5,6 @@ receivers: - name: 'slack' slack_configs: - send_resolved: true - username: 'llitfkitfk' - channel: '#prometheus' - api_url: 'https://hooks.slack.com/services/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' \ No newline at end of file + username: '' + channel: '#' + api_url: '' \ No newline at end of file