使用docker-compose搭建promethes+grafana监控系统
# 安装docker
如果你之前安装过 docker,请先删掉
sudo yum remove docker docker-client docker-client-latest docker-common docker-latest docker-latest-logrotate docker-logrotate docker-engine
安装一些依赖
sudo yum install -y yum-utils device-mapper-persistent-data lvm2
**根据你的发行版下载repo文件: **
yum-config-manager --add-repo https://download.docker.com/linux/centos/docker-ce.repo
把软件仓库地址替换为 TUNA:
sudo sed -i 's+download.docker.com+mirrors.tuna.tsinghua.edu.cn/docker-ce+' /etc/yum.repos.d/docker-ce.repo
最后安装:
sudo yum makecache fast
sudo yum install docker-ce
# docker-compose安装
下载地址
https://github.com/docker/compose/releases/download/v2.15.0/docker-compose-linux-x86_64
点击下载,完成后上传到服务器上,然后执行如下命令
mv docker-compose-Linux-x86_64 /usr/local/bin/docker-compose chmod +x /usr/local/bin/docker-compose
# 部署prometheus和grafana、alertmanager
新增prometheus配置文件,创建prometheus.yml,内容如下
# my global config
global:
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
# scrape_timeout is set to the global default (10s).
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets: ["alertmanager:9093"]
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
- "rules.yml"
- "quote.yml"
#- "rabbitmq.yml"
# - "second_rules.yml"
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
# The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
- job_name: "prometheus"
# metrics_path defaults to '/metrics'
# scheme defaults to 'http'.
static_configs:
- targets: ["localhost:9090"]
# - job_name: 'cadvisor'
# static_configs:
# - targets: ['10.10.10.235:8088']
# labels:
# instance: cadvisor
- job_name: 'airm-php'
static_configs:
- targets: ['10.10.10.13:8199']
labels:
instance: airm-php
- job_name: 'airm-c++'
static_configs:
- targets: ['10.10.10.155:8199']
labels:
instance: airm-c++
- job_name: 'get-front'
static_configs:
- targets: ['10.10.10.75:8199']
labels:
instance: get-front
# - job_name: 'gg-website'
# static_configs:
# - targets: ['10.10.10.200:8199']
# labels:
# instance: gg-website
- job_name: 'IDC-BMS'
static_configs:
- targets: ['10.10.10.16:8199']
labels:
instance: IDC-BMS
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
接着创建rules.yml,quote.yml
groups:
- name: Node_Down
rules:
- alert: Node实例宕机
expr: up == 0
for: 3m
labels:
severity: critical
level: 1
annotations:
summary: "{{ $labels.instance }} 服务宕机"
description: "{{ $labels.instance }} of job {{ $labels.job }} has been Down."
- name: airm_RabbitMQ_queue_messages
interval: 5m
rules:
- alert: airm_RabbitMQ
expr: rabbitmq_queue_messages_unacked{job="airm_RabbitMQ"} >=1
for: 3m
labels:
severity: critical
level: 1
annotations:
summary: "{{ $labels.instance }} is problem"
description: "{{ $labels.instance }} of job {{ $labels.job }} has been problem."
- name: gf_RabbitMQ_queue_messages
interval: 5m
rules:
- alert: gf_RabbitMQ_queue_messages
expr: rabbitmq_queue_messages_unacked{job="gf_RabbitMQ"} !=0
for: 10s
labels:
severity: critical
level: 1
annotations:
summary: "{{ $labels.instance }} is problem"
description: "{{ $labels.instance }} of job {{ $labels.job }} has been problem."
- name: gf_RabbitMQ_published
interval: 5m
rules:
- alert: gf_RabbitMQ_published
expr: irate(rabbitmq_channel_messages_published_total{instance="gf_RabbitMQ", job="gf_RabbitMQ"}[2m]) ==0
for: 3m
labels:
severity: critical
level: 1
annotations:
summary: "{{ $labels.instance }} is problem"
description: "{{ $labels.instance }} of job {{ $labels.job }} has been problem."
- name: RabbitMQ_Down
rules:
- alert: Rabbitmq-down
expr: rabbitmq_up{job='airm_RabbitMQ'} !=1
labels:
status: critical
team: Rabbitmq_monitor
level: 1
annotations:
description: "Instance: {{ $labels.instance }} is Down ! ! !"
value: '{{ $value }}'
summary: "The host node is down"
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
groups:
#- name: gf-md
# interval: 10m # 查询间隔
# rules:
# - alert: gf_md
# expr: irate(source_server_receive_tick_counter{instance="gf-md_hq"}[2m]) ==0
# for: 10m # 查询时间间隔
# labels:
# severity: critical
# level: 2
# annotations:
# summary: "{{ $labels.instance }} is problem"
# description: "{{ $labels.instance }} of job {{ $labels.job }} has been problem."
- name: airm-nano
interval: 5m # 查询间隔
rules:
- alert: airm_nano
expr: irate(source_server_receive_tick_counter{instance="airm-nano_hq", job="airm-nano_hq", source="nano_source"}[2m]) ==0
for: 5m # 查询时间间隔
labels:
severity: critical
level: 2
annotations:
summary: "{{ $labels.instance }} is problem"
description: "{{ $labels.instance }} of job {{ $labels.job }} has been problem."
- name: gf-md_hq_qianhai
interval: 5m # 查询间隔
rules:
- alert: gf-md_hq_qianhai
expr: irate(source_server_receive_tick_counter{instance="gf-md_hq_qianhai"}[2m]) ==0
for: 5m
labels:
severity: critical
level: 2
annotations:
summary: "{{ $labels.instance }} is problem"
description: "{{ $labels.instance }} of job {{ $labels.job }} has been problem."
- name: airm-fortex
rules:
- alert: airm_fortex
expr: irate(source_server_receive_tick_counter{instance="airm-fortex_hq"}[2m]) ==0
for: 3m
labels:
severity: critical
level: 2
annotations:
summary: "{{ $labels.instance }} is problem"
description: "{{ $labels.instance }} of job {{ $labels.job }} has been problem."
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
创建alertmanager.yml文件
global:
#每一分钟检查一次是否恢复
resolve_timeout: 5m
route:
#设置默认接收人
receiver: 'webhook'
#组告警等待时间。也就是告警产生后等待10s,如果有同组告警一起发出
group_wait: 10s
#两组告警的间隔时间
group_interval: 30m
#重复告警的间隔时间,减少相同微信告警的发送频率
repeat_interval: 2h
#repeat_interval: 60s
#采用哪个标签来作为分组依据
group_by: ['alertname','severity','channel','env']
routes:
- receiver: webhook
match_re:
severity: critical|warning
continue: true
- receiver: webhook1
match:
severity: critical
continue: true
- receiver: prometheusalert-phone #短信告警
match_re: #正则匹配
level: 1|2
receivers:
- name: 'webhook'
webhook_configs:
# - url: http://10.10.10.250:8060/dingtalk/ops_dingding/send
- url: http://10.10.10.250:8060/dingtalk/webhook/send
#警报被解决之后是否通知
send_resolved: true
- name: 'webhook1'
webhook_configs:
# - url: http://10.10.10.250:8060/dingtalk/ops_dingding/send
- url: http://10.10.10.250:8060/dingtalk/webhook1/send
#警报被解决之后是否通知
send_resolved: true
- name: 'prometheusalert-phone'
webhook_configs:
- url: 'http://10.10.10.250:8081/prometheusalert?type=alydx&tpl=ali-phone&phone=xxx'
#- url: 'http://10.10.10.250:8081/prometheusalert?type=alydx&tpl=ali-phone&phone=xxx'
send_resolved: true
#inhibit_rules:
## - source_match: # 当此告警发生,其他的告警被抑制
## severity: 'critical'
## target_match: # 被抑制的对象
## severity: 'High'
## equal: ['id', 'instance']
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
docker-compose文件
version: '2'
services:
# 添加 普罗米修斯服务
prometheus:
# Docker Hub 镜像
image: prom/prometheus:latest
# 容器名称
container_name: prometheus
# 容器内部 hostname
hostname: prometheus
# 容器支持自启动
restart: always
# 容器与宿主机 端口映射
ports:
- '9090:9090'
# 将宿主机中的config文件夹,挂载到容器中/config文件夹
volumes:
- './prometheus/config:/config'
- '/data/prometheus/data/prometheus:/prometheus/data'
- '/etc/localtime:/etc/localtime'
# 指定容器中的配置文件
command:
- '--config.file=/config/prometheus.yml'
# 支持热更新
- '--web.enable-lifecycle'
networks:
- monitor
# 添加告警模块
alertmanager:
image: prom/alertmanager:latest
container_name: alertmanager
hostname: alertmanager
restart: always
ports:
- '9093:9093'
volumes:
- './prometheus/config:/config'
- '/data/prometheus/data/alertmanager:/alertmanager/data'
- '/etc/localtime:/etc/localtime'
command:
- '--config.file=/config/alertmanager.yml'
networks:
- monitor
# 添加监控可视化面板
grafana:
image: grafana/grafana:latest
container_name: grafana
hostname: grafana
restart: always
ports:
- '3000:3000'
volumes:
# 配置grafana 邮件服务器
- './grafana/config/grafana.ini:/etc/grafana/grafana.ini'
- '/data/grafana/data/grafana:/var/lib/grafana'
- '/etc/localtime:/etc/localtime'
- './grafana/config/ldap.toml:/etc/grafana/ldap.toml'
networks:
- monitor
networks:
monitor:
driver: bridge
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
启动docker-compose
docker-compose up -d