使用docker-compose搭建promethes+grafana监控系统

# 安装docker

如果你之前安装过 docker，请先删掉

sudo yum remove docker docker-client docker-client-latest docker-common docker-latest docker-latest-logrotate docker-logrotate docker-engine

安装一些依赖

sudo yum install -y yum-utils device-mapper-persistent-data lvm2

**根据你的发行版下载repo文件: **

yum-config-manager --add-repo https://download.docker.com/linux/centos/docker-ce.repo

把软件仓库地址替换为 TUNA:

sudo sed -i 's+download.docker.com+mirrors.tuna.tsinghua.edu.cn/docker-ce+' /etc/yum.repos.d/docker-ce.repo

最后安装:

sudo yum makecache fast

sudo yum install docker-ce

# docker-compose安装

下载地址

https://github.com/docker/compose/releases/download/v2.15.0/docker-compose-linux-x86_64

点击下载，完成后上传到服务器上，然后执行如下命令

mv docker-compose-Linux-x86_64 /usr/local/bin/docker-compose chmod +x /usr/local/bin/docker-compose

# 部署prometheus和grafana、alertmanager

新增prometheus配置文件，创建prometheus.yml,内容如下

# my global config
global:
  scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
  evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
  # scrape_timeout is set to the global default (10s).

# Alertmanager configuration

alerting:
  alertmanagers:
    - static_configs:
         - targets: ["alertmanager:9093"]
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
  - "rules.yml"
  - "quote.yml"
  #- "rabbitmq.yml"
  # - "second_rules.yml"

# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
  # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
  - job_name: "prometheus"

    # metrics_path defaults to '/metrics'
    # scheme defaults to 'http'.

    static_configs:
      - targets: ["localhost:9090"]
#  - job_name: 'cadvisor'
#    static_configs:
#      - targets: ['10.10.10.235:8088']
#        labels:
#          instance: cadvisor
  - job_name: 'airm-php'
    static_configs:
      - targets: ['10.10.10.13:8199']
        labels:
          instance: airm-php
  - job_name: 'airm-c++'
    static_configs:
      - targets: ['10.10.10.155:8199']
        labels:
          instance: airm-c++
  - job_name: 'get-front'
    static_configs:
      - targets: ['10.10.10.75:8199']
        labels:
          instance: get-front
#  - job_name: 'gg-website'
#    static_configs:
#      - targets: ['10.10.10.200:8199']
#        labels:
#          instance: gg-website
  - job_name: 'IDC-BMS'
    static_configs:
      - targets: ['10.10.10.16:8199']
        labels:
          instance: IDC-BMS

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60

接着创建rules.yml，quote.yml

groups:
- name: Node_Down
  rules:
  - alert: Node实例宕机
    expr: up == 0
    for: 3m
    labels:
      severity: critical
      level: 1
    annotations:
      summary: "{{ $labels.instance }} 服务宕机"
      description: "{{ $labels.instance }} of job {{ $labels.job }} has been Down."
- name: airm_RabbitMQ_queue_messages
  interval: 5m
  rules:
  - alert: airm_RabbitMQ
    expr: rabbitmq_queue_messages_unacked{job="airm_RabbitMQ"} >=1
    for: 3m
    labels:
      severity: critical
      level: 1
    annotations:
      summary: "{{ $labels.instance }} is problem"
      description: "{{ $labels.instance }} of job {{ $labels.job }} has been problem."
- name: gf_RabbitMQ_queue_messages
  interval: 5m
  rules:
  - alert: gf_RabbitMQ_queue_messages
    expr: rabbitmq_queue_messages_unacked{job="gf_RabbitMQ"} !=0
    for: 10s
    labels:
      severity: critical
      level: 1
    annotations:
      summary: "{{ $labels.instance }} is problem"
      description: "{{ $labels.instance }} of job {{ $labels.job }} has been problem."
- name: gf_RabbitMQ_published
  interval: 5m
  rules:
  - alert: gf_RabbitMQ_published
    expr: irate(rabbitmq_channel_messages_published_total{instance="gf_RabbitMQ", job="gf_RabbitMQ"}[2m]) ==0
    for: 3m
    labels:
      severity: critical
      level: 1
    annotations:
      summary: "{{ $labels.instance }} is problem"
      description: "{{ $labels.instance }} of job {{ $labels.job }} has been problem."

- name: RabbitMQ_Down
  rules:
  - alert: Rabbitmq-down
    expr: rabbitmq_up{job='airm_RabbitMQ'} !=1
    labels:
      status: critical
      team: Rabbitmq_monitor
      level: 1
    annotations:
      description: "Instance: {{ $labels.instance }} is Down ! ! !"
      value: '{{ $value }}'
      summary:  "The host node is down"

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61

groups:
#- name: gf-md
#  interval: 10m # 查询间隔
#  rules:
#  - alert: gf_md
#    expr: irate(source_server_receive_tick_counter{instance="gf-md_hq"}[2m]) ==0
#    for: 10m # 查询时间间隔
#    labels:
#      severity: critical
#      level: 2
#    annotations:
#      summary: "{{ $labels.instance }} is problem"
#      description: "{{ $labels.instance }} of job {{ $labels.job }} has been problem."
- name: airm-nano
  interval: 5m # 查询间隔
  rules:
  - alert: airm_nano
    expr: irate(source_server_receive_tick_counter{instance="airm-nano_hq", job="airm-nano_hq", source="nano_source"}[2m]) ==0
    for: 5m # 查询时间间隔
    labels:
      severity: critical
      level: 2
    annotations:
      summary: "{{ $labels.instance }} is problem"
      description: "{{ $labels.instance }} of job {{ $labels.job }} has been problem."
- name: gf-md_hq_qianhai
  interval: 5m # 查询间隔
  rules:
  - alert: gf-md_hq_qianhai
    expr: irate(source_server_receive_tick_counter{instance="gf-md_hq_qianhai"}[2m]) ==0
    for: 5m
    labels:
      severity: critical
      level: 2
    annotations:
      summary: "{{ $labels.instance }} is problem"
      description: "{{ $labels.instance }} of job {{ $labels.job }} has been problem."
- name: airm-fortex
  rules:
  - alert: airm_fortex
    expr: irate(source_server_receive_tick_counter{instance="airm-fortex_hq"}[2m]) ==0
    for: 3m
    labels:
      severity: critical
      level: 2
    annotations:
      summary: "{{ $labels.instance }} is problem"
      description: "{{ $labels.instance }} of job {{ $labels.job }} has been problem."

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48

创建alertmanager.yml文件

global:
  #每一分钟检查一次是否恢复
  resolve_timeout: 5m
route:
  #设置默认接收人
  receiver: 'webhook'
  #组告警等待时间。也就是告警产生后等待10s，如果有同组告警一起发出
  group_wait: 10s
  #两组告警的间隔时间
  group_interval: 30m
  #重复告警的间隔时间，减少相同微信告警的发送频率
  repeat_interval: 2h
  #repeat_interval: 60s
  #采用哪个标签来作为分组依据
  group_by: ['alertname','severity','channel','env']
  routes:
  - receiver: webhook
    match_re:
      severity: critical|warning
    continue: true
  - receiver: webhook1
    match:
      severity: critical
    continue: true
  - receiver: prometheusalert-phone    #短信告警
    match_re:    #正则匹配
      level: 1|2
receivers:
- name: 'webhook'
  webhook_configs:
#  - url: http://10.10.10.250:8060/dingtalk/ops_dingding/send 
  - url: http://10.10.10.250:8060/dingtalk/webhook/send 
    #警报被解决之后是否通知
    send_resolved: true
- name: 'webhook1'
  webhook_configs:
#  - url: http://10.10.10.250:8060/dingtalk/ops_dingding/send 
  - url: http://10.10.10.250:8060/dingtalk/webhook1/send
    #警报被解决之后是否通知
    send_resolved: true
- name: 'prometheusalert-phone'
  webhook_configs:
  - url: 'http://10.10.10.250:8081/prometheusalert?type=alydx&tpl=ali-phone&phone=xxx'
  #- url: 'http://10.10.10.250:8081/prometheusalert?type=alydx&tpl=ali-phone&phone=xxx'
    send_resolved: true
#inhibit_rules:
##  - source_match:  # 当此告警发生，其他的告警被抑制
##      severity: 'critical'   
##    target_match:   # 被抑制的对象
##      severity: 'High'  
##    equal: ['id', 'instance']

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51

docker-compose文件

version: '2'
services:
  # 添加 普罗米修斯服务
  prometheus:
    # Docker Hub 镜像
    image: prom/prometheus:latest
    # 容器名称
    container_name: prometheus
    # 容器内部 hostname
    hostname: prometheus
    # 容器支持自启动
    restart: always
    # 容器与宿主机 端口映射
    ports:
      - '9090:9090'
    # 将宿主机中的config文件夹，挂载到容器中/config文件夹
    volumes:
      - './prometheus/config:/config'
      - '/data/prometheus/data/prometheus:/prometheus/data'
      - '/etc/localtime:/etc/localtime'
    # 指定容器中的配置文件
    command:
      - '--config.file=/config/prometheus.yml'
      # 支持热更新
      - '--web.enable-lifecycle'
    networks:
      - monitor

  # 添加告警模块
  alertmanager:
    image: prom/alertmanager:latest
    container_name: alertmanager
    hostname: alertmanager
    restart: always
    ports:
      - '9093:9093'
    volumes:
      - './prometheus/config:/config'
      - '/data/prometheus/data/alertmanager:/alertmanager/data'
      - '/etc/localtime:/etc/localtime'
    command:
      - '--config.file=/config/alertmanager.yml'
    networks:
      - monitor

  # 添加监控可视化面板
  grafana:
    image: grafana/grafana:latest
    container_name: grafana
    hostname: grafana
    restart: always
    ports:
      - '3000:3000'
    volumes:
      # 配置grafana 邮件服务器
      - './grafana/config/grafana.ini:/etc/grafana/grafana.ini'
      - '/data/grafana/data/grafana:/var/lib/grafana'
      - '/etc/localtime:/etc/localtime'
      - './grafana/config/ldap.toml:/etc/grafana/ldap.toml'
    networks:
      - monitor
networks:
  monitor:
    driver: bridge

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64

启动docker-compose

docker-compose up -d

上次更新: 2023/02/23, 18:13:16

← prometheus监控、告警与存储 prometheus添加钉钉消息告警→