![](https://img.laitimes.com/img/9ZDMuAjOiMmIsIjOiQnIsISPrdEZwZ1Rh5WNXp1bwNjW1ZUba9VZwlHdsATOfd3bkFGazxCMx8VesATMfhHLlN3XnxCMwEzX0xiRGZkRGZ0Xy9GbvNGLpZTY1EmMZVDUSFTU4VFRR9Fd4VGdsYTMfVmepNHLrJXYtJXZ0F2dvwVZnFWbp1zczV2YvJHctM3cv1Ce-cmbw5SN4YzMyE2N0IWO0EWMiVjNxYzX0QjMwADMzEzLcBTMxIDMy8CXn9Gbi9CXzV2Zh1WavwVbvNmLvR3YxUjL1M3Lc9CX6MHc0RHaiojIsJye.png)
最近公司开始接触这两个东西,加上看到了一张告警框架的区域分布图。发现还是挺有意思的,亚洲基本都喜欢搞Zabbix这一套系统,而欧美等国家用Prometheus比较多。之前尝试搞过,没太懂,现在了解了基本怎么搞。比较难的是自己去写语句来搞监控,zabbix会shell即可,这个目前理解都是一些接口查询语句,自定义也能开发,把值传递给接口即可。目前使用下来感觉就个人少量服务器告警还是尝试用一下NETDATA,我这搞了一下,服务器(2GB /1 core)带不动。
概览
- 我都是用docker搞得,都说说每个组件都是干啥的吧?
组件 | 作用 | 监控端(需要监控的主机) | 展示端(数据展示) | 补充说明 |
Node Exporter | 收集Host硬件和操作系统信息 | YES | NO | 主机信息 |
cAdvisor | 负责收集Host上运行的 信息 | docker 信息采集 | ||
Prometheus Server | 普罗米修斯监控主服务器 | 收集上面两个组件的数据并存储提供给Grafana来采集,随便安装到哪个机器上都行。 | ||
Grafana | 展示普罗米修斯监控界面 | 把数据可视化出来 | ||
Alertmanager | 告警发送 | 非必须 | 可在Grafana配置,比Grafana好一些 | |
Pushgetway | ** **告警 | 自定义需要 | No | 自定义 |
- 注意一点就是各个组件的关系、对应端口以及配置(注意容器中的localhost不能访问容器外的信息)。
- 安装
docker run -d -p 90:9100 \
-v "/proc:/host/proc" \
-v "/sys:/host/sys" \
-v "/:/rootfs" \
-v "/etc/localtime:/etc/localtime" \
--name=node-exporter \
prom/node-exporter
docker run -d \
--volume=/:/rootfs:ro \
--volume=/var/run:/var/run:rw \
--volume=/sys:/sys:ro \
--volume=/var/lib/docker/:/var/lib/docker:ro \
--publish=80:8080 \
--detach=true \
--name=cadvisor \
-v "/etc/localtime:/etc/localtime" \
google/cadvisor:latest
- prometheus 配置文件
# my global config
global:
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
# scrape_timeout is set to the global default (10s).
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager:9093
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
# - "first_rules.yml"
# - "second_rules.yml"
# - /etc/prometheus/alert_rules.yml
- /etc/prometheus/alert_rules.yml
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
# The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
- job_name: 'prometheus'
# metrics_path defaults to '/metrics'
# scheme defaults to 'http'.
static_configs:
#监听的地址
- targets: ['localhost:80','localhost:90']
- job_name: 'mail-base'
static_configs:
- targets: ['xxx.xxx.xxx.xxx:80','xxx.xxx.xxx.xxx:90']
- job_name: 'mail-docker'
static_configs:
- targets: ['xxx.xxx.xxx.xxx:80','xxx.xxx.xxx.xxx:90']
- 告警配置文件
groups:
- name: ali
rules:
# Alert for any instance that is unreachable for >5 minutes.
- alert: InstanceDown
expr: up == 0
for: 5m
labels:
severity: page
annotations:
summary: "Instance {{ $labels.instance }} down"
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes."
# Alert for any instance that has a median request latency >1s.
- alert: APIHighRequestLatency
expr: api_http_request_latencies_second{quantile="0.5"} > 1
for: 10m
annotations:
summary: "High request latency on {{ $labels.instance }}"
description: "{{ $labels.instance }} has a median request latency above 1s (current value: {{ $value }}s)"
docker run -d \
-p 9090:9090 \
-v /etc/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml \
-v /etc/prometheus/alert_rules.yml:/etc/prometheus/alert_rules.yml \
--name prometheus \
prom/prometheus \
--config.file=/etc/prometheus/prometheus.yml \
--web.enable-lifecycle
- 建立文件夹并授权(没有授权启动不了)
mkdir /etc/grafana
chmod 777 /etc/grafana
docker run -d \
-p 3000:3000 \
--name=grafana \
-v /etc/grafana:/var/lib/grafana \
grafana/grafana
- 配置文件
global:
resolve_timeout: 5m
smtp_smarthost: 'xxxxxx.emperinter.info:465'
smtp_from: '[email protected]'
smtp_auth_username: '[email protected]'
smtp_auth_password: 'xxxxxxxxxxx^'
smtp_require_tls: false
route:
receiver: team-test-mails
group_by: ['alertname']
group_wait: 30s
group_interval: 1m
repeat_interval: 2m
receivers:
- name: 'team-test-mails'
email_configs:
- to: '[email protected]'
send_resolved: true
docker run -d -p 59093:9093 --name Alertmanager -v /etc/prometheus/alertmanager.yml:/etc/alertmanager/alertmanager.yml docker.io/prom/alertmanager:latest