----alertmanager.yml
global:
resolve_timeout: 5m
smtp_smarthost: 'smtp.163.com:465' # 定義163郵箱伺服器端
smtp_from: '[email protected]' #來自哪個郵箱發的
smtp_auth_username: '[email protected]' #郵箱驗證
smtp_auth_password: 'xxxxxxxxxxx' # 郵箱授權碼,不是登入密碼
smtp_require_tls: false # 是否啟用tls
route:
group_by: ['alertname'] #告警分組
group_wait: 5s #如果在等待時間内目前group接收到了新的告警,這些告警将會合并為一個通知向receiver發送。
group_interval: 5s #用于定義相同的Gourp之間發送告警通知的時間間隔。
repeat_interval: 5m # 發送報警間隔,如果指定時間内沒有修複,則重新發送報警。
receiver: 'email' #發送的告警媒體
receivers:
- name: 'email' # 接收者配置,這裡要與接收媒體一緻
email_configs:
- to: '[email protected]' #發送給誰的郵箱,多個人多行列出
send_resolved: true # 是否通知已解決的警報。
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'dev', 'instance']
----prometheus.yml
# my global config
global:
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
# scrape_timeout is set to the global default (10s).
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets:
- xxx.xxx.xxx:9093
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
- /opt/prometheus-2.37.0.linux-amd64/rules/*.yml
# - "first_rules.yml"
# - "second_rules.yml"
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
# The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
- job_name: "prometheus"
# metrics_path defaults to '/metrics'
# scheme defaults to 'http'.
static_configs:
- targets: ["xxx.xxx.xxx:9090"]
- job_name: "node_export"
static_configs:
- targets: ["xxx.xxx.xxx:9100","xxx.xxx.xxx:9100","xxx.xxx.xxx:9100"]
groups:
- name: node-alert
rules:
- alert: node status is WODN
expr: up{job="node_export"} == 0
for: 2m
labels:
severity: emergency
instance: "{{ $labels.instance }}"
annotations:
summary: "node: {{ $labels.instance }} down"
description: "{{$labels.instance}} down more than 5 minutes"
value: "{{ $value }}"
- name: mem-used
rules:
- alert: node mem_used > 50%
expr: (((node_memory_MemTotal_bytes - node_memory_MemFree_bytes - node_memory_Buffers_bytes - node_memory_Cached_bytes) / (node_memory_MemTotal_bytes )) * 100 ) > 50
for: 2m
labels:
severity: emergency
instance: "{{ $labels.instance }}"
annotations:
summary: "node: {{ $labels.instance }} mem_used > 50%"
description: "{{$labels.instance}} down more than 2 minutes"
value: "{{ $value }}"
- name: disk-used
rules:
- alert: node disk_used > 50%
expr: (((node_filesystem_size_bytes-node_filesystem_free_bytes) *100/(node_filesystem_avail_bytes +(node_filesystem_size_bytes-node_filesystem_free_bytes)))) > 30
for: 2m
labels:
severity: emergency
instance: "{{ $labels.instance }}"
annotations:
summary: "node: {{ $labels.instance }} disk_used > 50%"
description: "{{$labels.instance}} down more than 2 minutes"
value: "{{ $value }}"