天天看點

alertmanager郵件告警相關

----alertmanager.yml
global:
  resolve_timeout: 5m
  smtp_smarthost: 'smtp.163.com:465'   # 定義163郵箱伺服器端
  smtp_from: '[email protected]'  #來自哪個郵箱發的
  smtp_auth_username: '[email protected]' #郵箱驗證
  smtp_auth_password: 'xxxxxxxxxxx'   # 郵箱授權碼,不是登入密碼
  smtp_require_tls: false   # 是否啟用tls

route:
  group_by: ['alertname']    #告警分組
  group_wait: 5s       #如果在等待時間内目前group接收到了新的告警,這些告警将會合并為一個通知向receiver發送。
  group_interval: 5s    #用于定義相同的Gourp之間發送告警通知的時間間隔。
  repeat_interval: 5m  # 發送報警間隔,如果指定時間内沒有修複,則重新發送報警。
  receiver: 'email'    #發送的告警媒體

receivers:
- name: 'email'        # 接收者配置,這裡要與接收媒體一緻
  email_configs:
  - to: '[email protected]' #發送給誰的郵箱,多個人多行列出
    send_resolved: true    # 是否通知已解決的警報。
inhibit_rules:
  - source_match:
      severity: 'critical'
    target_match:
      severity: 'warning'
    equal: ['alertname', 'dev', 'instance']      
----prometheus.yml
# my global config
global:
  scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
  evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
  # scrape_timeout is set to the global default (10s).

# Alertmanager configuration
alerting:
  alertmanagers:
    - static_configs:
        - targets:
           - xxx.xxx.xxx:9093

# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
    - /opt/prometheus-2.37.0.linux-amd64/rules/*.yml
  # - "first_rules.yml"
  # - "second_rules.yml"

# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
  # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
  - job_name: "prometheus"

    # metrics_path defaults to '/metrics'
    # scheme defaults to 'http'.

    static_configs:
      - targets: ["xxx.xxx.xxx:9090"]

  - job_name: "node_export"
    static_configs:
      - targets: ["xxx.xxx.xxx:9100","xxx.xxx.xxx:9100","xxx.xxx.xxx:9100"]      
groups:
- name: node-alert
  rules:
  - alert: node status is WODN
    expr: up{job="node_export"} == 0
    for: 2m
    labels:
      severity: emergency
      instance: "{{ $labels.instance }}"
    annotations:
      summary: "node: {{ $labels.instance }} down"
      description: "{{$labels.instance}} down more than 5 minutes"
      value: "{{ $value }}"
- name: mem-used
  rules:
  - alert: node mem_used > 50%
    expr: (((node_memory_MemTotal_bytes - node_memory_MemFree_bytes - node_memory_Buffers_bytes - node_memory_Cached_bytes) / (node_memory_MemTotal_bytes )) * 100 ) > 50
    for: 2m
    labels:
      severity: emergency
      instance: "{{ $labels.instance }}"
    annotations:
      summary: "node: {{ $labels.instance }} mem_used > 50%"
      description: "{{$labels.instance}} down more than 2 minutes"
      value: "{{ $value }}"
- name: disk-used
  rules:
  - alert: node disk_used > 50%
    expr: (((node_filesystem_size_bytes-node_filesystem_free_bytes) *100/(node_filesystem_avail_bytes +(node_filesystem_size_bytes-node_filesystem_free_bytes)))) > 30
    for: 2m
    labels:
      severity: emergency
      instance: "{{ $labels.instance }}"
    annotations:
      summary: "node: {{ $labels.instance }} disk_used > 50%"
      description: "{{$labels.instance}} down more than 2 minutes"
      value: "{{ $value }}"      

繼續閱讀