Prometheus 3 monitoring alarm

1: Configure prometheus configuration file

vim /path/to/prometheus.yml
# my global config
global:
  scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
  evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
  # scrape_timeout is set to the global default (10s).


# 报警管理器配置
alerting:
  alertmanagers:
    - static_configs:
        - targets: ["localhost:9093"]
          # - alertmanager:9093

# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
# 配置报警规则的地址
rule_files:
  - /rules_path/to/*.rules
  # - "first_rules.yml"
  # - "second_rules.yml"

# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
# 配置对应的服务
scrape_configs:
  # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
  - job_name: "prometheus"

    # metrics_path defaults to '/metrics'
    # scheme defaults to 'http'.

    static_configs:
      - targets: ["localhost:9090"]

 # 服务器基础配置
  - job_name: "node_exporter"
    static_configs:
      - targets: ["localhost:9100"]

2: Modify the rule file and collect the content you need

vim /rules_path/to/*.rules
groups:
- name: hostStatsAlert #组名
  rules:
  # 规则1,CPU使用率。这里因为要做测试,所以设置的时0.3,正常为85%
  - alert: hostCpuUsageAlert # alertname
    expr: sum(avg without (cpu)(irate(node_cpu_seconds_total{
    
    mode!='idle'}[5m]))) by (instance) > 0.3 # peomeQL查询语句
    for: 1m
    labels: # 自定义标签,后面可以作为分组的依据
      severity: page
      temes: 1
    annotations:
      summary: "Instance {
    
    { $labels.instance }} CPU usgae high"
      description: "{
    
    { $labels.instance }} CPU usage above 30% (current value: {
    
    { $value }})"
   # 规则2,内存使用率,正常为85%
  - alert: hostMemUsageAlert
    expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes)/node_memory_MemTotal_bytes > 0.85
    for: 1m
    labels:
      severity: page
      temes: 2
    annotations:
      summary: "Instance {
    
    { $labels.instance }} MEM usgae high"
      description: "{
    
    { $labels.instance }} MEM usage above 85% (current value: {
    
    { $value }})" 
# 规则3,磁盘使用率,正常为85%
  - alert: disk_usage
    expr: 1 - (node_filesystem_files_free{
    
    instance=~'localhost:9100',fstype=~"ext.?|xfs"} / node_filesystem_files{
    
    instance=~'localhost:9100',fstype=~"ext.?|xfs"} ) > 0.85
    for: 1m
    labels:
      severity: page
    annotations:
      summary: " {
    
    { $labels.mountpoint }} usage hight more"
      description: "{
    
    { $labels.mountpoint }} disk usage above 85% (current value: {
    
    { $value }})"
 # 规则4,I/O使用,正常为85%
  - alert: I/O Utilization
    expr: irate(node_disk_io_time_seconds_total{
    
    instance="localhost:9100",device=~"sda"} [1m]) > 0.85
    for: 1m
    labels:
      severity: page
    annotations:
      summary: "Instance {
    
    { $labels.instance }} MEM usgae high"
      description: "{
    
    { $labels.instance }} MEM usage above 85% (current value: {
    
    { $value }})"

3: Modify alertmanager configuration file

vim /alertmanager_install_path/to/alertmanager.yml

The following is just the configuration of the mailbox

global:
  smtp_smarthost: 'smtp.qq.com:25'
  smtp_from: '发送人邮箱'
  smtp_auth_username: '发送人名字'
  smtp_auth_password: '授权码'


# 总规则
route:
  group_by: ['alertname']
  group_wait: 30s
  group_interval: 5m
  repeat_interval: 1h
  receiver: 'web.hook'
  # 子规则
  routes:
  - receiver: 'mail' # 接收人
    group_wait: 10s  # 等待时间
    match_re:        # 匹配规则
      temes: 2       # 自定义标签
  - receiver: 'mail1' 
    group_wait: 10s
    match_re:
      temes: 1
receivers:
  - name: 'web.hook'
    webhook_configs:
      - url: 'http://127.0.0.1:5001/'
  
  - name: 'mail'   				# 接收人名字
    email_configs:
      - to: '*****@qq.com' # 邮箱地址
  - name: 'mail1'
    email_configs:
      - to: '*****@qq.com'

inhibit_rules:
  - source_match:
      severity: 'critical'
    target_match:
      severity: 'warning'
    equal: ['alertname', 'dev', 'instance']

4: Configure the above information and restart related services after the configuration is completed.

systemctl restart promethues
systemctl restart alertmanager

promethues.service

cat > /etc/systemd/system/promethues.service <<EOF
[Unit]
Description=https://prometheus.io

[Service]
Restart=on-failure
ExecStart=/promethues_instll_path/to/prometheus --config.file=/promethues_instll_path/to/prometheus.yml

[Install]
WantedBy=multi-user.target

EOF

alertmanager.service

cat > /etc/systemd/system/promethues.service <<EOF
[Unit]
Description=alertmanager exporter service
Documentation=https://prometheus.io
After=network.target

[Service]
Type=simple
User=root
Group=root
ExecStart=/alertmanager_install_path/to/alertmanager --config.file=/alertmanager_install_path/to/alertmanager.yml
Restart=on-failure

[Install]
WantedBy=multi-user.target

EOF

Guess you like

Origin blog.csdn.net/m0_51828898/article/details/132279510