docker-compose the Prometheus rapid deployment server and monitor ceph cluster using nails webhook alarm

 Now the environment is like this:

ceph 4 sets:

192.168.100.21  ceph-node1

192.168.100.22  ceph-node2

192.168.100.23  ceph-node3

192.168.100.25  ceph-node5

# Ceph cluster has deployed a cluster four osd three mon block is not used to store all no mod

 

A monitoring server

192.168.100.26 Grafana above are based on the deployment of the container 

Prometheus: 
Grafana: 
alertmanager: 
prometheus-webhook-alert:
cAdvisor:

docker-compose organized as follows:

version: "2"
networks:
    monitor:
        driver: bridge
services:
  prometheus:
    image: prom/prometheus
    container_name: prometheu
    hostname: prometheu
    restart: always
    volumes:
    - /Prometheus/config/prometheus.yml:/etc/prometheus/prometheus.yml
    - ./config/alertmanager-rule.yml:/etc/prometheus/alertmanager-rule.yml
    - /etc/localtime:/etc/localtime
    ports:
    - "9090:9090"
    networks:
    - monitor

  prometheus-webhook-alert:
    image: timonwong/prometheus-webhook-dingtalk:v0.3.0
    container_name: prometheus-webhook-alertmanagers
    hostname: webhook-alertmanagers
    restart: always
    volumes:
    - /etc/localtime:/etc/localtime
    ports:
    - "8060:8060"
    entrypoint: /bin/prometheus-webhook-dingtalk --ding.profile="webhook1=https://oapi.dingtalk.com/robot/send?access_token=****#钉钉webhook自己去申请一个"
    networks:
    - monitor
  
  alertmanager:
    image: prom/alertmanager
    container_name: alertmanager
    hostname: alertmanager
    restart: always
    volumes:
      - ./config/alertmanager.yml:/etc/alertmanager/alertmanager.yml
      - /etc/localtime:/etc/localtime
    ports:
      - "9093:9093"
    networks:
      - monitor

  grafana:
    image: grafana/grafana
    container_name: grafana
    hostname: grafana
    restart: always
    volumes:
    - /etc/localtime:/etc/localtime
    - ./grafana-piechart:/var/lib/grafana/plugins/grafana-piechart-panel
    ports:
    - "3000:3000"
    networks:
    - monitor
    
  cadvisor:
    image: google/cadvisor:latest
    container_name: cadvisor
    hostname: cadvisor
    restart: always
    volumes:
    - /:/rootfs:ro
    - /var/run:/var/run:rw
    - /sys:/sys:ro
    - /var/lib/docker/:/var/lib/docker:ro
    - /etc/localtime:/etc/localtime
    ports:
    - "8080:8080"
    networks:
    - monitor

Several key configuration files as follows:

# Prometheus profile

 

 nfig / prometheus.yml

 

# my global config
global:
  scrape_interval:     15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
  evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.

# Alertmanager configuration
alerting:
  alertmanagers:
  - static_configs:
    - targets: ["192.168.100.26:9093"]
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
  - "alertmanager-rule.yml"

scrape_configs:
  # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
  - job_name: 'prometheus'
    static_configs:
    - targets: ['192.168.100.26:9090']
 
  - job_name: 'cadvisor-1'
    static_configs:
    - targets: ['192.168.100.26:8080']
 
  - job_name: 'node-1'
    scrape_interval: 4s
    static_configs:
    - targets: ['192.168.100.26:9100']

  - job_name: 'cadvisor-2'
    static_configs:
    - targets: ['192.168.100.25:8080']

  - job_name: 'node-2'
    scrape_interval: 4s
    static_configs:
    - targets: ['192.168.100.25:9100']

  - job_name: 'ceph'
    scrape_interval: 4s
    static_configs:
    - targets: ['192.168.100.21:9128']

 

# Monitoring alarm assembly press combined filter configuration file and configure addresses webhook

cat  ./config/alertmanager.yml

global:
  resolve_timeout: 5m
route:
  group_by: ['alertname']
  group_wait: 10s
  group_interval: 10s
  repeat_interval: 1h
  receiver: 'web.hook'

receivers:
- name: 'web.hook'
  webhook_configs:
  - url: 'http://192.168.100.26:8060/dingtalk/webhook1/send'
    send_resolved: true

inhibit_rules:
  - source_match:
      severity: 'critical'
    target_match:
      severity: 'warning'
    equal: ['alertname', 'dev', 'instance']

# Monitoring alarm rule configuration file

cat ./alertmanager-rule.yml 

Groups :
 - name: ceph- rule 
  the rules:
   - Alert: Down the Ceph the OSD
     expr : ceph_osd_down> 0 
    for : 2M 
    Labels: 
      Product: Test the Ceph cluster 
    Annotations: 
      The Warn: " {{}} $ labels.instance: There {{$ value the OSD}}, Down: {{}} $ Labels " 
      the Description: " {{}} $ labels.instance: there {{$ labels.osd}} $ current status labels.status} {} { "
 
  - Alert: cluster space usage
     expr : ceph_cluster_used_bytes / ceph_cluster_capacity_bytes * 100 > 80 
    for : 2M 
    Labels:
      product: Ceph test cluster 
    Annotations: 
      The Warn: " {{}} $ labels.instance: Cluster insufficient space " 
      the Description: " {{}} $ labels.instance: current space usage is $ {value}} { "

Guess you like

Origin www.cnblogs.com/python-diy/p/11512285.html
Recommended