Now the environment is like this:
ceph 4 sets:
192.168.100.21 ceph-node1
192.168.100.22 ceph-node2
192.168.100.23 ceph-node3
192.168.100.25 ceph-node5
# Ceph cluster has deployed a cluster four osd three mon block is not used to store all no mod
A monitoring server
192.168.100.26 Grafana above are based on the deployment of the container
Prometheus:
Grafana:
alertmanager:
prometheus-webhook-alert:
cAdvisor:
docker-compose organized as follows:
version: "2" networks: monitor: driver: bridge services: prometheus: image: prom/prometheus container_name: prometheu hostname: prometheu restart: always volumes: - /Prometheus/config/prometheus.yml:/etc/prometheus/prometheus.yml - ./config/alertmanager-rule.yml:/etc/prometheus/alertmanager-rule.yml - /etc/localtime:/etc/localtime ports: - "9090:9090" networks: - monitor prometheus-webhook-alert: image: timonwong/prometheus-webhook-dingtalk:v0.3.0 container_name: prometheus-webhook-alertmanagers hostname: webhook-alertmanagers restart: always volumes: - /etc/localtime:/etc/localtime ports: - "8060:8060" entrypoint: /bin/prometheus-webhook-dingtalk --ding.profile="webhook1=https://oapi.dingtalk.com/robot/send?access_token=****#钉钉webhook自己去申请一个" networks: - monitor alertmanager: image: prom/alertmanager container_name: alertmanager hostname: alertmanager restart: always volumes: - ./config/alertmanager.yml:/etc/alertmanager/alertmanager.yml - /etc/localtime:/etc/localtime ports: - "9093:9093" networks: - monitor grafana: image: grafana/grafana container_name: grafana hostname: grafana restart: always volumes: - /etc/localtime:/etc/localtime - ./grafana-piechart:/var/lib/grafana/plugins/grafana-piechart-panel ports: - "3000:3000" networks: - monitor cadvisor: image: google/cadvisor:latest container_name: cadvisor hostname: cadvisor restart: always volumes: - /:/rootfs:ro - /var/run:/var/run:rw - /sys:/sys:ro - /var/lib/docker/:/var/lib/docker:ro - /etc/localtime:/etc/localtime ports: - "8080:8080" networks: - monitor
Several key configuration files as follows:
# Prometheus profile
nfig / prometheus.yml
# my global config global: scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute. evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute. # Alertmanager configuration alerting: alertmanagers: - static_configs: - targets: ["192.168.100.26:9093"] # Load rules once and periodically evaluate them according to the global 'evaluation_interval'. rule_files: - "alertmanager-rule.yml" scrape_configs: # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config. - job_name: 'prometheus' static_configs: - targets: ['192.168.100.26:9090'] - job_name: 'cadvisor-1' static_configs: - targets: ['192.168.100.26:8080'] - job_name: 'node-1' scrape_interval: 4s static_configs: - targets: ['192.168.100.26:9100'] - job_name: 'cadvisor-2' static_configs: - targets: ['192.168.100.25:8080'] - job_name: 'node-2' scrape_interval: 4s static_configs: - targets: ['192.168.100.25:9100'] - job_name: 'ceph' scrape_interval: 4s static_configs: - targets: ['192.168.100.21:9128']
# Monitoring alarm assembly press combined filter configuration file and configure addresses webhook
cat ./config/alertmanager.yml
global: resolve_timeout: 5m route: group_by: ['alertname'] group_wait: 10s group_interval: 10s repeat_interval: 1h receiver: 'web.hook' receivers: - name: 'web.hook' webhook_configs: - url: 'http://192.168.100.26:8060/dingtalk/webhook1/send' send_resolved: true inhibit_rules: - source_match: severity: 'critical' target_match: severity: 'warning' equal: ['alertname', 'dev', 'instance']
# Monitoring alarm rule configuration file
cat ./alertmanager-rule.yml
Groups : - name: ceph- rule the rules: - Alert: Down the Ceph the OSD expr : ceph_osd_down> 0 for : 2M Labels: Product: Test the Ceph cluster Annotations: The Warn: " {{}} $ labels.instance: There {{$ value the OSD}}, Down: {{}} $ Labels " the Description: " {{}} $ labels.instance: there {{$ labels.osd}} $ current status labels.status} {} { " - Alert: cluster space usage expr : ceph_cluster_used_bytes / ceph_cluster_capacity_bytes * 100 > 80 for : 2M Labels: product: Ceph test cluster Annotations: The Warn: " {{}} $ labels.instance: Cluster insufficient space " the Description: " {{}} $ labels.instance: current space usage is $ {value}} { "