简介
本文采用docker-compose方式部署prometheus组件,用来监控系统、容器,服务信息,通过dingtalk实现告警(服务暂时没监控,后续更新)
准备工作
创建组件映射文件夹
mkdir -p /data/prometheus/{
cadvisor,exporter,grafana,prometheus,dingtalk,alertmanager}
mkdir -p /data/prometheus/prometheus/{
conf,data}
mkdir -p /data/prometheus/prometheus/conf/rules # 告警规则存储目录(复制是删除注释否者创建为 rules#)
mkdir -p /data/prometheus/grafana/data
mkdir -p /data/prometheus/dingtalk/conf
mkdir -p /data/prometheus/alertmanager/conf
touch /data/prometheus/dingtalk/conf/config.yml
chmod 777 -R /data/prometheus/grafana/data/
docker run -d --name prometheus prom/prometheus
docker cp prometheus:/etc/prometheus/prometheus.yml /data/prometheus/prometheus/conf/
docker rm -f prometheus
docker run -d --name alertmanager -p 9093:9093 prom/alertmanager:latest
docker cp alertmanager:/etc/alertmanager/alertmanager.yml /data/prometheus/alertmanager/conf/
docker rm -f alertmanager
vim /data/prometheus/prometheus/conf/prometheus.yml
- targets: ["192.168.30.33:6001","192.168.30.33:6002","192.168.30.33:6003","192.168.30.33:6004"]
# tree /data/prometheus/
/data/prometheus/
├── alertmanager
│ └── conf
├── cadvisor
├── dingtalk
│ └── conf
│ └── config.yml
├── exporter
├── grafana
│ └── data
└── prometheus
├── conf
│ ├── prometheus.yml
│ └── rules
└── data
部署(node-exporter、cadvisor、prometheus、grafana、prometheus-webhook-dingtalk、alertmanager)
vi /data/prometheus/prometheus_module.yaml
version: '3.3'
services:
node-exporter:
image: prom/node-exporter:latest
container_name: node-exporter
hostname: node-exporter
restart: always
user: root
command:
- '--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|host|etc)($$|/)'
- '--collector.filesystem.ignored-fs-types=^(autofs|binfmt_misc|cgroup|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|mqueue|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|sysfs|tracefs)$$'
- '--collector.textfile.directory=/node_exporter/prom'
volumes:
- /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
- '--path.rootfs=/host'
- '--path.procfs /host/proc'
- '--path.sysfs /host/sys'
- '--path.rootfs /rootfs'
pid: host
volumes:
- '/proc:/host/proc:ro'
- '/sys:/host/sys:ro'
- '/:/rootfs:ro'
ports:
- "6001:9100"
cadvisor:
image: google/cadvisor
container_name: dc-cadvisor
restart: always
privileged: true
volumes:
- /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
- /:/rootfs:ro
- /var/run:/var/run:rw
- /sys:/sys:ro
- /var/lib/docker/:/var/lib/docker:ro
- /dev/disk/:/dev/disk:ro
ports:
- "6002:8080"
prometheus:
image: prom/prometheus:latest
container_name: prometheus
hostname: prometheus
restart: always
user: root
volumes:
- /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
- /data/prometheus/prometheus/conf:/etc/prometheus
- /data/prometheus/prometheus/data:/prometheus
ports:
- "6003:9090"
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--web.console.libraries=/usr/share/prometheus/console_libraries'
- '--web.console.templates=/usr/share/prometheus/consoles'
- '--storage.tsdb.retention.time=7d'
grafana:
image: grafana/grafana:latest
container_name: grafana
hostname: grafana
restart: always
user: root
environment:
- GF_SECURITY_ADMIN_PASSWORD=admin
ports:
- "6004:3000"
volumes:
- /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
# - /opt/grafana/defaults.ini:/etc/grafana/grafana.ini
- /data/prometheus/grafana/data:/var/lib/grafana
prometheus-webhook-dingtalk:
image: "timonwong/prometheus-webhook-dingtalk"
container_name: dingtalk-prometheus-webhook
hostname: prometheus-webhook-dingtalk
restart: always
ports:
- "6005:8060"
volumes:
- /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
- /data/prometheus/dingtalk/conf/config.yml:/etc/prometheus-webhook-dingtalk/config.yml
# entrypoint: /bin/prometheus-webhook-dingtalk --ding.profile="FBI=https://oapi.dingtalk.com/robot/send?access_token=************f530163"
alertmanager:
image: prom/alertmanager:latest
container_name: alertmanager
hostname: alertmanager
restart: always
ports:
- "6006:9093"
- "6007:9094"
volumes:
- /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
- /data/prometheus/alertmanager/conf/:/etc/alertmanager/
command:
- "--config.file=/etc/alertmanager/alertmanager.yml"
docker-compose -f /data/prometheus/prometheus_module.yaml up -d
grafana展示
grafana使用模板(本人使用8919+15798):
8919(系统信息)
193、15798(容器信息)
配置钉钉告警
vim /data/prometheus/alertmanager/conf/alertmanager.yml
route:
group_by: ['alertname']
group_wait: 30s
group_interval: 5m
repeat_interval: 1h
receiver: 'web.hook'
receivers:
- name: 'web.hook'
webhook_configs:
- url: 'http://192.168.30.33:6005/dingtalk/webhook1/send'
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'dev', 'instance']
vim /data/prometheus/dingtalk/conf/config.yml
targets:
webhook1:
url: https://oapi.dingtalk.com/robot/send?access_token=************f530163
# secret for signature
webhook2:
url: https://oapi.dingtalk.com/robot/send?access_token=xxxxxxxxxxxx
webhook_legacy:
url: https://oapi.dingtalk.com/robot/send?access_token=xxxxxxxxxxxx
# Customize template content
message:
# Use legacy template
title: '{
{ template "legacy.title" . }}'
text: '{
{ template "legacy.content" . }}'
webhook_mention_all:
url: https://oapi.dingtalk.com/robot/send?access_token=xxxxxxxxxxxx
mention:
all: true
webhook_mention_users:
url: https://oapi.dingtalk.com/robot/send?access_token=xxxxxxxxxxxx
mention:
mobiles: ['156xxxx8827', '189xxxx8325']
# 测试钉钉告警
curl -XPOST http://192.168.30.33/api/v1/alerts -d '
[
{
"labels": {
"alertname": "DiskRunningFull",
"dev": "sda1",
"instance": "中文测试1",
"route": "WEBHOOK"
},
"annotations": {
"info": "The disk sda1 is running full",
"summary": "please check the instance example1"
}
}
]
'
vim /data/prometheus/prometheus/conf/prometheus.yml
# my global config
global:
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
# scrape_timeout is set to the global default (10s).
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets:
# 取消alert节点的主机端口,注意yml文件的格式,主机名要对应,如果害怕主机名错误,可以直接写IP地址
- 192.168.30.33:6006
# - alertmanager:9093
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
# 添加告警规则目录内,告警规则文件
- "/etc/prometheus/rules/*_rules.yml"
# - "first_rules.yml"
# - "second_rules.yml"
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
# The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
- job_name: "prometheus"
# metrics_path defaults to '/metrics'
# scheme defaults to 'http'.
static_configs:
- targets: ["192.168.30.33:6001","192.168.30.33:6002","192.168.30.33:6003","192.168.30.33:6004"]
编写告警规则
实现:
主机存活告警
cpu使用率告警
内存使用率告警
磁盘使用率告警
主机磁盘读过大
主机磁盘写过大
iowait较高
Tcp TimeWait数量过多告警
vim /data/prometheus/prometheus/conf/rules/up_rules.yml
groups:
- name: 主机存活告警
rules:
- alert: 主机存活告警
expr: up == 0
for: 1m
labels:
severity: error
annotations:
summary: "Instance {
{ $labels.instance }} 停止工作"
description: "{
{ $labels.instance }} : {
{ $labels.job }} 已宕机超过1分钟"
vim /data/prometheus/prometheus/conf/rules/cpu_rules.yml
groups:
- name: CPU使用率告警
rules:
- alert: CPU使用率告警
expr: 100 - (avg by (instance)(irate(node_cpu_seconds_total{
mode="idle"}[1m]) )) * 100 > 80
for: 15m
labels:
severity: warning
annotations:
summary: "CPU近15分钟使用率大于80%, 实例: {
{ $labels.instance }},当前值:{
{ $value }}%"
vim /data/prometheus/prometheus/conf/rules/mem_rules.yml
groups:
- name: 内存使用率告警
rules:
- alert: 内存使用率告警
expr: (1 - (node_memory_MemAvailable_bytes / (node_memory_MemTotal_bytes))) * 100 > 80
for: 1m
labels:
severity: warning
annotations:
summary: "内存使用率大于80%, 实例: {
{ $labels.instance }},当前值:{
{ $value }}%"
vim /data/prometheus/prometheus/conf/rules/disk_rules.yml
groups:
- name: 磁盘使用率告警
rules:
- alert: 磁盘使用率告警
expr: 100 - (node_filesystem_free_bytes{
mountpoint="/",fstype=~"ext4|xfs"} / node_filesystem_size_bytes{
fstype=~"ext4|xfs"} * 100) > 80
for: 1m
labels:
severity: warning
annotations:
summary: "Instance {
{ $labels.instance }} :{
{ $labels.mountpoint }} 分区使用率过高"
description: "{
{ $labels.instance }} : {
{ $labels.job }} :{
{ $labels.mountpoint }} 这个分区使用大于百分之80% (当前值:{
{ $value }})"
vim /data/prometheus/prometheus/conf/rules/disk_r_rules.yml
groups:
- name: 主机磁盘读过大
rules:
- alert: 主机磁盘读过大
expr: sum by (instance) (rate(node_disk_read_bytes_total[2m])) > 50*1024 *1024
for: 5m
labels:
severity: warning
annotations:
summary: "磁盘读过大, 实例: {
{
$labels.instance}},当前值: {
{ $value | humanize1024 }}。"
vim /data/prometheus/prometheus/conf/rules/disk_w_rules.yml
# 写入 > 50MB/s
groups:
- name: 主机磁盘写过大
rules:
- alert: 主机磁盘写过大
expr: sum by (instance) (rate(node_disk_written_bytes_total[2m])) > 50 * 1024 * 1024
for: 5m
labels:
severity: warning
annotations:
summary: "磁盘写过大, 实例: {
{
$labels.instance}},当前值: {
{ $value | humanize1024 }}。"
vim /data/prometheus/prometheus/conf/rules/iowait_rules.yml
groups:
- name: iowait较高
rules:
- alert: iowait较高
expr: (sum(increase(node_cpu_seconds_total{
mode='iowait'}[5m]))by(instance)) / (sum(increase(node_cpu_seconds_total[5m]))by(instance)) *100 >= 10
for: 5m
labels:
severity: warning
annotations:
summary: "CPU ioWait近5分钟占比大于等于10%, 实例: {
{ $labels.instance }},当前值:{
{ $value }}%"
vim /data/prometheus/prometheus/conf/rules/tcp_time_wait.rules.yml
groups:
- name: Tcp TimeWait数量过多告警
rules:
- alert: Tcp TimeWait数量过多告警
expr: node_sockstat_TCP_tw >= 5000
for: 1m
labels:
severity: warning
annotations:
summary: "Tcp TimeWait数量大于5000, 实例: {
{ $labels.instance }},当前值:{
{ $value }}%"
docker restart prometheus # 重启prometheus使配置生效
新增client:docker部署
mkdir -p /data/docker-compose/
vim /data/docker-compose/docker-compose.yaml
version: '3.3'
services:
node-exporter:
image: prom/node-exporter:latest
container_name: node-exporter
hostname: node-exporter
restart: always
user: root
command:
- '--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|host|etc)($$|/)'
- '--collector.filesystem.ignored-fs-types=^(autofs|binfmt_misc|cgroup|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|mqueue|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|sysfs|tracefs)$$'
- '--collector.textfile.directory=/node_exporter/prom'
volumes:
- /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
- '--path.rootfs=/host'
- '--path.procfs /host/proc'
- '--path.sysfs /host/sys'
- '--path.rootfs /rootfs'
pid: host
volumes:
- '/proc:/host/proc:ro'
- '/sys:/host/sys:ro'
- '/:/rootfs:ro'
ports:
- "6001:9100"
cadvisor:
image: google/cadvisor
container_name: dc-cadvisor
restart: always
privileged: true
volumes:
- /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
- /:/rootfs:ro
- /var/run:/var/run:rw
- /sys:/sys:ro
- /var/lib/docker/:/var/lib/docker:ro
- /dev/disk/:/dev/disk:ro
ports:
- "6002:8080"
docker-compose up -d
# prometheus添加监控
vim /data/prometheus/prometheus/conf/prometheus.yml
"IP:6001","IP:6002" # 添加
docker restart prometheus
新增client:linux系统部署
cd /usr/local/
wget https://github.com/prometheus/node_exporter/releases/download/v1.4.0/node_exporter-1.4.0.linux-amd64.tar.gz
tar zxf node_exporter-1.4.0.linux-amd64.tar.gz
mv /usr/local/node_exporter-1.4.0.linux-amd64/* /usr/local/node_exporter/
cd /usr/local/node_exporter/
nohup ./node_exporter &