Kubernetes 1.7 部署 Prometheus 监控服务

一、背景简介

       原生的 Kubernetes 是集成的 InfluxDB + Heapster + Grafana 监控组件,存在一定局限;

       本文主要介绍,在 Kubernetes 集群中部署 Prometheus 监控系统的一般方法(核心功能安装)

二、基本步骤

2.1  镜像准备

      prom/prometheus:v1.7.0  【主监控程序】

      prom/alertmanager:v0.14.0 【告警管理器】

      prom/node-exporter:v0.14.0   【 Node 采集器】

      quay.io/coreos/kube-state-metrics:v1.3.0  【k8s 采集器】

      k8s.gcr.io/addon-resizer:1.7 【k8s 采集器附属】

2.2  创建命名空间(专门存放监控 Pods)

# kubectl create namespace monitoring

      【 也可以通过Yaml 文件形式创建】

# vim monitoring_namespace.yaml
apiVersion: v1
kind: Namespace
metadata:
  name: monitoring
# kubectl create -f monitoring_namespace.yaml

2.3  创建 Prometheus core ConfigMap

# vim prometheus_core_config_map.yaml
 
 
apiVersion: v1
kind: ConfigMap
metadata:
  creationTimestamp: null
  name: prometheus-core
  namespace: monitoring
data:
  prometheus.yaml: |
    global:
      scrape_interval: 10s
      scrape_timeout: 10s
      evaluation_interval: 10s
    rule_files:
      - "/etc/prometheus-rules/*.rules"
    scrape_configs:
      
      # Data From Kubelet Daemon
      # https://github.com/prometheus/prometheus/blob/master/documentation/examples/prometheus-kubernetes.yml#L37
      - job_name: 'kubernetes-node-kubelet'
        tls_config:
          ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
        bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
        kubernetes_sd_configs:
          - role: node
        relabel_configs:
          - source_labels: [__address__]
            regex: '(.*):10250'
            replacement: '${1}:10255'
            target_label: __address__
      
      # Data From EndPoints (Service EndPoints)
      # https://github.com/prometheus/prometheus/blob/master/documentation/examples/prometheus-kubernetes.yml#L79
      - job_name: 'kubernetes-endpoints'
        kubernetes_sd_configs:
          - role: endpoints
        relabel_configs:
          - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
            action: keep
            regex: true
          - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
            action: replace
            target_label: __scheme__
            regex: (https?)
          - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
            action: replace
            target_label: __metrics_path__
            regex: (.+)
          - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
            action: replace
            target_label: __address__
            regex: (.+)(?::\d+);(\d+)
            replacement: $1:$2
          - action: labelmap
            regex: __meta_kubernetes_service_label_(.+)
          - source_labels: [__meta_kubernetes_namespace]
            action: replace
            target_label: kubernetes_namespace
          - source_labels: [__meta_kubernetes_service_name]
            action: replace
            target_label: kubernetes_name

      # No Data Now, For ClusterIP ?
      # https://github.com/prometheus/prometheus/blob/master/documentation/examples/prometheus-kubernetes.yml#L119
      - job_name: 'kubernetes-services'
        metrics_path: /probe
        params:
          module: [http_2xx]
        kubernetes_sd_configs:
          - role: service
        relabel_configs:
          - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_probe]
            action: keep
            regex: true
          - source_labels: [__address__]
            target_label: __param_target
          - target_label: __address__
            replacement: blackbox
          - source_labels: [__param_target]
            target_label: instance
          - action: labelmap
            regex: __meta_kubernetes_service_label_(.+)
          - source_labels: [__meta_kubernetes_namespace]
            target_label: kubernetes_namespace
          - source_labels: [__meta_kubernetes_service_name]
            target_label: kubernetes_name

      # Data From Pods
      # https://github.com/prometheus/prometheus/blob/master/documentation/examples/prometheus-kubernetes.yml#L156
      - job_name: 'kubernetes-pods'
        kubernetes_sd_configs:
          - role: pod
        relabel_configs:
          - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
            action: keep
            regex: true
          - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
            action: replace
            target_label: __metrics_path__
            regex: (.+)
          - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
            action: replace
            regex: (.+):(?:\d+);(\d+)
            replacement: ${1}:${2}
            target_label: __address__
          - action: labelmap
            regex: __meta_kubernetes_pod_label_(.+)
          - source_labels: [__meta_kubernetes_namespace]
            action: replace
            target_label: kubernetes_namespace
          - source_labels: [__meta_kubernetes_pod_name]
            action: replace
            target_label: kubernetes_pod_name
          - source_labels: [__meta_kubernetes_pod_container_port_number]
            action: keep
            regex: \d{4}

      # Data From cAdvisor
      - job_name: 'kubernetes-cadvisor'
        kubernetes_sd_configs:
        - role: node
        relabel_configs:
        - source_labels: [__meta_kubernetes_node_address_InternalIP]
          target_label: __address__
          regex: (.*)
          replacement: $1:4194
# kubectl create -f prometheus_core_config_map.yaml

2.4  创建 Prometheus rule ConfigMap

# vim prometheus_rules_config_map.yaml
apiVersion: v1
kind: ConfigMap
metadata:
  creationTimestamp: null
  name: prometheus-rules
  namespace: monitoring
data:
  cpu-usage.rules: |
    ALERT NodeCPUUsage
      IF (100 - (avg by (instance) (irate(node_cpu{name="node-exporter",mode="idle"}[5m])) * 100)) > 75
      FOR 2m
      LABELS {
        severity="page"
      }
      ANNOTATIONS {
        SUMMARY = "{{$labels.instance}}: High CPU usage detected",
        DESCRIPTION = "{{$labels.instance}}: CPU usage is above 75% (current value is: {{ $value }})"
      }
  instance-availability.rules: |
    ALERT InstanceDown
      IF up == 0
      FOR 1m
      LABELS { severity = "page" }
      ANNOTATIONS {
        summary = "Instance {{ $labels.instance }} down",
        description = "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 1 minute.",
      }
  low-disk-space.rules: |
    ALERT NodeLowRootDisk
      IF ((node_filesystem_size{mountpoint="/etc/hosts"} - node_filesystem_free{mountpoint="/etc/hosts"} ) / node_filesystem_size{mountpoint="/etc/hosts"} * 100) > 75
      FOR 2m
      LABELS {
        severity="page"
      }
      ANNOTATIONS {
        SUMMARY = "{{$labels.instance}}: Low root disk space",
        DESCRIPTION = "{{$labels.instance}}: Root disk usage is above 75% (current value is: {{ $value }})"
      }

    ALERT NodeLowDataDisk
      IF ((node_filesystem_size{mountpoint="/data-disk"} - node_filesystem_free{mountpoint="/data-disk"} ) / node_filesystem_size{mountpoint="/data-disk"} * 100) > 75
      FOR 2m
      LABELS {
        severity="page"
      }
      ANNOTATIONS {
        SUMMARY = "{{$labels.instance}}: Low data disk space",
        DESCRIPTION = "{{$labels.instance}}: Data disk usage is above 75% (current value is: {{ $value }})"
      }
  mem-usage.rules: |
    ALERT NodeSwapUsage
      IF (((node_memory_SwapTotal-node_memory_SwapFree)/node_memory_SwapTotal)*100) > 75
      FOR 2m
      LABELS {
        severity="page"
      }
      ANNOTATIONS {
        SUMMARY = "{{$labels.instance}}: Swap usage detected",
        DESCRIPTION = "{{$labels.instance}}: Swap usage usage is above 75% (current value is: {{ $value }})"
      }

    ALERT NodeMemoryUsage
      IF (((node_memory_MemTotal-node_memory_MemFree-node_memory_Cached)/(node_memory_MemTotal)*100)) > 75
      FOR 2m
      LABELS {
        severity="page"
      }
      ANNOTATIONS {
        SUMMARY = "{{$labels.instance}}: High memory usage detected",
        DESCRIPTION = "{{$labels.instance}}: Memory usage is above 75% (current value is: {{ $value }})"
      }
# kubectl create -f prometheus_rules_config_map.yaml

2.5  创建 Prometheus alert ConfigMap

# vim prometheus_alert_config_map.yaml
kind: ConfigMap
apiVersion: v1
metadata:
  name: prometheus-alertmanager
  namespace: monitoring
data:
  config.yml: |-
    global:
      resolve_timeout: 5m
      smtp_smarthost: '<SMTP_IP>:<PORT>'
      smtp_from: '[email protected]'
      smtp_require_tls: false
    route:
      receiver: live-monitoring
      group_wait: 30s
      group_interval: 5m
      repeat_interval: 4h
      group_by: [alertname]

    receivers:
    - name: live-monitoring
      email_configs:
      - to: '[email protected]'
      - to: '[email protected]'
# kubectl create -f prometheus_alert_config_map.yaml

2.6  创建 Prometheus 实例

# vim prometheus_deployment.yaml
apiVersion: rbac.authorization.k8s.io/v1beta1  
kind: ClusterRole  
metadata:  
  name: prometheus  
rules:  
- apiGroups: [""]  
  resources:  
  - nodes  
  - services  
  - endpoints  
  - pods  
  verbs: ["get", "list", "watch"]  
- apiGroups: [""]  
  resources:  
  - configmaps  
  verbs: ["get"]  
- nonResourceURLs: ["/metrics"]  
  verbs: ["get"]  
---  
apiVersion: v1  
kind: ServiceAccount  
metadata:  
  name: prometheus-k8s  
  namespace: monitoring  
---  
apiVersion: rbac.authorization.k8s.io/v1beta1  
kind: ClusterRoleBinding  
metadata:  
  name: prometheus  
roleRef:  
  apiGroup: rbac.authorization.k8s.io  
  kind: ClusterRole  
  name: prometheus  
subjects:  
- kind: ServiceAccount  
  name: prometheus-k8s  
  namespace: monitoring  
---  
apiVersion: extensions/v1beta1  
kind: Deployment  
metadata:  
  name: prometheus-core  
  namespace: monitoring  
  labels:  
    app: prometheus  
    component: core  
spec:  
  replicas: 1  
  template:  
    metadata:  
      name: prometheus-main  
      labels:  
        app: prometheus  
        component: core  
    spec:  
      serviceAccountName: prometheus-k8s  
      containers:  
      - name: prometheus  
        image: prom/prometheus:v1.7.0  
        args:  
          - '-storage.local.retention=12h'  
          - '-storage.local.memory-chunks=500000'  
          - '-config.file=/etc/prometheus/prometheus.yaml'  
          - '-alertmanager.url=http://127.0.0.1:9093/'  
        ports:  
        - name: web-ui  
          containerPort: 9090  
        resources:  
          requests:  
            cpu: 500m  
            memory: 500M  
          limits:  
            cpu: 500m  
            memory: 500M  
        volumeMounts:  
        - name: config-volume  
          mountPath: /etc/prometheus  
        - name: rules-volume  
          mountPath: /etc/prometheus-rules  
      - image: prom/alertmanager:v0.14.0
        name: alertmanager
        args:
        - "--config.file=/etc/alertmanager/config.yml"
        - "--storage.path=/alertmanager"
        ports:
        - containerPort: 9093
          protocol: TCP
          name: http
        volumeMounts:
        - name: alertmanager-config-volume
          mountPath: /etc/alertmanager
        resources:
          requests:
            cpu: 50m
            memory: 50Mi
          limits:
            cpu: 200m
            memory: 200Mi
      volumes:  
      - name: config-volume  
        configMap:  
          name: prometheus-core  
      - name: rules-volume  
        configMap:  
          name: prometheus-rules  
      - name: alertmanager-config-volume
        configMap:
          name: prometheus-alertmanager
---  
apiVersion: v1  
kind: Service  
metadata:  
  name: prometheus  
  namespace: monitoring  
  labels:  
    app: prometheus  
    component: core  
  annotations:  
    prometheus.io/scrape: 'true'  
spec:  
  type: NodePort  
  ports:  
    - port: 9090  
      protocol: TCP  
      name: core-web-ui  
      nodePort: 30122  
    - port: 9093  
      protocol: TCP  
      name: alert-web-ui  
      nodePort: 30123 
  selector:  
    app: prometheus  
    component: core  
# kubectl create -f prometheus_deployment.yaml

2.7  开启 Node Exporter (通过 EndPoints 收集)

# vim prometheus_node_exporter.yaml
apiVersion: extensions/v1beta1
kind: DaemonSet
metadata:
  name: prometheus-node-exporter
  namespace: monitoring
  labels:
    app: prometheus
    component: node-exporter
spec:
  template:
    metadata:
      name: prometheus-node-exporter
      labels:
        app: prometheus
        component: node-exporter
    spec:
      containers:
      - image: prom/node-exporter:v0.14.0
        name: prometheus-node-exporter
        ports:
        - name: prom-node-exp
          containerPort: 9100
          hostPort: 9100
      hostNetwork: true
      hostPID: true     
---
apiVersion: v1
kind: Service
metadata:
  annotations:
    prometheus.io/scrape: 'true'
  name: prometheus-node-exporter
  namespace: monitoring
  labels:
    app: prometheus
    component: node-exporter
spec:
  clusterIP: None
  ports:
    - name: prometheus-node-exporter
      port: 9100
      protocol: TCP
  selector:
    app: prometheus
    component: node-exporter
  type: ClusterIP
# kubectl create -f prometheus_node_exporter.yaml

2.8  开启 Kubernetes 状态 Exporter (通过 EndPoints 收集)

# vim prometheus_kube_stat_exporter.yaml
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRole
metadata:
  name: kube-state-metrics
rules:
- apiGroups: [""]
  resources:
  - configmaps
  - secrets
  - nodes
  - pods
  - services
  - resourcequotas
  - replicationcontrollers
  - limitranges
  - persistentvolumeclaims
  - persistentvolumes
  - namespaces
  - endpoints
  verbs: ["list", "watch"]
- apiGroups: ["extensions"]
  resources:
  - daemonsets
  - deployments
  - replicasets
  verbs: ["list", "watch"]
- apiGroups: ["apps"]
  resources:
  - statefulsets
  verbs: ["list", "watch"]
- apiGroups: ["batch"]
  resources:
  - cronjobs
  - jobs
  verbs: ["list", "watch"]
- apiGroups: ["autoscaling"]
  resources:
  - horizontalpodautoscalers
  verbs: ["list", "watch"]
---
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: Role
metadata:
  namespace: monitoring
  name: kube-state-metrics-resizer
rules:
- apiGroups: [""]
  resources:
  - pods
  verbs: ["get"]
- apiGroups: ["extensions"]
  resources:
  - deployments
  resourceNames: ["kube-state-metrics"]
  verbs: ["get", "update"]
---
apiVersion: v1
kind: ServiceAccount
metadata:
  name: kube-state-metrics
  namespace: monitoring
---
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRoleBinding
metadata:
  name: kube-state-metrics
roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: kube-state-metrics
subjects:
- kind: ServiceAccount
  name: kube-state-metrics
  namespace: monitoring
---
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: RoleBinding
metadata:
  name: kube-state-metrics
  namespace: monitoring
roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: Role
  name: kube-state-metrics-resizer
subjects:
- kind: ServiceAccount
  name: kube-state-metrics
  namespace: monitoring
---
apiVersion: apps/v1beta1
kind: Deployment
metadata:
  name: kube-state-metrics
  namespace: monitoring
spec:
  selector:
    matchLabels:
      k8s-app: kube-state-metrics
  replicas: 1
  template:
    metadata:
      labels:
        k8s-app: kube-state-metrics
    spec:
      serviceAccountName: kube-state-metrics
      containers:
      - name: kube-state-metrics
        image: quay.io/coreos/kube-state-metrics:v1.3.0
        ports:
        - name: http-metrics
          containerPort: 8080
        - name: telemetry
          containerPort: 8081
        readinessProbe:
          httpGet:
            path: /healthz
            port: 8080
          initialDelaySeconds: 5
          timeoutSeconds: 5
      - name: addon-resizer
        image: k8s.gcr.io/addon-resizer:1.7
        resources:
          limits:
            cpu: 100m
            memory: 30Mi
          requests:
            cpu: 100m
            memory: 30Mi
        env:
          - name: MY_POD_NAME
            valueFrom:
              fieldRef:
                fieldPath: metadata.name
          - name: MY_POD_NAMESPACE
            valueFrom:
              fieldRef:
                fieldPath: metadata.namespace
        command:
          - /pod_nanny
          - --container=kube-state-metrics
          - --cpu=100m
          - --extra-cpu=1m
          - --memory=100Mi
          - --extra-memory=2Mi
          - --threshold=5
          - --deployment=kube-state-metrics
---
apiVersion: v1
kind: Service
metadata:
  name: kube-state-metrics
  namespace: monitoring
  labels:
    k8s-app: kube-state-metrics
  annotations:
    prometheus.io/scrape: 'true'
spec:
  ports:
  - name: http-metrics
    port: 8080
    targetPort: http-metrics
    protocol: TCP
  - name: telemetry
    port: 8081
    targetPort: telemetry
    protocol: TCP
  selector:
    k8s-app: kube-state-metrics
# kubectl create -f prometheus_kube_stat_exporter.yaml

2.9  查看 prometheus web-ui

      浏览器访问:http:<任意 k8s Node IP>:30122/targets

          

2.10 查看 alert manager web-ui

       浏览器访问:http:<任意 k8s Node IP>:30123

       

2.11  邮箱收到告警样式

        







  



猜你喜欢

转载自blog.csdn.net/shida_csdn/article/details/80224986