【kubernetes/k8s源码分析】perf-tests 解读源码

版权声明:本文为博主原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。
本文链接: https://blog.csdn.net/zhonglinzhang/article/details/102721682

    github: https://github.com/kubernetes/perf-tests/tree/master/clusterloader2 

    pert-tests 是性能测试框架,需要用户自己通过配置文件定义性能测试策略。以 kubernetes density 测试为例,clusterloader2 的测试过程

  ./clusterloader --kubeconfig=/root/.kube/config --provider=kubemark --masterip=master.node.local --mastername=root \

       --testconfig=testing/density/config.yaml \

        --report-dir=./reports \

        --alsologtostderr 2>&1 | tee /tmp/tmp.log

      Measurement 都是用于采集数据的观测程序。包括从metrics,或者event,或者从 apiserver 获得的资源数据,density 测试包括 APIResponsiveness、SaturationPodStartupLatency、WaitForRunningSaturationRCs、SchedulingThroughput、PodStartupLatency(第 2 章节)。后续文章会分析这些 measurement 的实现方式。

   第 1 章节根据配置如下 step所示

steps:
- name: Starting measurements
  measurements:
  - Identifier: APIResponsiveness
    Method: APIResponsiveness
    Params:
      action: reset
  - Identifier: APIResponsivenessPrometheus
    Method: APIResponsivenessPrometheus
    Params:
      action: start
  # TODO(oxddr): figure out how many probers to run in function of cluster
  - Identifier: InClusterNetworkLatency
    Method: InClusterNetworkLatency
    Params:
      action: start
      replicasPerProbe: {{AddInt 2 (DivideInt .Nodes 100)}}
  - Identifier: DnsLookupLatency
    Method: DnsLookupLatency
    Params:
      action: start
      replicasPerProbe: {{AddInt 2 (DivideInt .Nodes 100)}}
  - Identifier: TestMetrics
    Method: TestMetrics
    Params:
      action: start
      nodeMode: {{$NODE_MODE}}
      resourceConstraints: {{$DENSITY_RESOURCE_CONSTRAINTS_FILE}}
      systemPodMetricsEnabled: {{$ENABLE_SYSTEM_POD_METRICS}}

1. TESTMtrics

    TestMetrics 包括如下

  • EtcdMetrics
  • SchedulingMetrics
  • MetricsForE2E
  • ResourceUsageSummay
  • CPUProfile 包括 etcd apiserver scheduler controllermanager 等
  • MemoryProfile
  • SystemPodMetrics
type testMetrics struct {
	etcdMetrics                    measurement.Measurement
	schedulingMetrics              measurement.Measurement
	metricsForE2E                  measurement.Measurement
	resourceUsageSummary           measurement.Measurement
	etcdCPUProfile                 measurement.Measurement
	etcdMemoryProfile              measurement.Measurement
	etcdMutexProfile               measurement.Measurement
	apiserverCPUProfile            measurement.Measurement
	apiserverMemoryProfile         measurement.Measurement
	schedulerCPUProfile            measurement.Measurement
	schedulerMemoryProfile         measurement.Measurement
	controllerManagerCPUProfile    measurement.Measurement
	controllerManagerMemoryProfile measurement.Measurement
	systemPodMetrics               measurement.Measurement
}

    1.1 ETCDMetrics

      通过 curl http://localhost:2379/metrics 或者 curl http://localhost:2382/metrics (新版本) 获取数据

etcd_disk_backend_commit_duration_seconds_bucket{le="0.001"} 0

etcd_disk_backend_commit_duration_seconds_bucket{le="0.002"} 0

etcd_disk_backend_commit_duration_seconds_bucket{le="0.004"} 0

etcd_disk_backend_commit_duration_seconds_bucket{le="0.008"} 0

etcd_disk_backend_commit_duration_seconds_bucket{le="0.016"} 13

etcd_disk_backend_commit_duration_seconds_bucket{le="0.032"} 60965

etcd_disk_backend_commit_duration_seconds_bucket{le="0.064"} 126990

etcd_disk_backend_commit_duration_seconds_bucket{le="0.128"} 139193

etcd_disk_backend_commit_duration_seconds_bucket{le="0.256"} 141012

etcd_disk_backend_commit_duration_seconds_bucket{le="0.512"} 141441

etcd_disk_backend_commit_duration_seconds_bucket{le="1.024"} 141472

etcd_disk_backend_commit_duration_seconds_bucket{le="2.048"} 141477

etcd_disk_backend_commit_duration_seconds_bucket{le="4.096"} 141478

etcd_disk_backend_commit_duration_seconds_bucket{le="8.192"} 141478

etcd_disk_backend_commit_duration_seconds_bucket{le="+Inf"} 141478

etcd_disk_backend_commit_duration_seconds_bucket:The latency distributions of commit called by backend

etcd_debugging_snap_save_total_duration_seconds_bucket:The total latency distributions of save called by snapshot.

etcd_disk_wal_fsync_duration_seconds_bucket

etcd_network_peer_round_trip_time_seconds_bucket

     getEtcdMetrics 通过 curl http://localhost:2379/metrics 或者 curl http://localhost:2382/metrics (新版本) 获取数据

     根据上述 metrics label 获得数据

func (e *etcdMetricsMeasurement) stopAndSummarize(host, provider string) error {
	defer e.Dispose()
	// Do some one-off collection of metrics.
	samples, err := e.getEtcdMetrics(host, provider)
	if err != nil {
		return err
	}
	for _, sample := range samples {
		switch sample.Metric[model.MetricNameLabel] {
		case "etcd_disk_backend_commit_duration_seconds_bucket":
			measurementutil.ConvertSampleToBucket(sample, &e.metrics.BackendCommitDuration)
		case "etcd_debugging_snap_save_total_duration_seconds_bucket":
			measurementutil.ConvertSampleToBucket(sample, &e.metrics.SnapshotSaveTotalDuration)
		case "etcd_disk_wal_fsync_duration_seconds_bucket":
			measurementutil.ConvertSampleToBucket(sample, &e.metrics.WalFsyncDuration)
		case "etcd_network_peer_round_trip_time_seconds_bucket":
			measurementutil.ConvertSampleToBucket(sample, &e.metrics.PeerRoundTripTime)
		}
	}
	return nil
}

    1.2 SchedulerMetrics

      通过 curl -XGET http://localhost:10251/metrics 获取数据,label 为 scheduler_scheduling_latency_seconds

      统计的是数据包括 predicate_evaluation priority_evaluation preemption_evaluation bind

scheduler_scheduling_latency_seconds{operation="binding",quantile="0.5"} 0.0472671

scheduler_scheduling_latency_seconds{operation="binding",quantile="0.9"} 0.100486235

scheduler_scheduling_latency_seconds{operation="binding",quantile="0.99"} 0.113182433

scheduler_scheduling_latency_seconds_sum{operation="binding"} 14.535012662000003

scheduler_scheduling_latency_seconds_count{operation="binding"} 240

scheduler_scheduling_latency_seconds{operation="predicate_evaluation",quantile="0.5"} 0.000407027

scheduler_scheduling_latency_seconds{operation="predicate_evaluation",quantile="0.9"} 0.000732365

scheduler_scheduling_latency_seconds{operation="predicate_evaluation",quantile="0.99"} 0.001816966

scheduler_scheduling_latency_seconds_sum{operation="predicate_evaluation"} 0.134087715

scheduler_scheduling_latency_seconds_count{operation="predicate_evaluation"} 240

for _, sample := range samples {
	if sample.Metric[model.MetricNameLabel] != schedulingLatencyMetricName {
		continue
	}

	var metric *measurementutil.LatencyMetric
	switch sample.Metric[schedulermetric.OperationLabel] {
	case schedulermetric.PredicateEvaluation:
		metric = &result.PredicateEvaluationLatency
	case schedulermetric.PriorityEvaluation:
		metric = &result.PriorityEvaluationLatency
	case schedulermetric.PreemptionEvaluation:
		metric = &result.PreemptionEvaluationLatency
	case schedulermetric.Binding:
		metric = &result.BindingLatency
	}
	if metric == nil {
		continue
	}

    1.3 APIResponsiveness

      通过 curl -XGET http://localhost:8080/metrics可以获得,代码中使用 client-go 一样的功能,过滤的 label 为apiserver_request_latencies_summaryapiserver_request_count

      这个资源特别多,只罗列几个,其他略

apiserver_request_latencies_summary{resource="pods",scope="namespace",subresource="",verb="LIST",quantile="0.5"} 2050
apiserver_request_latencies_summary{resource="pods",scope="namespace",subresource="",verb="LIST",quantile="0.9"} 2050
apiserver_request_latencies_summary{resource="pods",scope="namespace",subresource="",verb="LIST",quantile="0.99"} 2050
apiserver_request_latencies_summary_sum{resource="pods",scope="namespace",subresource="",verb="LIST"} 78645
apiserver_request_latencies_summary_count{resource="pods",scope="namespace",subresource="",verb="LIST"} 5
apiserver_request_latencies_summary{resource="pods",scope="namespace",subresource="",verb="POST",quantile="0.5"} 39036
apiserver_request_latencies_summary{resource="pods",scope="namespace",subresource="",verb="POST",quantile="0.9"} 120477
apiserver_request_latencies_summary{resource="pods",scope="namespace",subresource="",verb="POST",quantile="0.99"} 144922
apiserver_request_latencies_summary_sum{resource="pods",scope="namespace",subresource="",verb="POST"} 1.5654715e+07

{

"data": {

"Perc50": 0.498,

"Perc90": 0.498,

"Perc99": 0.498

},

"unit": "ms",

"labels": {

"Count": "2",

"Resource": "pods",

"Scope": "cluster",

"Subresource": "",

"Verb": "LIST"

}

}

    1.4 systemPodMetrics

     只搜集系统 pod 重启次数,也就是 kube-system namespace 下的 pod,统计的结果如下:

  "pods": [
    {
      "name": "calico-node-b6mq4",
      "containers": []
    },
    {
      "name": "calico-kube-controllers-8597b9886-dvkl4",
      "containers": [
        {
          "name": "calico-kube-controllers",
          "restartCount": 0
        }
      ]
    },
    {
      "name": "calico-node-kcp6w",
      "containers": []
    },
    {
      "name": "calico-node-jvj54",
      "containers": []
    },
    {
      "name": "calico-node-p75cp",
      "containers": [
        {
          "name": "calico-node",
          "restartCount": 0
        }

    第 2 章节根据如下配置讲解

- name: Starting saturation pod measurements
  measurements:
  - Identifier: SaturationPodStartupLatency
    Method: PodStartupLatency
    Params:
      action: start
      labelSelector: group = saturation
      threshold: {{$saturationDeploymentTimeout}}s
  - Identifier: WaitForRunningSaturationDeployments
    Method: WaitForControlledPodsRunning
    Params:
      action: start
      apiVersion: apps/v1
      kind: Deployment
      labelSelector: group = saturation
      operationTimeout: {{$saturationDeploymentHardTimeout}}s
  - Identifier: SchedulingThroughput
    Method: SchedulingThroughput
    Params:
      action: start
      labelSelector: group = saturation

2. PodStartupLatency

    列出该 namespace 下的 event 且 KIND 类型为 Pod,

func (p *podStartupLatencyMeasurement) gatherScheduleTimes(c clientset.Interface) error {
	selector := fields.Set{
		"involvedObject.kind": "Pod",
		"source":              corev1.DefaultSchedulerName,
	}.AsSelector().String()
	options := metav1.ListOptions{FieldSelector: selector}
	schedEvents, err := c.CoreV1().Events(p.selector.Namespace).List(options)
	if err != nil {
		return err
	}

     如下这段代码是 start 阶段启动时执行,如果 pod 是 running 状态,使用 namespace-pod 作为 key,存入 podStartupEntries ,设置 watch 时间为 time.Now,设置 create 时间为 CreationTimestamp,设置 run 时间

if pod.Status.Phase == corev1.PodRunning {
	key := createMetaNamespaceKey(pod.Namespace, pod.Name)
	if _, found := p.podStartupEntries.Get(key, createPhase); !found {
		p.podStartupEntries.Set(key, watchPhase, time.Now())
		p.podStartupEntries.Set(key, createPhase, pod.CreationTimestamp.Time)
		var startTime metav1.Time
		for _, cs := range pod.Status.ContainerStatuses {
			if cs.State.Running != nil {
				if startTime.Before(&cs.State.Running.StartedAt) {
					startTime = cs.State.Running.StartedAt
				}
			}
		}
		if startTime != metav1.NewTime(time.Time{}) {
			p.podStartupEntries.Set(key, runPhase, startTime.Time)
		} else {
			klog.Errorf("%s: pod %v (%v) is reported to be running, but none of its containers is", p, pod.Name, pod.Namespace)
		}
	}
}

   2.1 测试的目标如下

podStartupLatency := p.podStartupEntries.CalculateTransitionsLatency(map[string]measurementutil.Transition{
   "create_to_schedule": {
      From: createPhase,
      To:   schedulePhase,
   },
   "schedule_to_run": {
      From: schedulePhase,
      To:   runPhase,
   },
   "run_to_watch": {
      From: runPhase,
      To:   watchPhase,
   },
   "schedule_to_watch": {
      From: schedulePhase,
      To:   watchPhase,
   },
   "pod_startup": {
      From:      createPhase,
      To:        watchPhase,
      Threshold: p.threshold,
   },
})

    2.2 测试结果如下所示:

{
  "version": "1.0",
  "dataItems": [
    {
      "data": {
        "Perc50": 0,
        "Perc90": 0,
        "Perc99": 0
      },
      "unit": "ms",
      "labels": {
        "Metric": "create_to_schedule"
      }
    },
    {
      "data": {
        "Perc50": 0,
        "Perc90": 1000,
        "Perc99": 1000
      },
      "unit": "ms",
      "labels": {
        "Metric": "schedule_to_run"
      }
    },
    {
      "data": {
        "Perc50": 1090.348398,
        "Perc90": 1757.317935,
        "Perc99": 2299.464715
      },
      "unit": "ms",
      "labels": {
        "Metric": "run_to_watch"
      }
    },
    {
      "data": {
        "Perc50": 1578.38022,
        "Perc90": 2286.247476,
        "Perc99": 2630.67882
      },
      "unit": "ms",
      "labels": {
        "Metric": "schedule_to_watch"
      }
    },
    {
      "data": {
        "Perc50": 1580.205515,
        "Perc90": 2286.247476,
        "Perc99": 2630.67882
      },
      "unit": "ms",
      "labels": {
        "Metric": "pod_startup"
      }
    }
  ]
}

3. metricsForE2EMeasurement

     metricsForE2EMeasurement 测试主要是从 kube-apiserver kube-controller-manager kube-scheduler 等组件采集 metrics,可选的组件 kubelet 

    3.1 对 kube-apiserver 观测的指标有

var interestingApiServerMetricsLabels = []string{
   "apiserver_init_events_total",
   "apiserver_request_count",
   "apiserver_request_latencies_summary",
   "etcd_request_latencies_summary",
}

      apiserver_request_count,kube-apiserver 的指标数据从 kube-apiserver 的 metrics 获取,apiserver_request_count 包括各种资源的 LIST WATCH GET 等操作,如下示例为 nodes 的 LIST 操作,请求为 4 次

      {
        "metric": {
          "__name__": "apiserver_request_count",
          "client": "clusterloader/v0.0.0 (linux/amd64) kubernetes/$Format",
          "code": "200",
          "contentType": "application/vnd.kubernetes.protobuf",
          "resource": "nodes",
          "scope": "cluster",
          "subresource": "",
          "verb": "LIST"
        },
        "value": [
          0,
          "4"
        ]
      },

     apiserver_request_latencies_summary 指标包括各种资源的 LIST WATCH GET 等操作,如下所示为 service 的status 的 PUT 操作,在集群负载 0.5 0.9 0.99 时的请求延迟

      {
        "metric": {
          "__name__": "apiserver_request_latencies_summary",
          "quantile": "0.5",
          "resource": "apiservices",
          "scope": "cluster",
          "subresource": "status",
          "verb": "PUT"
        },
        "value": [
          0,
          "2308"
        ]
      },

      {
        "metric": {
          "__name__": "apiserver_request_latencies_summary",
          "quantile": "0.9",
          "resource": "apiservices",
          "scope": "cluster",
          "subresource": "status",
          "verb": "PUT"
        },
        "value": [
          0,
          "3941"
        ]
      },

      {
        "metric": {
          "__name__": "apiserver_request_latencies_summary",
          "quantile": "0.99",
          "resource": "apiservices",
          "scope": "cluster",
          "subresource": "status",
          "verb": "PUT"
        },
        "value": [
          0,
          "4137"
        ]
      },

总结:

    测试的指标大多根据 metrics 获取

    也有数据从 event 获取,比如 podStartupLatency

EtcdMetrics 测试方法

  • etcd_disk_backend_commit_duration_seconds_bucket
  • etcd_debugging_snap_save_total_duration_seconds_bucket
  • etcd_disk_wal_fsync_duration_seconds_bucket
  • etcd_network_peer_round_trip_time_seconds_bucket         可能未实现

metrics_for_e2e 测试方法

   kube-apiserver 的指标有:

  • apiserver_init_events_total                      可能未实现
  • apiserver_request_count
  • apiserver_request_latencies_summary
  • etcd_request_latencies_summary            可能未实现

猜你喜欢

转载自blog.csdn.net/zhonglinzhang/article/details/102721682
今日推荐