k8s.io/perf-tests/clusterloader2@v0.0.0-20240304094227-64bdb12da87e/pkg/measurement/common/network/network_performance_measurement.go (about)

     1  /*
     2  Copyright 2021 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  // Package network captures network performance metrics
    18  // for protocol TCP,UDP,HTTP etc. The metrics are collected for baseline (1:1),
    19  // scale (N:M) pod ratios.Client and server pods located on different worker
    20  // nodes exchange traffic for specified time to measure the performance metrics.
    21  package network
    22  
    23  import (
    24  	"context"
    25  	"embed"
    26  	"fmt"
    27  	"math"
    28  	"sync"
    29  	"time"
    30  
    31  	"k8s.io/apimachinery/pkg/api/meta"
    32  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    33  	"k8s.io/apimachinery/pkg/runtime"
    34  	"k8s.io/apimachinery/pkg/runtime/schema"
    35  	"k8s.io/apimachinery/pkg/util/wait"
    36  	"k8s.io/client-go/dynamic"
    37  	"k8s.io/client-go/kubernetes"
    38  	"k8s.io/client-go/tools/cache"
    39  	"k8s.io/klog/v2"
    40  	"k8s.io/perf-tests/clusterloader2/pkg/framework"
    41  	"k8s.io/perf-tests/clusterloader2/pkg/framework/client"
    42  	"k8s.io/perf-tests/clusterloader2/pkg/measurement"
    43  	measurementutil "k8s.io/perf-tests/clusterloader2/pkg/measurement/util"
    44  	"k8s.io/perf-tests/clusterloader2/pkg/util"
    45  )
    46  
    47  const (
    48  	podReadyTimeout = 5 * time.Minute
    49  	// initialDelay in seconds after which pod starts sending traffic.
    50  	// Delay is used to synchronize all client pods to send traffic at same time.
    51  	initialDelayForTestExecution = 15 * time.Second
    52  	// networkPerformanceMetricsName indicates the measurement name
    53  	networkPerformanceMetricsName = "NetworkPerformanceMetrics"
    54  	netperfNamespace              = "netperf"
    55  )
    56  
    57  const (
    58  	workerPodDeploymentManifestFilePath = "manifests/*deployment.yaml"
    59  	networkTestRequestFilePath          = "manifests/networktestrequests.yaml"
    60  	crdManifestFilePath                 = "manifests/*CustomResourceDefinition.yaml"
    61  	clusterRoleBindingFilePath          = "manifests/roleBinding.yaml"
    62  	customResourceDefinitionName        = "networktestrequests.clusterloader.io"
    63  	rbacName                            = "networktestrequests-rbac"
    64  )
    65  
    66  var (
    67  	crdGvk = schema.GroupVersionKind{
    68  		Group:   "apiextensions.k8s.io",
    69  		Kind:    "CustomResourceDefinition",
    70  		Version: "v1",
    71  	}
    72  
    73  	rbacGvk = schema.GroupVersionKind{
    74  		Group:   "rbac.authorization.k8s.io",
    75  		Kind:    "ClusterRoleBinding",
    76  		Version: "v1",
    77  	}
    78  
    79  	crGvk = schema.GroupVersionKind{
    80  		Group:   "clusterloader.io",
    81  		Kind:    "NetworkTestRequest",
    82  		Version: "v1alpha1",
    83  	}
    84  
    85  	//go:embed manifests
    86  	manifestsFS embed.FS
    87  )
    88  
    89  func init() {
    90  	klog.Info("Registering Network Performance Measurement")
    91  	if err := measurement.Register(networkPerformanceMetricsName, createNetworkPerformanceMeasurement); err != nil {
    92  		klog.Fatalf("Cannot register %s: %v", networkPerformanceMetricsName, err)
    93  	}
    94  }
    95  
    96  func createNetworkPerformanceMeasurement() measurement.Measurement {
    97  	return &networkPerformanceMeasurement{}
    98  }
    99  
   100  type networkPerformanceMeasurement struct {
   101  	k8sClient         kubernetes.Interface
   102  	dynamicClient     dynamic.Interface
   103  	resourceInterface dynamic.ResourceInterface
   104  	framework         *framework.Framework
   105  
   106  	numberOfServers int
   107  	numberOfClients int
   108  	podRatioType    string
   109  	testDuration    time.Duration
   110  	protocol        string
   111  
   112  	// workerPodInfo stores list of podData for every worker node.
   113  	podInfo    workerPodInfo
   114  	metricLock sync.Mutex
   115  	// metricVal stores MetricResponse received from worker node.
   116  	metricVal [][]float64
   117  	stopCh    chan struct{}
   118  	// startTimeStampForTestExecution is a futureTimeStamp sent to all pods to start
   119  	// sending the traffic at the same time.
   120  	startTimeStampForTestExecution int64
   121  }
   122  
   123  func (npm *networkPerformanceMeasurement) Execute(config *measurement.Config) ([]measurement.Summary, error) {
   124  	action, err := util.GetString(config.Params, "action")
   125  	if err != nil {
   126  		return nil, err
   127  	}
   128  	switch action {
   129  	case "start":
   130  		if err = npm.validate(config); err != nil {
   131  			return nil, err
   132  		}
   133  		return nil, npm.start(config)
   134  	case "gather":
   135  		summary, err := npm.gather()
   136  		if err != nil {
   137  			return nil, err
   138  		}
   139  		return []measurement.Summary{summary}, err
   140  	default:
   141  		return nil, fmt.Errorf("unknown action: %v", action)
   142  	}
   143  }
   144  
   145  func (npm *networkPerformanceMeasurement) start(config *measurement.Config) error {
   146  	npm.initialize(config)
   147  	if err := npm.prepareCluster(); err != nil {
   148  		return err
   149  	}
   150  	if err := npm.createAndWaitForWorkerPods(); err != nil {
   151  		return err
   152  	}
   153  	if err := npm.storeWorkerPods(); err != nil {
   154  		return err
   155  	}
   156  	if err := npm.initializeInformer(); err != nil {
   157  		return err
   158  	}
   159  
   160  	switch npm.podRatioType {
   161  	case oneToOne, manyToMany:
   162  		npm.execNToMTest()
   163  	default:
   164  		return fmt.Errorf("invalid Pod Ratio: %v", npm.podRatioType)
   165  	}
   166  	return nil
   167  }
   168  
   169  func (npm *networkPerformanceMeasurement) initialize(config *measurement.Config) {
   170  	npm.k8sClient = config.ClusterFramework.GetClientSets().GetClient()
   171  	npm.framework = config.ClusterFramework
   172  	npm.dynamicClient = config.ClusterFramework.GetDynamicClients().GetClient()
   173  	npm.podInfo.workerPodMap = make(map[string]podList)
   174  }
   175  
   176  func (npm *networkPerformanceMeasurement) initializeInformer() error {
   177  	gvr, _ := meta.UnsafeGuessKindToResource(crGvk)
   178  	npm.resourceInterface = npm.dynamicClient.Resource(gvr).Namespace(netperfNamespace)
   179  	informer, err := getInformer(netperfNamespace, npm.dynamicClient, gvr)
   180  	if err != nil {
   181  		return fmt.Errorf("error getting informer:%s", err)
   182  	}
   183  	informer.AddEventHandler(cache.ResourceEventHandlerFuncs{
   184  		UpdateFunc: func(oldObj interface{}, newObj interface{}) {
   185  			npm.handleUpdateNetworkTestEvents(newObj)
   186  		},
   187  	})
   188  	npm.stopCh = make(chan struct{})
   189  	go informer.Run(npm.stopCh)
   190  	return nil
   191  }
   192  
   193  func (npm *networkPerformanceMeasurement) prepareCluster() error {
   194  	if err := client.CreateNamespace(npm.k8sClient, netperfNamespace); err != nil {
   195  		return fmt.Errorf("error while creating namespace: %v", err)
   196  	}
   197  	if err := npm.framework.ApplyTemplatedManifests(manifestsFS, clusterRoleBindingFilePath, nil); err != nil {
   198  		return fmt.Errorf("error while creating clusterRoleBinding: %v", err)
   199  	}
   200  	if err := npm.framework.ApplyTemplatedManifests(manifestsFS, crdManifestFilePath, nil); err != nil {
   201  		return fmt.Errorf("error while creating CRD: %v", err)
   202  	}
   203  	return nil
   204  }
   205  
   206  func (npm *networkPerformanceMeasurement) cleanupCluster() {
   207  	if npm.framework == nil {
   208  		klog.V(1).Infof("Network measurement %s wasn't started, skipping the Dispose() step", npm)
   209  		return
   210  	}
   211  	if err := npm.framework.DeleteObject(crdGvk, "", customResourceDefinitionName); err != nil {
   212  		klog.Errorf("Failed to deleted CRD: %v", err)
   213  	}
   214  	if err := npm.framework.DeleteObject(rbacGvk, "", rbacName); err != nil {
   215  		klog.Errorf("Failed to delete clusterRoleBinding: %v", err)
   216  	}
   217  	if err := client.DeleteNamespace(npm.k8sClient, netperfNamespace); err != nil {
   218  		klog.Errorf("Failed to delete namespace: %v", err)
   219  	}
   220  	if err := client.WaitForDeleteNamespace(npm.k8sClient, netperfNamespace, client.DefaultNamespaceDeletionTimeout); err != nil {
   221  		klog.Errorf("Waiting for namespace deletion failed: %v", err)
   222  	}
   223  }
   224  
   225  func (npm *networkPerformanceMeasurement) createAndWaitForWorkerPods() error {
   226  	// Create worker pods
   227  	var replicas = map[string]interface{}{"Replicas": npm.numberOfClients + npm.numberOfServers}
   228  	if err := npm.framework.ApplyTemplatedManifests(manifestsFS, workerPodDeploymentManifestFilePath, replicas); err != nil {
   229  		return fmt.Errorf("failed to create worked pods: %v ", err)
   230  	}
   231  	// Wait for all worker pods to be ready
   232  	ctx, cancel := context.WithTimeout(context.TODO(), podReadyTimeout)
   233  	defer cancel()
   234  	selector := &util.ObjectSelector{Namespace: netperfNamespace}
   235  	options := &measurementutil.WaitForPodOptions{
   236  		DesiredPodCount:     func() int { return npm.numberOfClients + npm.numberOfServers },
   237  		CallerName:          networkPerformanceMetricsName,
   238  		WaitForPodsInterval: 2 * time.Second,
   239  	}
   240  	podStore, err := measurementutil.NewPodStore(npm.k8sClient, selector)
   241  	if err != nil {
   242  		return err
   243  	}
   244  	_, err = measurementutil.WaitForPods(ctx, podStore, options)
   245  	return err
   246  }
   247  
   248  func (*networkPerformanceMeasurement) String() string {
   249  	return networkPerformanceMetricsName
   250  }
   251  
   252  func newWorkerPodData(podName, nodeName, podIP string) *workerPodData {
   253  	return &workerPodData{podName: podName, workerNode: nodeName, podIP: podIP}
   254  }
   255  
   256  func (npm *networkPerformanceMeasurement) storeWorkerPods() error {
   257  	pods, err := npm.k8sClient.CoreV1().Pods(netperfNamespace).List(context.TODO(), metav1.ListOptions{})
   258  	if err != nil {
   259  		return err
   260  	}
   261  	for _, pod := range pods.Items {
   262  		if pod.Status.PodIP == "" {
   263  			klog.Errorf("IP address not found for %s", pod.Name)
   264  			continue
   265  		}
   266  		npm.addPodWorker(newWorkerPodData(pod.Name, pod.Spec.NodeName, pod.Status.PodIP))
   267  	}
   268  	return nil
   269  }
   270  
   271  func (npm *networkPerformanceMeasurement) gather() (measurement.Summary, error) {
   272  	npm.waitForMetricsFromPods()
   273  
   274  	close(npm.stopCh)
   275  	npm.cleanupCluster()
   276  
   277  	resultSummary := npm.createResultSummary()
   278  	content, err := util.PrettyPrintJSON(&measurementutil.PerfData{
   279  		Version:   "v1",
   280  		DataItems: resultSummary.DataItems,
   281  	})
   282  	if err != nil {
   283  		klog.Infof("Failed to print metrics: %v", err)
   284  	}
   285  	summaryName := fmt.Sprintf(npm.String() + "_" + resultSummary.podRatio + "_" + resultSummary.protocol + "_" + resultSummary.service)
   286  	return measurement.CreateSummary(summaryName, "json", content), nil
   287  }
   288  
   289  // Dispose disposes resources,objects after the measurement.
   290  func (npm *networkPerformanceMeasurement) Dispose() {
   291  	klog.Infof("Stopping network performance measurement...")
   292  	npm.cleanupCluster()
   293  }
   294  
   295  func (npm *networkPerformanceMeasurement) handleUpdateNetworkTestEvents(newObj interface{}) {
   296  	newRuntimeObj, ok := newObj.(runtime.Object)
   297  	if newObj != nil && !ok {
   298  		klog.Errorf("Unexpected object type: %v", newObj)
   299  		return
   300  	}
   301  	// TODO(#1757): Stop relying on unstructured data.
   302  	resourceContent, err := runtime.DefaultUnstructuredConverter.ToUnstructured(newRuntimeObj)
   303  	if err != nil {
   304  		klog.Errorf("Failed to convert to unstructured: %v", newObj)
   305  		return
   306  	}
   307  	var metricResp metricResponse
   308  	metricMap, ok := resourceContent["status"].(map[string]interface{})
   309  	if !ok {
   310  		klog.Errorf("Object doesn't have Status field: %v", newObj)
   311  		return
   312  	}
   313  	if err = getMetricResponse(metricMap, &metricResp); err != nil {
   314  		klog.Errorf("Failed to get metrics response: %v", err)
   315  	}
   316  	if len(metricResp.Metrics) == 0 && metricResp.Error != "" {
   317  		klog.Errorf("Metrics error: %v", metricResp.Error)
   318  	}
   319  	npm.metricLock.Lock()
   320  	defer npm.metricLock.Unlock()
   321  	npm.metricVal = append(npm.metricVal, metricResp.Metrics)
   322  }
   323  
   324  func (npm *networkPerformanceMeasurement) execNToMTest() {
   325  	podPairList := npm.podInfo.formUniquePodPair()
   326  	npm.createCustomResourcePerUniquePodPair(podPairList)
   327  }
   328  
   329  func (npm *networkPerformanceMeasurement) createCustomResourcePerUniquePodPair(uniquePodPairList []podPair) {
   330  	currTime := time.Now()
   331  	npm.startTimeStampForTestExecution = currTime.Add(initialDelayForTestExecution).Unix()
   332  	for pairIndex, podPair := range uniquePodPairList {
   333  		templateMapping := npm.populateTemplate(podPair, pairIndex)
   334  		if err := npm.framework.ApplyTemplatedManifests(manifestsFS, networkTestRequestFilePath, templateMapping); err != nil {
   335  			klog.Error(err)
   336  		}
   337  	}
   338  }
   339  
   340  func (npm *networkPerformanceMeasurement) addPodWorker(data *workerPodData) {
   341  	if _, ok := npm.podInfo.workerPodMap[data.workerNode]; !ok {
   342  		npm.podInfo.workerPodMap[data.workerNode] = []workerPodData{}
   343  	}
   344  	npm.podInfo.workerPodMap[data.workerNode] = append(npm.podInfo.workerPodMap[data.workerNode], *data)
   345  }
   346  
   347  func (npm *networkPerformanceMeasurement) calculateMetricDataValue(dataElem *measurementutil.DataItem, metricIndex int) {
   348  	var aggregatePodPairMetrics []float64
   349  	var metricResponse []float64
   350  
   351  	npm.metricLock.Lock()
   352  	defer npm.metricLock.Unlock()
   353  
   354  	switch npm.podRatioType {
   355  	case oneToOne:
   356  		for _, metricResponse = range npm.metricVal {
   357  			if len(metricResponse) > 0 {
   358  				dataElem.Data[value] = metricResponse[metricIndex]
   359  			} else {
   360  				dataElem.Data[value] = 0
   361  			}
   362  		}
   363  		klog.Info("Metric value: ", dataElem.Data[value])
   364  	case manyToMany:
   365  		for _, metricResponse = range npm.metricVal {
   366  			if len(metricResponse) > 0 {
   367  				// Sometimes iperf gives negative values for latency. As short-term fix
   368  				// we are considering them as zero.
   369  				if metricIndex == udpLatencyAverage && metricResponse[metricIndex] < 0 {
   370  					aggregatePodPairMetrics = append(aggregatePodPairMetrics, 0)
   371  					continue
   372  				}
   373  				aggregatePodPairMetrics = append(aggregatePodPairMetrics, metricResponse[metricIndex])
   374  			}
   375  		}
   376  		percentile := getPercentile(aggregatePodPairMetrics)
   377  		dataElem.Data[perc05] = percentile.percentile05
   378  		dataElem.Data[perc50] = percentile.percentile50
   379  		dataElem.Data[perc95] = percentile.percentile95
   380  		klog.Info("Aggregate Metric value: ", aggregatePodPairMetrics)
   381  	}
   382  }
   383  
   384  func (npm *networkPerformanceMeasurement) createResultSummary() testResultSummary {
   385  	var resultSummary testResultSummary
   386  	switch npm.protocol {
   387  	case protocolTCP:
   388  		npm.getMetricData(&resultSummary, tcpBandwidth, throughput)
   389  		resultSummary.protocol = protocolTCP
   390  	case protocolUDP:
   391  		npm.getMetricData(&resultSummary, udpPacketPerSecond, packetPerSecond)
   392  		npm.getMetricData(&resultSummary, udpJitter, jitter)
   393  		npm.getMetricData(&resultSummary, udpLatencyAverage, latency)
   394  		npm.getMetricData(&resultSummary, udpLostPacketsPercentage, lostPackets)
   395  		resultSummary.protocol = protocolUDP
   396  	case protocolHTTP:
   397  		npm.getMetricData(&resultSummary, httpResponseTime, responseTime)
   398  		resultSummary.protocol = protocolHTTP
   399  	}
   400  	resultSummary.service = "P2P"
   401  	resultSummary.podRatio = npm.podRatioType
   402  	return resultSummary
   403  }
   404  
   405  func (npm *networkPerformanceMeasurement) getMetricData(data *testResultSummary, metricIndex int, metricName string) {
   406  	var metricDataItem measurementutil.DataItem
   407  	metricDataItem.Data = make(map[string]float64)
   408  	metricDataItem.Labels = make(map[string]string)
   409  	metricDataItem.Labels["Metric"] = metricName
   410  	npm.calculateMetricDataValue(&metricDataItem, metricIndex)
   411  	metricDataItem.Unit = metricUnitMap[metricDataItem.Labels["Metric"]]
   412  	data.DataItems = append(data.DataItems, metricDataItem)
   413  	klog.V(3).Infof("TestResultSummary: %v", data)
   414  }
   415  
   416  func (npm *networkPerformanceMeasurement) waitForMetricsFromPods() {
   417  	bufferTime := time.Duration(math.Max(10, float64(npm.numberOfClients)*0.1)) * time.Second
   418  	timeout := initialDelayForTestExecution + npm.testDuration + bufferTime
   419  	interval := 2 * time.Second
   420  	if err := wait.Poll(interval, timeout, npm.checkResponseReceivedFromPods); err != nil {
   421  		// TODO: Log pod names from which response is not received.
   422  		klog.Errorf("Failed to receive response from %v pods", npm.numberOfClients-len(npm.metricVal))
   423  	}
   424  }
   425  
   426  func (npm *networkPerformanceMeasurement) checkResponseReceivedFromPods() (bool, error) {
   427  	npm.metricLock.Lock()
   428  	defer npm.metricLock.Unlock()
   429  	return len(npm.metricVal) == npm.numberOfClients, nil
   430  }