k8s.io/perf-tests/clusterloader2@v0.0.0-20240304094227-64bdb12da87e/pkg/measurement/common/network/network_performance_measurement.go (about) 1 /* 2 Copyright 2021 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 // Package network captures network performance metrics 18 // for protocol TCP,UDP,HTTP etc. The metrics are collected for baseline (1:1), 19 // scale (N:M) pod ratios.Client and server pods located on different worker 20 // nodes exchange traffic for specified time to measure the performance metrics. 21 package network 22 23 import ( 24 "context" 25 "embed" 26 "fmt" 27 "math" 28 "sync" 29 "time" 30 31 "k8s.io/apimachinery/pkg/api/meta" 32 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 33 "k8s.io/apimachinery/pkg/runtime" 34 "k8s.io/apimachinery/pkg/runtime/schema" 35 "k8s.io/apimachinery/pkg/util/wait" 36 "k8s.io/client-go/dynamic" 37 "k8s.io/client-go/kubernetes" 38 "k8s.io/client-go/tools/cache" 39 "k8s.io/klog/v2" 40 "k8s.io/perf-tests/clusterloader2/pkg/framework" 41 "k8s.io/perf-tests/clusterloader2/pkg/framework/client" 42 "k8s.io/perf-tests/clusterloader2/pkg/measurement" 43 measurementutil "k8s.io/perf-tests/clusterloader2/pkg/measurement/util" 44 "k8s.io/perf-tests/clusterloader2/pkg/util" 45 ) 46 47 const ( 48 podReadyTimeout = 5 * time.Minute 49 // initialDelay in seconds after which pod starts sending traffic. 50 // Delay is used to synchronize all client pods to send traffic at same time. 51 initialDelayForTestExecution = 15 * time.Second 52 // networkPerformanceMetricsName indicates the measurement name 53 networkPerformanceMetricsName = "NetworkPerformanceMetrics" 54 netperfNamespace = "netperf" 55 ) 56 57 const ( 58 workerPodDeploymentManifestFilePath = "manifests/*deployment.yaml" 59 networkTestRequestFilePath = "manifests/networktestrequests.yaml" 60 crdManifestFilePath = "manifests/*CustomResourceDefinition.yaml" 61 clusterRoleBindingFilePath = "manifests/roleBinding.yaml" 62 customResourceDefinitionName = "networktestrequests.clusterloader.io" 63 rbacName = "networktestrequests-rbac" 64 ) 65 66 var ( 67 crdGvk = schema.GroupVersionKind{ 68 Group: "apiextensions.k8s.io", 69 Kind: "CustomResourceDefinition", 70 Version: "v1", 71 } 72 73 rbacGvk = schema.GroupVersionKind{ 74 Group: "rbac.authorization.k8s.io", 75 Kind: "ClusterRoleBinding", 76 Version: "v1", 77 } 78 79 crGvk = schema.GroupVersionKind{ 80 Group: "clusterloader.io", 81 Kind: "NetworkTestRequest", 82 Version: "v1alpha1", 83 } 84 85 //go:embed manifests 86 manifestsFS embed.FS 87 ) 88 89 func init() { 90 klog.Info("Registering Network Performance Measurement") 91 if err := measurement.Register(networkPerformanceMetricsName, createNetworkPerformanceMeasurement); err != nil { 92 klog.Fatalf("Cannot register %s: %v", networkPerformanceMetricsName, err) 93 } 94 } 95 96 func createNetworkPerformanceMeasurement() measurement.Measurement { 97 return &networkPerformanceMeasurement{} 98 } 99 100 type networkPerformanceMeasurement struct { 101 k8sClient kubernetes.Interface 102 dynamicClient dynamic.Interface 103 resourceInterface dynamic.ResourceInterface 104 framework *framework.Framework 105 106 numberOfServers int 107 numberOfClients int 108 podRatioType string 109 testDuration time.Duration 110 protocol string 111 112 // workerPodInfo stores list of podData for every worker node. 113 podInfo workerPodInfo 114 metricLock sync.Mutex 115 // metricVal stores MetricResponse received from worker node. 116 metricVal [][]float64 117 stopCh chan struct{} 118 // startTimeStampForTestExecution is a futureTimeStamp sent to all pods to start 119 // sending the traffic at the same time. 120 startTimeStampForTestExecution int64 121 } 122 123 func (npm *networkPerformanceMeasurement) Execute(config *measurement.Config) ([]measurement.Summary, error) { 124 action, err := util.GetString(config.Params, "action") 125 if err != nil { 126 return nil, err 127 } 128 switch action { 129 case "start": 130 if err = npm.validate(config); err != nil { 131 return nil, err 132 } 133 return nil, npm.start(config) 134 case "gather": 135 summary, err := npm.gather() 136 if err != nil { 137 return nil, err 138 } 139 return []measurement.Summary{summary}, err 140 default: 141 return nil, fmt.Errorf("unknown action: %v", action) 142 } 143 } 144 145 func (npm *networkPerformanceMeasurement) start(config *measurement.Config) error { 146 npm.initialize(config) 147 if err := npm.prepareCluster(); err != nil { 148 return err 149 } 150 if err := npm.createAndWaitForWorkerPods(); err != nil { 151 return err 152 } 153 if err := npm.storeWorkerPods(); err != nil { 154 return err 155 } 156 if err := npm.initializeInformer(); err != nil { 157 return err 158 } 159 160 switch npm.podRatioType { 161 case oneToOne, manyToMany: 162 npm.execNToMTest() 163 default: 164 return fmt.Errorf("invalid Pod Ratio: %v", npm.podRatioType) 165 } 166 return nil 167 } 168 169 func (npm *networkPerformanceMeasurement) initialize(config *measurement.Config) { 170 npm.k8sClient = config.ClusterFramework.GetClientSets().GetClient() 171 npm.framework = config.ClusterFramework 172 npm.dynamicClient = config.ClusterFramework.GetDynamicClients().GetClient() 173 npm.podInfo.workerPodMap = make(map[string]podList) 174 } 175 176 func (npm *networkPerformanceMeasurement) initializeInformer() error { 177 gvr, _ := meta.UnsafeGuessKindToResource(crGvk) 178 npm.resourceInterface = npm.dynamicClient.Resource(gvr).Namespace(netperfNamespace) 179 informer, err := getInformer(netperfNamespace, npm.dynamicClient, gvr) 180 if err != nil { 181 return fmt.Errorf("error getting informer:%s", err) 182 } 183 informer.AddEventHandler(cache.ResourceEventHandlerFuncs{ 184 UpdateFunc: func(oldObj interface{}, newObj interface{}) { 185 npm.handleUpdateNetworkTestEvents(newObj) 186 }, 187 }) 188 npm.stopCh = make(chan struct{}) 189 go informer.Run(npm.stopCh) 190 return nil 191 } 192 193 func (npm *networkPerformanceMeasurement) prepareCluster() error { 194 if err := client.CreateNamespace(npm.k8sClient, netperfNamespace); err != nil { 195 return fmt.Errorf("error while creating namespace: %v", err) 196 } 197 if err := npm.framework.ApplyTemplatedManifests(manifestsFS, clusterRoleBindingFilePath, nil); err != nil { 198 return fmt.Errorf("error while creating clusterRoleBinding: %v", err) 199 } 200 if err := npm.framework.ApplyTemplatedManifests(manifestsFS, crdManifestFilePath, nil); err != nil { 201 return fmt.Errorf("error while creating CRD: %v", err) 202 } 203 return nil 204 } 205 206 func (npm *networkPerformanceMeasurement) cleanupCluster() { 207 if npm.framework == nil { 208 klog.V(1).Infof("Network measurement %s wasn't started, skipping the Dispose() step", npm) 209 return 210 } 211 if err := npm.framework.DeleteObject(crdGvk, "", customResourceDefinitionName); err != nil { 212 klog.Errorf("Failed to deleted CRD: %v", err) 213 } 214 if err := npm.framework.DeleteObject(rbacGvk, "", rbacName); err != nil { 215 klog.Errorf("Failed to delete clusterRoleBinding: %v", err) 216 } 217 if err := client.DeleteNamespace(npm.k8sClient, netperfNamespace); err != nil { 218 klog.Errorf("Failed to delete namespace: %v", err) 219 } 220 if err := client.WaitForDeleteNamespace(npm.k8sClient, netperfNamespace, client.DefaultNamespaceDeletionTimeout); err != nil { 221 klog.Errorf("Waiting for namespace deletion failed: %v", err) 222 } 223 } 224 225 func (npm *networkPerformanceMeasurement) createAndWaitForWorkerPods() error { 226 // Create worker pods 227 var replicas = map[string]interface{}{"Replicas": npm.numberOfClients + npm.numberOfServers} 228 if err := npm.framework.ApplyTemplatedManifests(manifestsFS, workerPodDeploymentManifestFilePath, replicas); err != nil { 229 return fmt.Errorf("failed to create worked pods: %v ", err) 230 } 231 // Wait for all worker pods to be ready 232 ctx, cancel := context.WithTimeout(context.TODO(), podReadyTimeout) 233 defer cancel() 234 selector := &util.ObjectSelector{Namespace: netperfNamespace} 235 options := &measurementutil.WaitForPodOptions{ 236 DesiredPodCount: func() int { return npm.numberOfClients + npm.numberOfServers }, 237 CallerName: networkPerformanceMetricsName, 238 WaitForPodsInterval: 2 * time.Second, 239 } 240 podStore, err := measurementutil.NewPodStore(npm.k8sClient, selector) 241 if err != nil { 242 return err 243 } 244 _, err = measurementutil.WaitForPods(ctx, podStore, options) 245 return err 246 } 247 248 func (*networkPerformanceMeasurement) String() string { 249 return networkPerformanceMetricsName 250 } 251 252 func newWorkerPodData(podName, nodeName, podIP string) *workerPodData { 253 return &workerPodData{podName: podName, workerNode: nodeName, podIP: podIP} 254 } 255 256 func (npm *networkPerformanceMeasurement) storeWorkerPods() error { 257 pods, err := npm.k8sClient.CoreV1().Pods(netperfNamespace).List(context.TODO(), metav1.ListOptions{}) 258 if err != nil { 259 return err 260 } 261 for _, pod := range pods.Items { 262 if pod.Status.PodIP == "" { 263 klog.Errorf("IP address not found for %s", pod.Name) 264 continue 265 } 266 npm.addPodWorker(newWorkerPodData(pod.Name, pod.Spec.NodeName, pod.Status.PodIP)) 267 } 268 return nil 269 } 270 271 func (npm *networkPerformanceMeasurement) gather() (measurement.Summary, error) { 272 npm.waitForMetricsFromPods() 273 274 close(npm.stopCh) 275 npm.cleanupCluster() 276 277 resultSummary := npm.createResultSummary() 278 content, err := util.PrettyPrintJSON(&measurementutil.PerfData{ 279 Version: "v1", 280 DataItems: resultSummary.DataItems, 281 }) 282 if err != nil { 283 klog.Infof("Failed to print metrics: %v", err) 284 } 285 summaryName := fmt.Sprintf(npm.String() + "_" + resultSummary.podRatio + "_" + resultSummary.protocol + "_" + resultSummary.service) 286 return measurement.CreateSummary(summaryName, "json", content), nil 287 } 288 289 // Dispose disposes resources,objects after the measurement. 290 func (npm *networkPerformanceMeasurement) Dispose() { 291 klog.Infof("Stopping network performance measurement...") 292 npm.cleanupCluster() 293 } 294 295 func (npm *networkPerformanceMeasurement) handleUpdateNetworkTestEvents(newObj interface{}) { 296 newRuntimeObj, ok := newObj.(runtime.Object) 297 if newObj != nil && !ok { 298 klog.Errorf("Unexpected object type: %v", newObj) 299 return 300 } 301 // TODO(#1757): Stop relying on unstructured data. 302 resourceContent, err := runtime.DefaultUnstructuredConverter.ToUnstructured(newRuntimeObj) 303 if err != nil { 304 klog.Errorf("Failed to convert to unstructured: %v", newObj) 305 return 306 } 307 var metricResp metricResponse 308 metricMap, ok := resourceContent["status"].(map[string]interface{}) 309 if !ok { 310 klog.Errorf("Object doesn't have Status field: %v", newObj) 311 return 312 } 313 if err = getMetricResponse(metricMap, &metricResp); err != nil { 314 klog.Errorf("Failed to get metrics response: %v", err) 315 } 316 if len(metricResp.Metrics) == 0 && metricResp.Error != "" { 317 klog.Errorf("Metrics error: %v", metricResp.Error) 318 } 319 npm.metricLock.Lock() 320 defer npm.metricLock.Unlock() 321 npm.metricVal = append(npm.metricVal, metricResp.Metrics) 322 } 323 324 func (npm *networkPerformanceMeasurement) execNToMTest() { 325 podPairList := npm.podInfo.formUniquePodPair() 326 npm.createCustomResourcePerUniquePodPair(podPairList) 327 } 328 329 func (npm *networkPerformanceMeasurement) createCustomResourcePerUniquePodPair(uniquePodPairList []podPair) { 330 currTime := time.Now() 331 npm.startTimeStampForTestExecution = currTime.Add(initialDelayForTestExecution).Unix() 332 for pairIndex, podPair := range uniquePodPairList { 333 templateMapping := npm.populateTemplate(podPair, pairIndex) 334 if err := npm.framework.ApplyTemplatedManifests(manifestsFS, networkTestRequestFilePath, templateMapping); err != nil { 335 klog.Error(err) 336 } 337 } 338 } 339 340 func (npm *networkPerformanceMeasurement) addPodWorker(data *workerPodData) { 341 if _, ok := npm.podInfo.workerPodMap[data.workerNode]; !ok { 342 npm.podInfo.workerPodMap[data.workerNode] = []workerPodData{} 343 } 344 npm.podInfo.workerPodMap[data.workerNode] = append(npm.podInfo.workerPodMap[data.workerNode], *data) 345 } 346 347 func (npm *networkPerformanceMeasurement) calculateMetricDataValue(dataElem *measurementutil.DataItem, metricIndex int) { 348 var aggregatePodPairMetrics []float64 349 var metricResponse []float64 350 351 npm.metricLock.Lock() 352 defer npm.metricLock.Unlock() 353 354 switch npm.podRatioType { 355 case oneToOne: 356 for _, metricResponse = range npm.metricVal { 357 if len(metricResponse) > 0 { 358 dataElem.Data[value] = metricResponse[metricIndex] 359 } else { 360 dataElem.Data[value] = 0 361 } 362 } 363 klog.Info("Metric value: ", dataElem.Data[value]) 364 case manyToMany: 365 for _, metricResponse = range npm.metricVal { 366 if len(metricResponse) > 0 { 367 // Sometimes iperf gives negative values for latency. As short-term fix 368 // we are considering them as zero. 369 if metricIndex == udpLatencyAverage && metricResponse[metricIndex] < 0 { 370 aggregatePodPairMetrics = append(aggregatePodPairMetrics, 0) 371 continue 372 } 373 aggregatePodPairMetrics = append(aggregatePodPairMetrics, metricResponse[metricIndex]) 374 } 375 } 376 percentile := getPercentile(aggregatePodPairMetrics) 377 dataElem.Data[perc05] = percentile.percentile05 378 dataElem.Data[perc50] = percentile.percentile50 379 dataElem.Data[perc95] = percentile.percentile95 380 klog.Info("Aggregate Metric value: ", aggregatePodPairMetrics) 381 } 382 } 383 384 func (npm *networkPerformanceMeasurement) createResultSummary() testResultSummary { 385 var resultSummary testResultSummary 386 switch npm.protocol { 387 case protocolTCP: 388 npm.getMetricData(&resultSummary, tcpBandwidth, throughput) 389 resultSummary.protocol = protocolTCP 390 case protocolUDP: 391 npm.getMetricData(&resultSummary, udpPacketPerSecond, packetPerSecond) 392 npm.getMetricData(&resultSummary, udpJitter, jitter) 393 npm.getMetricData(&resultSummary, udpLatencyAverage, latency) 394 npm.getMetricData(&resultSummary, udpLostPacketsPercentage, lostPackets) 395 resultSummary.protocol = protocolUDP 396 case protocolHTTP: 397 npm.getMetricData(&resultSummary, httpResponseTime, responseTime) 398 resultSummary.protocol = protocolHTTP 399 } 400 resultSummary.service = "P2P" 401 resultSummary.podRatio = npm.podRatioType 402 return resultSummary 403 } 404 405 func (npm *networkPerformanceMeasurement) getMetricData(data *testResultSummary, metricIndex int, metricName string) { 406 var metricDataItem measurementutil.DataItem 407 metricDataItem.Data = make(map[string]float64) 408 metricDataItem.Labels = make(map[string]string) 409 metricDataItem.Labels["Metric"] = metricName 410 npm.calculateMetricDataValue(&metricDataItem, metricIndex) 411 metricDataItem.Unit = metricUnitMap[metricDataItem.Labels["Metric"]] 412 data.DataItems = append(data.DataItems, metricDataItem) 413 klog.V(3).Infof("TestResultSummary: %v", data) 414 } 415 416 func (npm *networkPerformanceMeasurement) waitForMetricsFromPods() { 417 bufferTime := time.Duration(math.Max(10, float64(npm.numberOfClients)*0.1)) * time.Second 418 timeout := initialDelayForTestExecution + npm.testDuration + bufferTime 419 interval := 2 * time.Second 420 if err := wait.Poll(interval, timeout, npm.checkResponseReceivedFromPods); err != nil { 421 // TODO: Log pod names from which response is not received. 422 klog.Errorf("Failed to receive response from %v pods", npm.numberOfClients-len(npm.metricVal)) 423 } 424 } 425 426 func (npm *networkPerformanceMeasurement) checkResponseReceivedFromPods() (bool, error) { 427 npm.metricLock.Lock() 428 defer npm.metricLock.Unlock() 429 return len(npm.metricVal) == npm.numberOfClients, nil 430 }