k8s.io/kubernetes@v1.29.3/test/e2e/framework/kubelet/stats.go (about) 1 /* 2 Copyright 2019 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package kubelet 18 19 import ( 20 "bytes" 21 "context" 22 "encoding/json" 23 "fmt" 24 "sort" 25 "strings" 26 "sync" 27 "text/tabwriter" 28 "time" 29 30 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 31 utilerrors "k8s.io/apimachinery/pkg/util/errors" 32 "k8s.io/apimachinery/pkg/util/wait" 33 34 clientset "k8s.io/client-go/kubernetes" 35 restclient "k8s.io/client-go/rest" 36 kubeletstatsv1alpha1 "k8s.io/kubelet/pkg/apis/stats/v1alpha1" 37 "k8s.io/kubernetes/test/e2e/framework" 38 e2emetrics "k8s.io/kubernetes/test/e2e/framework/metrics" 39 ) 40 41 const ( 42 // timeout for proxy requests. 43 proxyTimeout = 2 * time.Minute 44 45 // dockerOperationsKey is the key for docker operation metrics. 46 // copied from k8s.io/kubernetes/pkg/kubelet/dockershim/metrics 47 dockerOperationsKey = "docker_operations_total" 48 49 // dockerOperationsErrorsKey is the key for the operation error metrics. 50 // copied from k8s.io/kubernetes/pkg/kubelet/dockershim/metrics 51 dockerOperationsErrorsKey = "docker_operations_errors_total" 52 53 // dockerOperationsTimeoutKey is the key for the operation timeout metrics. 54 // copied from k8s.io/kubernetes/pkg/kubelet/dockershim/metrics 55 dockerOperationsTimeoutKey = "docker_operations_timeout_total" 56 ) 57 58 // ContainerResourceUsage is a structure for gathering container resource usage. 59 type ContainerResourceUsage struct { 60 Name string 61 Timestamp time.Time 62 CPUUsageInCores float64 63 MemoryUsageInBytes uint64 64 MemoryWorkingSetInBytes uint64 65 MemoryRSSInBytes uint64 66 // The interval used to calculate CPUUsageInCores. 67 CPUInterval time.Duration 68 } 69 70 // ResourceUsagePerContainer is map of ContainerResourceUsage 71 type ResourceUsagePerContainer map[string]*ContainerResourceUsage 72 73 // ResourceUsagePerNode is map of ResourceUsagePerContainer. 74 type ResourceUsagePerNode map[string]ResourceUsagePerContainer 75 76 // ContainersCPUSummary is indexed by the container name with each entry a 77 // (percentile, value) map. 78 type ContainersCPUSummary map[string]map[float64]float64 79 80 // NodesCPUSummary is indexed by the node name with each entry a 81 // ContainersCPUSummary map. 82 type NodesCPUSummary map[string]ContainersCPUSummary 83 84 // RuntimeOperationMonitor is the tool getting and parsing docker operation metrics. 85 type RuntimeOperationMonitor struct { 86 client clientset.Interface 87 nodesRuntimeOps map[string]NodeRuntimeOperationErrorRate 88 } 89 90 // NodeRuntimeOperationErrorRate is the runtime operation error rate on one node. 91 type NodeRuntimeOperationErrorRate map[string]*RuntimeOperationErrorRate 92 93 // RuntimeOperationErrorRate is the error rate of a specified runtime operation. 94 type RuntimeOperationErrorRate struct { 95 TotalNumber float64 96 ErrorRate float64 97 TimeoutRate float64 98 } 99 100 // ProxyRequest performs a get on a node proxy endpoint given the nodename and rest client. 101 func ProxyRequest(ctx context.Context, c clientset.Interface, node, endpoint string, port int) (restclient.Result, error) { 102 // proxy tends to hang in some cases when Node is not ready. Add an artificial timeout for this call. #22165 103 var result restclient.Result 104 finished := make(chan struct{}, 1) 105 go func() { 106 result = c.CoreV1().RESTClient().Get(). 107 Resource("nodes"). 108 SubResource("proxy"). 109 Name(fmt.Sprintf("%v:%v", node, port)). 110 Suffix(endpoint). 111 Do(ctx) 112 113 finished <- struct{}{} 114 }() 115 select { 116 case <-finished: 117 return result, nil 118 case <-time.After(proxyTimeout): 119 return restclient.Result{}, nil 120 } 121 } 122 123 // NewRuntimeOperationMonitor returns a new RuntimeOperationMonitor. 124 func NewRuntimeOperationMonitor(ctx context.Context, c clientset.Interface) *RuntimeOperationMonitor { 125 m := &RuntimeOperationMonitor{ 126 client: c, 127 nodesRuntimeOps: make(map[string]NodeRuntimeOperationErrorRate), 128 } 129 nodes, err := m.client.CoreV1().Nodes().List(ctx, metav1.ListOptions{}) 130 if err != nil { 131 framework.Failf("RuntimeOperationMonitor: unable to get list of nodes: %v", err) 132 } 133 for _, node := range nodes.Items { 134 m.nodesRuntimeOps[node.Name] = make(NodeRuntimeOperationErrorRate) 135 } 136 // Initialize the runtime operation error rate 137 m.GetRuntimeOperationErrorRate(ctx) 138 return m 139 } 140 141 // GetRuntimeOperationErrorRate gets runtime operation records from kubelet metrics and calculate 142 // error rates of all runtime operations. 143 func (m *RuntimeOperationMonitor) GetRuntimeOperationErrorRate(ctx context.Context) map[string]NodeRuntimeOperationErrorRate { 144 for node := range m.nodesRuntimeOps { 145 nodeResult, err := getNodeRuntimeOperationErrorRate(ctx, m.client, node) 146 if err != nil { 147 framework.Logf("GetRuntimeOperationErrorRate: unable to get kubelet metrics from node %q: %v", node, err) 148 continue 149 } 150 m.nodesRuntimeOps[node] = nodeResult 151 } 152 return m.nodesRuntimeOps 153 } 154 155 // GetLatestRuntimeOperationErrorRate gets latest error rate and timeout rate from last observed RuntimeOperationErrorRate. 156 func (m *RuntimeOperationMonitor) GetLatestRuntimeOperationErrorRate(ctx context.Context) map[string]NodeRuntimeOperationErrorRate { 157 result := make(map[string]NodeRuntimeOperationErrorRate) 158 for node := range m.nodesRuntimeOps { 159 result[node] = make(NodeRuntimeOperationErrorRate) 160 oldNodeResult := m.nodesRuntimeOps[node] 161 curNodeResult, err := getNodeRuntimeOperationErrorRate(ctx, m.client, node) 162 if err != nil { 163 framework.Logf("GetLatestRuntimeOperationErrorRate: unable to get kubelet metrics from node %q: %v", node, err) 164 continue 165 } 166 for op, cur := range curNodeResult { 167 t := *cur 168 if old, found := oldNodeResult[op]; found { 169 t.ErrorRate = (t.ErrorRate*t.TotalNumber - old.ErrorRate*old.TotalNumber) / (t.TotalNumber - old.TotalNumber) 170 t.TimeoutRate = (t.TimeoutRate*t.TotalNumber - old.TimeoutRate*old.TotalNumber) / (t.TotalNumber - old.TotalNumber) 171 t.TotalNumber -= old.TotalNumber 172 } 173 result[node][op] = &t 174 } 175 m.nodesRuntimeOps[node] = curNodeResult 176 } 177 return result 178 } 179 180 // FormatRuntimeOperationErrorRate formats the runtime operation error rate to string. 181 func FormatRuntimeOperationErrorRate(nodesResult map[string]NodeRuntimeOperationErrorRate) string { 182 lines := []string{} 183 for node, nodeResult := range nodesResult { 184 lines = append(lines, fmt.Sprintf("node %q runtime operation error rate:", node)) 185 for op, result := range nodeResult { 186 line := fmt.Sprintf("operation %q: total - %.0f; error rate - %f; timeout rate - %f", op, 187 result.TotalNumber, result.ErrorRate, result.TimeoutRate) 188 lines = append(lines, line) 189 } 190 lines = append(lines, fmt.Sprintln()) 191 } 192 return strings.Join(lines, "\n") 193 } 194 195 // getNodeRuntimeOperationErrorRate gets runtime operation error rate from specified node. 196 func getNodeRuntimeOperationErrorRate(ctx context.Context, c clientset.Interface, node string) (NodeRuntimeOperationErrorRate, error) { 197 result := make(NodeRuntimeOperationErrorRate) 198 ms, err := e2emetrics.GetKubeletMetrics(ctx, c, node) 199 if err != nil { 200 return result, err 201 } 202 // If no corresponding metrics are found, the returned samples will be empty. Then the following 203 // loop will be skipped automatically. 204 allOps := ms[dockerOperationsKey] 205 errOps := ms[dockerOperationsErrorsKey] 206 timeoutOps := ms[dockerOperationsTimeoutKey] 207 for _, sample := range allOps { 208 operation := string(sample.Metric["operation_type"]) 209 result[operation] = &RuntimeOperationErrorRate{TotalNumber: float64(sample.Value)} 210 } 211 for _, sample := range errOps { 212 operation := string(sample.Metric["operation_type"]) 213 // Should always find the corresponding item, just in case 214 if _, found := result[operation]; found { 215 result[operation].ErrorRate = float64(sample.Value) / result[operation].TotalNumber 216 } 217 } 218 for _, sample := range timeoutOps { 219 operation := string(sample.Metric["operation_type"]) 220 if _, found := result[operation]; found { 221 result[operation].TimeoutRate = float64(sample.Value) / result[operation].TotalNumber 222 } 223 } 224 return result, nil 225 } 226 227 // GetStatsSummary contacts kubelet for the container information. 228 func GetStatsSummary(ctx context.Context, c clientset.Interface, nodeName string) (*kubeletstatsv1alpha1.Summary, error) { 229 ctx, cancel := context.WithTimeout(ctx, framework.SingleCallTimeout) 230 defer cancel() 231 232 data, err := c.CoreV1().RESTClient().Get(). 233 Resource("nodes"). 234 SubResource("proxy"). 235 Name(fmt.Sprintf("%v:%v", nodeName, framework.KubeletPort)). 236 Suffix("stats/summary"). 237 Do(ctx).Raw() 238 239 if err != nil { 240 return nil, err 241 } 242 243 summary := kubeletstatsv1alpha1.Summary{} 244 err = json.Unmarshal(data, &summary) 245 if err != nil { 246 return nil, err 247 } 248 return &summary, nil 249 } 250 251 func getNodeStatsSummary(ctx context.Context, c clientset.Interface, nodeName string) (*kubeletstatsv1alpha1.Summary, error) { 252 data, err := c.CoreV1().RESTClient().Get(). 253 Resource("nodes"). 254 SubResource("proxy"). 255 Name(fmt.Sprintf("%v:%v", nodeName, framework.KubeletPort)). 256 Suffix("stats/summary"). 257 SetHeader("Content-Type", "application/json"). 258 Do(ctx).Raw() 259 260 if err != nil { 261 return nil, err 262 } 263 264 var summary *kubeletstatsv1alpha1.Summary 265 err = json.Unmarshal(data, &summary) 266 if err != nil { 267 return nil, err 268 } 269 return summary, nil 270 } 271 272 func getSystemContainerStats(summary *kubeletstatsv1alpha1.Summary) map[string]*kubeletstatsv1alpha1.ContainerStats { 273 statsList := summary.Node.SystemContainers 274 statsMap := make(map[string]*kubeletstatsv1alpha1.ContainerStats) 275 for i := range statsList { 276 statsMap[statsList[i].Name] = &statsList[i] 277 } 278 279 // Create a root container stats using information available in 280 // stats.NodeStats. This is necessary since it is a different type. 281 statsMap[rootContainerName] = &kubeletstatsv1alpha1.ContainerStats{ 282 CPU: summary.Node.CPU, 283 Memory: summary.Node.Memory, 284 } 285 return statsMap 286 } 287 288 const ( 289 rootContainerName = "/" 290 ) 291 292 // TargetContainers returns a list of containers for which we want to collect resource usage. 293 func TargetContainers() []string { 294 return []string{ 295 rootContainerName, 296 kubeletstatsv1alpha1.SystemContainerRuntime, 297 kubeletstatsv1alpha1.SystemContainerKubelet, 298 } 299 } 300 301 func formatResourceUsageStats(nodeName string, containerStats ResourceUsagePerContainer) string { 302 // Example output: 303 // 304 // Resource usage for node "e2e-test-foo-node-abcde": 305 // container cpu(cores) memory(MB) 306 // "/" 0.363 2942.09 307 // "/docker-daemon" 0.088 521.80 308 // "/kubelet" 0.086 424.37 309 // "/system" 0.007 119.88 310 buf := &bytes.Buffer{} 311 w := tabwriter.NewWriter(buf, 1, 0, 1, ' ', 0) 312 fmt.Fprintf(w, "container\tcpu(cores)\tmemory_working_set(MB)\tmemory_rss(MB)\n") 313 for name, s := range containerStats { 314 fmt.Fprintf(w, "%q\t%.3f\t%.2f\t%.2f\n", name, s.CPUUsageInCores, float64(s.MemoryWorkingSetInBytes)/(1024*1024), float64(s.MemoryRSSInBytes)/(1024*1024)) 315 } 316 w.Flush() 317 return fmt.Sprintf("Resource usage on node %q:\n%s", nodeName, buf.String()) 318 } 319 320 // GetKubeletHeapStats returns stats of kubelet heap. 321 func GetKubeletHeapStats(ctx context.Context, c clientset.Interface, nodeName string) (string, error) { 322 client, err := ProxyRequest(ctx, c, nodeName, "debug/pprof/heap", framework.KubeletPort) 323 if err != nil { 324 return "", err 325 } 326 raw, errRaw := client.Raw() 327 if errRaw != nil { 328 return "", err 329 } 330 kubeletstatsv1alpha1 := string(raw) 331 // Only dumping the runtime.MemStats numbers to avoid polluting the log. 332 numLines := 23 333 lines := strings.Split(kubeletstatsv1alpha1, "\n") 334 return strings.Join(lines[len(lines)-numLines:], "\n"), nil 335 } 336 337 func computeContainerResourceUsage(name string, oldStats, newStats *kubeletstatsv1alpha1.ContainerStats) *ContainerResourceUsage { 338 return &ContainerResourceUsage{ 339 Name: name, 340 Timestamp: newStats.CPU.Time.Time, 341 CPUUsageInCores: float64(*newStats.CPU.UsageCoreNanoSeconds-*oldStats.CPU.UsageCoreNanoSeconds) / float64(newStats.CPU.Time.Time.Sub(oldStats.CPU.Time.Time).Nanoseconds()), 342 MemoryUsageInBytes: *newStats.Memory.UsageBytes, 343 MemoryWorkingSetInBytes: *newStats.Memory.WorkingSetBytes, 344 MemoryRSSInBytes: *newStats.Memory.RSSBytes, 345 CPUInterval: newStats.CPU.Time.Time.Sub(oldStats.CPU.Time.Time), 346 } 347 } 348 349 // resourceCollector periodically polls the node, collect stats for a given 350 // list of containers, computes and cache resource usage up to 351 // maxEntriesPerContainer for each container. 352 type resourceCollector struct { 353 lock sync.RWMutex 354 node string 355 containers []string 356 client clientset.Interface 357 buffers map[string][]*ContainerResourceUsage 358 pollingInterval time.Duration 359 stop func() 360 } 361 362 func newResourceCollector(c clientset.Interface, nodeName string, containerNames []string, pollingInterval time.Duration) *resourceCollector { 363 buffers := make(map[string][]*ContainerResourceUsage) 364 return &resourceCollector{ 365 node: nodeName, 366 containers: containerNames, 367 client: c, 368 buffers: buffers, 369 pollingInterval: pollingInterval, 370 } 371 } 372 373 // Start starts a goroutine to Poll the node every pollingInterval. 374 func (r *resourceCollector) Start(ctx context.Context) { 375 ctx, cancel := context.WithCancel(ctx) 376 r.stop = cancel 377 // Keep the last observed stats for comparison. 378 oldStats := make(map[string]*kubeletstatsv1alpha1.ContainerStats) 379 go wait.UntilWithContext(ctx, func(ctx context.Context) { r.collectStats(ctx, oldStats) }, r.pollingInterval) 380 } 381 382 // Stop sends a signal to terminate the stats collecting goroutine. 383 func (r *resourceCollector) Stop() { 384 r.stop() 385 } 386 387 // collectStats gets the latest stats from kubelet stats summary API, computes 388 // the resource usage, and pushes it to the buffer. 389 func (r *resourceCollector) collectStats(ctx context.Context, oldStatsMap map[string]*kubeletstatsv1alpha1.ContainerStats) { 390 summary, err := getNodeStatsSummary(ctx, r.client, r.node) 391 if err != nil { 392 framework.Logf("Error getting node stats summary on %q, err: %v", r.node, err) 393 return 394 } 395 cStatsMap := getSystemContainerStats(summary) 396 r.lock.Lock() 397 defer r.lock.Unlock() 398 for _, name := range r.containers { 399 cStats, ok := cStatsMap[name] 400 if !ok { 401 framework.Logf("Missing info/stats for container %q on node %q", name, r.node) 402 return 403 } 404 405 if oldStats, ok := oldStatsMap[name]; ok { 406 if oldStats.CPU == nil || cStats.CPU == nil || oldStats.Memory == nil || cStats.Memory == nil { 407 continue 408 } 409 if oldStats.CPU.Time.Equal(&cStats.CPU.Time) { 410 // No change -> skip this stat. 411 continue 412 } 413 r.buffers[name] = append(r.buffers[name], computeContainerResourceUsage(name, oldStats, cStats)) 414 } 415 // Update the old stats. 416 oldStatsMap[name] = cStats 417 } 418 } 419 420 func (r *resourceCollector) GetLatest() (ResourceUsagePerContainer, error) { 421 r.lock.RLock() 422 defer r.lock.RUnlock() 423 kubeletstatsv1alpha1 := make(ResourceUsagePerContainer) 424 for _, name := range r.containers { 425 contStats, ok := r.buffers[name] 426 if !ok || len(contStats) == 0 { 427 return nil, fmt.Errorf("Resource usage on node %q is not ready yet", r.node) 428 } 429 kubeletstatsv1alpha1[name] = contStats[len(contStats)-1] 430 } 431 return kubeletstatsv1alpha1, nil 432 } 433 434 // Reset frees the stats and start over. 435 func (r *resourceCollector) Reset() { 436 r.lock.Lock() 437 defer r.lock.Unlock() 438 for _, name := range r.containers { 439 r.buffers[name] = []*ContainerResourceUsage{} 440 } 441 } 442 443 type resourceUsageByCPU []*ContainerResourceUsage 444 445 func (r resourceUsageByCPU) Len() int { return len(r) } 446 func (r resourceUsageByCPU) Swap(i, j int) { r[i], r[j] = r[j], r[i] } 447 func (r resourceUsageByCPU) Less(i, j int) bool { return r[i].CPUUsageInCores < r[j].CPUUsageInCores } 448 449 // The percentiles to report. 450 var percentiles = [...]float64{0.05, 0.20, 0.50, 0.70, 0.90, 0.95, 0.99} 451 452 // GetBasicCPUStats returns the percentiles the cpu usage in cores for 453 // containerName. This method examines all data currently in the buffer. 454 func (r *resourceCollector) GetBasicCPUStats(containerName string) map[float64]float64 { 455 r.lock.RLock() 456 defer r.lock.RUnlock() 457 result := make(map[float64]float64, len(percentiles)) 458 usages := r.buffers[containerName] 459 sort.Sort(resourceUsageByCPU(usages)) 460 for _, q := range percentiles { 461 index := int(float64(len(usages))*q) - 1 462 if index < 0 { 463 // We don't have enough data. 464 result[q] = 0 465 continue 466 } 467 result[q] = usages[index].CPUUsageInCores 468 } 469 return result 470 } 471 472 // ResourceMonitor manages a resourceCollector per node. 473 type ResourceMonitor struct { 474 client clientset.Interface 475 containers []string 476 pollingInterval time.Duration 477 collectors map[string]*resourceCollector 478 } 479 480 // NewResourceMonitor returns a new ResourceMonitor. 481 func NewResourceMonitor(c clientset.Interface, containerNames []string, pollingInterval time.Duration) *ResourceMonitor { 482 return &ResourceMonitor{ 483 containers: containerNames, 484 client: c, 485 pollingInterval: pollingInterval, 486 } 487 } 488 489 // Start starts collectors. 490 func (r *ResourceMonitor) Start(ctx context.Context) { 491 // It should be OK to monitor unschedulable Nodes 492 nodes, err := r.client.CoreV1().Nodes().List(ctx, metav1.ListOptions{}) 493 if err != nil { 494 framework.Failf("ResourceMonitor: unable to get list of nodes: %v", err) 495 } 496 r.collectors = make(map[string]*resourceCollector, 0) 497 for _, node := range nodes.Items { 498 collector := newResourceCollector(r.client, node.Name, r.containers, r.pollingInterval) 499 r.collectors[node.Name] = collector 500 collector.Start(ctx) 501 } 502 } 503 504 // Stop stops collectors. 505 func (r *ResourceMonitor) Stop() { 506 for _, collector := range r.collectors { 507 collector.Stop() 508 } 509 } 510 511 // Reset resets collectors. 512 func (r *ResourceMonitor) Reset() { 513 for _, collector := range r.collectors { 514 collector.Reset() 515 } 516 } 517 518 // LogLatest outputs the latest resource usage into log. 519 func (r *ResourceMonitor) LogLatest() { 520 summary, err := r.GetLatest() 521 if err != nil { 522 framework.Logf("%v", err) 523 } 524 framework.Logf("%s", r.FormatResourceUsage(summary)) 525 } 526 527 // FormatResourceUsage returns the formatted string for LogLatest(). 528 // TODO(oomichi): This can be made to local function after making test/e2e/node/kubelet_perf.go use LogLatest directly instead. 529 func (r *ResourceMonitor) FormatResourceUsage(s ResourceUsagePerNode) string { 530 summary := []string{} 531 for node, usage := range s { 532 summary = append(summary, formatResourceUsageStats(node, usage)) 533 } 534 return strings.Join(summary, "\n") 535 } 536 537 // GetLatest returns the latest resource usage. 538 func (r *ResourceMonitor) GetLatest() (ResourceUsagePerNode, error) { 539 result := make(ResourceUsagePerNode) 540 errs := []error{} 541 for key, collector := range r.collectors { 542 s, err := collector.GetLatest() 543 if err != nil { 544 errs = append(errs, err) 545 continue 546 } 547 result[key] = s 548 } 549 return result, utilerrors.NewAggregate(errs) 550 } 551 552 // GetMasterNodeLatest returns the latest resource usage of master and node. 553 func (r *ResourceMonitor) GetMasterNodeLatest(usagePerNode ResourceUsagePerNode) ResourceUsagePerNode { 554 result := make(ResourceUsagePerNode) 555 var masterUsage ResourceUsagePerContainer 556 var nodesUsage []ResourceUsagePerContainer 557 for node, usage := range usagePerNode { 558 if strings.HasSuffix(node, "master") { 559 masterUsage = usage 560 } else { 561 nodesUsage = append(nodesUsage, usage) 562 } 563 } 564 nodeAvgUsage := make(ResourceUsagePerContainer) 565 for _, nodeUsage := range nodesUsage { 566 for c, usage := range nodeUsage { 567 if _, found := nodeAvgUsage[c]; !found { 568 nodeAvgUsage[c] = &ContainerResourceUsage{Name: usage.Name} 569 } 570 nodeAvgUsage[c].CPUUsageInCores += usage.CPUUsageInCores 571 nodeAvgUsage[c].MemoryUsageInBytes += usage.MemoryUsageInBytes 572 nodeAvgUsage[c].MemoryWorkingSetInBytes += usage.MemoryWorkingSetInBytes 573 nodeAvgUsage[c].MemoryRSSInBytes += usage.MemoryRSSInBytes 574 } 575 } 576 for c := range nodeAvgUsage { 577 nodeAvgUsage[c].CPUUsageInCores /= float64(len(nodesUsage)) 578 nodeAvgUsage[c].MemoryUsageInBytes /= uint64(len(nodesUsage)) 579 nodeAvgUsage[c].MemoryWorkingSetInBytes /= uint64(len(nodesUsage)) 580 nodeAvgUsage[c].MemoryRSSInBytes /= uint64(len(nodesUsage)) 581 } 582 result["master"] = masterUsage 583 result["node"] = nodeAvgUsage 584 return result 585 } 586 587 // FormatCPUSummary returns the string of human-readable CPU summary from the specified summary data. 588 func (r *ResourceMonitor) FormatCPUSummary(summary NodesCPUSummary) string { 589 // Example output for a node (the percentiles may differ): 590 // CPU usage of containers on node "e2e-test-foo-node-0vj7": 591 // container 5th% 50th% 90th% 95th% 592 // "/" 0.051 0.159 0.387 0.455 593 // "/runtime 0.000 0.000 0.146 0.166 594 // "/kubelet" 0.036 0.053 0.091 0.154 595 // "/misc" 0.001 0.001 0.001 0.002 596 var summaryStrings []string 597 var header []string 598 header = append(header, "container") 599 for _, p := range percentiles { 600 header = append(header, fmt.Sprintf("%.0fth%%", p*100)) 601 } 602 for nodeName, containers := range summary { 603 buf := &bytes.Buffer{} 604 w := tabwriter.NewWriter(buf, 1, 0, 1, ' ', 0) 605 fmt.Fprintf(w, "%s\n", strings.Join(header, "\t")) 606 for _, containerName := range TargetContainers() { 607 var s []string 608 s = append(s, fmt.Sprintf("%q", containerName)) 609 data, ok := containers[containerName] 610 for _, p := range percentiles { 611 value := "N/A" 612 if ok { 613 value = fmt.Sprintf("%.3f", data[p]) 614 } 615 s = append(s, value) 616 } 617 fmt.Fprintf(w, "%s\n", strings.Join(s, "\t")) 618 } 619 w.Flush() 620 summaryStrings = append(summaryStrings, fmt.Sprintf("CPU usage of containers on node %q\n:%s", nodeName, buf.String())) 621 } 622 return strings.Join(summaryStrings, "\n") 623 } 624 625 // LogCPUSummary outputs summary of CPU into log. 626 func (r *ResourceMonitor) LogCPUSummary() { 627 summary := r.GetCPUSummary() 628 framework.Logf("%s", r.FormatCPUSummary(summary)) 629 } 630 631 // GetCPUSummary returns summary of CPU. 632 func (r *ResourceMonitor) GetCPUSummary() NodesCPUSummary { 633 result := make(NodesCPUSummary) 634 for nodeName, collector := range r.collectors { 635 result[nodeName] = make(ContainersCPUSummary) 636 for _, containerName := range TargetContainers() { 637 data := collector.GetBasicCPUStats(containerName) 638 result[nodeName][containerName] = data 639 } 640 } 641 return result 642 } 643 644 // GetMasterNodeCPUSummary returns summary of master node CPUs. 645 func (r *ResourceMonitor) GetMasterNodeCPUSummary(summaryPerNode NodesCPUSummary) NodesCPUSummary { 646 result := make(NodesCPUSummary) 647 var masterSummary ContainersCPUSummary 648 var nodesSummaries []ContainersCPUSummary 649 for node, summary := range summaryPerNode { 650 if strings.HasSuffix(node, "master") { 651 masterSummary = summary 652 } else { 653 nodesSummaries = append(nodesSummaries, summary) 654 } 655 } 656 657 nodeAvgSummary := make(ContainersCPUSummary) 658 for _, nodeSummary := range nodesSummaries { 659 for c, summary := range nodeSummary { 660 if _, found := nodeAvgSummary[c]; !found { 661 nodeAvgSummary[c] = map[float64]float64{} 662 } 663 for perc, value := range summary { 664 nodeAvgSummary[c][perc] += value 665 } 666 } 667 } 668 for c := range nodeAvgSummary { 669 for perc := range nodeAvgSummary[c] { 670 nodeAvgSummary[c][perc] /= float64(len(nodesSummaries)) 671 } 672 } 673 result["master"] = masterSummary 674 result["node"] = nodeAvgSummary 675 return result 676 }