k8s.io/kubernetes@v1.31.0-alpha.0.0.20240520171757-56147500dadc/test/e2e_node/resource_usage_test.go (about) 1 //go:build linux 2 // +build linux 3 4 /* 5 Copyright 2015 The Kubernetes Authors. 6 7 Licensed under the Apache License, Version 2.0 (the "License"); 8 you may not use this file except in compliance with the License. 9 You may obtain a copy of the License at 10 11 http://www.apache.org/licenses/LICENSE-2.0 12 13 Unless required by applicable law or agreed to in writing, software 14 distributed under the License is distributed on an "AS IS" BASIS, 15 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 See the License for the specific language governing permissions and 17 limitations under the License. 18 */ 19 20 package e2enode 21 22 import ( 23 "context" 24 "fmt" 25 "strings" 26 "time" 27 28 clientset "k8s.io/client-go/kubernetes" 29 kubeletstatsv1alpha1 "k8s.io/kubelet/pkg/apis/stats/v1alpha1" 30 "k8s.io/kubernetes/test/e2e/framework" 31 e2ekubelet "k8s.io/kubernetes/test/e2e/framework/kubelet" 32 e2eperf "k8s.io/kubernetes/test/e2e/framework/perf" 33 e2epod "k8s.io/kubernetes/test/e2e/framework/pod" 34 imageutils "k8s.io/kubernetes/test/utils/image" 35 admissionapi "k8s.io/pod-security-admission/api" 36 37 "github.com/onsi/ginkgo/v2" 38 ) 39 40 var _ = SIGDescribe("Resource-usage", framework.WithSerial(), framework.WithSlow(), func() { 41 const ( 42 // Interval to poll /stats/container on a node 43 containerStatsPollingPeriod = 10 * time.Second 44 ) 45 46 var ( 47 rc *ResourceCollector 48 om *e2ekubelet.RuntimeOperationMonitor 49 ) 50 51 f := framework.NewDefaultFramework("resource-usage") 52 f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged 53 54 ginkgo.BeforeEach(func(ctx context.Context) { 55 om = e2ekubelet.NewRuntimeOperationMonitor(ctx, f.ClientSet) 56 // The test collects resource usage from a standalone Cadvisor pod. 57 // The Cadvsior of Kubelet has a housekeeping interval of 10s, which is too long to 58 // show the resource usage spikes. But changing its interval increases the overhead 59 // of kubelet. Hence we use a Cadvisor pod. 60 e2epod.NewPodClient(f).CreateSync(ctx, getCadvisorPod()) 61 rc = NewResourceCollector(containerStatsPollingPeriod) 62 }) 63 64 ginkgo.AfterEach(func(ctx context.Context) { 65 result := om.GetLatestRuntimeOperationErrorRate(ctx) 66 framework.Logf("runtime operation error metrics:\n%s", e2ekubelet.FormatRuntimeOperationErrorRate(result)) 67 }) 68 69 // This test measures and verifies the steady resource usage of node is within limit 70 // It collects data from a standalone Cadvisor with housekeeping interval 1s. 71 // It verifies CPU percentiles and the lastest memory usage. 72 ginkgo.Context("regular resource usage tracking", func() { 73 rTests := []resourceTest{ 74 { 75 podsNr: 10, 76 cpuLimits: e2ekubelet.ContainersCPUSummary{ 77 kubeletstatsv1alpha1.SystemContainerKubelet: {0.50: 0.30, 0.95: 0.35}, 78 kubeletstatsv1alpha1.SystemContainerRuntime: {0.50: 0.30, 0.95: 0.40}, 79 }, 80 memLimits: e2ekubelet.ResourceUsagePerContainer{ 81 kubeletstatsv1alpha1.SystemContainerKubelet: &e2ekubelet.ContainerResourceUsage{MemoryRSSInBytes: 200 * 1024 * 1024}, 82 kubeletstatsv1alpha1.SystemContainerRuntime: &e2ekubelet.ContainerResourceUsage{MemoryRSSInBytes: 400 * 1024 * 1024}, 83 }, 84 }, 85 } 86 87 for _, testArg := range rTests { 88 itArg := testArg 89 desc := fmt.Sprintf("resource tracking for %d pods per node", itArg.podsNr) 90 ginkgo.It(desc, func(ctx context.Context) { 91 testInfo := getTestNodeInfo(f, itArg.getTestName(), desc) 92 93 runResourceUsageTest(ctx, f, rc, itArg) 94 95 // Log and verify resource usage 96 logAndVerifyResource(ctx, f, rc, itArg.cpuLimits, itArg.memLimits, testInfo, true) 97 }) 98 } 99 }) 100 101 ginkgo.Context("regular resource usage tracking", func() { 102 rTests := []resourceTest{ 103 { 104 podsNr: 0, 105 }, 106 { 107 podsNr: 10, 108 }, 109 { 110 podsNr: 35, 111 }, 112 { 113 podsNr: 90, 114 }, 115 } 116 117 for _, testArg := range rTests { 118 itArg := testArg 119 desc := fmt.Sprintf("resource tracking for %d pods per node [Benchmark]", itArg.podsNr) 120 ginkgo.It(desc, func(ctx context.Context) { 121 testInfo := getTestNodeInfo(f, itArg.getTestName(), desc) 122 123 runResourceUsageTest(ctx, f, rc, itArg) 124 125 // Log and verify resource usage 126 logAndVerifyResource(ctx, f, rc, itArg.cpuLimits, itArg.memLimits, testInfo, false) 127 }) 128 } 129 }) 130 }) 131 132 type resourceTest struct { 133 podsNr int 134 cpuLimits e2ekubelet.ContainersCPUSummary 135 memLimits e2ekubelet.ResourceUsagePerContainer 136 } 137 138 func (rt *resourceTest) getTestName() string { 139 return fmt.Sprintf("resource_%d", rt.podsNr) 140 } 141 142 // runResourceUsageTest runs the resource usage test 143 func runResourceUsageTest(ctx context.Context, f *framework.Framework, rc *ResourceCollector, testArg resourceTest) { 144 const ( 145 // The monitoring time for one test 146 monitoringTime = 10 * time.Minute 147 // The periodic reporting period 148 reportingPeriod = 5 * time.Minute 149 // sleep for an interval here to measure steady data 150 sleepAfterCreatePods = 10 * time.Second 151 ) 152 pods := newTestPods(testArg.podsNr, true, imageutils.GetPauseImageName(), "test_pod") 153 154 rc.Start() 155 // Explicitly delete pods to prevent namespace controller cleanning up timeout 156 ginkgo.DeferCleanup(deletePodsSync, f, append(pods, getCadvisorPod())) 157 ginkgo.DeferCleanup(rc.Stop) 158 159 ginkgo.By("Creating a batch of Pods") 160 e2epod.NewPodClient(f).CreateBatch(ctx, pods) 161 162 // wait for a while to let the node be steady 163 time.Sleep(sleepAfterCreatePods) 164 165 // Log once and flush the stats. 166 rc.LogLatest() 167 rc.Reset() 168 169 ginkgo.By("Start monitoring resource usage") 170 // Periodically dump the cpu summary until the deadline is met. 171 // Note that without calling e2ekubelet.ResourceMonitor.Reset(), the stats 172 // would occupy increasingly more memory. This should be fine 173 // for the current test duration, but we should reclaim the 174 // entries if we plan to monitor longer (e.g., 8 hours). 175 deadline := time.Now().Add(monitoringTime) 176 for time.Now().Before(deadline) && ctx.Err() == nil { 177 timeLeft := time.Until(deadline) 178 framework.Logf("Still running...%v left", timeLeft) 179 if timeLeft < reportingPeriod { 180 time.Sleep(timeLeft) 181 } else { 182 time.Sleep(reportingPeriod) 183 } 184 logPods(ctx, f.ClientSet) 185 } 186 187 ginkgo.By("Reporting overall resource usage") 188 logPods(ctx, f.ClientSet) 189 } 190 191 // logAndVerifyResource prints the resource usage as perf data and verifies whether resource usage satisfies the limit. 192 func logAndVerifyResource(ctx context.Context, f *framework.Framework, rc *ResourceCollector, cpuLimits e2ekubelet.ContainersCPUSummary, 193 memLimits e2ekubelet.ResourceUsagePerContainer, testInfo map[string]string, isVerify bool) { 194 nodeName := framework.TestContext.NodeName 195 196 // Obtain memory PerfData 197 usagePerContainer, err := rc.GetLatest() 198 framework.ExpectNoError(err) 199 framework.Logf("%s", formatResourceUsageStats(usagePerContainer)) 200 201 usagePerNode := make(e2ekubelet.ResourceUsagePerNode) 202 usagePerNode[nodeName] = usagePerContainer 203 204 // Obtain CPU PerfData 205 cpuSummary := rc.GetCPUSummary() 206 framework.Logf("%s", formatCPUSummary(cpuSummary)) 207 208 cpuSummaryPerNode := make(e2ekubelet.NodesCPUSummary) 209 cpuSummaryPerNode[nodeName] = cpuSummary 210 211 // Print resource usage 212 logPerfData(e2eperf.ResourceUsageToPerfDataWithLabels(usagePerNode, testInfo), "memory") 213 logPerfData(e2eperf.CPUUsageToPerfDataWithLabels(cpuSummaryPerNode, testInfo), "cpu") 214 215 // Verify resource usage 216 if isVerify { 217 verifyMemoryLimits(ctx, f.ClientSet, memLimits, usagePerNode) 218 verifyCPULimits(cpuLimits, cpuSummaryPerNode) 219 } 220 } 221 222 func verifyMemoryLimits(ctx context.Context, c clientset.Interface, expected e2ekubelet.ResourceUsagePerContainer, actual e2ekubelet.ResourceUsagePerNode) { 223 if expected == nil { 224 return 225 } 226 var errList []string 227 for nodeName, nodeSummary := range actual { 228 var nodeErrs []string 229 for cName, expectedResult := range expected { 230 container, ok := nodeSummary[cName] 231 if !ok { 232 nodeErrs = append(nodeErrs, fmt.Sprintf("container %q: missing", cName)) 233 continue 234 } 235 236 expectedValue := expectedResult.MemoryRSSInBytes 237 actualValue := container.MemoryRSSInBytes 238 if expectedValue != 0 && actualValue > expectedValue { 239 nodeErrs = append(nodeErrs, fmt.Sprintf("container %q: expected RSS memory (MB) < %d; got %d", 240 cName, expectedValue, actualValue)) 241 } 242 } 243 if len(nodeErrs) > 0 { 244 errList = append(errList, fmt.Sprintf("node %v:\n %s", nodeName, strings.Join(nodeErrs, ", "))) 245 heapStats, err := e2ekubelet.GetKubeletHeapStats(ctx, c, nodeName) 246 if err != nil { 247 framework.Logf("Unable to get heap stats from %q", nodeName) 248 } else { 249 framework.Logf("Heap stats on %q\n:%v", nodeName, heapStats) 250 } 251 } 252 } 253 if len(errList) > 0 { 254 framework.Failf("Memory usage exceeding limits:\n %s", strings.Join(errList, "\n")) 255 } 256 } 257 258 func verifyCPULimits(expected e2ekubelet.ContainersCPUSummary, actual e2ekubelet.NodesCPUSummary) { 259 if expected == nil { 260 return 261 } 262 var errList []string 263 for nodeName, perNodeSummary := range actual { 264 var nodeErrs []string 265 for cName, expectedResult := range expected { 266 perContainerSummary, ok := perNodeSummary[cName] 267 if !ok { 268 nodeErrs = append(nodeErrs, fmt.Sprintf("container %q: missing", cName)) 269 continue 270 } 271 for p, expectedValue := range expectedResult { 272 actualValue, ok := perContainerSummary[p] 273 if !ok { 274 nodeErrs = append(nodeErrs, fmt.Sprintf("container %q: missing percentile %v", cName, p)) 275 continue 276 } 277 if actualValue > expectedValue { 278 nodeErrs = append(nodeErrs, fmt.Sprintf("container %q: expected %.0fth%% usage < %.3f; got %.3f", 279 cName, p*100, expectedValue, actualValue)) 280 } 281 } 282 } 283 if len(nodeErrs) > 0 { 284 errList = append(errList, fmt.Sprintf("node %v:\n %s", nodeName, strings.Join(nodeErrs, ", "))) 285 } 286 } 287 if len(errList) > 0 { 288 framework.Failf("CPU usage exceeding limits:\n %s", strings.Join(errList, "\n")) 289 } 290 } 291 292 func logPods(ctx context.Context, c clientset.Interface) { 293 nodeName := framework.TestContext.NodeName 294 podList, err := e2ekubelet.GetKubeletRunningPods(ctx, c, nodeName) 295 if err != nil { 296 framework.Logf("Unable to retrieve kubelet pods for node %v", nodeName) 297 } 298 framework.Logf("%d pods are running on node %v", len(podList.Items), nodeName) 299 }