k8s.io/kubernetes@v1.29.3/test/e2e_node/podresources_test.go (about) 1 /* 2 Copyright 2020 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package e2enode 18 19 import ( 20 "context" 21 "errors" 22 "fmt" 23 "os" 24 "strings" 25 "time" 26 27 v1 "k8s.io/api/core/v1" 28 "k8s.io/apimachinery/pkg/api/resource" 29 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 30 kubeletdevicepluginv1beta1 "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1" 31 kubeletpodresourcesv1 "k8s.io/kubelet/pkg/apis/podresources/v1" 32 kubefeatures "k8s.io/kubernetes/pkg/features" 33 kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config" 34 apisgrpc "k8s.io/kubernetes/pkg/kubelet/apis/grpc" 35 "k8s.io/kubernetes/pkg/kubelet/apis/podresources" 36 "k8s.io/kubernetes/pkg/kubelet/cm/cpumanager" 37 "k8s.io/kubernetes/pkg/kubelet/util" 38 testutils "k8s.io/kubernetes/test/utils" 39 admissionapi "k8s.io/pod-security-admission/api" 40 "k8s.io/utils/cpuset" 41 42 "github.com/onsi/ginkgo/v2" 43 "github.com/onsi/gomega" 44 "github.com/onsi/gomega/gstruct" 45 "github.com/onsi/gomega/types" 46 "k8s.io/kubernetes/test/e2e/feature" 47 "k8s.io/kubernetes/test/e2e/framework" 48 e2emetrics "k8s.io/kubernetes/test/e2e/framework/metrics" 49 e2enode "k8s.io/kubernetes/test/e2e/framework/node" 50 e2epod "k8s.io/kubernetes/test/e2e/framework/pod" 51 e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper" 52 "k8s.io/kubernetes/test/e2e/nodefeature" 53 ) 54 55 const ( 56 defaultTopologyUnawareResourceName = "example.com/resource" 57 ) 58 59 type podDesc struct { 60 podName string 61 cntName string 62 resourceName string 63 resourceAmount int 64 cpuRequest int // cpuRequest is in millicores 65 } 66 67 func (desc podDesc) CpuRequestQty() resource.Quantity { 68 qty := resource.NewMilliQuantity(int64(desc.cpuRequest), resource.DecimalSI) 69 return *qty 70 } 71 72 func (desc podDesc) CpuRequestExclusive() int { 73 if (desc.cpuRequest % 1000) != 0 { 74 // exclusive cpus are request only if the quantity is integral; 75 // hence, explicitly rule out non-integral requests 76 return 0 77 } 78 return desc.cpuRequest / 1000 79 } 80 81 func (desc podDesc) RequiresCPU() bool { 82 return desc.cpuRequest > 0 83 } 84 85 func (desc podDesc) RequiresDevices() bool { 86 return desc.resourceName != "" && desc.resourceAmount > 0 87 } 88 89 func makePodResourcesTestPod(desc podDesc) *v1.Pod { 90 cnt := v1.Container{ 91 Name: desc.cntName, 92 Image: busyboxImage, 93 Resources: v1.ResourceRequirements{ 94 Requests: v1.ResourceList{}, 95 Limits: v1.ResourceList{}, 96 }, 97 Command: []string{"sh", "-c", "sleep 1d"}, 98 } 99 if desc.RequiresCPU() { 100 cpuRequestQty := desc.CpuRequestQty() 101 cnt.Resources.Requests[v1.ResourceCPU] = cpuRequestQty 102 cnt.Resources.Limits[v1.ResourceCPU] = cpuRequestQty 103 // we don't really care, we only need to be in guaranteed QoS 104 cnt.Resources.Requests[v1.ResourceMemory] = resource.MustParse("100Mi") 105 cnt.Resources.Limits[v1.ResourceMemory] = resource.MustParse("100Mi") 106 } 107 if desc.RequiresDevices() { 108 cnt.Resources.Requests[v1.ResourceName(desc.resourceName)] = resource.MustParse(fmt.Sprintf("%d", desc.resourceAmount)) 109 cnt.Resources.Limits[v1.ResourceName(desc.resourceName)] = resource.MustParse(fmt.Sprintf("%d", desc.resourceAmount)) 110 } 111 return &v1.Pod{ 112 ObjectMeta: metav1.ObjectMeta{ 113 Name: desc.podName, 114 }, 115 Spec: v1.PodSpec{ 116 RestartPolicy: v1.RestartPolicyNever, 117 Containers: []v1.Container{ 118 cnt, 119 }, 120 }, 121 } 122 } 123 124 func logPodResources(podIdx int, pr *kubeletpodresourcesv1.PodResources) { 125 ns := pr.GetNamespace() 126 cnts := pr.GetContainers() 127 if len(cnts) == 0 { 128 framework.Logf("#%02d/%02d/%02d - %s/%s/%s No containers", podIdx, 0, 0, ns, pr.GetName(), "_") 129 return 130 } 131 132 for cntIdx, cnt := range cnts { 133 if len(cnt.Devices) == 0 { 134 framework.Logf("#%02d/%02d/%02d - %s/%s/%s cpus -> %v resources -> none", podIdx, cntIdx, 0, ns, pr.GetName(), cnt.Name, cnt.CpuIds) 135 continue 136 } 137 138 for devIdx, dev := range cnt.Devices { 139 framework.Logf("#%02d/%02d/%02d - %s/%s/%s cpus -> %v %s -> %s", podIdx, cntIdx, devIdx, ns, pr.GetName(), cnt.Name, cnt.CpuIds, dev.ResourceName, strings.Join(dev.DeviceIds, ", ")) 140 } 141 } 142 } 143 144 type podResMap map[string]map[string]kubeletpodresourcesv1.ContainerResources 145 146 func convertToMap(podsResources []*kubeletpodresourcesv1.PodResources) podResMap { 147 res := make(map[string]map[string]kubeletpodresourcesv1.ContainerResources) 148 for idx, podResource := range podsResources { 149 // to make troubleshooting easier 150 logPodResources(idx, podResource) 151 152 cnts := make(map[string]kubeletpodresourcesv1.ContainerResources) 153 for _, cnt := range podResource.GetContainers() { 154 cnts[cnt.GetName()] = *cnt 155 } 156 res[podResource.GetName()] = cnts 157 } 158 return res 159 } 160 161 func getPodResourcesValues(ctx context.Context, cli kubeletpodresourcesv1.PodResourcesListerClient) (podResMap, error) { 162 resp, err := cli.List(ctx, &kubeletpodresourcesv1.ListPodResourcesRequest{}) 163 if err != nil { 164 return nil, err 165 } 166 return convertToMap(resp.GetPodResources()), nil 167 } 168 169 type testPodData struct { 170 PodMap map[string]*v1.Pod 171 } 172 173 func newTestPodData() *testPodData { 174 return &testPodData{ 175 PodMap: make(map[string]*v1.Pod), 176 } 177 } 178 179 func (tpd *testPodData) createPodsForTest(ctx context.Context, f *framework.Framework, podReqs []podDesc) { 180 for _, podReq := range podReqs { 181 pod := makePodResourcesTestPod(podReq) 182 pod = e2epod.NewPodClient(f).CreateSync(ctx, pod) 183 184 framework.Logf("created pod %s", podReq.podName) 185 tpd.PodMap[podReq.podName] = pod 186 } 187 } 188 189 /* deletePodsForTest clean up all the pods run for a testcase. Must ensure proper cleanup */ 190 func (tpd *testPodData) deletePodsForTest(ctx context.Context, f *framework.Framework) { 191 deletePodsAsync(ctx, f, tpd.PodMap) 192 } 193 194 /* deletePod removes pod during a test. Should do a best-effort clean up */ 195 func (tpd *testPodData) deletePod(ctx context.Context, f *framework.Framework, podName string) { 196 _, ok := tpd.PodMap[podName] 197 if !ok { 198 return 199 } 200 deletePodSyncByName(ctx, f, podName) 201 delete(tpd.PodMap, podName) 202 } 203 204 func findContainerDeviceByName(devs []*kubeletpodresourcesv1.ContainerDevices, resourceName string) *kubeletpodresourcesv1.ContainerDevices { 205 for _, dev := range devs { 206 if dev.ResourceName == resourceName { 207 return dev 208 } 209 } 210 return nil 211 } 212 213 func matchPodDescWithResources(expected []podDesc, found podResMap) error { 214 for _, podReq := range expected { 215 framework.Logf("matching: %#v", podReq) 216 217 podInfo, ok := found[podReq.podName] 218 if !ok { 219 return fmt.Errorf("no pod resources for pod %q", podReq.podName) 220 } 221 cntInfo, ok := podInfo[podReq.cntName] 222 if !ok { 223 return fmt.Errorf("no container resources for pod %q container %q", podReq.podName, podReq.cntName) 224 } 225 if podReq.RequiresCPU() { 226 if exclusiveCpus := podReq.CpuRequestExclusive(); exclusiveCpus != len(cntInfo.CpuIds) { 227 if exclusiveCpus == 0 { 228 return fmt.Errorf("pod %q container %q requested %d expected to be allocated CPUs from shared pool %v", podReq.podName, podReq.cntName, podReq.cpuRequest, cntInfo.CpuIds) 229 } 230 return fmt.Errorf("pod %q container %q expected %d cpus got %v", podReq.podName, podReq.cntName, exclusiveCpus, cntInfo.CpuIds) 231 } 232 } 233 if podReq.RequiresDevices() { 234 dev := findContainerDeviceByName(cntInfo.GetDevices(), podReq.resourceName) 235 if dev == nil { 236 return fmt.Errorf("pod %q container %q expected data for resource %q not found", podReq.podName, podReq.cntName, podReq.resourceName) 237 } 238 if len(dev.DeviceIds) != podReq.resourceAmount { 239 return fmt.Errorf("pod %q container %q resource %q expected %d items got %v", podReq.podName, podReq.cntName, podReq.resourceName, podReq.resourceAmount, dev.DeviceIds) 240 } 241 } else { 242 devs := cntInfo.GetDevices() 243 if len(devs) > 0 { 244 return fmt.Errorf("pod %q container %q expected no resources, got %v", podReq.podName, podReq.cntName, devs) 245 } 246 } 247 if cnts, ok := found[defaultTopologyUnawareResourceName]; ok { 248 for _, cnt := range cnts { 249 for _, cd := range cnt.GetDevices() { 250 if cd.ResourceName != defaultTopologyUnawareResourceName { 251 continue 252 } 253 if cd.Topology != nil { 254 //we expect nil topology 255 return fmt.Errorf("Nil topology is expected") 256 } 257 } 258 259 } 260 } 261 } 262 return nil 263 } 264 265 func expectPodResources(ctx context.Context, offset int, cli kubeletpodresourcesv1.PodResourcesListerClient, expected []podDesc) { 266 gomega.EventuallyWithOffset(1+offset, ctx, func(ctx context.Context) error { 267 found, err := getPodResourcesValues(ctx, cli) 268 if err != nil { 269 return err 270 } 271 return matchPodDescWithResources(expected, found) 272 }, time.Minute, 10*time.Second).Should(gomega.Succeed()) 273 } 274 275 func filterOutDesc(descs []podDesc, name string) []podDesc { 276 var ret []podDesc 277 for _, desc := range descs { 278 if desc.podName == name { 279 continue 280 } 281 ret = append(ret, desc) 282 } 283 return ret 284 } 285 286 func podresourcesListTests(ctx context.Context, f *framework.Framework, cli kubeletpodresourcesv1.PodResourcesListerClient, sd *sriovData) { 287 var tpd *testPodData 288 289 var found podResMap 290 var expected []podDesc 291 var extra podDesc 292 293 expectedBasePods := 0 /* nothing but pods we create */ 294 if sd != nil { 295 expectedBasePods = 1 // sriovdp 296 } 297 298 var err error 299 ginkgo.By("checking the output when no pods are present") 300 found, err = getPodResourcesValues(ctx, cli) 301 framework.ExpectNoError(err, "getPodResourcesValues() failed err: %v", err) 302 gomega.ExpectWithOffset(1, found).To(gomega.HaveLen(expectedBasePods), "base pod expectation mismatch") 303 304 tpd = newTestPodData() 305 ginkgo.By("checking the output when only pods which don't require resources are present") 306 expected = []podDesc{ 307 { 308 podName: "pod-00", 309 cntName: "cnt-00", 310 }, 311 { 312 podName: "pod-01", 313 cntName: "cnt-00", 314 }, 315 } 316 tpd.createPodsForTest(ctx, f, expected) 317 expectPodResources(ctx, 1, cli, expected) 318 tpd.deletePodsForTest(ctx, f) 319 320 tpd = newTestPodData() 321 ginkgo.By("checking the output when only a subset of pods require resources") 322 if sd != nil { 323 expected = []podDesc{ 324 { 325 podName: "pod-00", 326 cntName: "cnt-00", 327 }, 328 { 329 podName: "pod-01", 330 cntName: "cnt-00", 331 resourceName: sd.resourceName, 332 resourceAmount: 1, 333 cpuRequest: 2000, 334 }, 335 { 336 podName: "pod-02", 337 cntName: "cnt-00", 338 cpuRequest: 2000, 339 }, 340 { 341 podName: "pod-03", 342 cntName: "cnt-00", 343 resourceName: sd.resourceName, 344 resourceAmount: 1, 345 cpuRequest: 1000, 346 }, 347 } 348 } else { 349 expected = []podDesc{ 350 { 351 podName: "pod-00", 352 cntName: "cnt-00", 353 }, 354 { 355 podName: "pod-01", 356 cntName: "cnt-00", 357 cpuRequest: 2000, 358 }, 359 { 360 podName: "pod-02", 361 cntName: "cnt-00", 362 cpuRequest: 2000, 363 }, 364 { 365 podName: "pod-03", 366 cntName: "cnt-00", 367 cpuRequest: 1000, 368 }, 369 } 370 371 } 372 tpd.createPodsForTest(ctx, f, expected) 373 expectPodResources(ctx, 1, cli, expected) 374 tpd.deletePodsForTest(ctx, f) 375 376 tpd = newTestPodData() 377 ginkgo.By("checking the output when creating pods which require resources between calls") 378 if sd != nil { 379 expected = []podDesc{ 380 { 381 podName: "pod-00", 382 cntName: "cnt-00", 383 }, 384 { 385 podName: "pod-01", 386 cntName: "cnt-00", 387 resourceName: sd.resourceName, 388 resourceAmount: 1, 389 cpuRequest: 2000, 390 }, 391 { 392 podName: "pod-02", 393 cntName: "cnt-00", 394 cpuRequest: 2000, 395 }, 396 } 397 } else { 398 expected = []podDesc{ 399 { 400 podName: "pod-00", 401 cntName: "cnt-00", 402 }, 403 { 404 podName: "pod-01", 405 cntName: "cnt-00", 406 cpuRequest: 2000, 407 }, 408 { 409 podName: "pod-02", 410 cntName: "cnt-00", 411 cpuRequest: 2000, 412 }, 413 } 414 } 415 416 tpd.createPodsForTest(ctx, f, expected) 417 expectPodResources(ctx, 1, cli, expected) 418 419 if sd != nil { 420 extra = podDesc{ 421 podName: "pod-03", 422 cntName: "cnt-00", 423 resourceName: sd.resourceName, 424 resourceAmount: 1, 425 cpuRequest: 1000, 426 } 427 } else { 428 extra = podDesc{ 429 podName: "pod-03", 430 cntName: "cnt-00", 431 cpuRequest: 1000, 432 } 433 434 } 435 436 tpd.createPodsForTest(ctx, f, []podDesc{ 437 extra, 438 }) 439 440 expected = append(expected, extra) 441 expectPodResources(ctx, 1, cli, expected) 442 tpd.deletePodsForTest(ctx, f) 443 444 tpd = newTestPodData() 445 ginkgo.By("checking the output when deleting pods which require resources between calls") 446 447 if sd != nil { 448 expected = []podDesc{ 449 { 450 podName: "pod-00", 451 cntName: "cnt-00", 452 cpuRequest: 1000, 453 }, 454 { 455 podName: "pod-01", 456 cntName: "cnt-00", 457 resourceName: sd.resourceName, 458 resourceAmount: 1, 459 cpuRequest: 2000, 460 }, 461 { 462 podName: "pod-02", 463 cntName: "cnt-00", 464 }, 465 { 466 podName: "pod-03", 467 cntName: "cnt-00", 468 resourceName: sd.resourceName, 469 resourceAmount: 1, 470 cpuRequest: 1000, 471 }, 472 } 473 } else { 474 expected = []podDesc{ 475 { 476 podName: "pod-00", 477 cntName: "cnt-00", 478 cpuRequest: 1000, 479 }, 480 { 481 podName: "pod-01", 482 cntName: "cnt-00", 483 cpuRequest: 1000, 484 }, 485 { 486 podName: "pod-02", 487 cntName: "cnt-00", 488 }, 489 { 490 podName: "pod-03", 491 cntName: "cnt-00", 492 cpuRequest: 1000, 493 }, 494 } 495 } 496 tpd.createPodsForTest(ctx, f, expected) 497 expectPodResources(ctx, 1, cli, expected) 498 499 tpd.deletePod(ctx, f, "pod-01") 500 expectedPostDelete := filterOutDesc(expected, "pod-01") 501 expectPodResources(ctx, 1, cli, expectedPostDelete) 502 tpd.deletePodsForTest(ctx, f) 503 504 tpd = newTestPodData() 505 ginkgo.By("checking the output when pods request non integral CPUs") 506 if sd != nil { 507 expected = []podDesc{ 508 { 509 podName: "pod-00", 510 cntName: "cnt-00", 511 cpuRequest: 1500, 512 }, 513 { 514 podName: "pod-01", 515 cntName: "cnt-00", 516 resourceName: sd.resourceName, 517 resourceAmount: 1, 518 cpuRequest: 1500, 519 }, 520 } 521 } else { 522 expected = []podDesc{ 523 { 524 podName: "pod-00", 525 cntName: "cnt-00", 526 cpuRequest: 1500, 527 }, 528 } 529 530 } 531 tpd.createPodsForTest(ctx, f, expected) 532 expectPodResources(ctx, 1, cli, expected) 533 tpd.deletePodsForTest(ctx, f) 534 535 } 536 537 func podresourcesGetAllocatableResourcesTests(ctx context.Context, cli kubeletpodresourcesv1.PodResourcesListerClient, sd *sriovData, onlineCPUs, reservedSystemCPUs cpuset.CPUSet) { 538 ginkgo.By("checking the devices known to the kubelet") 539 resp, err := cli.GetAllocatableResources(ctx, &kubeletpodresourcesv1.AllocatableResourcesRequest{}) 540 framework.ExpectNoErrorWithOffset(1, err) 541 devs := resp.GetDevices() 542 var cpus []int 543 for _, cpuid := range resp.GetCpuIds() { 544 cpus = append(cpus, int(cpuid)) 545 } 546 allocatableCPUs := cpuset.New(cpus...) 547 548 if onlineCPUs.Size() == 0 { 549 ginkgo.By("expecting no CPUs reported") 550 gomega.ExpectWithOffset(1, onlineCPUs.Size()).To(gomega.Equal(reservedSystemCPUs.Size()), "with no online CPUs, no CPUs should be reserved") 551 } else { 552 ginkgo.By(fmt.Sprintf("expecting online CPUs reported - online=%v (%d) reserved=%v (%d)", onlineCPUs, onlineCPUs.Size(), reservedSystemCPUs, reservedSystemCPUs.Size())) 553 if reservedSystemCPUs.Size() > onlineCPUs.Size() { 554 ginkgo.Fail("more reserved CPUs than online") 555 } 556 expectedCPUs := onlineCPUs.Difference(reservedSystemCPUs) 557 558 ginkgo.By(fmt.Sprintf("expecting CPUs '%v'='%v'", allocatableCPUs, expectedCPUs)) 559 gomega.ExpectWithOffset(1, allocatableCPUs.Equals(expectedCPUs)).To(gomega.BeTrue(), "mismatch expecting CPUs") 560 } 561 562 if sd == nil { // no devices in the environment, so expect no devices 563 ginkgo.By("expecting no devices reported") 564 gomega.ExpectWithOffset(1, devs).To(gomega.BeEmpty(), fmt.Sprintf("got unexpected devices %#v", devs)) 565 return 566 } 567 568 ginkgo.By(fmt.Sprintf("expecting some %q devices reported", sd.resourceName)) 569 gomega.ExpectWithOffset(1, devs).ToNot(gomega.BeEmpty()) 570 for _, dev := range devs { 571 gomega.Expect(dev.ResourceName).To(gomega.Equal(sd.resourceName)) 572 gomega.ExpectWithOffset(1, dev.DeviceIds).ToNot(gomega.BeEmpty()) 573 } 574 } 575 576 func podresourcesGetTests(ctx context.Context, f *framework.Framework, cli kubeletpodresourcesv1.PodResourcesListerClient) { 577 //var err error 578 ginkgo.By("checking the output when no pods are present") 579 expected := []podDesc{} 580 resp, err := cli.Get(ctx, &kubeletpodresourcesv1.GetPodResourcesRequest{PodName: "test", PodNamespace: f.Namespace.Name}) 581 podResourceList := []*kubeletpodresourcesv1.PodResources{resp.GetPodResources()} 582 gomega.Expect(err).To(gomega.HaveOccurred(), "pod not found") 583 res := convertToMap(podResourceList) 584 err = matchPodDescWithResources(expected, res) 585 framework.ExpectNoError(err, "matchPodDescWithResources() failed err %v", err) 586 587 tpd := newTestPodData() 588 ginkgo.By("checking the output when only pods which don't require resources are present") 589 expected = []podDesc{ 590 { 591 podName: "pod-00", 592 cntName: "cnt-00", 593 }, 594 } 595 tpd.createPodsForTest(ctx, f, expected) 596 resp, err = cli.Get(ctx, &kubeletpodresourcesv1.GetPodResourcesRequest{PodName: "pod-00", PodNamespace: f.Namespace.Name}) 597 framework.ExpectNoError(err, "Get() call failed for pod %s/%s", f.Namespace.Name, "pod-00") 598 podResourceList = []*kubeletpodresourcesv1.PodResources{resp.GetPodResources()} 599 res = convertToMap(podResourceList) 600 err = matchPodDescWithResources(expected, res) 601 framework.ExpectNoError(err, "matchPodDescWithResources() failed err %v", err) 602 tpd.deletePodsForTest(ctx, f) 603 604 tpd = newTestPodData() 605 ginkgo.By("checking the output when only pod require CPU") 606 expected = []podDesc{ 607 { 608 podName: "pod-01", 609 cntName: "cnt-00", 610 cpuRequest: 2000, 611 }, 612 } 613 tpd.createPodsForTest(ctx, f, expected) 614 resp, err = cli.Get(ctx, &kubeletpodresourcesv1.GetPodResourcesRequest{PodName: "pod-01", PodNamespace: f.Namespace.Name}) 615 framework.ExpectNoError(err, "Get() call failed for pod %s/%s", f.Namespace.Name, "pod-01") 616 podResourceList = []*kubeletpodresourcesv1.PodResources{resp.GetPodResources()} 617 res = convertToMap(podResourceList) 618 err = matchPodDescWithResources(expected, res) 619 framework.ExpectNoError(err, "matchPodDescWithResources() failed err %v", err) 620 tpd.deletePodsForTest(ctx, f) 621 } 622 623 // Serial because the test updates kubelet configuration. 624 var _ = SIGDescribe("POD Resources", framework.WithSerial(), feature.PodResources, nodefeature.PodResources, func() { 625 f := framework.NewDefaultFramework("podresources-test") 626 f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged 627 628 reservedSystemCPUs := cpuset.New(1) 629 630 ginkgo.Context("with SRIOV devices in the system", func() { 631 ginkgo.BeforeEach(func() { 632 requireSRIOVDevices() 633 }) 634 635 ginkgo.Context("with CPU manager Static policy", func() { 636 ginkgo.BeforeEach(func(ctx context.Context) { 637 // this is a very rough check. We just want to rule out system that does NOT have enough resources 638 _, cpuAlloc, _ := getLocalNodeCPUDetails(ctx, f) 639 640 if cpuAlloc < minCoreCount { 641 e2eskipper.Skipf("Skipping CPU Manager tests since the CPU allocatable < %d", minCoreCount) 642 } 643 }) 644 645 // empty context to apply kubelet config changes 646 ginkgo.Context("", func() { 647 tempSetCurrentKubeletConfig(f, func(ctx context.Context, initialConfig *kubeletconfig.KubeletConfiguration) { 648 // Set the CPU Manager policy to static. 649 initialConfig.CPUManagerPolicy = string(cpumanager.PolicyStatic) 650 651 // Set the CPU Manager reconcile period to 1 second. 652 initialConfig.CPUManagerReconcilePeriod = metav1.Duration{Duration: 1 * time.Second} 653 654 cpus := reservedSystemCPUs.String() 655 framework.Logf("configurePodResourcesInKubelet: using reservedSystemCPUs=%q", cpus) 656 initialConfig.ReservedSystemCPUs = cpus 657 }) 658 659 ginkgo.It("should return the expected responses", func(ctx context.Context) { 660 onlineCPUs, err := getOnlineCPUs() 661 framework.ExpectNoError(err, "getOnlineCPUs() failed err: %v", err) 662 663 configMap := getSRIOVDevicePluginConfigMap(framework.TestContext.SriovdpConfigMapFile) 664 sd := setupSRIOVConfigOrFail(ctx, f, configMap) 665 ginkgo.DeferCleanup(teardownSRIOVConfigOrFail, f, sd) 666 667 waitForSRIOVResources(ctx, f, sd) 668 669 endpoint, err := util.LocalEndpoint(defaultPodResourcesPath, podresources.Socket) 670 framework.ExpectNoError(err, "LocalEndpoint() failed err: %v", err) 671 672 cli, conn, err := podresources.GetV1Client(endpoint, defaultPodResourcesTimeout, defaultPodResourcesMaxSize) 673 framework.ExpectNoError(err, "GetV1Client() failed err: %v", err) 674 defer conn.Close() 675 676 waitForSRIOVResources(ctx, f, sd) 677 678 ginkgo.By("checking List()") 679 podresourcesListTests(ctx, f, cli, sd) 680 ginkgo.By("checking GetAllocatableResources()") 681 podresourcesGetAllocatableResourcesTests(ctx, cli, sd, onlineCPUs, reservedSystemCPUs) 682 }) 683 }) 684 }) 685 686 ginkgo.Context("with CPU manager None policy", func() { 687 ginkgo.It("should return the expected responses", func(ctx context.Context) { 688 // current default is "none" policy - no need to restart the kubelet 689 690 requireSRIOVDevices() 691 692 configMap := getSRIOVDevicePluginConfigMap(framework.TestContext.SriovdpConfigMapFile) 693 sd := setupSRIOVConfigOrFail(ctx, f, configMap) 694 ginkgo.DeferCleanup(teardownSRIOVConfigOrFail, f, sd) 695 696 waitForSRIOVResources(ctx, f, sd) 697 698 endpoint, err := util.LocalEndpoint(defaultPodResourcesPath, podresources.Socket) 699 framework.ExpectNoError(err, "LocalEndpoint() failed err: %v", err) 700 701 cli, conn, err := podresources.GetV1Client(endpoint, defaultPodResourcesTimeout, defaultPodResourcesMaxSize) 702 framework.ExpectNoError(err, "GetV1Client() failed err: %v", err) 703 defer conn.Close() 704 705 waitForSRIOVResources(ctx, f, sd) 706 707 // intentionally passing empty cpuset instead of onlineCPUs because with none policy 708 // we should get no allocatable cpus - no exclusively allocatable CPUs, depends on policy static 709 podresourcesGetAllocatableResourcesTests(ctx, cli, sd, cpuset.CPUSet{}, cpuset.CPUSet{}) 710 }) 711 }) 712 }) 713 714 ginkgo.Context("without SRIOV devices in the system", func() { 715 ginkgo.BeforeEach(func() { 716 requireLackOfSRIOVDevices() 717 }) 718 719 ginkgo.Context("with CPU manager Static policy", func() { 720 ginkgo.BeforeEach(func(ctx context.Context) { 721 // this is a very rough check. We just want to rule out system that does NOT have enough resources 722 _, cpuAlloc, _ := getLocalNodeCPUDetails(ctx, f) 723 724 if cpuAlloc < minCoreCount { 725 e2eskipper.Skipf("Skipping CPU Manager tests since the CPU allocatable < %d", minCoreCount) 726 } 727 }) 728 729 // empty context to apply kubelet config changes 730 ginkgo.Context("", func() { 731 tempSetCurrentKubeletConfig(f, func(ctx context.Context, initialConfig *kubeletconfig.KubeletConfiguration) { 732 // Set the CPU Manager policy to static. 733 initialConfig.CPUManagerPolicy = string(cpumanager.PolicyStatic) 734 735 // Set the CPU Manager reconcile period to 1 second. 736 initialConfig.CPUManagerReconcilePeriod = metav1.Duration{Duration: 1 * time.Second} 737 738 cpus := reservedSystemCPUs.String() 739 framework.Logf("configurePodResourcesInKubelet: using reservedSystemCPUs=%q", cpus) 740 initialConfig.ReservedSystemCPUs = cpus 741 if initialConfig.FeatureGates == nil { 742 initialConfig.FeatureGates = make(map[string]bool) 743 } 744 initialConfig.FeatureGates[string(kubefeatures.KubeletPodResourcesGet)] = true 745 }) 746 747 ginkgo.It("should return the expected responses", func(ctx context.Context) { 748 onlineCPUs, err := getOnlineCPUs() 749 framework.ExpectNoError(err, "getOnlineCPUs() failed err: %v", err) 750 751 endpoint, err := util.LocalEndpoint(defaultPodResourcesPath, podresources.Socket) 752 framework.ExpectNoError(err, "LocalEndpoint() failed err: %v", err) 753 754 cli, conn, err := podresources.GetV1Client(endpoint, defaultPodResourcesTimeout, defaultPodResourcesMaxSize) 755 framework.ExpectNoError(err, "GetV1Client() failed err: %v", err) 756 defer conn.Close() 757 758 podresourcesListTests(ctx, f, cli, nil) 759 podresourcesGetAllocatableResourcesTests(ctx, cli, nil, onlineCPUs, reservedSystemCPUs) 760 podresourcesGetTests(ctx, f, cli) 761 }) 762 ginkgo.It("should account for resources of pods in terminal phase", func(ctx context.Context) { 763 pd := podDesc{ 764 cntName: "e2e-test-cnt", 765 podName: "e2e-test-pod", 766 cpuRequest: 1000, 767 } 768 pod := makePodResourcesTestPod(pd) 769 pod.Spec.Containers[0].Command = []string{"sh", "-c", "/bin/true"} 770 pod = e2epod.NewPodClient(f).CreateSync(ctx, pod) 771 defer e2epod.NewPodClient(f).DeleteSync(ctx, pod.Name, metav1.DeleteOptions{}, time.Minute) 772 err := e2epod.WaitForPodCondition(ctx, f.ClientSet, pod.Namespace, pod.Name, "Pod Succeeded", time.Minute*2, testutils.PodSucceeded) 773 framework.ExpectNoError(err) 774 endpoint, err := util.LocalEndpoint(defaultPodResourcesPath, podresources.Socket) 775 framework.ExpectNoError(err) 776 cli, conn, err := podresources.GetV1Client(endpoint, defaultPodResourcesTimeout, defaultPodResourcesMaxSize) 777 framework.ExpectNoError(err) 778 defer conn.Close() 779 // although the pod moved into terminal state, PodResourcesAPI still list its cpus 780 expectPodResources(ctx, 1, cli, []podDesc{pd}) 781 782 }) 783 }) 784 }) 785 786 ginkgo.Context("with CPU manager None policy", func() { 787 ginkgo.It("should return the expected responses", func(ctx context.Context) { 788 endpoint, err := util.LocalEndpoint(defaultPodResourcesPath, podresources.Socket) 789 framework.ExpectNoError(err, "LocalEndpoint() failed err: %v", err) 790 791 cli, conn, err := podresources.GetV1Client(endpoint, defaultPodResourcesTimeout, defaultPodResourcesMaxSize) 792 framework.ExpectNoError(err, "GetV1Client() failed err: %v", err) 793 defer conn.Close() 794 795 // intentionally passing empty cpuset instead of onlineCPUs because with none policy 796 // we should get no allocatable cpus - no exclusively allocatable CPUs, depends on policy static 797 podresourcesGetAllocatableResourcesTests(ctx, cli, nil, cpuset.CPUSet{}, cpuset.CPUSet{}) 798 }) 799 }) 800 801 ginkgo.Context("with disabled KubeletPodResourcesGet feature gate", func() { 802 803 ginkgo.It("should return the expected error with the feature gate disabled", func(ctx context.Context) { 804 endpoint, err := util.LocalEndpoint(defaultPodResourcesPath, podresources.Socket) 805 framework.ExpectNoError(err, "LocalEndpoint() faild err %v", err) 806 807 cli, conn, err := podresources.GetV1Client(endpoint, defaultPodResourcesTimeout, defaultPodResourcesMaxSize) 808 framework.ExpectNoError(err, "GetV1Client() failed err %v", err) 809 defer conn.Close() 810 811 ginkgo.By("checking Get fail if the feature gate is not enabled") 812 getRes, err := cli.Get(ctx, &kubeletpodresourcesv1.GetPodResourcesRequest{PodName: "test", PodNamespace: f.Namespace.Name}) 813 framework.Logf("Get result: %v, err: %v", getRes, err) 814 gomega.Expect(err).To(gomega.HaveOccurred(), "With feature gate disabled, the call must fail") 815 }) 816 }) 817 }) 818 819 ginkgo.Context("with a topology-unaware device plugin, which reports resources w/o hardware topology", func() { 820 ginkgo.Context("with CPU manager Static policy", func() { 821 ginkgo.BeforeEach(func(ctx context.Context) { 822 // this is a very rough check. We just want to rule out system that does NOT have enough resources 823 _, cpuAlloc, _ := getLocalNodeCPUDetails(ctx, f) 824 825 if cpuAlloc < minCoreCount { 826 e2eskipper.Skipf("Skipping CPU Manager tests since the CPU allocatable < %d", minCoreCount) 827 } 828 }) 829 830 // empty context to apply kubelet config changes 831 ginkgo.Context("", func() { 832 tempSetCurrentKubeletConfig(f, func(ctx context.Context, initialConfig *kubeletconfig.KubeletConfiguration) { 833 // Set the CPU Manager policy to static. 834 initialConfig.CPUManagerPolicy = string(cpumanager.PolicyStatic) 835 836 // Set the CPU Manager reconcile period to 1 second. 837 initialConfig.CPUManagerReconcilePeriod = metav1.Duration{Duration: 1 * time.Second} 838 839 cpus := reservedSystemCPUs.String() 840 framework.Logf("configurePodResourcesInKubelet: using reservedSystemCPUs=%q", cpus) 841 initialConfig.ReservedSystemCPUs = cpus 842 }) 843 844 ginkgo.It("should return proper podresources the same as before the restart of kubelet", func(ctx context.Context) { 845 dpPod := setupSampleDevicePluginOrFail(ctx, f) 846 ginkgo.DeferCleanup(teardownSampleDevicePluginOrFail, f, dpPod) 847 848 waitForTopologyUnawareResources(ctx, f) 849 850 endpoint, err := util.LocalEndpoint(defaultPodResourcesPath, podresources.Socket) 851 framework.ExpectNoError(err, "LocalEndpoint() failed err: %v", err) 852 853 cli, conn, err := podresources.GetV1Client(endpoint, defaultPodResourcesTimeout, defaultPodResourcesMaxSize) 854 framework.ExpectNoError(err, "GetV1Client() failed err: %v", err) 855 defer conn.Close() 856 857 ginkgo.By("checking List and resources topology unaware resource should be without topology") 858 859 allocatableResponse, _ := cli.GetAllocatableResources(ctx, &kubeletpodresourcesv1.AllocatableResourcesRequest{}) 860 for _, dev := range allocatableResponse.GetDevices() { 861 if dev.ResourceName != defaultTopologyUnawareResourceName { 862 continue 863 } 864 gomega.Expect(dev.Topology).To(gomega.BeNil(), "Topology is expected to be empty for topology unaware resources") 865 } 866 867 desc := podDesc{ 868 podName: "pod-01", 869 cntName: "cnt-01", 870 resourceName: defaultTopologyUnawareResourceName, 871 resourceAmount: 1, 872 cpuRequest: 1000, 873 } 874 875 tpd := newTestPodData() 876 tpd.createPodsForTest(ctx, f, []podDesc{ 877 desc, 878 }) 879 880 expectPodResources(ctx, 1, cli, []podDesc{desc}) 881 882 ginkgo.By("Restarting Kubelet") 883 restartKubelet(true) 884 885 // we need to wait for the node to be reported ready before we can safely query 886 // the podresources endpoint again. Otherwise we will have false negatives. 887 ginkgo.By("Wait for node to be ready") 888 waitForTopologyUnawareResources(ctx, f) 889 890 expectPodResources(ctx, 1, cli, []podDesc{desc}) 891 tpd.deletePodsForTest(ctx, f) 892 }) 893 }) 894 }) 895 }) 896 897 f.Context("when querying /metrics", f.WithNodeConformance(), func() { 898 tempSetCurrentKubeletConfig(f, func(ctx context.Context, initialConfig *kubeletconfig.KubeletConfiguration) { 899 if initialConfig.FeatureGates == nil { 900 initialConfig.FeatureGates = make(map[string]bool) 901 } 902 initialConfig.FeatureGates[string(kubefeatures.KubeletPodResourcesGet)] = true 903 }) 904 ginkgo.BeforeEach(func(ctx context.Context) { 905 // ensure APIs have been called at least once 906 endpoint, err := util.LocalEndpoint(defaultPodResourcesPath, podresources.Socket) 907 framework.ExpectNoError(err, "LocalEndpoint() failed err %v", err) 908 909 cli, conn, err := podresources.GetV1Client(endpoint, defaultPodResourcesTimeout, defaultPodResourcesMaxSize) 910 framework.ExpectNoError(err, "GetV1Client() failed err %v", err) 911 defer conn.Close() 912 913 _, err = cli.List(ctx, &kubeletpodresourcesv1.ListPodResourcesRequest{}) 914 framework.ExpectNoError(err, "List() failed err %v", err) 915 916 _, err = cli.GetAllocatableResources(ctx, &kubeletpodresourcesv1.AllocatableResourcesRequest{}) 917 framework.ExpectNoError(err, "GetAllocatableResources() failed err %v", err) 918 919 desc := podDesc{ 920 podName: "pod-01", 921 cntName: "cnt-01", 922 } 923 tpd := newTestPodData() 924 tpd.createPodsForTest(ctx, f, []podDesc{ 925 desc, 926 }) 927 expectPodResources(ctx, 1, cli, []podDesc{desc}) 928 929 expected := []podDesc{} 930 resp, err := cli.Get(ctx, &kubeletpodresourcesv1.GetPodResourcesRequest{PodName: "pod-01", PodNamespace: f.Namespace.Name}) 931 framework.ExpectNoError(err, "Get() call failed for pod %s/%s", f.Namespace.Name, "pod-01") 932 podResourceList := []*kubeletpodresourcesv1.PodResources{resp.GetPodResources()} 933 res := convertToMap(podResourceList) 934 err = matchPodDescWithResources(expected, res) 935 framework.ExpectNoError(err, "matchPodDescWithResources() failed err %v", err) 936 tpd.deletePodsForTest(ctx, f) 937 }) 938 939 ginkgo.It("should report the values for the podresources metrics", func(ctx context.Context) { 940 // we updated the kubelet config in BeforeEach, so we can assume we start fresh. 941 // being [Serial], we can also assume noone else but us is running pods. 942 ginkgo.By("Checking the value of the podresources metrics") 943 944 matchResourceMetrics := gstruct.MatchKeys(gstruct.IgnoreExtras, gstruct.Keys{ 945 "kubelet_pod_resources_endpoint_requests_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{ 946 "": timelessSampleAtLeast(1), 947 }), 948 "kubelet_pod_resources_endpoint_requests_list": gstruct.MatchAllElements(nodeID, gstruct.Elements{ 949 "": timelessSampleAtLeast(1), 950 }), 951 "kubelet_pod_resources_endpoint_requests_get_allocatable": gstruct.MatchAllElements(nodeID, gstruct.Elements{ 952 "": timelessSampleAtLeast(1), 953 }), 954 "kubelet_pod_resources_endpoint_requests_get": gstruct.MatchAllElements(nodeID, gstruct.Elements{ 955 "": timelessSampleAtLeast(1), 956 }), 957 // not checking errors: the calls don't have non-catastrophic (e.g. out of memory) error conditions yet. 958 }) 959 960 ginkgo.By("Giving the Kubelet time to start up and produce metrics") 961 gomega.Eventually(ctx, getPodResourcesMetrics, 1*time.Minute, 15*time.Second).Should(matchResourceMetrics) 962 ginkgo.By("Ensuring the metrics match the expectations a few more times") 963 gomega.Consistently(ctx, getPodResourcesMetrics, 1*time.Minute, 15*time.Second).Should(matchResourceMetrics) 964 }) 965 }) 966 967 ginkgo.Context("with the builtin rate limit values", func() { 968 ginkgo.It("should hit throttling when calling podresources List in a tight loop", func(ctx context.Context) { 969 // ensure APIs have been called at least once 970 endpoint, err := util.LocalEndpoint(defaultPodResourcesPath, podresources.Socket) 971 framework.ExpectNoError(err, "LocalEndpoint() failed err %v", err) 972 973 ginkgo.By("Connecting to the kubelet endpoint") 974 cli, conn, err := podresources.GetV1Client(endpoint, defaultPodResourcesTimeout, defaultPodResourcesMaxSize) 975 framework.ExpectNoError(err, "GetV1Client() failed err %v", err) 976 defer conn.Close() 977 978 tries := podresources.DefaultQPS * 2 // This should also be greater than DefaultBurstTokens 979 errs := []error{} 980 981 ginkgo.By(fmt.Sprintf("Issuing %d List() calls in a tight loop", tries)) 982 startTime := time.Now() 983 for try := 0; try < tries; try++ { 984 _, err = cli.List(ctx, &kubeletpodresourcesv1.ListPodResourcesRequest{}) 985 errs = append(errs, err) 986 } 987 elapsed := time.Since(startTime) 988 989 ginkgo.By(fmt.Sprintf("Checking return codes for %d List() calls in %v", tries, elapsed)) 990 991 framework.ExpectNoError(errs[0], "the first List() call unexpectedly failed with %v", errs[0]) 992 // we would expect (burst) successes and then (tries-burst) errors on a clean test environment running with 993 // enough CPU power. CI is usually harsher. So we relax constraints, expecting at least _a_ failure, while 994 // we are likely to get much more. But we can't predict yet how more we should expect, so we prefer to relax 995 // constraints than to risk flakes at this stage. 996 errLimitExceededCount := 0 997 for _, err := range errs[1:] { 998 if errors.Is(err, apisgrpc.ErrorLimitExceeded) { 999 errLimitExceededCount++ 1000 } 1001 } 1002 gomega.Expect(errLimitExceededCount).ToNot(gomega.BeZero(), "never hit the rate limit trying %d calls in %v", tries, elapsed) 1003 1004 framework.Logf("got %d/%d rate limit errors, at least one needed, the more the better", errLimitExceededCount, tries) 1005 1006 // this is not needed for this test. We're done. But we need to play nice with *other* tests which may run just after, 1007 // and which need to query the API. If they run "too fast", they can still be throttled because the throttling period 1008 // is not exhausted yet, yielding false negatives, leading to flakes. 1009 // We can't reset the period for the rate limit, we just wait "long enough" to make sure we absorb the burst 1010 // and other queries are not rejected because happening to soon 1011 ginkgo.By("Cooling down to reset the podresources API rate limit") 1012 time.Sleep(5 * time.Second) 1013 }) 1014 }) 1015 }) 1016 1017 func requireLackOfSRIOVDevices() { 1018 if sriovdevCount, err := countSRIOVDevices(); err != nil || sriovdevCount > 0 { 1019 e2eskipper.Skipf("this test is meant to run on a system with no configured VF from SRIOV device") 1020 } 1021 } 1022 1023 func getOnlineCPUs() (cpuset.CPUSet, error) { 1024 onlineCPUList, err := os.ReadFile("/sys/devices/system/cpu/online") 1025 if err != nil { 1026 return cpuset.CPUSet{}, err 1027 } 1028 return cpuset.Parse(strings.TrimSpace(string(onlineCPUList))) 1029 } 1030 1031 func setupSampleDevicePluginOrFail(ctx context.Context, f *framework.Framework) *v1.Pod { 1032 e2enode.WaitForNodeToBeReady(ctx, f.ClientSet, framework.TestContext.NodeName, 5*time.Minute) 1033 1034 dp := getSampleDevicePluginPod(kubeletdevicepluginv1beta1.DevicePluginPath) 1035 dp.Spec.NodeName = framework.TestContext.NodeName 1036 1037 ginkgo.By("Create the sample device plugin pod") 1038 1039 dpPod := e2epod.NewPodClient(f).CreateSync(ctx, dp) 1040 1041 err := e2epod.WaitForPodCondition(ctx, f.ClientSet, dpPod.Namespace, dpPod.Name, "Ready", 120*time.Second, testutils.PodRunningReady) 1042 if err != nil { 1043 framework.Logf("Sample Device Pod %v took too long to enter running/ready: %v", dp.Name, err) 1044 } 1045 framework.ExpectNoError(err, "WaitForPodCondition() failed err: %v", err) 1046 1047 return dpPod 1048 } 1049 1050 func teardownSampleDevicePluginOrFail(ctx context.Context, f *framework.Framework, pod *v1.Pod) { 1051 gp := int64(0) 1052 deleteOptions := metav1.DeleteOptions{ 1053 GracePeriodSeconds: &gp, 1054 } 1055 ginkgo.By(fmt.Sprintf("Delete sample device plugin pod %s/%s", pod.Namespace, pod.Name)) 1056 err := f.ClientSet.CoreV1().Pods(pod.Namespace).Delete(ctx, pod.Name, deleteOptions) 1057 1058 framework.ExpectNoError(err, "Failed to delete Pod %v in Namspace %v", pod.Name, pod.Namespace) 1059 waitForAllContainerRemoval(ctx, pod.Name, pod.Namespace) 1060 } 1061 1062 func waitForTopologyUnawareResources(ctx context.Context, f *framework.Framework) { 1063 ginkgo.By(fmt.Sprintf("Waiting for %q resources to become available on the local node", defaultTopologyUnawareResourceName)) 1064 1065 gomega.Eventually(ctx, func(ctx context.Context) bool { 1066 node := getLocalNode(ctx, f) 1067 resourceAmount := CountSampleDeviceAllocatable(node) 1068 return resourceAmount > 0 1069 }, 2*time.Minute, framework.Poll).Should(gomega.BeTrue()) 1070 } 1071 1072 func getPodResourcesMetrics(ctx context.Context) (e2emetrics.KubeletMetrics, error) { 1073 // we are running out of good names, so we need to be unnecessarily specific to avoid clashes 1074 ginkgo.By("getting Pod Resources metrics from the metrics API") 1075 return e2emetrics.GrabKubeletMetricsWithoutProxy(ctx, nodeNameOrIP()+":10255", "/metrics") 1076 } 1077 1078 func timelessSampleAtLeast(lower interface{}) types.GomegaMatcher { 1079 return gstruct.PointTo(gstruct.MatchAllFields(gstruct.Fields{ 1080 // We already check Metric when matching the Id 1081 "Metric": gstruct.Ignore(), 1082 "Value": gomega.BeNumerically(">=", lower), 1083 "Timestamp": gstruct.Ignore(), 1084 "Histogram": gstruct.Ignore(), 1085 })) 1086 }