k8s.io/kubernetes@v1.31.0-alpha.0.0.20240520171757-56147500dadc/test/e2e_node/podresources_test.go (about) 1 /* 2 Copyright 2020 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package e2enode 18 19 import ( 20 "context" 21 "errors" 22 "fmt" 23 "os" 24 "strings" 25 "time" 26 27 v1 "k8s.io/api/core/v1" 28 "k8s.io/apimachinery/pkg/api/resource" 29 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 30 kubeletdevicepluginv1beta1 "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1" 31 kubeletpodresourcesv1 "k8s.io/kubelet/pkg/apis/podresources/v1" 32 kubefeatures "k8s.io/kubernetes/pkg/features" 33 kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config" 34 apisgrpc "k8s.io/kubernetes/pkg/kubelet/apis/grpc" 35 "k8s.io/kubernetes/pkg/kubelet/apis/podresources" 36 "k8s.io/kubernetes/pkg/kubelet/cm/cpumanager" 37 "k8s.io/kubernetes/pkg/kubelet/util" 38 testutils "k8s.io/kubernetes/test/utils" 39 admissionapi "k8s.io/pod-security-admission/api" 40 "k8s.io/utils/cpuset" 41 42 "github.com/onsi/ginkgo/v2" 43 "github.com/onsi/gomega" 44 "github.com/onsi/gomega/gstruct" 45 "github.com/onsi/gomega/types" 46 "k8s.io/kubernetes/test/e2e/feature" 47 "k8s.io/kubernetes/test/e2e/framework" 48 e2emetrics "k8s.io/kubernetes/test/e2e/framework/metrics" 49 e2enode "k8s.io/kubernetes/test/e2e/framework/node" 50 e2epod "k8s.io/kubernetes/test/e2e/framework/pod" 51 e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper" 52 "k8s.io/kubernetes/test/e2e/nodefeature" 53 ) 54 55 const ( 56 defaultTopologyUnawareResourceName = "example.com/resource" 57 ) 58 59 type podDesc struct { 60 podName string 61 cntName string 62 resourceName string 63 resourceAmount int 64 cpuRequest int // cpuRequest is in millicores 65 initContainers []initContainerDesc 66 } 67 68 func (desc podDesc) CpuRequestQty() resource.Quantity { 69 qty := resource.NewMilliQuantity(int64(desc.cpuRequest), resource.DecimalSI) 70 return *qty 71 } 72 73 func (desc podDesc) CpuRequestExclusive() int { 74 if (desc.cpuRequest % 1000) != 0 { 75 // exclusive cpus are request only if the quantity is integral; 76 // hence, explicitly rule out non-integral requests 77 return 0 78 } 79 return desc.cpuRequest / 1000 80 } 81 82 func (desc podDesc) RequiresCPU() bool { 83 return desc.cpuRequest > 0 84 } 85 86 func (desc podDesc) RequiresDevices() bool { 87 return desc.resourceName != "" && desc.resourceAmount > 0 88 } 89 90 type initContainerDesc struct { 91 cntName string 92 resourceName string 93 resourceAmount int 94 cpuRequest int // cpuRequest is in millicores 95 restartPolicy *v1.ContainerRestartPolicy 96 } 97 98 func (desc initContainerDesc) CPURequestQty() resource.Quantity { 99 qty := resource.NewMilliQuantity(int64(desc.cpuRequest), resource.DecimalSI) 100 return *qty 101 } 102 103 func (desc initContainerDesc) CPURequestExclusive() int { 104 if (desc.cpuRequest % 1000) != 0 { 105 // exclusive cpus are request only if the quantity is integral; 106 // hence, explicitly rule out non-integral requests 107 return 0 108 } 109 return desc.cpuRequest / 1000 110 } 111 112 func (desc initContainerDesc) RequiresCPU() bool { 113 return desc.cpuRequest > 0 114 } 115 116 func (desc initContainerDesc) RequiresDevices() bool { 117 return desc.resourceName != "" && desc.resourceAmount > 0 118 } 119 120 func makePodResourcesTestPod(desc podDesc) *v1.Pod { 121 cnt := v1.Container{ 122 Name: desc.cntName, 123 Image: busyboxImage, 124 Resources: v1.ResourceRequirements{ 125 Requests: v1.ResourceList{}, 126 Limits: v1.ResourceList{}, 127 }, 128 Command: []string{"sh", "-c", "sleep 1d"}, 129 } 130 if desc.RequiresCPU() { 131 cpuRequestQty := desc.CpuRequestQty() 132 cnt.Resources.Requests[v1.ResourceCPU] = cpuRequestQty 133 cnt.Resources.Limits[v1.ResourceCPU] = cpuRequestQty 134 // we don't really care, we only need to be in guaranteed QoS 135 cnt.Resources.Requests[v1.ResourceMemory] = resource.MustParse("100Mi") 136 cnt.Resources.Limits[v1.ResourceMemory] = resource.MustParse("100Mi") 137 } 138 if desc.RequiresDevices() { 139 cnt.Resources.Requests[v1.ResourceName(desc.resourceName)] = resource.MustParse(fmt.Sprintf("%d", desc.resourceAmount)) 140 cnt.Resources.Limits[v1.ResourceName(desc.resourceName)] = resource.MustParse(fmt.Sprintf("%d", desc.resourceAmount)) 141 } 142 143 var initCnts []v1.Container 144 for _, cntDesc := range desc.initContainers { 145 initCnt := v1.Container{ 146 Name: cntDesc.cntName, 147 Image: busyboxImage, 148 Resources: v1.ResourceRequirements{ 149 Requests: v1.ResourceList{}, 150 Limits: v1.ResourceList{}, 151 }, 152 Command: []string{"sh", "-c", "sleep 5s"}, 153 RestartPolicy: cntDesc.restartPolicy, 154 } 155 if cntDesc.restartPolicy != nil && *cntDesc.restartPolicy == v1.ContainerRestartPolicyAlways { 156 initCnt.Command = []string{"sh", "-c", "sleep 1d"} 157 } 158 if cntDesc.RequiresCPU() { 159 cpuRequestQty := cntDesc.CPURequestQty() 160 initCnt.Resources.Requests[v1.ResourceCPU] = cpuRequestQty 161 initCnt.Resources.Limits[v1.ResourceCPU] = cpuRequestQty 162 // we don't really care, we only need to be in guaranteed QoS 163 initCnt.Resources.Requests[v1.ResourceMemory] = resource.MustParse("100Mi") 164 initCnt.Resources.Limits[v1.ResourceMemory] = resource.MustParse("100Mi") 165 } 166 if cntDesc.RequiresDevices() { 167 initCnt.Resources.Requests[v1.ResourceName(cntDesc.resourceName)] = resource.MustParse(fmt.Sprintf("%d", cntDesc.resourceAmount)) 168 initCnt.Resources.Limits[v1.ResourceName(cntDesc.resourceName)] = resource.MustParse(fmt.Sprintf("%d", cntDesc.resourceAmount)) 169 } 170 initCnts = append(initCnts, initCnt) 171 } 172 173 return &v1.Pod{ 174 ObjectMeta: metav1.ObjectMeta{ 175 Name: desc.podName, 176 }, 177 Spec: v1.PodSpec{ 178 RestartPolicy: v1.RestartPolicyNever, 179 InitContainers: initCnts, 180 Containers: []v1.Container{ 181 cnt, 182 }, 183 }, 184 } 185 } 186 187 func logPodResources(podIdx int, pr *kubeletpodresourcesv1.PodResources) { 188 ns := pr.GetNamespace() 189 cnts := pr.GetContainers() 190 if len(cnts) == 0 { 191 framework.Logf("#%02d/%02d/%02d - %s/%s/%s No containers", podIdx, 0, 0, ns, pr.GetName(), "_") 192 return 193 } 194 195 for cntIdx, cnt := range cnts { 196 if len(cnt.Devices) == 0 { 197 framework.Logf("#%02d/%02d/%02d - %s/%s/%s cpus -> %v resources -> none", podIdx, cntIdx, 0, ns, pr.GetName(), cnt.Name, cnt.CpuIds) 198 continue 199 } 200 201 for devIdx, dev := range cnt.Devices { 202 framework.Logf("#%02d/%02d/%02d - %s/%s/%s cpus -> %v %s -> %s", podIdx, cntIdx, devIdx, ns, pr.GetName(), cnt.Name, cnt.CpuIds, dev.ResourceName, strings.Join(dev.DeviceIds, ", ")) 203 } 204 } 205 } 206 207 type podResMap map[string]map[string]kubeletpodresourcesv1.ContainerResources 208 209 func convertToMap(podsResources []*kubeletpodresourcesv1.PodResources) podResMap { 210 res := make(map[string]map[string]kubeletpodresourcesv1.ContainerResources) 211 for idx, podResource := range podsResources { 212 // to make troubleshooting easier 213 logPodResources(idx, podResource) 214 215 cnts := make(map[string]kubeletpodresourcesv1.ContainerResources) 216 for _, cnt := range podResource.GetContainers() { 217 cnts[cnt.GetName()] = *cnt 218 } 219 res[podResource.GetName()] = cnts 220 } 221 return res 222 } 223 224 func getPodResourcesValues(ctx context.Context, cli kubeletpodresourcesv1.PodResourcesListerClient) (podResMap, error) { 225 resp, err := cli.List(ctx, &kubeletpodresourcesv1.ListPodResourcesRequest{}) 226 if err != nil { 227 return nil, err 228 } 229 return convertToMap(resp.GetPodResources()), nil 230 } 231 232 type testPodData struct { 233 PodMap map[string]*v1.Pod 234 } 235 236 func newTestPodData() *testPodData { 237 return &testPodData{ 238 PodMap: make(map[string]*v1.Pod), 239 } 240 } 241 242 func (tpd *testPodData) createPodsForTest(ctx context.Context, f *framework.Framework, podReqs []podDesc) { 243 for _, podReq := range podReqs { 244 pod := makePodResourcesTestPod(podReq) 245 pod = e2epod.NewPodClient(f).CreateSync(ctx, pod) 246 247 framework.Logf("created pod %s", podReq.podName) 248 tpd.PodMap[podReq.podName] = pod 249 } 250 } 251 252 /* deletePodsForTest clean up all the pods run for a testcase. Must ensure proper cleanup */ 253 func (tpd *testPodData) deletePodsForTest(ctx context.Context, f *framework.Framework) { 254 deletePodsAsync(ctx, f, tpd.PodMap) 255 } 256 257 /* deletePod removes pod during a test. Should do a best-effort clean up */ 258 func (tpd *testPodData) deletePod(ctx context.Context, f *framework.Framework, podName string) { 259 _, ok := tpd.PodMap[podName] 260 if !ok { 261 return 262 } 263 deletePodSyncByName(ctx, f, podName) 264 delete(tpd.PodMap, podName) 265 } 266 267 func findContainerDeviceByName(devs []*kubeletpodresourcesv1.ContainerDevices, resourceName string) *kubeletpodresourcesv1.ContainerDevices { 268 for _, dev := range devs { 269 if dev.ResourceName == resourceName { 270 return dev 271 } 272 } 273 return nil 274 } 275 276 func matchPodDescWithResources(expected []podDesc, found podResMap) error { 277 for _, podReq := range expected { 278 framework.Logf("matching: %#v", podReq) 279 280 podInfo, ok := found[podReq.podName] 281 if !ok { 282 return fmt.Errorf("no pod resources for pod %q", podReq.podName) 283 } 284 cntInfo, ok := podInfo[podReq.cntName] 285 if !ok { 286 return fmt.Errorf("no container resources for pod %q container %q", podReq.podName, podReq.cntName) 287 } 288 if podReq.RequiresCPU() { 289 if exclusiveCpus := podReq.CpuRequestExclusive(); exclusiveCpus != len(cntInfo.CpuIds) { 290 if exclusiveCpus == 0 { 291 return fmt.Errorf("pod %q container %q requested %d expected to be allocated CPUs from shared pool %v", podReq.podName, podReq.cntName, podReq.cpuRequest, cntInfo.CpuIds) 292 } 293 return fmt.Errorf("pod %q container %q expected %d cpus got %v", podReq.podName, podReq.cntName, exclusiveCpus, cntInfo.CpuIds) 294 } 295 } 296 if podReq.RequiresDevices() { 297 dev := findContainerDeviceByName(cntInfo.GetDevices(), podReq.resourceName) 298 if dev == nil { 299 return fmt.Errorf("pod %q container %q expected data for resource %q not found", podReq.podName, podReq.cntName, podReq.resourceName) 300 } 301 if len(dev.DeviceIds) != podReq.resourceAmount { 302 return fmt.Errorf("pod %q container %q resource %q expected %d items got %v", podReq.podName, podReq.cntName, podReq.resourceName, podReq.resourceAmount, dev.DeviceIds) 303 } 304 } else { 305 devs := cntInfo.GetDevices() 306 if len(devs) > 0 { 307 return fmt.Errorf("pod %q container %q expected no resources, got %v", podReq.podName, podReq.cntName, devs) 308 } 309 } 310 if cnts, ok := found[defaultTopologyUnawareResourceName]; ok { 311 for _, cnt := range cnts { 312 for _, cd := range cnt.GetDevices() { 313 if cd.ResourceName != defaultTopologyUnawareResourceName { 314 continue 315 } 316 if cd.Topology != nil { 317 //we expect nil topology 318 return fmt.Errorf("Nil topology is expected") 319 } 320 } 321 322 } 323 } 324 325 // check init containers 326 for _, initCntDesc := range podReq.initContainers { 327 if initCntDesc.restartPolicy == nil || *initCntDesc.restartPolicy != v1.ContainerRestartPolicyAlways { 328 // If the init container is not restartable, we don't expect it 329 // to be reported. 330 _, ok := podInfo[initCntDesc.cntName] 331 if ok { 332 return fmt.Errorf("pod %q regular init container %q should not be reported", podReq.podName, initCntDesc.cntName) 333 } 334 continue 335 } 336 337 cntInfo, ok := podInfo[initCntDesc.cntName] 338 if !ok { 339 return fmt.Errorf("no container resources for pod %q container %q", podReq.podName, initCntDesc.cntName) 340 } 341 if initCntDesc.RequiresCPU() { 342 if exclusiveCpus := initCntDesc.CPURequestExclusive(); exclusiveCpus != len(cntInfo.CpuIds) { 343 if exclusiveCpus == 0 { 344 return fmt.Errorf("pod %q container %q requested %d expected to be allocated CPUs from shared pool %v", podReq.podName, initCntDesc.cntName, initCntDesc.cpuRequest, cntInfo.CpuIds) 345 } 346 return fmt.Errorf("pod %q container %q expected %d cpus got %v", podReq.podName, initCntDesc.cntName, exclusiveCpus, cntInfo.CpuIds) 347 } 348 } 349 if initCntDesc.RequiresDevices() { 350 dev := findContainerDeviceByName(cntInfo.GetDevices(), initCntDesc.resourceName) 351 if dev == nil { 352 return fmt.Errorf("pod %q container %q expected data for resource %q not found", podReq.podName, initCntDesc.cntName, initCntDesc.resourceName) 353 } 354 if len(dev.DeviceIds) != initCntDesc.resourceAmount { 355 return fmt.Errorf("pod %q container %q resource %q expected %d items got %v", podReq.podName, initCntDesc.cntName, initCntDesc.resourceName, initCntDesc.resourceAmount, dev.DeviceIds) 356 } 357 } else { 358 devs := cntInfo.GetDevices() 359 if len(devs) > 0 { 360 return fmt.Errorf("pod %q container %q expected no resources, got %v", podReq.podName, initCntDesc.cntName, devs) 361 } 362 } 363 if cnts, ok := found[defaultTopologyUnawareResourceName]; ok { 364 for _, cnt := range cnts { 365 for _, cd := range cnt.GetDevices() { 366 if cd.ResourceName != defaultTopologyUnawareResourceName { 367 continue 368 } 369 if cd.Topology != nil { 370 // we expect nil topology 371 return fmt.Errorf("Nil topology is expected") 372 } 373 } 374 } 375 } 376 } 377 } 378 return nil 379 } 380 381 func expectPodResources(ctx context.Context, offset int, cli kubeletpodresourcesv1.PodResourcesListerClient, expected []podDesc) { 382 gomega.EventuallyWithOffset(1+offset, ctx, func(ctx context.Context) error { 383 found, err := getPodResourcesValues(ctx, cli) 384 if err != nil { 385 return err 386 } 387 return matchPodDescWithResources(expected, found) 388 }, time.Minute, 10*time.Second).Should(gomega.Succeed()) 389 } 390 391 func filterOutDesc(descs []podDesc, name string) []podDesc { 392 var ret []podDesc 393 for _, desc := range descs { 394 if desc.podName == name { 395 continue 396 } 397 ret = append(ret, desc) 398 } 399 return ret 400 } 401 402 func podresourcesListTests(ctx context.Context, f *framework.Framework, cli kubeletpodresourcesv1.PodResourcesListerClient, sd *sriovData, sidecarContainersEnabled bool) { 403 var tpd *testPodData 404 405 var found podResMap 406 var expected []podDesc 407 var extra podDesc 408 409 expectedBasePods := 0 /* nothing but pods we create */ 410 if sd != nil { 411 expectedBasePods = 1 // sriovdp 412 } 413 414 var err error 415 ginkgo.By("checking the output when no pods are present") 416 found, err = getPodResourcesValues(ctx, cli) 417 framework.ExpectNoError(err, "getPodResourcesValues() failed err: %v", err) 418 gomega.ExpectWithOffset(1, found).To(gomega.HaveLen(expectedBasePods), "base pod expectation mismatch") 419 420 tpd = newTestPodData() 421 ginkgo.By("checking the output when only pods which don't require resources are present") 422 expected = []podDesc{ 423 { 424 podName: "pod-00", 425 cntName: "cnt-00", 426 }, 427 { 428 podName: "pod-01", 429 cntName: "cnt-00", 430 }, 431 } 432 433 tpd.createPodsForTest(ctx, f, expected) 434 expectPodResources(ctx, 1, cli, expected) 435 tpd.deletePodsForTest(ctx, f) 436 437 tpd = newTestPodData() 438 ginkgo.By("checking the output when only a subset of pods require resources") 439 if sd != nil { 440 expected = []podDesc{ 441 { 442 podName: "pod-00", 443 cntName: "cnt-00", 444 }, 445 { 446 podName: "pod-01", 447 cntName: "cnt-00", 448 resourceName: sd.resourceName, 449 resourceAmount: 1, 450 cpuRequest: 1000, 451 }, 452 { 453 podName: "pod-02", 454 cntName: "cnt-00", 455 cpuRequest: 1000, 456 }, 457 { 458 podName: "pod-03", 459 cntName: "cnt-00", 460 resourceName: sd.resourceName, 461 resourceAmount: 1, 462 cpuRequest: 1000, 463 }, 464 } 465 } else { 466 expected = []podDesc{ 467 { 468 podName: "pod-00", 469 cntName: "cnt-00", 470 }, 471 { 472 podName: "pod-01", 473 cntName: "cnt-00", 474 cpuRequest: 1000, 475 }, 476 { 477 podName: "pod-02", 478 cntName: "cnt-00", 479 cpuRequest: 1000, 480 }, 481 { 482 podName: "pod-03", 483 cntName: "cnt-00", 484 cpuRequest: 1000, 485 }, 486 } 487 488 } 489 tpd.createPodsForTest(ctx, f, expected) 490 expectPodResources(ctx, 1, cli, expected) 491 tpd.deletePodsForTest(ctx, f) 492 493 tpd = newTestPodData() 494 ginkgo.By("checking the output when creating pods which require resources between calls") 495 if sd != nil { 496 expected = []podDesc{ 497 { 498 podName: "pod-00", 499 cntName: "cnt-00", 500 }, 501 { 502 podName: "pod-01", 503 cntName: "cnt-00", 504 resourceName: sd.resourceName, 505 resourceAmount: 1, 506 cpuRequest: 1000, 507 }, 508 { 509 podName: "pod-02", 510 cntName: "cnt-00", 511 cpuRequest: 1000, 512 }, 513 } 514 } else { 515 expected = []podDesc{ 516 { 517 podName: "pod-00", 518 cntName: "cnt-00", 519 }, 520 { 521 podName: "pod-01", 522 cntName: "cnt-00", 523 cpuRequest: 1000, 524 }, 525 { 526 podName: "pod-02", 527 cntName: "cnt-00", 528 cpuRequest: 1000, 529 }, 530 } 531 } 532 533 tpd.createPodsForTest(ctx, f, expected) 534 expectPodResources(ctx, 1, cli, expected) 535 536 if sd != nil { 537 extra = podDesc{ 538 podName: "pod-03", 539 cntName: "cnt-00", 540 resourceName: sd.resourceName, 541 resourceAmount: 1, 542 cpuRequest: 1000, 543 } 544 } else { 545 extra = podDesc{ 546 podName: "pod-03", 547 cntName: "cnt-00", 548 cpuRequest: 1000, 549 } 550 551 } 552 553 tpd.createPodsForTest(ctx, f, []podDesc{ 554 extra, 555 }) 556 557 expected = append(expected, extra) 558 expectPodResources(ctx, 1, cli, expected) 559 tpd.deletePodsForTest(ctx, f) 560 561 tpd = newTestPodData() 562 ginkgo.By("checking the output when deleting pods which require resources between calls") 563 564 if sd != nil { 565 expected = []podDesc{ 566 { 567 podName: "pod-00", 568 cntName: "cnt-00", 569 cpuRequest: 1000, 570 }, 571 { 572 podName: "pod-01", 573 cntName: "cnt-00", 574 resourceName: sd.resourceName, 575 resourceAmount: 1, 576 cpuRequest: 2000, 577 }, 578 { 579 podName: "pod-02", 580 cntName: "cnt-00", 581 }, 582 { 583 podName: "pod-03", 584 cntName: "cnt-00", 585 resourceName: sd.resourceName, 586 resourceAmount: 1, 587 cpuRequest: 1000, 588 }, 589 } 590 } else { 591 expected = []podDesc{ 592 { 593 podName: "pod-00", 594 cntName: "cnt-00", 595 cpuRequest: 1000, 596 }, 597 { 598 podName: "pod-01", 599 cntName: "cnt-00", 600 cpuRequest: 1000, 601 }, 602 { 603 podName: "pod-02", 604 cntName: "cnt-00", 605 }, 606 { 607 podName: "pod-03", 608 cntName: "cnt-00", 609 cpuRequest: 1000, 610 }, 611 } 612 } 613 tpd.createPodsForTest(ctx, f, expected) 614 expectPodResources(ctx, 1, cli, expected) 615 616 tpd.deletePod(ctx, f, "pod-01") 617 expectedPostDelete := filterOutDesc(expected, "pod-01") 618 expectPodResources(ctx, 1, cli, expectedPostDelete) 619 tpd.deletePodsForTest(ctx, f) 620 621 tpd = newTestPodData() 622 ginkgo.By("checking the output when pods request non integral CPUs") 623 if sd != nil { 624 expected = []podDesc{ 625 { 626 podName: "pod-00", 627 cntName: "cnt-00", 628 cpuRequest: 1500, 629 }, 630 { 631 podName: "pod-01", 632 cntName: "cnt-00", 633 resourceName: sd.resourceName, 634 resourceAmount: 1, 635 cpuRequest: 1500, 636 }, 637 } 638 } else { 639 expected = []podDesc{ 640 { 641 podName: "pod-00", 642 cntName: "cnt-00", 643 cpuRequest: 1500, 644 }, 645 } 646 647 } 648 tpd.createPodsForTest(ctx, f, expected) 649 expectPodResources(ctx, 1, cli, expected) 650 tpd.deletePodsForTest(ctx, f) 651 652 if sidecarContainersEnabled { 653 containerRestartPolicyAlways := v1.ContainerRestartPolicyAlways 654 655 tpd = newTestPodData() 656 ginkgo.By("checking the output when pods have init containers") 657 if sd != nil { 658 expected = []podDesc{ 659 { 660 podName: "pod-00", 661 cntName: "regular-00", 662 cpuRequest: 1000, 663 initContainers: []initContainerDesc{ 664 { 665 cntName: "init-00", 666 resourceName: sd.resourceName, 667 resourceAmount: 1, 668 cpuRequest: 1000, 669 }, 670 }, 671 }, 672 { 673 podName: "pod-01", 674 cntName: "regular-00", 675 cpuRequest: 1000, 676 initContainers: []initContainerDesc{ 677 { 678 cntName: "restartable-init-00", 679 resourceName: sd.resourceName, 680 resourceAmount: 1, 681 cpuRequest: 1000, 682 restartPolicy: &containerRestartPolicyAlways, 683 }, 684 }, 685 }, 686 } 687 } else { 688 expected = []podDesc{ 689 { 690 podName: "pod-00", 691 cntName: "regular-00", 692 cpuRequest: 1000, 693 initContainers: []initContainerDesc{ 694 { 695 cntName: "init-00", 696 cpuRequest: 1000, 697 }, 698 }, 699 }, 700 { 701 podName: "pod-01", 702 cntName: "regular-00", 703 cpuRequest: 1000, 704 initContainers: []initContainerDesc{ 705 { 706 cntName: "restartable-init-00", 707 cpuRequest: 1000, 708 restartPolicy: &containerRestartPolicyAlways, 709 }, 710 }, 711 }, 712 } 713 } 714 715 tpd.createPodsForTest(ctx, f, expected) 716 expectPodResources(ctx, 1, cli, expected) 717 tpd.deletePodsForTest(ctx, f) 718 } 719 } 720 721 func podresourcesGetAllocatableResourcesTests(ctx context.Context, cli kubeletpodresourcesv1.PodResourcesListerClient, sd *sriovData, onlineCPUs, reservedSystemCPUs cpuset.CPUSet) { 722 ginkgo.By("checking the devices known to the kubelet") 723 resp, err := cli.GetAllocatableResources(ctx, &kubeletpodresourcesv1.AllocatableResourcesRequest{}) 724 framework.ExpectNoErrorWithOffset(1, err) 725 devs := resp.GetDevices() 726 var cpus []int 727 for _, cpuid := range resp.GetCpuIds() { 728 cpus = append(cpus, int(cpuid)) 729 } 730 allocatableCPUs := cpuset.New(cpus...) 731 732 if onlineCPUs.Size() == 0 { 733 ginkgo.By("expecting no CPUs reported") 734 gomega.ExpectWithOffset(1, onlineCPUs.Size()).To(gomega.Equal(reservedSystemCPUs.Size()), "with no online CPUs, no CPUs should be reserved") 735 } else { 736 ginkgo.By(fmt.Sprintf("expecting online CPUs reported - online=%v (%d) reserved=%v (%d)", onlineCPUs, onlineCPUs.Size(), reservedSystemCPUs, reservedSystemCPUs.Size())) 737 if reservedSystemCPUs.Size() > onlineCPUs.Size() { 738 ginkgo.Fail("more reserved CPUs than online") 739 } 740 expectedCPUs := onlineCPUs.Difference(reservedSystemCPUs) 741 742 ginkgo.By(fmt.Sprintf("expecting CPUs '%v'='%v'", allocatableCPUs, expectedCPUs)) 743 gomega.ExpectWithOffset(1, allocatableCPUs.Equals(expectedCPUs)).To(gomega.BeTrue(), "mismatch expecting CPUs") 744 } 745 746 if sd == nil { // no devices in the environment, so expect no devices 747 ginkgo.By("expecting no devices reported") 748 gomega.ExpectWithOffset(1, devs).To(gomega.BeEmpty(), fmt.Sprintf("got unexpected devices %#v", devs)) 749 return 750 } 751 752 ginkgo.By(fmt.Sprintf("expecting some %q devices reported", sd.resourceName)) 753 gomega.ExpectWithOffset(1, devs).ToNot(gomega.BeEmpty()) 754 for _, dev := range devs { 755 gomega.Expect(dev.ResourceName).To(gomega.Equal(sd.resourceName)) 756 gomega.ExpectWithOffset(1, dev.DeviceIds).ToNot(gomega.BeEmpty()) 757 } 758 } 759 760 func podresourcesGetTests(ctx context.Context, f *framework.Framework, cli kubeletpodresourcesv1.PodResourcesListerClient, sidecarContainersEnabled bool) { 761 //var err error 762 ginkgo.By("checking the output when no pods are present") 763 expected := []podDesc{} 764 resp, err := cli.Get(ctx, &kubeletpodresourcesv1.GetPodResourcesRequest{PodName: "test", PodNamespace: f.Namespace.Name}) 765 podResourceList := []*kubeletpodresourcesv1.PodResources{resp.GetPodResources()} 766 gomega.Expect(err).To(gomega.HaveOccurred(), "pod not found") 767 res := convertToMap(podResourceList) 768 err = matchPodDescWithResources(expected, res) 769 framework.ExpectNoError(err, "matchPodDescWithResources() failed err %v", err) 770 771 tpd := newTestPodData() 772 ginkgo.By("checking the output when only pods which don't require resources are present") 773 expected = []podDesc{ 774 { 775 podName: "pod-00", 776 cntName: "cnt-00", 777 }, 778 } 779 tpd.createPodsForTest(ctx, f, expected) 780 resp, err = cli.Get(ctx, &kubeletpodresourcesv1.GetPodResourcesRequest{PodName: "pod-00", PodNamespace: f.Namespace.Name}) 781 framework.ExpectNoError(err, "Get() call failed for pod %s/%s", f.Namespace.Name, "pod-00") 782 podResourceList = []*kubeletpodresourcesv1.PodResources{resp.GetPodResources()} 783 res = convertToMap(podResourceList) 784 err = matchPodDescWithResources(expected, res) 785 framework.ExpectNoError(err, "matchPodDescWithResources() failed err %v", err) 786 tpd.deletePodsForTest(ctx, f) 787 788 tpd = newTestPodData() 789 ginkgo.By("checking the output when only pod require CPU") 790 expected = []podDesc{ 791 { 792 podName: "pod-01", 793 cntName: "cnt-00", 794 cpuRequest: 1000, 795 }, 796 } 797 tpd.createPodsForTest(ctx, f, expected) 798 resp, err = cli.Get(ctx, &kubeletpodresourcesv1.GetPodResourcesRequest{PodName: "pod-01", PodNamespace: f.Namespace.Name}) 799 framework.ExpectNoError(err, "Get() call failed for pod %s/%s", f.Namespace.Name, "pod-01") 800 podResourceList = []*kubeletpodresourcesv1.PodResources{resp.GetPodResources()} 801 res = convertToMap(podResourceList) 802 err = matchPodDescWithResources(expected, res) 803 framework.ExpectNoError(err, "matchPodDescWithResources() failed err %v", err) 804 tpd.deletePodsForTest(ctx, f) 805 806 if sidecarContainersEnabled { 807 containerRestartPolicyAlways := v1.ContainerRestartPolicyAlways 808 809 tpd = newTestPodData() 810 ginkgo.By("checking the output when only pod with init containers require CPU") 811 expected = []podDesc{ 812 { 813 podName: "pod-01", 814 cntName: "cnt-00", 815 cpuRequest: 1000, 816 initContainers: []initContainerDesc{ 817 { 818 cntName: "init-00", 819 cpuRequest: 1000, 820 }, 821 { 822 cntName: "restartable-init-01", 823 cpuRequest: 1000, 824 restartPolicy: &containerRestartPolicyAlways, 825 }, 826 }, 827 }, 828 } 829 tpd.createPodsForTest(ctx, f, expected) 830 resp, err = cli.Get(ctx, &kubeletpodresourcesv1.GetPodResourcesRequest{PodName: "pod-01", PodNamespace: f.Namespace.Name}) 831 framework.ExpectNoError(err, "Get() call failed for pod %s/%s", f.Namespace.Name, "pod-01") 832 podResourceList = []*kubeletpodresourcesv1.PodResources{resp.GetPodResources()} 833 res = convertToMap(podResourceList) 834 err = matchPodDescWithResources(expected, res) 835 framework.ExpectNoError(err, "matchPodDescWithResources() failed err %v", err) 836 tpd.deletePodsForTest(ctx, f) 837 } 838 } 839 840 // Serial because the test updates kubelet configuration. 841 var _ = SIGDescribe("POD Resources", framework.WithSerial(), feature.PodResources, nodefeature.PodResources, func() { 842 f := framework.NewDefaultFramework("podresources-test") 843 f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged 844 845 reservedSystemCPUs := cpuset.New(1) 846 847 ginkgo.Context("with SRIOV devices in the system", func() { 848 ginkgo.BeforeEach(func() { 849 requireSRIOVDevices() 850 }) 851 852 ginkgo.Context("with CPU manager Static policy", func() { 853 ginkgo.BeforeEach(func(ctx context.Context) { 854 // this is a very rough check. We just want to rule out system that does NOT have enough resources 855 _, cpuAlloc, _ := getLocalNodeCPUDetails(ctx, f) 856 857 if cpuAlloc < minCoreCount { 858 e2eskipper.Skipf("Skipping CPU Manager tests since the CPU allocatable < %d", minCoreCount) 859 } 860 }) 861 862 // empty context to apply kubelet config changes 863 ginkgo.Context("", func() { 864 tempSetCurrentKubeletConfig(f, func(ctx context.Context, initialConfig *kubeletconfig.KubeletConfiguration) { 865 // Set the CPU Manager policy to static. 866 initialConfig.CPUManagerPolicy = string(cpumanager.PolicyStatic) 867 868 // Set the CPU Manager reconcile period to 1 second. 869 initialConfig.CPUManagerReconcilePeriod = metav1.Duration{Duration: 1 * time.Second} 870 871 cpus := reservedSystemCPUs.String() 872 framework.Logf("configurePodResourcesInKubelet: using reservedSystemCPUs=%q", cpus) 873 initialConfig.ReservedSystemCPUs = cpus 874 }) 875 876 ginkgo.It("should return the expected responses", func(ctx context.Context) { 877 onlineCPUs, err := getOnlineCPUs() 878 framework.ExpectNoError(err, "getOnlineCPUs() failed err: %v", err) 879 880 configMap := getSRIOVDevicePluginConfigMap(framework.TestContext.SriovdpConfigMapFile) 881 sd := setupSRIOVConfigOrFail(ctx, f, configMap) 882 ginkgo.DeferCleanup(teardownSRIOVConfigOrFail, f, sd) 883 884 waitForSRIOVResources(ctx, f, sd) 885 886 endpoint, err := util.LocalEndpoint(defaultPodResourcesPath, podresources.Socket) 887 framework.ExpectNoError(err, "LocalEndpoint() failed err: %v", err) 888 889 cli, conn, err := podresources.GetV1Client(endpoint, defaultPodResourcesTimeout, defaultPodResourcesMaxSize) 890 framework.ExpectNoError(err, "GetV1Client() failed err: %v", err) 891 defer conn.Close() 892 893 waitForSRIOVResources(ctx, f, sd) 894 895 ginkgo.By("checking List()") 896 podresourcesListTests(ctx, f, cli, sd, false) 897 ginkgo.By("checking GetAllocatableResources()") 898 podresourcesGetAllocatableResourcesTests(ctx, cli, sd, onlineCPUs, reservedSystemCPUs) 899 }) 900 901 framework.It("should return the expected responses", nodefeature.SidecarContainers, func(ctx context.Context) { 902 onlineCPUs, err := getOnlineCPUs() 903 framework.ExpectNoError(err, "getOnlineCPUs() failed err: %v", err) 904 905 configMap := getSRIOVDevicePluginConfigMap(framework.TestContext.SriovdpConfigMapFile) 906 sd := setupSRIOVConfigOrFail(ctx, f, configMap) 907 ginkgo.DeferCleanup(teardownSRIOVConfigOrFail, f, sd) 908 909 waitForSRIOVResources(ctx, f, sd) 910 911 endpoint, err := util.LocalEndpoint(defaultPodResourcesPath, podresources.Socket) 912 framework.ExpectNoError(err, "LocalEndpoint() failed err: %v", err) 913 914 cli, conn, err := podresources.GetV1Client(endpoint, defaultPodResourcesTimeout, defaultPodResourcesMaxSize) 915 framework.ExpectNoError(err, "GetV1Client() failed err: %v", err) 916 defer framework.ExpectNoError(conn.Close()) 917 918 waitForSRIOVResources(ctx, f, sd) 919 920 ginkgo.By("checking List()") 921 podresourcesListTests(ctx, f, cli, sd, true) 922 ginkgo.By("checking GetAllocatableResources()") 923 podresourcesGetAllocatableResourcesTests(ctx, cli, sd, onlineCPUs, reservedSystemCPUs) 924 }) 925 }) 926 }) 927 928 ginkgo.Context("with CPU manager None policy", func() { 929 ginkgo.It("should return the expected responses", func(ctx context.Context) { 930 // current default is "none" policy - no need to restart the kubelet 931 932 requireSRIOVDevices() 933 934 configMap := getSRIOVDevicePluginConfigMap(framework.TestContext.SriovdpConfigMapFile) 935 sd := setupSRIOVConfigOrFail(ctx, f, configMap) 936 ginkgo.DeferCleanup(teardownSRIOVConfigOrFail, f, sd) 937 938 waitForSRIOVResources(ctx, f, sd) 939 940 endpoint, err := util.LocalEndpoint(defaultPodResourcesPath, podresources.Socket) 941 framework.ExpectNoError(err, "LocalEndpoint() failed err: %v", err) 942 943 cli, conn, err := podresources.GetV1Client(endpoint, defaultPodResourcesTimeout, defaultPodResourcesMaxSize) 944 framework.ExpectNoError(err, "GetV1Client() failed err: %v", err) 945 defer conn.Close() 946 947 waitForSRIOVResources(ctx, f, sd) 948 949 // intentionally passing empty cpuset instead of onlineCPUs because with none policy 950 // we should get no allocatable cpus - no exclusively allocatable CPUs, depends on policy static 951 podresourcesGetAllocatableResourcesTests(ctx, cli, sd, cpuset.CPUSet{}, cpuset.CPUSet{}) 952 }) 953 }) 954 }) 955 956 framework.Context("without SRIOV devices in the system", framework.WithFlaky(), func() { 957 ginkgo.BeforeEach(func() { 958 requireLackOfSRIOVDevices() 959 }) 960 961 ginkgo.Context("with CPU manager Static policy", func() { 962 ginkgo.BeforeEach(func(ctx context.Context) { 963 // this is a very rough check. We just want to rule out system that does NOT have enough resources 964 _, cpuAlloc, _ := getLocalNodeCPUDetails(ctx, f) 965 966 if cpuAlloc < minCoreCount { 967 e2eskipper.Skipf("Skipping CPU Manager tests since the CPU allocatable < %d", minCoreCount) 968 } 969 }) 970 971 // empty context to apply kubelet config changes 972 ginkgo.Context("", func() { 973 tempSetCurrentKubeletConfig(f, func(ctx context.Context, initialConfig *kubeletconfig.KubeletConfiguration) { 974 // Set the CPU Manager policy to static. 975 initialConfig.CPUManagerPolicy = string(cpumanager.PolicyStatic) 976 977 // Set the CPU Manager reconcile period to 1 second. 978 initialConfig.CPUManagerReconcilePeriod = metav1.Duration{Duration: 1 * time.Second} 979 980 cpus := reservedSystemCPUs.String() 981 framework.Logf("configurePodResourcesInKubelet: using reservedSystemCPUs=%q", cpus) 982 initialConfig.ReservedSystemCPUs = cpus 983 if initialConfig.FeatureGates == nil { 984 initialConfig.FeatureGates = make(map[string]bool) 985 } 986 initialConfig.FeatureGates[string(kubefeatures.KubeletPodResourcesGet)] = true 987 }) 988 989 ginkgo.It("should return the expected responses", func(ctx context.Context) { 990 onlineCPUs, err := getOnlineCPUs() 991 framework.ExpectNoError(err, "getOnlineCPUs() failed err: %v", err) 992 993 endpoint, err := util.LocalEndpoint(defaultPodResourcesPath, podresources.Socket) 994 framework.ExpectNoError(err, "LocalEndpoint() failed err: %v", err) 995 996 cli, conn, err := podresources.GetV1Client(endpoint, defaultPodResourcesTimeout, defaultPodResourcesMaxSize) 997 framework.ExpectNoError(err, "GetV1Client() failed err: %v", err) 998 defer conn.Close() 999 1000 podresourcesListTests(ctx, f, cli, nil, false) 1001 podresourcesGetAllocatableResourcesTests(ctx, cli, nil, onlineCPUs, reservedSystemCPUs) 1002 podresourcesGetTests(ctx, f, cli, false) 1003 }) 1004 1005 framework.It("should return the expected responses", nodefeature.SidecarContainers, func(ctx context.Context) { 1006 onlineCPUs, err := getOnlineCPUs() 1007 framework.ExpectNoError(err, "getOnlineCPUs() failed err: %v", err) 1008 1009 endpoint, err := util.LocalEndpoint(defaultPodResourcesPath, podresources.Socket) 1010 framework.ExpectNoError(err, "LocalEndpoint() failed err: %v", err) 1011 1012 cli, conn, err := podresources.GetV1Client(endpoint, defaultPodResourcesTimeout, defaultPodResourcesMaxSize) 1013 framework.ExpectNoError(err, "GetV1Client() failed err: %v", err) 1014 defer func() { 1015 framework.ExpectNoError(conn.Close()) 1016 }() 1017 1018 podresourcesListTests(ctx, f, cli, nil, true) 1019 podresourcesGetAllocatableResourcesTests(ctx, cli, nil, onlineCPUs, reservedSystemCPUs) 1020 podresourcesGetTests(ctx, f, cli, true) 1021 }) 1022 ginkgo.It("should account for resources of pods in terminal phase", func(ctx context.Context) { 1023 pd := podDesc{ 1024 cntName: "e2e-test-cnt", 1025 podName: "e2e-test-pod", 1026 cpuRequest: 1000, 1027 } 1028 pod := makePodResourcesTestPod(pd) 1029 pod.Spec.Containers[0].Command = []string{"sh", "-c", "/bin/true"} 1030 pod = e2epod.NewPodClient(f).Create(ctx, pod) 1031 defer e2epod.NewPodClient(f).DeleteSync(ctx, pod.Name, metav1.DeleteOptions{}, time.Minute) 1032 err := e2epod.WaitForPodCondition(ctx, f.ClientSet, pod.Namespace, pod.Name, "Pod Succeeded", time.Minute*2, testutils.PodSucceeded) 1033 framework.ExpectNoError(err) 1034 endpoint, err := util.LocalEndpoint(defaultPodResourcesPath, podresources.Socket) 1035 framework.ExpectNoError(err) 1036 cli, conn, err := podresources.GetV1Client(endpoint, defaultPodResourcesTimeout, defaultPodResourcesMaxSize) 1037 framework.ExpectNoError(err) 1038 defer conn.Close() 1039 // although the pod moved into terminal state, PodResourcesAPI still list its cpus 1040 expectPodResources(ctx, 1, cli, []podDesc{pd}) 1041 1042 }) 1043 }) 1044 }) 1045 1046 ginkgo.Context("with CPU manager None policy", func() { 1047 ginkgo.It("should return the expected responses", func(ctx context.Context) { 1048 endpoint, err := util.LocalEndpoint(defaultPodResourcesPath, podresources.Socket) 1049 framework.ExpectNoError(err, "LocalEndpoint() failed err: %v", err) 1050 1051 cli, conn, err := podresources.GetV1Client(endpoint, defaultPodResourcesTimeout, defaultPodResourcesMaxSize) 1052 framework.ExpectNoError(err, "GetV1Client() failed err: %v", err) 1053 defer conn.Close() 1054 1055 // intentionally passing empty cpuset instead of onlineCPUs because with none policy 1056 // we should get no allocatable cpus - no exclusively allocatable CPUs, depends on policy static 1057 podresourcesGetAllocatableResourcesTests(ctx, cli, nil, cpuset.CPUSet{}, cpuset.CPUSet{}) 1058 }) 1059 }) 1060 1061 ginkgo.Context("with disabled KubeletPodResourcesGet feature gate", func() { 1062 1063 ginkgo.It("should return the expected error with the feature gate disabled", func(ctx context.Context) { 1064 endpoint, err := util.LocalEndpoint(defaultPodResourcesPath, podresources.Socket) 1065 framework.ExpectNoError(err, "LocalEndpoint() faild err %v", err) 1066 1067 cli, conn, err := podresources.GetV1Client(endpoint, defaultPodResourcesTimeout, defaultPodResourcesMaxSize) 1068 framework.ExpectNoError(err, "GetV1Client() failed err %v", err) 1069 defer conn.Close() 1070 1071 ginkgo.By("checking Get fail if the feature gate is not enabled") 1072 getRes, err := cli.Get(ctx, &kubeletpodresourcesv1.GetPodResourcesRequest{PodName: "test", PodNamespace: f.Namespace.Name}) 1073 framework.Logf("Get result: %v, err: %v", getRes, err) 1074 gomega.Expect(err).To(gomega.HaveOccurred(), "With feature gate disabled, the call must fail") 1075 }) 1076 }) 1077 }) 1078 1079 ginkgo.Context("with a topology-unaware device plugin, which reports resources w/o hardware topology", func() { 1080 ginkgo.Context("with CPU manager Static policy", func() { 1081 ginkgo.BeforeEach(func(ctx context.Context) { 1082 // this is a very rough check. We just want to rule out system that does NOT have enough resources 1083 _, cpuAlloc, _ := getLocalNodeCPUDetails(ctx, f) 1084 1085 if cpuAlloc < minCoreCount { 1086 e2eskipper.Skipf("Skipping CPU Manager tests since the CPU allocatable < %d", minCoreCount) 1087 } 1088 }) 1089 1090 // empty context to apply kubelet config changes 1091 ginkgo.Context("", func() { 1092 tempSetCurrentKubeletConfig(f, func(ctx context.Context, initialConfig *kubeletconfig.KubeletConfiguration) { 1093 // Set the CPU Manager policy to static. 1094 initialConfig.CPUManagerPolicy = string(cpumanager.PolicyStatic) 1095 1096 // Set the CPU Manager reconcile period to 1 second. 1097 initialConfig.CPUManagerReconcilePeriod = metav1.Duration{Duration: 1 * time.Second} 1098 1099 cpus := reservedSystemCPUs.String() 1100 framework.Logf("configurePodResourcesInKubelet: using reservedSystemCPUs=%q", cpus) 1101 initialConfig.ReservedSystemCPUs = cpus 1102 }) 1103 1104 ginkgo.It("should return proper podresources the same as before the restart of kubelet", func(ctx context.Context) { 1105 dpPod := setupSampleDevicePluginOrFail(ctx, f) 1106 ginkgo.DeferCleanup(teardownSampleDevicePluginOrFail, f, dpPod) 1107 1108 waitForTopologyUnawareResources(ctx, f) 1109 1110 endpoint, err := util.LocalEndpoint(defaultPodResourcesPath, podresources.Socket) 1111 framework.ExpectNoError(err, "LocalEndpoint() failed err: %v", err) 1112 1113 cli, conn, err := podresources.GetV1Client(endpoint, defaultPodResourcesTimeout, defaultPodResourcesMaxSize) 1114 framework.ExpectNoError(err, "GetV1Client() failed err: %v", err) 1115 defer conn.Close() 1116 1117 ginkgo.By("checking List and resources topology unaware resource should be without topology") 1118 1119 allocatableResponse, _ := cli.GetAllocatableResources(ctx, &kubeletpodresourcesv1.AllocatableResourcesRequest{}) 1120 for _, dev := range allocatableResponse.GetDevices() { 1121 if dev.ResourceName != defaultTopologyUnawareResourceName { 1122 continue 1123 } 1124 gomega.Expect(dev.Topology).To(gomega.BeNil(), "Topology is expected to be empty for topology unaware resources") 1125 } 1126 1127 desc := podDesc{ 1128 podName: "pod-01", 1129 cntName: "cnt-01", 1130 resourceName: defaultTopologyUnawareResourceName, 1131 resourceAmount: 1, 1132 cpuRequest: 1000, 1133 } 1134 1135 tpd := newTestPodData() 1136 tpd.createPodsForTest(ctx, f, []podDesc{ 1137 desc, 1138 }) 1139 1140 expectPodResources(ctx, 1, cli, []podDesc{desc}) 1141 1142 ginkgo.By("Restarting Kubelet") 1143 restartKubelet(true) 1144 1145 // we need to wait for the node to be reported ready before we can safely query 1146 // the podresources endpoint again. Otherwise we will have false negatives. 1147 ginkgo.By("Wait for node to be ready") 1148 waitForTopologyUnawareResources(ctx, f) 1149 1150 expectPodResources(ctx, 1, cli, []podDesc{desc}) 1151 tpd.deletePodsForTest(ctx, f) 1152 }) 1153 }) 1154 }) 1155 }) 1156 1157 f.Context("when querying /metrics", f.WithNodeConformance(), func() { 1158 tempSetCurrentKubeletConfig(f, func(ctx context.Context, initialConfig *kubeletconfig.KubeletConfiguration) { 1159 if initialConfig.FeatureGates == nil { 1160 initialConfig.FeatureGates = make(map[string]bool) 1161 } 1162 initialConfig.FeatureGates[string(kubefeatures.KubeletPodResourcesGet)] = true 1163 }) 1164 ginkgo.BeforeEach(func(ctx context.Context) { 1165 // ensure APIs have been called at least once 1166 endpoint, err := util.LocalEndpoint(defaultPodResourcesPath, podresources.Socket) 1167 framework.ExpectNoError(err, "LocalEndpoint() failed err %v", err) 1168 1169 cli, conn, err := podresources.GetV1Client(endpoint, defaultPodResourcesTimeout, defaultPodResourcesMaxSize) 1170 framework.ExpectNoError(err, "GetV1Client() failed err %v", err) 1171 defer conn.Close() 1172 1173 _, err = cli.List(ctx, &kubeletpodresourcesv1.ListPodResourcesRequest{}) 1174 framework.ExpectNoError(err, "List() failed err %v", err) 1175 1176 _, err = cli.GetAllocatableResources(ctx, &kubeletpodresourcesv1.AllocatableResourcesRequest{}) 1177 framework.ExpectNoError(err, "GetAllocatableResources() failed err %v", err) 1178 1179 desc := podDesc{ 1180 podName: "pod-01", 1181 cntName: "cnt-01", 1182 } 1183 tpd := newTestPodData() 1184 tpd.createPodsForTest(ctx, f, []podDesc{ 1185 desc, 1186 }) 1187 expectPodResources(ctx, 1, cli, []podDesc{desc}) 1188 1189 expected := []podDesc{} 1190 resp, err := cli.Get(ctx, &kubeletpodresourcesv1.GetPodResourcesRequest{PodName: "pod-01", PodNamespace: f.Namespace.Name}) 1191 framework.ExpectNoError(err, "Get() call failed for pod %s/%s", f.Namespace.Name, "pod-01") 1192 podResourceList := []*kubeletpodresourcesv1.PodResources{resp.GetPodResources()} 1193 res := convertToMap(podResourceList) 1194 err = matchPodDescWithResources(expected, res) 1195 framework.ExpectNoError(err, "matchPodDescWithResources() failed err %v", err) 1196 tpd.deletePodsForTest(ctx, f) 1197 }) 1198 1199 ginkgo.It("should report the values for the podresources metrics", func(ctx context.Context) { 1200 // we updated the kubelet config in BeforeEach, so we can assume we start fresh. 1201 // being [Serial], we can also assume noone else but us is running pods. 1202 ginkgo.By("Checking the value of the podresources metrics") 1203 1204 matchResourceMetrics := gstruct.MatchKeys(gstruct.IgnoreExtras, gstruct.Keys{ 1205 "kubelet_pod_resources_endpoint_requests_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{ 1206 "": timelessSampleAtLeast(1), 1207 }), 1208 "kubelet_pod_resources_endpoint_requests_list": gstruct.MatchAllElements(nodeID, gstruct.Elements{ 1209 "": timelessSampleAtLeast(1), 1210 }), 1211 "kubelet_pod_resources_endpoint_requests_get_allocatable": gstruct.MatchAllElements(nodeID, gstruct.Elements{ 1212 "": timelessSampleAtLeast(1), 1213 }), 1214 "kubelet_pod_resources_endpoint_requests_get": gstruct.MatchAllElements(nodeID, gstruct.Elements{ 1215 "": timelessSampleAtLeast(1), 1216 }), 1217 // not checking errors: the calls don't have non-catastrophic (e.g. out of memory) error conditions yet. 1218 }) 1219 1220 ginkgo.By("Giving the Kubelet time to start up and produce metrics") 1221 gomega.Eventually(ctx, getPodResourcesMetrics, 1*time.Minute, 15*time.Second).Should(matchResourceMetrics) 1222 ginkgo.By("Ensuring the metrics match the expectations a few more times") 1223 gomega.Consistently(ctx, getPodResourcesMetrics, 1*time.Minute, 15*time.Second).Should(matchResourceMetrics) 1224 }) 1225 }) 1226 1227 framework.Context("with the builtin rate limit values", framework.WithFlaky(), func() { 1228 ginkgo.It("should hit throttling when calling podresources List in a tight loop", func(ctx context.Context) { 1229 // ensure APIs have been called at least once 1230 endpoint, err := util.LocalEndpoint(defaultPodResourcesPath, podresources.Socket) 1231 framework.ExpectNoError(err, "LocalEndpoint() failed err %v", err) 1232 1233 ginkgo.By("Connecting to the kubelet endpoint") 1234 cli, conn, err := podresources.GetV1Client(endpoint, defaultPodResourcesTimeout, defaultPodResourcesMaxSize) 1235 framework.ExpectNoError(err, "GetV1Client() failed err %v", err) 1236 defer conn.Close() 1237 1238 tries := podresources.DefaultQPS * 2 // This should also be greater than DefaultBurstTokens 1239 errs := []error{} 1240 1241 ginkgo.By(fmt.Sprintf("Issuing %d List() calls in a tight loop", tries)) 1242 startTime := time.Now() 1243 for try := 0; try < tries; try++ { 1244 _, err = cli.List(ctx, &kubeletpodresourcesv1.ListPodResourcesRequest{}) 1245 errs = append(errs, err) 1246 } 1247 elapsed := time.Since(startTime) 1248 1249 ginkgo.By(fmt.Sprintf("Checking return codes for %d List() calls in %v", tries, elapsed)) 1250 1251 framework.ExpectNoError(errs[0], "the first List() call unexpectedly failed with %v", errs[0]) 1252 // we would expect (burst) successes and then (tries-burst) errors on a clean test environment running with 1253 // enough CPU power. CI is usually harsher. So we relax constraints, expecting at least _a_ failure, while 1254 // we are likely to get much more. But we can't predict yet how more we should expect, so we prefer to relax 1255 // constraints than to risk flakes at this stage. 1256 errLimitExceededCount := 0 1257 for _, err := range errs[1:] { 1258 if errors.Is(err, apisgrpc.ErrorLimitExceeded) { 1259 errLimitExceededCount++ 1260 } 1261 } 1262 gomega.Expect(errLimitExceededCount).ToNot(gomega.BeZero(), "never hit the rate limit trying %d calls in %v", tries, elapsed) 1263 1264 framework.Logf("got %d/%d rate limit errors, at least one needed, the more the better", errLimitExceededCount, tries) 1265 1266 // this is not needed for this test. We're done. But we need to play nice with *other* tests which may run just after, 1267 // and which need to query the API. If they run "too fast", they can still be throttled because the throttling period 1268 // is not exhausted yet, yielding false negatives, leading to flakes. 1269 // We can't reset the period for the rate limit, we just wait "long enough" to make sure we absorb the burst 1270 // and other queries are not rejected because happening to soon 1271 ginkgo.By("Cooling down to reset the podresources API rate limit") 1272 time.Sleep(5 * time.Second) 1273 }) 1274 }) 1275 }) 1276 1277 func requireLackOfSRIOVDevices() { 1278 if sriovdevCount, err := countSRIOVDevices(); err != nil || sriovdevCount > 0 { 1279 e2eskipper.Skipf("this test is meant to run on a system with no configured VF from SRIOV device") 1280 } 1281 } 1282 1283 func getOnlineCPUs() (cpuset.CPUSet, error) { 1284 onlineCPUList, err := os.ReadFile("/sys/devices/system/cpu/online") 1285 if err != nil { 1286 return cpuset.CPUSet{}, err 1287 } 1288 return cpuset.Parse(strings.TrimSpace(string(onlineCPUList))) 1289 } 1290 1291 func setupSampleDevicePluginOrFail(ctx context.Context, f *framework.Framework) *v1.Pod { 1292 e2enode.WaitForNodeToBeReady(ctx, f.ClientSet, framework.TestContext.NodeName, 5*time.Minute) 1293 1294 dp := getSampleDevicePluginPod(kubeletdevicepluginv1beta1.DevicePluginPath) 1295 dp.Spec.NodeName = framework.TestContext.NodeName 1296 1297 ginkgo.By("Create the sample device plugin pod") 1298 1299 dpPod := e2epod.NewPodClient(f).CreateSync(ctx, dp) 1300 1301 err := e2epod.WaitForPodCondition(ctx, f.ClientSet, dpPod.Namespace, dpPod.Name, "Ready", 120*time.Second, testutils.PodRunningReady) 1302 if err != nil { 1303 framework.Logf("Sample Device Pod %v took too long to enter running/ready: %v", dp.Name, err) 1304 } 1305 framework.ExpectNoError(err, "WaitForPodCondition() failed err: %v", err) 1306 1307 return dpPod 1308 } 1309 1310 func teardownSampleDevicePluginOrFail(ctx context.Context, f *framework.Framework, pod *v1.Pod) { 1311 gp := int64(0) 1312 deleteOptions := metav1.DeleteOptions{ 1313 GracePeriodSeconds: &gp, 1314 } 1315 ginkgo.By(fmt.Sprintf("Delete sample device plugin pod %s/%s", pod.Namespace, pod.Name)) 1316 err := f.ClientSet.CoreV1().Pods(pod.Namespace).Delete(ctx, pod.Name, deleteOptions) 1317 1318 framework.ExpectNoError(err, "Failed to delete Pod %v in Namspace %v", pod.Name, pod.Namespace) 1319 waitForAllContainerRemoval(ctx, pod.Name, pod.Namespace) 1320 } 1321 1322 func waitForTopologyUnawareResources(ctx context.Context, f *framework.Framework) { 1323 ginkgo.By(fmt.Sprintf("Waiting for %q resources to become available on the local node", defaultTopologyUnawareResourceName)) 1324 1325 gomega.Eventually(ctx, func(ctx context.Context) bool { 1326 node := getLocalNode(ctx, f) 1327 resourceAmount := CountSampleDeviceAllocatable(node) 1328 return resourceAmount > 0 1329 }, 2*time.Minute, framework.Poll).Should(gomega.BeTrue()) 1330 } 1331 1332 func getPodResourcesMetrics(ctx context.Context) (e2emetrics.KubeletMetrics, error) { 1333 // we are running out of good names, so we need to be unnecessarily specific to avoid clashes 1334 ginkgo.By("getting Pod Resources metrics from the metrics API") 1335 return e2emetrics.GrabKubeletMetricsWithoutProxy(ctx, nodeNameOrIP()+":10255", "/metrics") 1336 } 1337 1338 func timelessSampleAtLeast(lower interface{}) types.GomegaMatcher { 1339 return gstruct.PointTo(gstruct.MatchAllFields(gstruct.Fields{ 1340 // We already check Metric when matching the Id 1341 "Metric": gstruct.Ignore(), 1342 "Value": gomega.BeNumerically(">=", lower), 1343 "Timestamp": gstruct.Ignore(), 1344 "Histogram": gstruct.Ignore(), 1345 })) 1346 }