k8s.io/kubernetes@v1.29.3/test/e2e_node/topology_manager_test.go (about) 1 /* 2 Copyright 2019 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package e2enode 18 19 import ( 20 "context" 21 "fmt" 22 "os" 23 "os/exec" 24 "regexp" 25 "strconv" 26 "strings" 27 "sync" 28 "time" 29 30 v1 "k8s.io/api/core/v1" 31 "k8s.io/apimachinery/pkg/api/resource" 32 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 33 "k8s.io/apimachinery/pkg/runtime" 34 kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config" 35 "k8s.io/kubernetes/pkg/kubelet/cm/cpumanager" 36 "k8s.io/kubernetes/pkg/kubelet/cm/topologymanager" 37 admissionapi "k8s.io/pod-security-admission/api" 38 39 "k8s.io/kubernetes/test/e2e/feature" 40 "k8s.io/kubernetes/test/e2e/framework" 41 e2enode "k8s.io/kubernetes/test/e2e/framework/node" 42 e2epod "k8s.io/kubernetes/test/e2e/framework/pod" 43 e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper" 44 e2etestfiles "k8s.io/kubernetes/test/e2e/framework/testfiles" 45 testutils "k8s.io/kubernetes/test/utils" 46 47 "github.com/onsi/ginkgo/v2" 48 "github.com/onsi/gomega" 49 ) 50 51 const ( 52 numaAlignmentCommand = `export CPULIST_ALLOWED=$( awk -F":\t*" '/Cpus_allowed_list/ { print $2 }' /proc/self/status); env;` 53 numaAlignmentSleepCommand = numaAlignmentCommand + `sleep 1d;` 54 podScopeTopology = "pod" 55 containerScopeTopology = "container" 56 57 minNumaNodes = 2 58 minCoreCount = 4 59 minSriovResource = 7 // This is the min number of SRIOV VFs needed on the system under test. 60 ) 61 62 // Helper for makeTopologyManagerPod(). 63 type tmCtnAttribute struct { 64 ctnName string 65 cpuRequest string 66 cpuLimit string 67 deviceName string 68 deviceRequest string 69 deviceLimit string 70 } 71 72 func detectNUMANodes() int { 73 outData, err := exec.Command("/bin/sh", "-c", "lscpu | grep \"NUMA node(s):\" | cut -d \":\" -f 2").Output() 74 framework.ExpectNoError(err) 75 76 numaNodes, err := strconv.Atoi(strings.TrimSpace(string(outData))) 77 framework.ExpectNoError(err) 78 79 return numaNodes 80 } 81 82 func detectCoresPerSocket() int { 83 outData, err := exec.Command("/bin/sh", "-c", "lscpu | grep \"Core(s) per socket:\" | cut -d \":\" -f 2").Output() 84 framework.ExpectNoError(err) 85 86 coreCount, err := strconv.Atoi(strings.TrimSpace(string(outData))) 87 framework.ExpectNoError(err) 88 89 return coreCount 90 } 91 92 func detectThreadPerCore() int { 93 outData, err := exec.Command("/bin/sh", "-c", "lscpu | grep \"Thread(s) per core:\" | cut -d \":\" -f 2").Output() 94 framework.ExpectNoError(err) 95 96 threadCount, err := strconv.Atoi(strings.TrimSpace(string(outData))) 97 framework.ExpectNoError(err) 98 99 return threadCount 100 } 101 102 func makeContainers(ctnCmd string, ctnAttributes []tmCtnAttribute) (ctns []v1.Container) { 103 for _, ctnAttr := range ctnAttributes { 104 ctn := v1.Container{ 105 Name: ctnAttr.ctnName, 106 Image: busyboxImage, 107 Resources: v1.ResourceRequirements{ 108 Requests: v1.ResourceList{ 109 v1.ResourceName(v1.ResourceCPU): resource.MustParse(ctnAttr.cpuRequest), 110 v1.ResourceName(v1.ResourceMemory): resource.MustParse("100Mi"), 111 }, 112 Limits: v1.ResourceList{ 113 v1.ResourceName(v1.ResourceCPU): resource.MustParse(ctnAttr.cpuLimit), 114 v1.ResourceName(v1.ResourceMemory): resource.MustParse("100Mi"), 115 }, 116 }, 117 Command: []string{"sh", "-c", ctnCmd}, 118 } 119 if ctnAttr.deviceName != "" { 120 ctn.Resources.Requests[v1.ResourceName(ctnAttr.deviceName)] = resource.MustParse(ctnAttr.deviceRequest) 121 ctn.Resources.Limits[v1.ResourceName(ctnAttr.deviceName)] = resource.MustParse(ctnAttr.deviceLimit) 122 } 123 ctns = append(ctns, ctn) 124 } 125 return 126 } 127 128 func makeTopologyManagerTestPod(podName string, tmCtnAttributes, tmInitCtnAttributes []tmCtnAttribute) *v1.Pod { 129 var containers, initContainers []v1.Container 130 if len(tmInitCtnAttributes) > 0 { 131 initContainers = makeContainers(numaAlignmentCommand, tmInitCtnAttributes) 132 } 133 containers = makeContainers(numaAlignmentSleepCommand, tmCtnAttributes) 134 135 return &v1.Pod{ 136 ObjectMeta: metav1.ObjectMeta{ 137 Name: podName, 138 }, 139 Spec: v1.PodSpec{ 140 RestartPolicy: v1.RestartPolicyNever, 141 InitContainers: initContainers, 142 Containers: containers, 143 }, 144 } 145 } 146 147 func findNUMANodeWithoutSRIOVDevicesFromConfigMap(configMap *v1.ConfigMap, numaNodes int) (int, bool) { 148 for nodeNum := 0; nodeNum < numaNodes; nodeNum++ { 149 value, ok := configMap.Annotations[fmt.Sprintf("pcidevice_node%d", nodeNum)] 150 if !ok { 151 framework.Logf("missing pcidevice annotation for NUMA node %d", nodeNum) 152 return -1, false 153 } 154 v, err := strconv.Atoi(value) 155 if err != nil { 156 framework.Failf("error getting the PCI device count on NUMA node %d: %v", nodeNum, err) 157 } 158 if v == 0 { 159 framework.Logf("NUMA node %d has no SRIOV devices attached", nodeNum) 160 return nodeNum, true 161 } 162 framework.Logf("NUMA node %d has %d SRIOV devices attached", nodeNum, v) 163 } 164 return -1, false 165 } 166 167 func findNUMANodeWithoutSRIOVDevicesFromSysfs(numaNodes int) (int, bool) { 168 pciDevs, err := getPCIDeviceInfo("/sys/bus/pci/devices") 169 if err != nil { 170 framework.Failf("error detecting the PCI device NUMA node: %v", err) 171 } 172 173 pciPerNuma := make(map[int]int) 174 for _, pciDev := range pciDevs { 175 if pciDev.IsVFn { 176 pciPerNuma[pciDev.NUMANode]++ 177 } 178 } 179 180 if len(pciPerNuma) == 0 { 181 framework.Logf("failed to find any VF device from %v", pciDevs) 182 return -1, false 183 } 184 185 for nodeNum := 0; nodeNum < numaNodes; nodeNum++ { 186 v := pciPerNuma[nodeNum] 187 if v == 0 { 188 framework.Logf("NUMA node %d has no SRIOV devices attached", nodeNum) 189 return nodeNum, true 190 } 191 framework.Logf("NUMA node %d has %d SRIOV devices attached", nodeNum, v) 192 } 193 return -1, false 194 } 195 196 func findNUMANodeWithoutSRIOVDevices(configMap *v1.ConfigMap, numaNodes int) (int, bool) { 197 // if someone annotated the configMap, let's use this information 198 if nodeNum, found := findNUMANodeWithoutSRIOVDevicesFromConfigMap(configMap, numaNodes); found { 199 return nodeNum, found 200 } 201 // no annotations, try to autodetect 202 // NOTE: this assumes all the VFs in the box can be used for the tests. 203 return findNUMANodeWithoutSRIOVDevicesFromSysfs(numaNodes) 204 } 205 206 func configureTopologyManagerInKubelet(oldCfg *kubeletconfig.KubeletConfiguration, policy, scope string, configMap *v1.ConfigMap, numaNodes int) (*kubeletconfig.KubeletConfiguration, string) { 207 // Configure Topology Manager in Kubelet with policy. 208 newCfg := oldCfg.DeepCopy() 209 if newCfg.FeatureGates == nil { 210 newCfg.FeatureGates = make(map[string]bool) 211 } 212 213 // Set the Topology Manager policy 214 newCfg.TopologyManagerPolicy = policy 215 216 newCfg.TopologyManagerScope = scope 217 218 // Set the CPU Manager policy to static. 219 newCfg.CPUManagerPolicy = string(cpumanager.PolicyStatic) 220 221 // Set the CPU Manager reconcile period to 1 second. 222 newCfg.CPUManagerReconcilePeriod = metav1.Duration{Duration: 1 * time.Second} 223 224 if nodeNum, ok := findNUMANodeWithoutSRIOVDevices(configMap, numaNodes); ok { 225 cpus, err := getCPUsPerNUMANode(nodeNum) 226 framework.Logf("NUMA Node %d doesn't seem to have attached SRIOV devices and has cpus=%v", nodeNum, cpus) 227 framework.ExpectNoError(err) 228 newCfg.ReservedSystemCPUs = fmt.Sprintf("%d", cpus[len(cpus)-1]) 229 } else { 230 // The Kubelet panics if either kube-reserved or system-reserved is not set 231 // when CPU Manager is enabled. Set cpu in kube-reserved > 0 so that 232 // kubelet doesn't panic. 233 if newCfg.KubeReserved == nil { 234 newCfg.KubeReserved = map[string]string{} 235 } 236 237 if _, ok := newCfg.KubeReserved["cpu"]; !ok { 238 newCfg.KubeReserved["cpu"] = "200m" 239 } 240 } 241 // Dump the config -- debug 242 framework.Logf("New kubelet config is %s", *newCfg) 243 244 return newCfg, newCfg.ReservedSystemCPUs 245 } 246 247 // getSRIOVDevicePluginPod returns the Device Plugin pod for sriov resources in e2e tests. 248 func getSRIOVDevicePluginPod() *v1.Pod { 249 data, err := e2etestfiles.Read(SRIOVDevicePluginDSYAML) 250 if err != nil { 251 framework.Fail(err.Error()) 252 } 253 254 ds := readDaemonSetV1OrDie(data) 255 p := &v1.Pod{ 256 ObjectMeta: metav1.ObjectMeta{ 257 Name: SRIOVDevicePluginName, 258 Namespace: metav1.NamespaceSystem, 259 }, 260 261 Spec: ds.Spec.Template.Spec, 262 } 263 264 return p 265 } 266 267 func readConfigMapV1OrDie(objBytes []byte) *v1.ConfigMap { 268 v1.AddToScheme(appsScheme) 269 requiredObj, err := runtime.Decode(appsCodecs.UniversalDecoder(v1.SchemeGroupVersion), objBytes) 270 if err != nil { 271 panic(err) 272 } 273 return requiredObj.(*v1.ConfigMap) 274 } 275 276 func readServiceAccountV1OrDie(objBytes []byte) *v1.ServiceAccount { 277 v1.AddToScheme(appsScheme) 278 requiredObj, err := runtime.Decode(appsCodecs.UniversalDecoder(v1.SchemeGroupVersion), objBytes) 279 if err != nil { 280 panic(err) 281 } 282 return requiredObj.(*v1.ServiceAccount) 283 } 284 285 func findSRIOVResource(node *v1.Node) (string, int64) { 286 framework.Logf("Node status allocatable: %v", node.Status.Allocatable) 287 re := regexp.MustCompile(`^intel.com/.*sriov.*`) 288 for key, val := range node.Status.Allocatable { 289 resource := string(key) 290 if re.MatchString(resource) { 291 v := val.Value() 292 if v > 0 { 293 return resource, v 294 } 295 } 296 } 297 return "", 0 298 } 299 300 func validatePodAlignment(ctx context.Context, f *framework.Framework, pod *v1.Pod, envInfo *testEnvInfo) { 301 for _, cnt := range pod.Spec.Containers { 302 ginkgo.By(fmt.Sprintf("validating the container %s on Gu pod %s", cnt.Name, pod.Name)) 303 304 logs, err := e2epod.GetPodLogs(ctx, f.ClientSet, f.Namespace.Name, pod.Name, cnt.Name) 305 framework.ExpectNoError(err, "expected log not found in container [%s] of pod [%s]", cnt.Name, pod.Name) 306 307 framework.Logf("got pod logs: %v", logs) 308 numaRes, err := checkNUMAAlignment(f, pod, &cnt, logs, envInfo) 309 framework.ExpectNoError(err, "NUMA Alignment check failed for [%s] of pod [%s]", cnt.Name, pod.Name) 310 if numaRes != nil { 311 framework.Logf("NUMA resources for %s/%s: %s", pod.Name, cnt.Name, numaRes.String()) 312 } 313 } 314 } 315 316 // validatePodAligmentWithPodScope validates whether all pod's CPUs are affined to the same NUMA node. 317 func validatePodAlignmentWithPodScope(ctx context.Context, f *framework.Framework, pod *v1.Pod, envInfo *testEnvInfo) error { 318 // Mapping between CPU IDs and NUMA node IDs. 319 podsNUMA := make(map[int]int) 320 321 ginkgo.By(fmt.Sprintf("validate pod scope alignment for %s pod", pod.Name)) 322 for _, cnt := range pod.Spec.Containers { 323 logs, err := e2epod.GetPodLogs(ctx, f.ClientSet, f.Namespace.Name, pod.Name, cnt.Name) 324 framework.ExpectNoError(err, "NUMA alignment failed for container [%s] of pod [%s]", cnt.Name, pod.Name) 325 envMap, err := makeEnvMap(logs) 326 framework.ExpectNoError(err, "NUMA alignment failed for container [%s] of pod [%s]", cnt.Name, pod.Name) 327 cpuToNUMA, err := getCPUToNUMANodeMapFromEnv(f, pod, &cnt, envMap, envInfo.numaNodes) 328 framework.ExpectNoError(err, "NUMA alignment failed for container [%s] of pod [%s]", cnt.Name, pod.Name) 329 for cpuID, numaID := range cpuToNUMA { 330 podsNUMA[cpuID] = numaID 331 } 332 } 333 334 numaRes := numaPodResources{ 335 CPUToNUMANode: podsNUMA, 336 } 337 aligned := numaRes.CheckAlignment() 338 if !aligned { 339 return fmt.Errorf("resources were assigned from different NUMA nodes") 340 } 341 342 framework.Logf("NUMA locality confirmed: all pod's CPUs aligned to the same NUMA node") 343 return nil 344 } 345 346 func runTopologyManagerPolicySuiteTests(ctx context.Context, f *framework.Framework) { 347 var cpuCap, cpuAlloc int64 348 349 cpuCap, cpuAlloc, _ = getLocalNodeCPUDetails(ctx, f) 350 ginkgo.By(fmt.Sprintf("checking node CPU capacity (%d) and allocatable CPUs (%d)", cpuCap, cpuAlloc)) 351 352 // Albeit even the weakest CI machines usually have 2 cpus, let's be extra careful and 353 // check explicitly. We prefer to skip than a false negative (and a failed test). 354 if cpuAlloc < 1 { 355 e2eskipper.Skipf("Skipping basic CPU Manager tests since CPU capacity < 2") 356 } 357 358 ginkgo.By("running a non-Gu pod") 359 runNonGuPodTest(ctx, f, cpuCap) 360 361 ginkgo.By("running a Gu pod") 362 runGuPodTest(ctx, f, 1) 363 364 // Skip rest of the tests if CPU allocatable < 3. 365 if cpuAlloc < 3 { 366 e2eskipper.Skipf("Skipping rest of the CPU Manager tests since CPU capacity < 3") 367 } 368 369 ginkgo.By("running multiple Gu and non-Gu pods") 370 runMultipleGuNonGuPods(ctx, f, cpuCap, cpuAlloc) 371 372 ginkgo.By("running a Gu pod requesting multiple CPUs") 373 runMultipleCPUGuPod(ctx, f) 374 375 ginkgo.By("running a Gu pod with multiple containers requesting integer CPUs") 376 runMultipleCPUContainersGuPod(ctx, f) 377 378 ginkgo.By("running multiple Gu pods") 379 runMultipleGuPods(ctx, f) 380 } 381 382 func runTopologyManagerPositiveTest(ctx context.Context, f *framework.Framework, numPods int, ctnAttrs, initCtnAttrs []tmCtnAttribute, envInfo *testEnvInfo) { 383 podMap := make(map[string]*v1.Pod) 384 385 for podID := 0; podID < numPods; podID++ { 386 podName := fmt.Sprintf("gu-pod-%d", podID) 387 framework.Logf("creating pod %s attrs %v", podName, ctnAttrs) 388 pod := makeTopologyManagerTestPod(podName, ctnAttrs, initCtnAttrs) 389 pod = e2epod.NewPodClient(f).CreateSync(ctx, pod) 390 framework.Logf("created pod %s", podName) 391 podMap[podName] = pod 392 } 393 394 // per https://github.com/kubernetes/enhancements/blob/master/keps/sig-node/693-topology-manager/README.md#multi-numa-systems-tests 395 // we can do a menaingful validation only when using the single-numa node policy 396 if envInfo.policy == topologymanager.PolicySingleNumaNode { 397 for _, pod := range podMap { 398 validatePodAlignment(ctx, f, pod, envInfo) 399 } 400 if envInfo.scope == podScopeTopology { 401 for _, pod := range podMap { 402 err := validatePodAlignmentWithPodScope(ctx, f, pod, envInfo) 403 framework.ExpectNoError(err) 404 } 405 } 406 } 407 408 deletePodsAsync(ctx, f, podMap) 409 } 410 411 func deletePodsAsync(ctx context.Context, f *framework.Framework, podMap map[string]*v1.Pod) { 412 var wg sync.WaitGroup 413 for _, pod := range podMap { 414 wg.Add(1) 415 go func(podNS, podName string) { 416 defer ginkgo.GinkgoRecover() 417 defer wg.Done() 418 419 deletePodSyncByName(ctx, f, podName) 420 waitForAllContainerRemoval(ctx, podName, podNS) 421 }(pod.Namespace, pod.Name) 422 } 423 wg.Wait() 424 } 425 426 func runTopologyManagerNegativeTest(ctx context.Context, f *framework.Framework, ctnAttrs, initCtnAttrs []tmCtnAttribute, envInfo *testEnvInfo) { 427 podName := "gu-pod" 428 framework.Logf("creating pod %s attrs %v", podName, ctnAttrs) 429 pod := makeTopologyManagerTestPod(podName, ctnAttrs, initCtnAttrs) 430 431 pod = e2epod.NewPodClient(f).Create(ctx, pod) 432 err := e2epod.WaitForPodCondition(ctx, f.ClientSet, f.Namespace.Name, pod.Name, "Failed", 30*time.Second, func(pod *v1.Pod) (bool, error) { 433 if pod.Status.Phase != v1.PodPending { 434 return true, nil 435 } 436 return false, nil 437 }) 438 framework.ExpectNoError(err) 439 pod, err = e2epod.NewPodClient(f).Get(ctx, pod.Name, metav1.GetOptions{}) 440 framework.ExpectNoError(err) 441 442 if pod.Status.Phase != v1.PodFailed { 443 framework.Failf("pod %s not failed: %v", pod.Name, pod.Status) 444 } 445 if !isTopologyAffinityError(pod) { 446 framework.Failf("pod %s failed for wrong reason: %q", pod.Name, pod.Status.Reason) 447 } 448 449 deletePodSyncByName(ctx, f, pod.Name) 450 } 451 452 func isTopologyAffinityError(pod *v1.Pod) bool { 453 re := regexp.MustCompile(`Topology.*Affinity.*Error`) 454 return re.MatchString(pod.Status.Reason) 455 } 456 457 func getSRIOVDevicePluginConfigMap(cmFile string) *v1.ConfigMap { 458 data, err := e2etestfiles.Read(SRIOVDevicePluginCMYAML) 459 if err != nil { 460 framework.Fail(err.Error()) 461 } 462 463 // the SRIOVDP configuration is hw-dependent, so we allow per-test-host customization. 464 framework.Logf("host-local SRIOV Device Plugin Config Map %q", cmFile) 465 if cmFile != "" { 466 data, err = os.ReadFile(cmFile) 467 if err != nil { 468 framework.Failf("unable to load the SRIOV Device Plugin ConfigMap: %v", err) 469 } 470 } else { 471 framework.Logf("Using built-in SRIOV Device Plugin Config Map") 472 } 473 474 return readConfigMapV1OrDie(data) 475 } 476 477 type sriovData struct { 478 configMap *v1.ConfigMap 479 serviceAccount *v1.ServiceAccount 480 pod *v1.Pod 481 482 resourceName string 483 resourceAmount int64 484 } 485 486 func setupSRIOVConfigOrFail(ctx context.Context, f *framework.Framework, configMap *v1.ConfigMap) *sriovData { 487 sd := createSRIOVConfigOrFail(ctx, f, configMap) 488 489 e2enode.WaitForNodeToBeReady(ctx, f.ClientSet, framework.TestContext.NodeName, 5*time.Minute) 490 491 sd.pod = createSRIOVPodOrFail(ctx, f) 492 return sd 493 } 494 495 func createSRIOVConfigOrFail(ctx context.Context, f *framework.Framework, configMap *v1.ConfigMap) *sriovData { 496 var err error 497 498 ginkgo.By(fmt.Sprintf("Creating configMap %v/%v", metav1.NamespaceSystem, configMap.Name)) 499 if _, err = f.ClientSet.CoreV1().ConfigMaps(metav1.NamespaceSystem).Create(ctx, configMap, metav1.CreateOptions{}); err != nil { 500 framework.Failf("unable to create test configMap %s: %v", configMap.Name, err) 501 } 502 503 data, err := e2etestfiles.Read(SRIOVDevicePluginSAYAML) 504 if err != nil { 505 framework.Fail(err.Error()) 506 } 507 serviceAccount := readServiceAccountV1OrDie(data) 508 ginkgo.By(fmt.Sprintf("Creating serviceAccount %v/%v", metav1.NamespaceSystem, serviceAccount.Name)) 509 if _, err = f.ClientSet.CoreV1().ServiceAccounts(metav1.NamespaceSystem).Create(ctx, serviceAccount, metav1.CreateOptions{}); err != nil { 510 framework.Failf("unable to create test serviceAccount %s: %v", serviceAccount.Name, err) 511 } 512 513 return &sriovData{ 514 configMap: configMap, 515 serviceAccount: serviceAccount, 516 } 517 } 518 519 func createSRIOVPodOrFail(ctx context.Context, f *framework.Framework) *v1.Pod { 520 dp := getSRIOVDevicePluginPod() 521 dp.Spec.NodeName = framework.TestContext.NodeName 522 523 ginkgo.By("Create SRIOV device plugin pod") 524 dpPod, err := f.ClientSet.CoreV1().Pods(metav1.NamespaceSystem).Create(ctx, dp, metav1.CreateOptions{}) 525 framework.ExpectNoError(err) 526 527 if err = e2epod.WaitForPodCondition(ctx, f.ClientSet, metav1.NamespaceSystem, dp.Name, "Ready", 120*time.Second, testutils.PodRunningReady); err != nil { 528 framework.Logf("SRIOV Pod %v took too long to enter running/ready: %v", dp.Name, err) 529 } 530 framework.ExpectNoError(err) 531 532 return dpPod 533 } 534 535 // waitForSRIOVResources waits until enough SRIOV resources are avaailable, expecting to complete within the timeout. 536 // if exits successfully, updates the sriovData with the resources which were found. 537 func waitForSRIOVResources(ctx context.Context, f *framework.Framework, sd *sriovData) { 538 sriovResourceName := "" 539 var sriovResourceAmount int64 540 ginkgo.By("Waiting for devices to become available on the local node") 541 gomega.Eventually(ctx, func(ctx context.Context) bool { 542 node := getLocalNode(ctx, f) 543 sriovResourceName, sriovResourceAmount = findSRIOVResource(node) 544 return sriovResourceAmount > minSriovResource 545 }, 2*time.Minute, framework.Poll).Should(gomega.BeTrue()) 546 547 sd.resourceName = sriovResourceName 548 sd.resourceAmount = sriovResourceAmount 549 framework.Logf("Detected SRIOV allocatable devices name=%q amount=%d", sd.resourceName, sd.resourceAmount) 550 } 551 552 func deleteSRIOVPodOrFail(ctx context.Context, f *framework.Framework, sd *sriovData) { 553 var err error 554 gp := int64(0) 555 deleteOptions := metav1.DeleteOptions{ 556 GracePeriodSeconds: &gp, 557 } 558 559 ginkgo.By(fmt.Sprintf("Delete SRIOV device plugin pod %s/%s", sd.pod.Namespace, sd.pod.Name)) 560 err = f.ClientSet.CoreV1().Pods(sd.pod.Namespace).Delete(ctx, sd.pod.Name, deleteOptions) 561 framework.ExpectNoError(err) 562 waitForAllContainerRemoval(ctx, sd.pod.Name, sd.pod.Namespace) 563 } 564 565 func removeSRIOVConfigOrFail(ctx context.Context, f *framework.Framework, sd *sriovData) { 566 var err error 567 gp := int64(0) 568 deleteOptions := metav1.DeleteOptions{ 569 GracePeriodSeconds: &gp, 570 } 571 572 ginkgo.By(fmt.Sprintf("Deleting configMap %v/%v", metav1.NamespaceSystem, sd.configMap.Name)) 573 err = f.ClientSet.CoreV1().ConfigMaps(metav1.NamespaceSystem).Delete(ctx, sd.configMap.Name, deleteOptions) 574 framework.ExpectNoError(err) 575 576 ginkgo.By(fmt.Sprintf("Deleting serviceAccount %v/%v", metav1.NamespaceSystem, sd.serviceAccount.Name)) 577 err = f.ClientSet.CoreV1().ServiceAccounts(metav1.NamespaceSystem).Delete(ctx, sd.serviceAccount.Name, deleteOptions) 578 framework.ExpectNoError(err) 579 } 580 581 func teardownSRIOVConfigOrFail(ctx context.Context, f *framework.Framework, sd *sriovData) { 582 deleteSRIOVPodOrFail(ctx, f, sd) 583 removeSRIOVConfigOrFail(ctx, f, sd) 584 } 585 586 func runTMScopeResourceAlignmentTestSuite(ctx context.Context, f *framework.Framework, configMap *v1.ConfigMap, reservedSystemCPUs, policy string, numaNodes, coreCount int) { 587 threadsPerCore := getSMTLevel() 588 sd := setupSRIOVConfigOrFail(ctx, f, configMap) 589 var ctnAttrs, initCtnAttrs []tmCtnAttribute 590 591 waitForSRIOVResources(ctx, f, sd) 592 593 envInfo := &testEnvInfo{ 594 numaNodes: numaNodes, 595 sriovResourceName: sd.resourceName, 596 policy: policy, 597 scope: podScopeTopology, 598 } 599 600 ginkgo.By(fmt.Sprintf("Admit two guaranteed pods. Both consist of 2 containers, each container with 1 CPU core. Use 1 %s device.", sd.resourceName)) 601 ctnAttrs = []tmCtnAttribute{ 602 { 603 ctnName: "ps-container-0", 604 cpuRequest: "1000m", 605 cpuLimit: "1000m", 606 deviceName: sd.resourceName, 607 deviceRequest: "1", 608 deviceLimit: "1", 609 }, 610 { 611 ctnName: "ps-container-1", 612 cpuRequest: "1000m", 613 cpuLimit: "1000m", 614 deviceName: sd.resourceName, 615 deviceRequest: "1", 616 deviceLimit: "1", 617 }, 618 } 619 runTopologyManagerPositiveTest(ctx, f, 2, ctnAttrs, initCtnAttrs, envInfo) 620 621 numCores := threadsPerCore * coreCount 622 coresReq := fmt.Sprintf("%dm", numCores*1000) 623 ginkgo.By(fmt.Sprintf("Admit a guaranteed pod requesting %d CPU cores, i.e., more than can be provided at every single NUMA node. Therefore, the request should be rejected.", numCores+1)) 624 ctnAttrs = []tmCtnAttribute{ 625 { 626 ctnName: "gu-container-1", 627 cpuRequest: coresReq, 628 cpuLimit: coresReq, 629 deviceRequest: "1", 630 deviceLimit: "1", 631 }, 632 { 633 ctnName: "gu-container-2", 634 cpuRequest: "1000m", 635 cpuLimit: "1000m", 636 deviceRequest: "1", 637 deviceLimit: "1", 638 }, 639 } 640 runTopologyManagerNegativeTest(ctx, f, ctnAttrs, initCtnAttrs, envInfo) 641 642 // The Topology Manager with pod scope should calculate how many CPUs it needs to admit a pod basing on two requests: 643 // the maximum of init containers' demand for CPU and sum of app containers' requests for CPU. 644 // The Topology Manager should use higher value of these. Therefore, both pods from below test case should get number of CPUs 645 // requested by init-container of highest demand for it. Since demand for CPU of each pod is slightly higher than half of resources 646 // available on one node, both pods should be placed on distinct NUMA nodes. 647 coresReq = fmt.Sprintf("%dm", (numCores/2+1)*1000) 648 ginkgo.By(fmt.Sprintf("Admit two guaranteed pods, each pod requests %d cores - the pods should be placed on different NUMA nodes", numCores/2+1)) 649 initCtnAttrs = []tmCtnAttribute{ 650 { 651 ctnName: "init-container-1", 652 cpuRequest: coresReq, 653 cpuLimit: coresReq, 654 deviceRequest: "1", 655 deviceLimit: "1", 656 }, 657 { 658 ctnName: "init-container-2", 659 cpuRequest: "1000m", 660 cpuLimit: "1000m", 661 deviceRequest: "1", 662 deviceLimit: "1", 663 }, 664 } 665 ctnAttrs = []tmCtnAttribute{ 666 { 667 ctnName: "gu-container-0", 668 cpuRequest: "1000m", 669 cpuLimit: "1000m", 670 deviceRequest: "1", 671 deviceLimit: "1", 672 }, 673 { 674 ctnName: "gu-container-1", 675 cpuRequest: "1000m", 676 cpuLimit: "1000m", 677 deviceRequest: "1", 678 deviceLimit: "1", 679 }, 680 } 681 runTopologyManagerPositiveTest(ctx, f, 2, ctnAttrs, initCtnAttrs, envInfo) 682 683 teardownSRIOVConfigOrFail(ctx, f, sd) 684 } 685 686 func runTopologyManagerNodeAlignmentSuiteTests(ctx context.Context, f *framework.Framework, sd *sriovData, reservedSystemCPUs, policy string, numaNodes, coreCount int) { 687 threadsPerCore := getSMTLevel() 688 689 waitForSRIOVResources(ctx, f, sd) 690 691 envInfo := &testEnvInfo{ 692 numaNodes: numaNodes, 693 sriovResourceName: sd.resourceName, 694 policy: policy, 695 } 696 697 // could have been a loop, we unroll it to explain the testcases 698 var ctnAttrs, initCtnAttrs []tmCtnAttribute 699 700 // simplest case 701 ginkgo.By(fmt.Sprintf("Successfully admit one guaranteed pod with 1 core, 1 %s device", sd.resourceName)) 702 ctnAttrs = []tmCtnAttribute{ 703 { 704 ctnName: "gu-container", 705 cpuRequest: "1000m", 706 cpuLimit: "1000m", 707 deviceName: sd.resourceName, 708 deviceRequest: "1", 709 deviceLimit: "1", 710 }, 711 } 712 runTopologyManagerPositiveTest(ctx, f, 1, ctnAttrs, initCtnAttrs, envInfo) 713 714 ginkgo.By(fmt.Sprintf("Successfully admit one guaranteed pod with 2 cores, 1 %s device", sd.resourceName)) 715 ctnAttrs = []tmCtnAttribute{ 716 { 717 ctnName: "gu-container", 718 cpuRequest: "2000m", 719 cpuLimit: "2000m", 720 deviceName: sd.resourceName, 721 deviceRequest: "1", 722 deviceLimit: "1", 723 }, 724 } 725 runTopologyManagerPositiveTest(ctx, f, 1, ctnAttrs, initCtnAttrs, envInfo) 726 727 if reservedSystemCPUs != "" { 728 // to avoid false negatives, we have put reserved CPUs in such a way there is at least a NUMA node 729 // with 1+ SRIOV devices and not reserved CPUs. 730 numCores := threadsPerCore * coreCount 731 allCoresReq := fmt.Sprintf("%dm", numCores*1000) 732 ginkgo.By(fmt.Sprintf("Successfully admit an entire socket (%d cores), 1 %s device", numCores, sd.resourceName)) 733 ctnAttrs = []tmCtnAttribute{ 734 { 735 ctnName: "gu-container", 736 cpuRequest: allCoresReq, 737 cpuLimit: allCoresReq, 738 deviceName: sd.resourceName, 739 deviceRequest: "1", 740 deviceLimit: "1", 741 }, 742 } 743 runTopologyManagerPositiveTest(ctx, f, 1, ctnAttrs, initCtnAttrs, envInfo) 744 } 745 746 if sd.resourceAmount > 1 { 747 // no matter how busses are connected to NUMA nodes and SRIOV devices are installed, this function 748 // preconditions must ensure the following can be fulfilled 749 ginkgo.By(fmt.Sprintf("Successfully admit two guaranteed pods, each with 1 core, 1 %s device", sd.resourceName)) 750 ctnAttrs = []tmCtnAttribute{ 751 { 752 ctnName: "gu-container", 753 cpuRequest: "1000m", 754 cpuLimit: "1000m", 755 deviceName: sd.resourceName, 756 deviceRequest: "1", 757 deviceLimit: "1", 758 }, 759 } 760 runTopologyManagerPositiveTest(ctx, f, 2, ctnAttrs, initCtnAttrs, envInfo) 761 762 ginkgo.By(fmt.Sprintf("Successfully admit two guaranteed pods, each with 2 cores, 1 %s device", sd.resourceName)) 763 ctnAttrs = []tmCtnAttribute{ 764 { 765 ctnName: "gu-container", 766 cpuRequest: "2000m", 767 cpuLimit: "2000m", 768 deviceName: sd.resourceName, 769 deviceRequest: "1", 770 deviceLimit: "1", 771 }, 772 } 773 runTopologyManagerPositiveTest(ctx, f, 2, ctnAttrs, initCtnAttrs, envInfo) 774 775 // testing more complex conditions require knowledge about the system cpu+bus topology 776 } 777 778 // multi-container tests 779 if sd.resourceAmount >= 4 { 780 ginkgo.By(fmt.Sprintf("Successfully admit a guaranteed pod requesting for two containers, each with 2 cores, 1 %s device", sd.resourceName)) 781 ctnAttrs = []tmCtnAttribute{ 782 { 783 ctnName: "gu-container-0", 784 cpuRequest: "2000m", 785 cpuLimit: "2000m", 786 deviceName: sd.resourceName, 787 deviceRequest: "1", 788 deviceLimit: "1", 789 }, 790 { 791 ctnName: "gu-container-1", 792 cpuRequest: "2000m", 793 cpuLimit: "2000m", 794 deviceName: sd.resourceName, 795 deviceRequest: "1", 796 deviceLimit: "1", 797 }, 798 } 799 runTopologyManagerPositiveTest(ctx, f, 1, ctnAttrs, initCtnAttrs, envInfo) 800 801 ginkgo.By(fmt.Sprintf("Successfully admit two guaranteed pods, each with two containers, each with 1 core, 1 %s device", sd.resourceName)) 802 ctnAttrs = []tmCtnAttribute{ 803 { 804 ctnName: "gu-container-0", 805 cpuRequest: "1000m", 806 cpuLimit: "1000m", 807 deviceName: sd.resourceName, 808 deviceRequest: "1", 809 deviceLimit: "1", 810 }, 811 { 812 ctnName: "gu-container-1", 813 cpuRequest: "1000m", 814 cpuLimit: "1000m", 815 deviceName: sd.resourceName, 816 deviceRequest: "1", 817 deviceLimit: "1", 818 }, 819 } 820 runTopologyManagerPositiveTest(ctx, f, 2, ctnAttrs, initCtnAttrs, envInfo) 821 822 ginkgo.By(fmt.Sprintf("Successfully admit two guaranteed pods, each with two containers, both with with 2 cores, one with 1 %s device", sd.resourceName)) 823 ctnAttrs = []tmCtnAttribute{ 824 { 825 ctnName: "gu-container-dev", 826 cpuRequest: "2000m", 827 cpuLimit: "2000m", 828 deviceName: sd.resourceName, 829 deviceRequest: "1", 830 deviceLimit: "1", 831 }, 832 { 833 ctnName: "gu-container-nodev", 834 cpuRequest: "2000m", 835 cpuLimit: "2000m", 836 }, 837 } 838 runTopologyManagerPositiveTest(ctx, f, 2, ctnAttrs, initCtnAttrs, envInfo) 839 } 840 841 // this is the only policy that can guarantee reliable rejects 842 if policy == topologymanager.PolicySingleNumaNode { 843 // overflow NUMA node capacity: cores 844 numCores := 1 + (threadsPerCore * coreCount) 845 excessCoresReq := fmt.Sprintf("%dm", numCores*1000) 846 ginkgo.By(fmt.Sprintf("Trying to admit a guaranteed pods, with %d cores, 1 %s device - and it should be rejected", numCores, sd.resourceName)) 847 ctnAttrs = []tmCtnAttribute{ 848 { 849 ctnName: "gu-container", 850 cpuRequest: excessCoresReq, 851 cpuLimit: excessCoresReq, 852 deviceName: sd.resourceName, 853 deviceRequest: "1", 854 deviceLimit: "1", 855 }, 856 } 857 runTopologyManagerNegativeTest(ctx, f, ctnAttrs, initCtnAttrs, envInfo) 858 } 859 } 860 861 func runTopologyManagerTests(f *framework.Framework) { 862 var oldCfg *kubeletconfig.KubeletConfiguration 863 var err error 864 865 var policies = []string{ 866 topologymanager.PolicySingleNumaNode, 867 topologymanager.PolicyRestricted, 868 topologymanager.PolicyBestEffort, 869 topologymanager.PolicyNone, 870 } 871 872 ginkgo.It("run Topology Manager policy test suite", func(ctx context.Context) { 873 oldCfg, err = getCurrentKubeletConfig(ctx) 874 framework.ExpectNoError(err) 875 876 scope := containerScopeTopology 877 for _, policy := range policies { 878 // Configure Topology Manager 879 ginkgo.By(fmt.Sprintf("by configuring Topology Manager policy to %s", policy)) 880 framework.Logf("Configuring topology Manager policy to %s", policy) 881 882 newCfg, _ := configureTopologyManagerInKubelet(oldCfg, policy, scope, nil, 0) 883 updateKubeletConfig(ctx, f, newCfg, true) 884 // Run the tests 885 runTopologyManagerPolicySuiteTests(ctx, f) 886 } 887 }) 888 889 ginkgo.It("run Topology Manager node alignment test suite", func(ctx context.Context) { 890 numaNodes, coreCount := hostPrecheck() 891 892 configMap := getSRIOVDevicePluginConfigMap(framework.TestContext.SriovdpConfigMapFile) 893 894 oldCfg, err = getCurrentKubeletConfig(ctx) 895 framework.ExpectNoError(err) 896 897 sd := setupSRIOVConfigOrFail(ctx, f, configMap) 898 ginkgo.DeferCleanup(teardownSRIOVConfigOrFail, f, sd) 899 900 scope := containerScopeTopology 901 for _, policy := range policies { 902 // Configure Topology Manager 903 ginkgo.By(fmt.Sprintf("by configuring Topology Manager policy to %s", policy)) 904 framework.Logf("Configuring topology Manager policy to %s", policy) 905 906 newCfg, reservedSystemCPUs := configureTopologyManagerInKubelet(oldCfg, policy, scope, configMap, numaNodes) 907 updateKubeletConfig(ctx, f, newCfg, true) 908 909 runTopologyManagerNodeAlignmentSuiteTests(ctx, f, sd, reservedSystemCPUs, policy, numaNodes, coreCount) 910 } 911 }) 912 913 ginkgo.It("run the Topology Manager pod scope alignment test suite", func(ctx context.Context) { 914 numaNodes, coreCount := hostPrecheck() 915 916 configMap := getSRIOVDevicePluginConfigMap(framework.TestContext.SriovdpConfigMapFile) 917 918 oldCfg, err = getCurrentKubeletConfig(ctx) 919 framework.ExpectNoError(err) 920 921 policy := topologymanager.PolicySingleNumaNode 922 scope := podScopeTopology 923 924 newCfg, reservedSystemCPUs := configureTopologyManagerInKubelet(oldCfg, policy, scope, configMap, numaNodes) 925 updateKubeletConfig(ctx, f, newCfg, true) 926 927 runTMScopeResourceAlignmentTestSuite(ctx, f, configMap, reservedSystemCPUs, policy, numaNodes, coreCount) 928 }) 929 930 ginkgo.AfterEach(func(ctx context.Context) { 931 if oldCfg != nil { 932 // restore kubelet config 933 updateKubeletConfig(ctx, f, oldCfg, true) 934 } 935 }) 936 } 937 938 func hostPrecheck() (int, int) { 939 // this is a very rough check. We just want to rule out system that does NOT have 940 // any SRIOV device. A more proper check will be done in runTopologyManagerPositiveTest 941 942 numaNodes := detectNUMANodes() 943 if numaNodes < minNumaNodes { 944 e2eskipper.Skipf("this test is intended to be run on a multi-node NUMA system") 945 } 946 947 coreCount := detectCoresPerSocket() 948 if coreCount < minCoreCount { 949 e2eskipper.Skipf("this test is intended to be run on a system with at least %d cores per socket", minCoreCount) 950 } 951 952 requireSRIOVDevices() 953 954 return numaNodes, coreCount 955 } 956 957 // Serial because the test updates kubelet configuration. 958 var _ = SIGDescribe("Topology Manager", framework.WithSerial(), feature.TopologyManager, func() { 959 f := framework.NewDefaultFramework("topology-manager-test") 960 f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged 961 962 ginkgo.Context("With kubeconfig updated to static CPU Manager policy run the Topology Manager tests", func() { 963 runTopologyManagerTests(f) 964 }) 965 })