k8s.io/kubernetes@v1.29.3/test/e2e_node/topology_manager_test.go

k8s.io/kubernetes@v1.29.3/test/e2e_node/topology_manager_test.go (about)

     1  /*
     2  Copyright 2019 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package e2enode
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  	"os"
    23  	"os/exec"
    24  	"regexp"
    25  	"strconv"
    26  	"strings"
    27  	"sync"
    28  	"time"
    29  
    30  	v1 "k8s.io/api/core/v1"
    31  	"k8s.io/apimachinery/pkg/api/resource"
    32  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    33  	"k8s.io/apimachinery/pkg/runtime"
    34  	kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config"
    35  	"k8s.io/kubernetes/pkg/kubelet/cm/cpumanager"
    36  	"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager"
    37  	admissionapi "k8s.io/pod-security-admission/api"
    38  
    39  	"k8s.io/kubernetes/test/e2e/feature"
    40  	"k8s.io/kubernetes/test/e2e/framework"
    41  	e2enode "k8s.io/kubernetes/test/e2e/framework/node"
    42  	e2epod "k8s.io/kubernetes/test/e2e/framework/pod"
    43  	e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper"
    44  	e2etestfiles "k8s.io/kubernetes/test/e2e/framework/testfiles"
    45  	testutils "k8s.io/kubernetes/test/utils"
    46  
    47  	"github.com/onsi/ginkgo/v2"
    48  	"github.com/onsi/gomega"
    49  )
    50  
    51  const (
    52  	numaAlignmentCommand      = `export CPULIST_ALLOWED=$( awk -F":\t*" '/Cpus_allowed_list/ { print $2 }' /proc/self/status); env;`
    53  	numaAlignmentSleepCommand = numaAlignmentCommand + `sleep 1d;`
    54  	podScopeTopology          = "pod"
    55  	containerScopeTopology    = "container"
    56  
    57  	minNumaNodes     = 2
    58  	minCoreCount     = 4
    59  	minSriovResource = 7 // This is the min number of SRIOV VFs needed on the system under test.
    60  )
    61  
    62  // Helper for makeTopologyManagerPod().
    63  type tmCtnAttribute struct {
    64  	ctnName       string
    65  	cpuRequest    string
    66  	cpuLimit      string
    67  	deviceName    string
    68  	deviceRequest string
    69  	deviceLimit   string
    70  }
    71  
    72  func detectNUMANodes() int {
    73  	outData, err := exec.Command("/bin/sh", "-c", "lscpu | grep \"NUMA node(s):\" | cut -d \":\" -f 2").Output()
    74  	framework.ExpectNoError(err)
    75  
    76  	numaNodes, err := strconv.Atoi(strings.TrimSpace(string(outData)))
    77  	framework.ExpectNoError(err)
    78  
    79  	return numaNodes
    80  }
    81  
    82  func detectCoresPerSocket() int {
    83  	outData, err := exec.Command("/bin/sh", "-c", "lscpu | grep \"Core(s) per socket:\" | cut -d \":\" -f 2").Output()
    84  	framework.ExpectNoError(err)
    85  
    86  	coreCount, err := strconv.Atoi(strings.TrimSpace(string(outData)))
    87  	framework.ExpectNoError(err)
    88  
    89  	return coreCount
    90  }
    91  
    92  func detectThreadPerCore() int {
    93  	outData, err := exec.Command("/bin/sh", "-c", "lscpu | grep \"Thread(s) per core:\" | cut -d \":\" -f 2").Output()
    94  	framework.ExpectNoError(err)
    95  
    96  	threadCount, err := strconv.Atoi(strings.TrimSpace(string(outData)))
    97  	framework.ExpectNoError(err)
    98  
    99  	return threadCount
   100  }
   101  
   102  func makeContainers(ctnCmd string, ctnAttributes []tmCtnAttribute) (ctns []v1.Container) {
   103  	for _, ctnAttr := range ctnAttributes {
   104  		ctn := v1.Container{
   105  			Name:  ctnAttr.ctnName,
   106  			Image: busyboxImage,
   107  			Resources: v1.ResourceRequirements{
   108  				Requests: v1.ResourceList{
   109  					v1.ResourceName(v1.ResourceCPU):    resource.MustParse(ctnAttr.cpuRequest),
   110  					v1.ResourceName(v1.ResourceMemory): resource.MustParse("100Mi"),
   111  				},
   112  				Limits: v1.ResourceList{
   113  					v1.ResourceName(v1.ResourceCPU):    resource.MustParse(ctnAttr.cpuLimit),
   114  					v1.ResourceName(v1.ResourceMemory): resource.MustParse("100Mi"),
   115  				},
   116  			},
   117  			Command: []string{"sh", "-c", ctnCmd},
   118  		}
   119  		if ctnAttr.deviceName != "" {
   120  			ctn.Resources.Requests[v1.ResourceName(ctnAttr.deviceName)] = resource.MustParse(ctnAttr.deviceRequest)
   121  			ctn.Resources.Limits[v1.ResourceName(ctnAttr.deviceName)] = resource.MustParse(ctnAttr.deviceLimit)
   122  		}
   123  		ctns = append(ctns, ctn)
   124  	}
   125  	return
   126  }
   127  
   128  func makeTopologyManagerTestPod(podName string, tmCtnAttributes, tmInitCtnAttributes []tmCtnAttribute) *v1.Pod {
   129  	var containers, initContainers []v1.Container
   130  	if len(tmInitCtnAttributes) > 0 {
   131  		initContainers = makeContainers(numaAlignmentCommand, tmInitCtnAttributes)
   132  	}
   133  	containers = makeContainers(numaAlignmentSleepCommand, tmCtnAttributes)
   134  
   135  	return &v1.Pod{
   136  		ObjectMeta: metav1.ObjectMeta{
   137  			Name: podName,
   138  		},
   139  		Spec: v1.PodSpec{
   140  			RestartPolicy:  v1.RestartPolicyNever,
   141  			InitContainers: initContainers,
   142  			Containers:     containers,
   143  		},
   144  	}
   145  }
   146  
   147  func findNUMANodeWithoutSRIOVDevicesFromConfigMap(configMap *v1.ConfigMap, numaNodes int) (int, bool) {
   148  	for nodeNum := 0; nodeNum < numaNodes; nodeNum++ {
   149  		value, ok := configMap.Annotations[fmt.Sprintf("pcidevice_node%d", nodeNum)]
   150  		if !ok {
   151  			framework.Logf("missing pcidevice annotation for NUMA node %d", nodeNum)
   152  			return -1, false
   153  		}
   154  		v, err := strconv.Atoi(value)
   155  		if err != nil {
   156  			framework.Failf("error getting the PCI device count on NUMA node %d: %v", nodeNum, err)
   157  		}
   158  		if v == 0 {
   159  			framework.Logf("NUMA node %d has no SRIOV devices attached", nodeNum)
   160  			return nodeNum, true
   161  		}
   162  		framework.Logf("NUMA node %d has %d SRIOV devices attached", nodeNum, v)
   163  	}
   164  	return -1, false
   165  }
   166  
   167  func findNUMANodeWithoutSRIOVDevicesFromSysfs(numaNodes int) (int, bool) {
   168  	pciDevs, err := getPCIDeviceInfo("/sys/bus/pci/devices")
   169  	if err != nil {
   170  		framework.Failf("error detecting the PCI device NUMA node: %v", err)
   171  	}
   172  
   173  	pciPerNuma := make(map[int]int)
   174  	for _, pciDev := range pciDevs {
   175  		if pciDev.IsVFn {
   176  			pciPerNuma[pciDev.NUMANode]++
   177  		}
   178  	}
   179  
   180  	if len(pciPerNuma) == 0 {
   181  		framework.Logf("failed to find any VF device from %v", pciDevs)
   182  		return -1, false
   183  	}
   184  
   185  	for nodeNum := 0; nodeNum < numaNodes; nodeNum++ {
   186  		v := pciPerNuma[nodeNum]
   187  		if v == 0 {
   188  			framework.Logf("NUMA node %d has no SRIOV devices attached", nodeNum)
   189  			return nodeNum, true
   190  		}
   191  		framework.Logf("NUMA node %d has %d SRIOV devices attached", nodeNum, v)
   192  	}
   193  	return -1, false
   194  }
   195  
   196  func findNUMANodeWithoutSRIOVDevices(configMap *v1.ConfigMap, numaNodes int) (int, bool) {
   197  	// if someone annotated the configMap, let's use this information
   198  	if nodeNum, found := findNUMANodeWithoutSRIOVDevicesFromConfigMap(configMap, numaNodes); found {
   199  		return nodeNum, found
   200  	}
   201  	// no annotations, try to autodetect
   202  	// NOTE: this assumes all the VFs in the box can be used for the tests.
   203  	return findNUMANodeWithoutSRIOVDevicesFromSysfs(numaNodes)
   204  }
   205  
   206  func configureTopologyManagerInKubelet(oldCfg *kubeletconfig.KubeletConfiguration, policy, scope string, configMap *v1.ConfigMap, numaNodes int) (*kubeletconfig.KubeletConfiguration, string) {
   207  	// Configure Topology Manager in Kubelet with policy.
   208  	newCfg := oldCfg.DeepCopy()
   209  	if newCfg.FeatureGates == nil {
   210  		newCfg.FeatureGates = make(map[string]bool)
   211  	}
   212  
   213  	// Set the Topology Manager policy
   214  	newCfg.TopologyManagerPolicy = policy
   215  
   216  	newCfg.TopologyManagerScope = scope
   217  
   218  	// Set the CPU Manager policy to static.
   219  	newCfg.CPUManagerPolicy = string(cpumanager.PolicyStatic)
   220  
   221  	// Set the CPU Manager reconcile period to 1 second.
   222  	newCfg.CPUManagerReconcilePeriod = metav1.Duration{Duration: 1 * time.Second}
   223  
   224  	if nodeNum, ok := findNUMANodeWithoutSRIOVDevices(configMap, numaNodes); ok {
   225  		cpus, err := getCPUsPerNUMANode(nodeNum)
   226  		framework.Logf("NUMA Node %d doesn't seem to have attached SRIOV devices and has cpus=%v", nodeNum, cpus)
   227  		framework.ExpectNoError(err)
   228  		newCfg.ReservedSystemCPUs = fmt.Sprintf("%d", cpus[len(cpus)-1])
   229  	} else {
   230  		// The Kubelet panics if either kube-reserved or system-reserved is not set
   231  		// when CPU Manager is enabled. Set cpu in kube-reserved > 0 so that
   232  		// kubelet doesn't panic.
   233  		if newCfg.KubeReserved == nil {
   234  			newCfg.KubeReserved = map[string]string{}
   235  		}
   236  
   237  		if _, ok := newCfg.KubeReserved["cpu"]; !ok {
   238  			newCfg.KubeReserved["cpu"] = "200m"
   239  		}
   240  	}
   241  	// Dump the config -- debug
   242  	framework.Logf("New kubelet config is %s", *newCfg)
   243  
   244  	return newCfg, newCfg.ReservedSystemCPUs
   245  }
   246  
   247  // getSRIOVDevicePluginPod returns the Device Plugin pod for sriov resources in e2e tests.
   248  func getSRIOVDevicePluginPod() *v1.Pod {
   249  	data, err := e2etestfiles.Read(SRIOVDevicePluginDSYAML)
   250  	if err != nil {
   251  		framework.Fail(err.Error())
   252  	}
   253  
   254  	ds := readDaemonSetV1OrDie(data)
   255  	p := &v1.Pod{
   256  		ObjectMeta: metav1.ObjectMeta{
   257  			Name:      SRIOVDevicePluginName,
   258  			Namespace: metav1.NamespaceSystem,
   259  		},
   260  
   261  		Spec: ds.Spec.Template.Spec,
   262  	}
   263  
   264  	return p
   265  }
   266  
   267  func readConfigMapV1OrDie(objBytes []byte) *v1.ConfigMap {
   268  	v1.AddToScheme(appsScheme)
   269  	requiredObj, err := runtime.Decode(appsCodecs.UniversalDecoder(v1.SchemeGroupVersion), objBytes)
   270  	if err != nil {
   271  		panic(err)
   272  	}
   273  	return requiredObj.(*v1.ConfigMap)
   274  }
   275  
   276  func readServiceAccountV1OrDie(objBytes []byte) *v1.ServiceAccount {
   277  	v1.AddToScheme(appsScheme)
   278  	requiredObj, err := runtime.Decode(appsCodecs.UniversalDecoder(v1.SchemeGroupVersion), objBytes)
   279  	if err != nil {
   280  		panic(err)
   281  	}
   282  	return requiredObj.(*v1.ServiceAccount)
   283  }
   284  
   285  func findSRIOVResource(node *v1.Node) (string, int64) {
   286  	framework.Logf("Node status allocatable: %v", node.Status.Allocatable)
   287  	re := regexp.MustCompile(`^intel.com/.*sriov.*`)
   288  	for key, val := range node.Status.Allocatable {
   289  		resource := string(key)
   290  		if re.MatchString(resource) {
   291  			v := val.Value()
   292  			if v > 0 {
   293  				return resource, v
   294  			}
   295  		}
   296  	}
   297  	return "", 0
   298  }
   299  
   300  func validatePodAlignment(ctx context.Context, f *framework.Framework, pod *v1.Pod, envInfo *testEnvInfo) {
   301  	for _, cnt := range pod.Spec.Containers {
   302  		ginkgo.By(fmt.Sprintf("validating the container %s on Gu pod %s", cnt.Name, pod.Name))
   303  
   304  		logs, err := e2epod.GetPodLogs(ctx, f.ClientSet, f.Namespace.Name, pod.Name, cnt.Name)
   305  		framework.ExpectNoError(err, "expected log not found in container [%s] of pod [%s]", cnt.Name, pod.Name)
   306  
   307  		framework.Logf("got pod logs: %v", logs)
   308  		numaRes, err := checkNUMAAlignment(f, pod, &cnt, logs, envInfo)
   309  		framework.ExpectNoError(err, "NUMA Alignment check failed for [%s] of pod [%s]", cnt.Name, pod.Name)
   310  		if numaRes != nil {
   311  			framework.Logf("NUMA resources for %s/%s: %s", pod.Name, cnt.Name, numaRes.String())
   312  		}
   313  	}
   314  }
   315  
   316  // validatePodAligmentWithPodScope validates whether all pod's CPUs are affined to the same NUMA node.
   317  func validatePodAlignmentWithPodScope(ctx context.Context, f *framework.Framework, pod *v1.Pod, envInfo *testEnvInfo) error {
   318  	// Mapping between CPU IDs and NUMA node IDs.
   319  	podsNUMA := make(map[int]int)
   320  
   321  	ginkgo.By(fmt.Sprintf("validate pod scope alignment for %s pod", pod.Name))
   322  	for _, cnt := range pod.Spec.Containers {
   323  		logs, err := e2epod.GetPodLogs(ctx, f.ClientSet, f.Namespace.Name, pod.Name, cnt.Name)
   324  		framework.ExpectNoError(err, "NUMA alignment failed for container [%s] of pod [%s]", cnt.Name, pod.Name)
   325  		envMap, err := makeEnvMap(logs)
   326  		framework.ExpectNoError(err, "NUMA alignment failed for container [%s] of pod [%s]", cnt.Name, pod.Name)
   327  		cpuToNUMA, err := getCPUToNUMANodeMapFromEnv(f, pod, &cnt, envMap, envInfo.numaNodes)
   328  		framework.ExpectNoError(err, "NUMA alignment failed for container [%s] of pod [%s]", cnt.Name, pod.Name)
   329  		for cpuID, numaID := range cpuToNUMA {
   330  			podsNUMA[cpuID] = numaID
   331  		}
   332  	}
   333  
   334  	numaRes := numaPodResources{
   335  		CPUToNUMANode: podsNUMA,
   336  	}
   337  	aligned := numaRes.CheckAlignment()
   338  	if !aligned {
   339  		return fmt.Errorf("resources were assigned from different NUMA nodes")
   340  	}
   341  
   342  	framework.Logf("NUMA locality confirmed: all pod's CPUs aligned to the same NUMA node")
   343  	return nil
   344  }
   345  
   346  func runTopologyManagerPolicySuiteTests(ctx context.Context, f *framework.Framework) {
   347  	var cpuCap, cpuAlloc int64
   348  
   349  	cpuCap, cpuAlloc, _ = getLocalNodeCPUDetails(ctx, f)
   350  	ginkgo.By(fmt.Sprintf("checking node CPU capacity (%d) and allocatable CPUs (%d)", cpuCap, cpuAlloc))
   351  
   352  	// Albeit even the weakest CI machines usually have 2 cpus, let's be extra careful and
   353  	// check explicitly. We prefer to skip than a false negative (and a failed test).
   354  	if cpuAlloc < 1 {
   355  		e2eskipper.Skipf("Skipping basic CPU Manager tests since CPU capacity < 2")
   356  	}
   357  
   358  	ginkgo.By("running a non-Gu pod")
   359  	runNonGuPodTest(ctx, f, cpuCap)
   360  
   361  	ginkgo.By("running a Gu pod")
   362  	runGuPodTest(ctx, f, 1)
   363  
   364  	// Skip rest of the tests if CPU allocatable < 3.
   365  	if cpuAlloc < 3 {
   366  		e2eskipper.Skipf("Skipping rest of the CPU Manager tests since CPU capacity < 3")
   367  	}
   368  
   369  	ginkgo.By("running multiple Gu and non-Gu pods")
   370  	runMultipleGuNonGuPods(ctx, f, cpuCap, cpuAlloc)
   371  
   372  	ginkgo.By("running a Gu pod requesting multiple CPUs")
   373  	runMultipleCPUGuPod(ctx, f)
   374  
   375  	ginkgo.By("running a Gu pod with multiple containers requesting integer CPUs")
   376  	runMultipleCPUContainersGuPod(ctx, f)
   377  
   378  	ginkgo.By("running multiple Gu pods")
   379  	runMultipleGuPods(ctx, f)
   380  }
   381  
   382  func runTopologyManagerPositiveTest(ctx context.Context, f *framework.Framework, numPods int, ctnAttrs, initCtnAttrs []tmCtnAttribute, envInfo *testEnvInfo) {
   383  	podMap := make(map[string]*v1.Pod)
   384  
   385  	for podID := 0; podID < numPods; podID++ {
   386  		podName := fmt.Sprintf("gu-pod-%d", podID)
   387  		framework.Logf("creating pod %s attrs %v", podName, ctnAttrs)
   388  		pod := makeTopologyManagerTestPod(podName, ctnAttrs, initCtnAttrs)
   389  		pod = e2epod.NewPodClient(f).CreateSync(ctx, pod)
   390  		framework.Logf("created pod %s", podName)
   391  		podMap[podName] = pod
   392  	}
   393  
   394  	// per https://github.com/kubernetes/enhancements/blob/master/keps/sig-node/693-topology-manager/README.md#multi-numa-systems-tests
   395  	// we can do a menaingful validation only when using the single-numa node policy
   396  	if envInfo.policy == topologymanager.PolicySingleNumaNode {
   397  		for _, pod := range podMap {
   398  			validatePodAlignment(ctx, f, pod, envInfo)
   399  		}
   400  		if envInfo.scope == podScopeTopology {
   401  			for _, pod := range podMap {
   402  				err := validatePodAlignmentWithPodScope(ctx, f, pod, envInfo)
   403  				framework.ExpectNoError(err)
   404  			}
   405  		}
   406  	}
   407  
   408  	deletePodsAsync(ctx, f, podMap)
   409  }
   410  
   411  func deletePodsAsync(ctx context.Context, f *framework.Framework, podMap map[string]*v1.Pod) {
   412  	var wg sync.WaitGroup
   413  	for _, pod := range podMap {
   414  		wg.Add(1)
   415  		go func(podNS, podName string) {
   416  			defer ginkgo.GinkgoRecover()
   417  			defer wg.Done()
   418  
   419  			deletePodSyncByName(ctx, f, podName)
   420  			waitForAllContainerRemoval(ctx, podName, podNS)
   421  		}(pod.Namespace, pod.Name)
   422  	}
   423  	wg.Wait()
   424  }
   425  
   426  func runTopologyManagerNegativeTest(ctx context.Context, f *framework.Framework, ctnAttrs, initCtnAttrs []tmCtnAttribute, envInfo *testEnvInfo) {
   427  	podName := "gu-pod"
   428  	framework.Logf("creating pod %s attrs %v", podName, ctnAttrs)
   429  	pod := makeTopologyManagerTestPod(podName, ctnAttrs, initCtnAttrs)
   430  
   431  	pod = e2epod.NewPodClient(f).Create(ctx, pod)
   432  	err := e2epod.WaitForPodCondition(ctx, f.ClientSet, f.Namespace.Name, pod.Name, "Failed", 30*time.Second, func(pod *v1.Pod) (bool, error) {
   433  		if pod.Status.Phase != v1.PodPending {
   434  			return true, nil
   435  		}
   436  		return false, nil
   437  	})
   438  	framework.ExpectNoError(err)
   439  	pod, err = e2epod.NewPodClient(f).Get(ctx, pod.Name, metav1.GetOptions{})
   440  	framework.ExpectNoError(err)
   441  
   442  	if pod.Status.Phase != v1.PodFailed {
   443  		framework.Failf("pod %s not failed: %v", pod.Name, pod.Status)
   444  	}
   445  	if !isTopologyAffinityError(pod) {
   446  		framework.Failf("pod %s failed for wrong reason: %q", pod.Name, pod.Status.Reason)
   447  	}
   448  
   449  	deletePodSyncByName(ctx, f, pod.Name)
   450  }
   451  
   452  func isTopologyAffinityError(pod *v1.Pod) bool {
   453  	re := regexp.MustCompile(`Topology.*Affinity.*Error`)
   454  	return re.MatchString(pod.Status.Reason)
   455  }
   456  
   457  func getSRIOVDevicePluginConfigMap(cmFile string) *v1.ConfigMap {
   458  	data, err := e2etestfiles.Read(SRIOVDevicePluginCMYAML)
   459  	if err != nil {
   460  		framework.Fail(err.Error())
   461  	}
   462  
   463  	// the SRIOVDP configuration is hw-dependent, so we allow per-test-host customization.
   464  	framework.Logf("host-local SRIOV Device Plugin Config Map %q", cmFile)
   465  	if cmFile != "" {
   466  		data, err = os.ReadFile(cmFile)
   467  		if err != nil {
   468  			framework.Failf("unable to load the SRIOV Device Plugin ConfigMap: %v", err)
   469  		}
   470  	} else {
   471  		framework.Logf("Using built-in SRIOV Device Plugin Config Map")
   472  	}
   473  
   474  	return readConfigMapV1OrDie(data)
   475  }
   476  
   477  type sriovData struct {
   478  	configMap      *v1.ConfigMap
   479  	serviceAccount *v1.ServiceAccount
   480  	pod            *v1.Pod
   481  
   482  	resourceName   string
   483  	resourceAmount int64
   484  }
   485  
   486  func setupSRIOVConfigOrFail(ctx context.Context, f *framework.Framework, configMap *v1.ConfigMap) *sriovData {
   487  	sd := createSRIOVConfigOrFail(ctx, f, configMap)
   488  
   489  	e2enode.WaitForNodeToBeReady(ctx, f.ClientSet, framework.TestContext.NodeName, 5*time.Minute)
   490  
   491  	sd.pod = createSRIOVPodOrFail(ctx, f)
   492  	return sd
   493  }
   494  
   495  func createSRIOVConfigOrFail(ctx context.Context, f *framework.Framework, configMap *v1.ConfigMap) *sriovData {
   496  	var err error
   497  
   498  	ginkgo.By(fmt.Sprintf("Creating configMap %v/%v", metav1.NamespaceSystem, configMap.Name))
   499  	if _, err = f.ClientSet.CoreV1().ConfigMaps(metav1.NamespaceSystem).Create(ctx, configMap, metav1.CreateOptions{}); err != nil {
   500  		framework.Failf("unable to create test configMap %s: %v", configMap.Name, err)
   501  	}
   502  
   503  	data, err := e2etestfiles.Read(SRIOVDevicePluginSAYAML)
   504  	if err != nil {
   505  		framework.Fail(err.Error())
   506  	}
   507  	serviceAccount := readServiceAccountV1OrDie(data)
   508  	ginkgo.By(fmt.Sprintf("Creating serviceAccount %v/%v", metav1.NamespaceSystem, serviceAccount.Name))
   509  	if _, err = f.ClientSet.CoreV1().ServiceAccounts(metav1.NamespaceSystem).Create(ctx, serviceAccount, metav1.CreateOptions{}); err != nil {
   510  		framework.Failf("unable to create test serviceAccount %s: %v", serviceAccount.Name, err)
   511  	}
   512  
   513  	return &sriovData{
   514  		configMap:      configMap,
   515  		serviceAccount: serviceAccount,
   516  	}
   517  }
   518  
   519  func createSRIOVPodOrFail(ctx context.Context, f *framework.Framework) *v1.Pod {
   520  	dp := getSRIOVDevicePluginPod()
   521  	dp.Spec.NodeName = framework.TestContext.NodeName
   522  
   523  	ginkgo.By("Create SRIOV device plugin pod")
   524  	dpPod, err := f.ClientSet.CoreV1().Pods(metav1.NamespaceSystem).Create(ctx, dp, metav1.CreateOptions{})
   525  	framework.ExpectNoError(err)
   526  
   527  	if err = e2epod.WaitForPodCondition(ctx, f.ClientSet, metav1.NamespaceSystem, dp.Name, "Ready", 120*time.Second, testutils.PodRunningReady); err != nil {
   528  		framework.Logf("SRIOV Pod %v took too long to enter running/ready: %v", dp.Name, err)
   529  	}
   530  	framework.ExpectNoError(err)
   531  
   532  	return dpPod
   533  }
   534  
   535  // waitForSRIOVResources waits until enough SRIOV resources are avaailable, expecting to complete within the timeout.
   536  // if exits successfully, updates the sriovData with the resources which were found.
   537  func waitForSRIOVResources(ctx context.Context, f *framework.Framework, sd *sriovData) {
   538  	sriovResourceName := ""
   539  	var sriovResourceAmount int64
   540  	ginkgo.By("Waiting for devices to become available on the local node")
   541  	gomega.Eventually(ctx, func(ctx context.Context) bool {
   542  		node := getLocalNode(ctx, f)
   543  		sriovResourceName, sriovResourceAmount = findSRIOVResource(node)
   544  		return sriovResourceAmount > minSriovResource
   545  	}, 2*time.Minute, framework.Poll).Should(gomega.BeTrue())
   546  
   547  	sd.resourceName = sriovResourceName
   548  	sd.resourceAmount = sriovResourceAmount
   549  	framework.Logf("Detected SRIOV allocatable devices name=%q amount=%d", sd.resourceName, sd.resourceAmount)
   550  }
   551  
   552  func deleteSRIOVPodOrFail(ctx context.Context, f *framework.Framework, sd *sriovData) {
   553  	var err error
   554  	gp := int64(0)
   555  	deleteOptions := metav1.DeleteOptions{
   556  		GracePeriodSeconds: &gp,
   557  	}
   558  
   559  	ginkgo.By(fmt.Sprintf("Delete SRIOV device plugin pod %s/%s", sd.pod.Namespace, sd.pod.Name))
   560  	err = f.ClientSet.CoreV1().Pods(sd.pod.Namespace).Delete(ctx, sd.pod.Name, deleteOptions)
   561  	framework.ExpectNoError(err)
   562  	waitForAllContainerRemoval(ctx, sd.pod.Name, sd.pod.Namespace)
   563  }
   564  
   565  func removeSRIOVConfigOrFail(ctx context.Context, f *framework.Framework, sd *sriovData) {
   566  	var err error
   567  	gp := int64(0)
   568  	deleteOptions := metav1.DeleteOptions{
   569  		GracePeriodSeconds: &gp,
   570  	}
   571  
   572  	ginkgo.By(fmt.Sprintf("Deleting configMap %v/%v", metav1.NamespaceSystem, sd.configMap.Name))
   573  	err = f.ClientSet.CoreV1().ConfigMaps(metav1.NamespaceSystem).Delete(ctx, sd.configMap.Name, deleteOptions)
   574  	framework.ExpectNoError(err)
   575  
   576  	ginkgo.By(fmt.Sprintf("Deleting serviceAccount %v/%v", metav1.NamespaceSystem, sd.serviceAccount.Name))
   577  	err = f.ClientSet.CoreV1().ServiceAccounts(metav1.NamespaceSystem).Delete(ctx, sd.serviceAccount.Name, deleteOptions)
   578  	framework.ExpectNoError(err)
   579  }
   580  
   581  func teardownSRIOVConfigOrFail(ctx context.Context, f *framework.Framework, sd *sriovData) {
   582  	deleteSRIOVPodOrFail(ctx, f, sd)
   583  	removeSRIOVConfigOrFail(ctx, f, sd)
   584  }
   585  
   586  func runTMScopeResourceAlignmentTestSuite(ctx context.Context, f *framework.Framework, configMap *v1.ConfigMap, reservedSystemCPUs, policy string, numaNodes, coreCount int) {
   587  	threadsPerCore := getSMTLevel()
   588  	sd := setupSRIOVConfigOrFail(ctx, f, configMap)
   589  	var ctnAttrs, initCtnAttrs []tmCtnAttribute
   590  
   591  	waitForSRIOVResources(ctx, f, sd)
   592  
   593  	envInfo := &testEnvInfo{
   594  		numaNodes:         numaNodes,
   595  		sriovResourceName: sd.resourceName,
   596  		policy:            policy,
   597  		scope:             podScopeTopology,
   598  	}
   599  
   600  	ginkgo.By(fmt.Sprintf("Admit two guaranteed pods. Both consist of 2 containers, each container with 1 CPU core. Use 1 %s device.", sd.resourceName))
   601  	ctnAttrs = []tmCtnAttribute{
   602  		{
   603  			ctnName:       "ps-container-0",
   604  			cpuRequest:    "1000m",
   605  			cpuLimit:      "1000m",
   606  			deviceName:    sd.resourceName,
   607  			deviceRequest: "1",
   608  			deviceLimit:   "1",
   609  		},
   610  		{
   611  			ctnName:       "ps-container-1",
   612  			cpuRequest:    "1000m",
   613  			cpuLimit:      "1000m",
   614  			deviceName:    sd.resourceName,
   615  			deviceRequest: "1",
   616  			deviceLimit:   "1",
   617  		},
   618  	}
   619  	runTopologyManagerPositiveTest(ctx, f, 2, ctnAttrs, initCtnAttrs, envInfo)
   620  
   621  	numCores := threadsPerCore * coreCount
   622  	coresReq := fmt.Sprintf("%dm", numCores*1000)
   623  	ginkgo.By(fmt.Sprintf("Admit a guaranteed pod requesting %d CPU cores, i.e., more than can be provided at every single NUMA node. Therefore, the request should be rejected.", numCores+1))
   624  	ctnAttrs = []tmCtnAttribute{
   625  		{
   626  			ctnName:       "gu-container-1",
   627  			cpuRequest:    coresReq,
   628  			cpuLimit:      coresReq,
   629  			deviceRequest: "1",
   630  			deviceLimit:   "1",
   631  		},
   632  		{
   633  			ctnName:       "gu-container-2",
   634  			cpuRequest:    "1000m",
   635  			cpuLimit:      "1000m",
   636  			deviceRequest: "1",
   637  			deviceLimit:   "1",
   638  		},
   639  	}
   640  	runTopologyManagerNegativeTest(ctx, f, ctnAttrs, initCtnAttrs, envInfo)
   641  
   642  	// The Topology Manager with pod scope should calculate how many CPUs it needs to admit a pod basing on two requests:
   643  	// the maximum of init containers' demand for CPU and sum of app containers' requests for CPU.
   644  	// The Topology Manager should use higher value of these. Therefore, both pods from below test case should get number of CPUs
   645  	// requested by init-container of highest demand for it. Since demand for CPU of each pod is slightly higher than half of resources
   646  	// available on one node, both pods should be placed on distinct NUMA nodes.
   647  	coresReq = fmt.Sprintf("%dm", (numCores/2+1)*1000)
   648  	ginkgo.By(fmt.Sprintf("Admit two guaranteed pods, each pod requests %d cores - the pods should be placed on different NUMA nodes", numCores/2+1))
   649  	initCtnAttrs = []tmCtnAttribute{
   650  		{
   651  			ctnName:       "init-container-1",
   652  			cpuRequest:    coresReq,
   653  			cpuLimit:      coresReq,
   654  			deviceRequest: "1",
   655  			deviceLimit:   "1",
   656  		},
   657  		{
   658  			ctnName:       "init-container-2",
   659  			cpuRequest:    "1000m",
   660  			cpuLimit:      "1000m",
   661  			deviceRequest: "1",
   662  			deviceLimit:   "1",
   663  		},
   664  	}
   665  	ctnAttrs = []tmCtnAttribute{
   666  		{
   667  			ctnName:       "gu-container-0",
   668  			cpuRequest:    "1000m",
   669  			cpuLimit:      "1000m",
   670  			deviceRequest: "1",
   671  			deviceLimit:   "1",
   672  		},
   673  		{
   674  			ctnName:       "gu-container-1",
   675  			cpuRequest:    "1000m",
   676  			cpuLimit:      "1000m",
   677  			deviceRequest: "1",
   678  			deviceLimit:   "1",
   679  		},
   680  	}
   681  	runTopologyManagerPositiveTest(ctx, f, 2, ctnAttrs, initCtnAttrs, envInfo)
   682  
   683  	teardownSRIOVConfigOrFail(ctx, f, sd)
   684  }
   685  
   686  func runTopologyManagerNodeAlignmentSuiteTests(ctx context.Context, f *framework.Framework, sd *sriovData, reservedSystemCPUs, policy string, numaNodes, coreCount int) {
   687  	threadsPerCore := getSMTLevel()
   688  
   689  	waitForSRIOVResources(ctx, f, sd)
   690  
   691  	envInfo := &testEnvInfo{
   692  		numaNodes:         numaNodes,
   693  		sriovResourceName: sd.resourceName,
   694  		policy:            policy,
   695  	}
   696  
   697  	// could have been a loop, we unroll it to explain the testcases
   698  	var ctnAttrs, initCtnAttrs []tmCtnAttribute
   699  
   700  	// simplest case
   701  	ginkgo.By(fmt.Sprintf("Successfully admit one guaranteed pod with 1 core, 1 %s device", sd.resourceName))
   702  	ctnAttrs = []tmCtnAttribute{
   703  		{
   704  			ctnName:       "gu-container",
   705  			cpuRequest:    "1000m",
   706  			cpuLimit:      "1000m",
   707  			deviceName:    sd.resourceName,
   708  			deviceRequest: "1",
   709  			deviceLimit:   "1",
   710  		},
   711  	}
   712  	runTopologyManagerPositiveTest(ctx, f, 1, ctnAttrs, initCtnAttrs, envInfo)
   713  
   714  	ginkgo.By(fmt.Sprintf("Successfully admit one guaranteed pod with 2 cores, 1 %s device", sd.resourceName))
   715  	ctnAttrs = []tmCtnAttribute{
   716  		{
   717  			ctnName:       "gu-container",
   718  			cpuRequest:    "2000m",
   719  			cpuLimit:      "2000m",
   720  			deviceName:    sd.resourceName,
   721  			deviceRequest: "1",
   722  			deviceLimit:   "1",
   723  		},
   724  	}
   725  	runTopologyManagerPositiveTest(ctx, f, 1, ctnAttrs, initCtnAttrs, envInfo)
   726  
   727  	if reservedSystemCPUs != "" {
   728  		// to avoid false negatives, we have put reserved CPUs in such a way there is at least a NUMA node
   729  		// with 1+ SRIOV devices and not reserved CPUs.
   730  		numCores := threadsPerCore * coreCount
   731  		allCoresReq := fmt.Sprintf("%dm", numCores*1000)
   732  		ginkgo.By(fmt.Sprintf("Successfully admit an entire socket (%d cores), 1 %s device", numCores, sd.resourceName))
   733  		ctnAttrs = []tmCtnAttribute{
   734  			{
   735  				ctnName:       "gu-container",
   736  				cpuRequest:    allCoresReq,
   737  				cpuLimit:      allCoresReq,
   738  				deviceName:    sd.resourceName,
   739  				deviceRequest: "1",
   740  				deviceLimit:   "1",
   741  			},
   742  		}
   743  		runTopologyManagerPositiveTest(ctx, f, 1, ctnAttrs, initCtnAttrs, envInfo)
   744  	}
   745  
   746  	if sd.resourceAmount > 1 {
   747  		// no matter how busses are connected to NUMA nodes and SRIOV devices are installed, this function
   748  		// preconditions must ensure the following can be fulfilled
   749  		ginkgo.By(fmt.Sprintf("Successfully admit two guaranteed pods, each with 1 core, 1 %s device", sd.resourceName))
   750  		ctnAttrs = []tmCtnAttribute{
   751  			{
   752  				ctnName:       "gu-container",
   753  				cpuRequest:    "1000m",
   754  				cpuLimit:      "1000m",
   755  				deviceName:    sd.resourceName,
   756  				deviceRequest: "1",
   757  				deviceLimit:   "1",
   758  			},
   759  		}
   760  		runTopologyManagerPositiveTest(ctx, f, 2, ctnAttrs, initCtnAttrs, envInfo)
   761  
   762  		ginkgo.By(fmt.Sprintf("Successfully admit two guaranteed pods, each with 2 cores, 1 %s device", sd.resourceName))
   763  		ctnAttrs = []tmCtnAttribute{
   764  			{
   765  				ctnName:       "gu-container",
   766  				cpuRequest:    "2000m",
   767  				cpuLimit:      "2000m",
   768  				deviceName:    sd.resourceName,
   769  				deviceRequest: "1",
   770  				deviceLimit:   "1",
   771  			},
   772  		}
   773  		runTopologyManagerPositiveTest(ctx, f, 2, ctnAttrs, initCtnAttrs, envInfo)
   774  
   775  		// testing more complex conditions require knowledge about the system cpu+bus topology
   776  	}
   777  
   778  	// multi-container tests
   779  	if sd.resourceAmount >= 4 {
   780  		ginkgo.By(fmt.Sprintf("Successfully admit a guaranteed pod requesting for two containers, each with 2 cores, 1 %s device", sd.resourceName))
   781  		ctnAttrs = []tmCtnAttribute{
   782  			{
   783  				ctnName:       "gu-container-0",
   784  				cpuRequest:    "2000m",
   785  				cpuLimit:      "2000m",
   786  				deviceName:    sd.resourceName,
   787  				deviceRequest: "1",
   788  				deviceLimit:   "1",
   789  			},
   790  			{
   791  				ctnName:       "gu-container-1",
   792  				cpuRequest:    "2000m",
   793  				cpuLimit:      "2000m",
   794  				deviceName:    sd.resourceName,
   795  				deviceRequest: "1",
   796  				deviceLimit:   "1",
   797  			},
   798  		}
   799  		runTopologyManagerPositiveTest(ctx, f, 1, ctnAttrs, initCtnAttrs, envInfo)
   800  
   801  		ginkgo.By(fmt.Sprintf("Successfully admit two guaranteed pods, each with two containers, each with 1 core, 1 %s device", sd.resourceName))
   802  		ctnAttrs = []tmCtnAttribute{
   803  			{
   804  				ctnName:       "gu-container-0",
   805  				cpuRequest:    "1000m",
   806  				cpuLimit:      "1000m",
   807  				deviceName:    sd.resourceName,
   808  				deviceRequest: "1",
   809  				deviceLimit:   "1",
   810  			},
   811  			{
   812  				ctnName:       "gu-container-1",
   813  				cpuRequest:    "1000m",
   814  				cpuLimit:      "1000m",
   815  				deviceName:    sd.resourceName,
   816  				deviceRequest: "1",
   817  				deviceLimit:   "1",
   818  			},
   819  		}
   820  		runTopologyManagerPositiveTest(ctx, f, 2, ctnAttrs, initCtnAttrs, envInfo)
   821  
   822  		ginkgo.By(fmt.Sprintf("Successfully admit two guaranteed pods, each with two containers, both with with 2 cores, one with 1 %s device", sd.resourceName))
   823  		ctnAttrs = []tmCtnAttribute{
   824  			{
   825  				ctnName:       "gu-container-dev",
   826  				cpuRequest:    "2000m",
   827  				cpuLimit:      "2000m",
   828  				deviceName:    sd.resourceName,
   829  				deviceRequest: "1",
   830  				deviceLimit:   "1",
   831  			},
   832  			{
   833  				ctnName:    "gu-container-nodev",
   834  				cpuRequest: "2000m",
   835  				cpuLimit:   "2000m",
   836  			},
   837  		}
   838  		runTopologyManagerPositiveTest(ctx, f, 2, ctnAttrs, initCtnAttrs, envInfo)
   839  	}
   840  
   841  	// this is the only policy that can guarantee reliable rejects
   842  	if policy == topologymanager.PolicySingleNumaNode {
   843  		// overflow NUMA node capacity: cores
   844  		numCores := 1 + (threadsPerCore * coreCount)
   845  		excessCoresReq := fmt.Sprintf("%dm", numCores*1000)
   846  		ginkgo.By(fmt.Sprintf("Trying to admit a guaranteed pods, with %d cores, 1 %s device - and it should be rejected", numCores, sd.resourceName))
   847  		ctnAttrs = []tmCtnAttribute{
   848  			{
   849  				ctnName:       "gu-container",
   850  				cpuRequest:    excessCoresReq,
   851  				cpuLimit:      excessCoresReq,
   852  				deviceName:    sd.resourceName,
   853  				deviceRequest: "1",
   854  				deviceLimit:   "1",
   855  			},
   856  		}
   857  		runTopologyManagerNegativeTest(ctx, f, ctnAttrs, initCtnAttrs, envInfo)
   858  	}
   859  }
   860  
   861  func runTopologyManagerTests(f *framework.Framework) {
   862  	var oldCfg *kubeletconfig.KubeletConfiguration
   863  	var err error
   864  
   865  	var policies = []string{
   866  		topologymanager.PolicySingleNumaNode,
   867  		topologymanager.PolicyRestricted,
   868  		topologymanager.PolicyBestEffort,
   869  		topologymanager.PolicyNone,
   870  	}
   871  
   872  	ginkgo.It("run Topology Manager policy test suite", func(ctx context.Context) {
   873  		oldCfg, err = getCurrentKubeletConfig(ctx)
   874  		framework.ExpectNoError(err)
   875  
   876  		scope := containerScopeTopology
   877  		for _, policy := range policies {
   878  			// Configure Topology Manager
   879  			ginkgo.By(fmt.Sprintf("by configuring Topology Manager policy to %s", policy))
   880  			framework.Logf("Configuring topology Manager policy to %s", policy)
   881  
   882  			newCfg, _ := configureTopologyManagerInKubelet(oldCfg, policy, scope, nil, 0)
   883  			updateKubeletConfig(ctx, f, newCfg, true)
   884  			// Run the tests
   885  			runTopologyManagerPolicySuiteTests(ctx, f)
   886  		}
   887  	})
   888  
   889  	ginkgo.It("run Topology Manager node alignment test suite", func(ctx context.Context) {
   890  		numaNodes, coreCount := hostPrecheck()
   891  
   892  		configMap := getSRIOVDevicePluginConfigMap(framework.TestContext.SriovdpConfigMapFile)
   893  
   894  		oldCfg, err = getCurrentKubeletConfig(ctx)
   895  		framework.ExpectNoError(err)
   896  
   897  		sd := setupSRIOVConfigOrFail(ctx, f, configMap)
   898  		ginkgo.DeferCleanup(teardownSRIOVConfigOrFail, f, sd)
   899  
   900  		scope := containerScopeTopology
   901  		for _, policy := range policies {
   902  			// Configure Topology Manager
   903  			ginkgo.By(fmt.Sprintf("by configuring Topology Manager policy to %s", policy))
   904  			framework.Logf("Configuring topology Manager policy to %s", policy)
   905  
   906  			newCfg, reservedSystemCPUs := configureTopologyManagerInKubelet(oldCfg, policy, scope, configMap, numaNodes)
   907  			updateKubeletConfig(ctx, f, newCfg, true)
   908  
   909  			runTopologyManagerNodeAlignmentSuiteTests(ctx, f, sd, reservedSystemCPUs, policy, numaNodes, coreCount)
   910  		}
   911  	})
   912  
   913  	ginkgo.It("run the Topology Manager pod scope alignment test suite", func(ctx context.Context) {
   914  		numaNodes, coreCount := hostPrecheck()
   915  
   916  		configMap := getSRIOVDevicePluginConfigMap(framework.TestContext.SriovdpConfigMapFile)
   917  
   918  		oldCfg, err = getCurrentKubeletConfig(ctx)
   919  		framework.ExpectNoError(err)
   920  
   921  		policy := topologymanager.PolicySingleNumaNode
   922  		scope := podScopeTopology
   923  
   924  		newCfg, reservedSystemCPUs := configureTopologyManagerInKubelet(oldCfg, policy, scope, configMap, numaNodes)
   925  		updateKubeletConfig(ctx, f, newCfg, true)
   926  
   927  		runTMScopeResourceAlignmentTestSuite(ctx, f, configMap, reservedSystemCPUs, policy, numaNodes, coreCount)
   928  	})
   929  
   930  	ginkgo.AfterEach(func(ctx context.Context) {
   931  		if oldCfg != nil {
   932  			// restore kubelet config
   933  			updateKubeletConfig(ctx, f, oldCfg, true)
   934  		}
   935  	})
   936  }
   937  
   938  func hostPrecheck() (int, int) {
   939  	// this is a very rough check. We just want to rule out system that does NOT have
   940  	// any SRIOV device. A more proper check will be done in runTopologyManagerPositiveTest
   941  
   942  	numaNodes := detectNUMANodes()
   943  	if numaNodes < minNumaNodes {
   944  		e2eskipper.Skipf("this test is intended to be run on a multi-node NUMA system")
   945  	}
   946  
   947  	coreCount := detectCoresPerSocket()
   948  	if coreCount < minCoreCount {
   949  		e2eskipper.Skipf("this test is intended to be run on a system with at least %d cores per socket", minCoreCount)
   950  	}
   951  
   952  	requireSRIOVDevices()
   953  
   954  	return numaNodes, coreCount
   955  }
   956  
   957  // Serial because the test updates kubelet configuration.
   958  var _ = SIGDescribe("Topology Manager", framework.WithSerial(), feature.TopologyManager, func() {
   959  	f := framework.NewDefaultFramework("topology-manager-test")
   960  	f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged
   961  
   962  	ginkgo.Context("With kubeconfig updated to static CPU Manager policy run the Topology Manager tests", func() {
   963  		runTopologyManagerTests(f)
   964  	})
   965  })