k8s.io/kubernetes@v1.29.3/test/e2e/scheduling/preemption.go (about)

     1  /*
     2  Copyright 2017 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package scheduling
    18  
    19  import (
    20  	"context"
    21  	"encoding/json"
    22  	"fmt"
    23  	"strings"
    24  	"sync/atomic"
    25  	"time"
    26  
    27  	"github.com/google/uuid"
    28  	appsv1 "k8s.io/api/apps/v1"
    29  	v1 "k8s.io/api/core/v1"
    30  	schedulingv1 "k8s.io/api/scheduling/v1"
    31  	apierrors "k8s.io/apimachinery/pkg/api/errors"
    32  	"k8s.io/apimachinery/pkg/api/resource"
    33  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    34  	"k8s.io/apimachinery/pkg/runtime"
    35  	"k8s.io/apimachinery/pkg/types"
    36  	"k8s.io/apimachinery/pkg/util/sets"
    37  	"k8s.io/apimachinery/pkg/util/strategicpatch"
    38  	"k8s.io/apimachinery/pkg/util/wait"
    39  	"k8s.io/apimachinery/pkg/watch"
    40  	clientset "k8s.io/client-go/kubernetes"
    41  	"k8s.io/client-go/tools/cache"
    42  	"k8s.io/kubernetes/pkg/apis/scheduling"
    43  	"k8s.io/kubernetes/test/e2e/framework"
    44  	e2enode "k8s.io/kubernetes/test/e2e/framework/node"
    45  	e2epod "k8s.io/kubernetes/test/e2e/framework/pod"
    46  	e2ereplicaset "k8s.io/kubernetes/test/e2e/framework/replicaset"
    47  	admissionapi "k8s.io/pod-security-admission/api"
    48  
    49  	"github.com/onsi/ginkgo/v2"
    50  	"github.com/onsi/gomega"
    51  
    52  	// ensure libs have a chance to initialize
    53  	_ "github.com/stretchr/testify/assert"
    54  )
    55  
    56  type priorityPair struct {
    57  	name  string
    58  	value int32
    59  }
    60  
    61  var testExtendedResource = v1.ResourceName("scheduling.k8s.io/foo")
    62  
    63  const (
    64  	testFinalizer = "example.com/test-finalizer"
    65  )
    66  
    67  var _ = SIGDescribe("SchedulerPreemption", framework.WithSerial(), func() {
    68  	var cs clientset.Interface
    69  	var nodeList *v1.NodeList
    70  	var ns string
    71  	f := framework.NewDefaultFramework("sched-preemption")
    72  	f.NamespacePodSecurityLevel = admissionapi.LevelBaseline
    73  
    74  	lowPriority, mediumPriority, highPriority := int32(1), int32(100), int32(1000)
    75  	lowPriorityClassName := f.BaseName + "-low-priority"
    76  	mediumPriorityClassName := f.BaseName + "-medium-priority"
    77  	highPriorityClassName := f.BaseName + "-high-priority"
    78  	priorityPairs := []priorityPair{
    79  		{name: lowPriorityClassName, value: lowPriority},
    80  		{name: mediumPriorityClassName, value: mediumPriority},
    81  		{name: highPriorityClassName, value: highPriority},
    82  	}
    83  
    84  	ginkgo.AfterEach(func(ctx context.Context) {
    85  		for _, pair := range priorityPairs {
    86  			_ = cs.SchedulingV1().PriorityClasses().Delete(ctx, pair.name, *metav1.NewDeleteOptions(0))
    87  		}
    88  		for _, node := range nodeList.Items {
    89  			nodeCopy := node.DeepCopy()
    90  			delete(nodeCopy.Status.Capacity, testExtendedResource)
    91  			delete(nodeCopy.Status.Allocatable, testExtendedResource)
    92  			err := patchNode(ctx, cs, &node, nodeCopy)
    93  			framework.ExpectNoError(err)
    94  		}
    95  	})
    96  
    97  	ginkgo.BeforeEach(func(ctx context.Context) {
    98  		cs = f.ClientSet
    99  		ns = f.Namespace.Name
   100  		nodeList = &v1.NodeList{}
   101  		var err error
   102  		for _, pair := range priorityPairs {
   103  			_, err := f.ClientSet.SchedulingV1().PriorityClasses().Create(ctx, &schedulingv1.PriorityClass{ObjectMeta: metav1.ObjectMeta{Name: pair.name}, Value: pair.value}, metav1.CreateOptions{})
   104  			if err != nil && !apierrors.IsAlreadyExists(err) {
   105  				framework.Failf("expected 'alreadyExists' as error, got instead: %v", err)
   106  			}
   107  		}
   108  
   109  		e2enode.WaitForTotalHealthy(ctx, cs, time.Minute)
   110  		nodeList, err = e2enode.GetReadySchedulableNodes(ctx, cs)
   111  		if err != nil {
   112  			framework.Logf("Unexpected error occurred: %v", err)
   113  		}
   114  		framework.ExpectNoErrorWithOffset(0, err)
   115  		for _, n := range nodeList.Items {
   116  			workerNodes.Insert(n.Name)
   117  		}
   118  
   119  		err = framework.CheckTestingNSDeletedExcept(ctx, cs, ns)
   120  		framework.ExpectNoError(err)
   121  	})
   122  
   123  	/*
   124  		Release: v1.19
   125  		Testname: Scheduler, Basic Preemption
   126  		Description: When a higher priority pod is created and no node with enough
   127  		resources is found, the scheduler MUST preempt a lower priority pod and
   128  		schedule the high priority pod.
   129  	*/
   130  	framework.ConformanceIt("validates basic preemption works", func(ctx context.Context) {
   131  		var podRes v1.ResourceList
   132  
   133  		// Create two pods per node that uses a lot of the node's resources.
   134  		ginkgo.By("Create pods that use 4/5 of node resources.")
   135  		pods := make([]*v1.Pod, 0, 2*len(nodeList.Items))
   136  		// Create pods in the cluster.
   137  		// One of them has low priority, making it the victim for preemption.
   138  		for i, node := range nodeList.Items {
   139  			// Update each node to advertise 3 available extended resources
   140  			nodeCopy := node.DeepCopy()
   141  			nodeCopy.Status.Capacity[testExtendedResource] = resource.MustParse("5")
   142  			nodeCopy.Status.Allocatable[testExtendedResource] = resource.MustParse("5")
   143  			err := patchNode(ctx, cs, &node, nodeCopy)
   144  			framework.ExpectNoError(err)
   145  
   146  			for j := 0; j < 2; j++ {
   147  				// Request 2 of the available resources for the victim pods
   148  				podRes = v1.ResourceList{}
   149  				podRes[testExtendedResource] = resource.MustParse("2")
   150  
   151  				// make the first pod low priority and the rest medium priority.
   152  				priorityName := mediumPriorityClassName
   153  				if len(pods) == 0 {
   154  					priorityName = lowPriorityClassName
   155  				}
   156  				pausePod := createPausePod(ctx, f, pausePodConfig{
   157  					Name:              fmt.Sprintf("pod%d-%d-%v", i, j, priorityName),
   158  					PriorityClassName: priorityName,
   159  					Resources: &v1.ResourceRequirements{
   160  						Requests: podRes,
   161  						Limits:   podRes,
   162  					},
   163  					Affinity: &v1.Affinity{
   164  						NodeAffinity: &v1.NodeAffinity{
   165  							RequiredDuringSchedulingIgnoredDuringExecution: &v1.NodeSelector{
   166  								NodeSelectorTerms: []v1.NodeSelectorTerm{
   167  									{
   168  										MatchFields: []v1.NodeSelectorRequirement{
   169  											{Key: "metadata.name", Operator: v1.NodeSelectorOpIn, Values: []string{node.Name}},
   170  										},
   171  									},
   172  								},
   173  							},
   174  						},
   175  					},
   176  				})
   177  				pods = append(pods, pausePod)
   178  				framework.Logf("Created pod: %v", pausePod.Name)
   179  			}
   180  		}
   181  		if len(pods) < 2 {
   182  			framework.Failf("We need at least two pods to be created but " +
   183  				"all nodes are already heavily utilized, so preemption tests cannot be run")
   184  		}
   185  		ginkgo.By("Wait for pods to be scheduled.")
   186  		for _, pod := range pods {
   187  			framework.ExpectNoError(e2epod.WaitForPodRunningInNamespace(ctx, cs, pod))
   188  		}
   189  
   190  		// Set the pod request to the first pod's resources (should be low priority pod)
   191  		podRes = pods[0].Spec.Containers[0].Resources.Requests
   192  
   193  		ginkgo.By("Run a high priority pod that has same requirements as that of lower priority pod")
   194  		// Create a high priority pod and make sure it is scheduled on the same node as the low priority pod.
   195  		runPausePod(ctx, f, pausePodConfig{
   196  			Name:              "preemptor-pod",
   197  			PriorityClassName: highPriorityClassName,
   198  			Resources: &v1.ResourceRequirements{
   199  				Requests: podRes,
   200  				Limits:   podRes,
   201  			},
   202  		})
   203  
   204  		preemptedPod, err := cs.CoreV1().Pods(pods[0].Namespace).Get(ctx, pods[0].Name, metav1.GetOptions{})
   205  		podPreempted := (err != nil && apierrors.IsNotFound(err)) ||
   206  			(err == nil && preemptedPod.DeletionTimestamp != nil)
   207  		if !podPreempted {
   208  			framework.Failf("expected pod to be preempted, instead got pod %+v and error %v", preemptedPod, err)
   209  		}
   210  		for i := 1; i < len(pods); i++ {
   211  			livePod, err := cs.CoreV1().Pods(pods[i].Namespace).Get(ctx, pods[i].Name, metav1.GetOptions{})
   212  			framework.ExpectNoError(err)
   213  			gomega.Expect(livePod.DeletionTimestamp).To(gomega.BeNil())
   214  		}
   215  	})
   216  
   217  	/*
   218  		Release: v1.19
   219  		Testname: Scheduler, Preemption for critical pod
   220  		Description: When a critical pod is created and no node with enough
   221  		resources is found, the scheduler MUST preempt a lower priority pod to
   222  		schedule the critical pod.
   223  	*/
   224  	framework.ConformanceIt("validates lower priority pod preemption by critical pod", func(ctx context.Context) {
   225  		var podRes v1.ResourceList
   226  
   227  		ginkgo.By("Create pods that use 4/5 of node resources.")
   228  		pods := make([]*v1.Pod, 0, len(nodeList.Items))
   229  		for i, node := range nodeList.Items {
   230  			// Update each node to advertise 3 available extended resources
   231  			nodeCopy := node.DeepCopy()
   232  			nodeCopy.Status.Capacity[testExtendedResource] = resource.MustParse("5")
   233  			nodeCopy.Status.Allocatable[testExtendedResource] = resource.MustParse("5")
   234  			err := patchNode(ctx, cs, &node, nodeCopy)
   235  			framework.ExpectNoError(err)
   236  
   237  			for j := 0; j < 2; j++ {
   238  				// Request 2 of the available resources for the victim pods
   239  				podRes = v1.ResourceList{}
   240  				podRes[testExtendedResource] = resource.MustParse("2")
   241  
   242  				// make the first pod low priority and the rest medium priority.
   243  				priorityName := mediumPriorityClassName
   244  				if len(pods) == 0 {
   245  					priorityName = lowPriorityClassName
   246  				}
   247  				pausePod := createPausePod(ctx, f, pausePodConfig{
   248  					Name:              fmt.Sprintf("pod%d-%d-%v", i, j, priorityName),
   249  					PriorityClassName: priorityName,
   250  					Resources: &v1.ResourceRequirements{
   251  						Requests: podRes,
   252  						Limits:   podRes,
   253  					},
   254  					Affinity: &v1.Affinity{
   255  						NodeAffinity: &v1.NodeAffinity{
   256  							RequiredDuringSchedulingIgnoredDuringExecution: &v1.NodeSelector{
   257  								NodeSelectorTerms: []v1.NodeSelectorTerm{
   258  									{
   259  										MatchFields: []v1.NodeSelectorRequirement{
   260  											{Key: "metadata.name", Operator: v1.NodeSelectorOpIn, Values: []string{node.Name}},
   261  										},
   262  									},
   263  								},
   264  							},
   265  						},
   266  					},
   267  				})
   268  				pods = append(pods, pausePod)
   269  				framework.Logf("Created pod: %v", pausePod.Name)
   270  			}
   271  		}
   272  		if len(pods) < 2 {
   273  			framework.Failf("We need at least two pods to be created but " +
   274  				"all nodes are already heavily utilized, so preemption tests cannot be run")
   275  		}
   276  		ginkgo.By("Wait for pods to be scheduled.")
   277  		for _, pod := range pods {
   278  			framework.ExpectNoError(e2epod.WaitForPodRunningInNamespace(ctx, cs, pod))
   279  		}
   280  
   281  		// We want this pod to be preempted
   282  		podRes = pods[0].Spec.Containers[0].Resources.Requests
   283  		ginkgo.By("Run a critical pod that use same resources as that of a lower priority pod")
   284  		// Create a critical pod and make sure it is scheduled.
   285  		defer func() {
   286  			// Clean-up the critical pod
   287  			// Always run cleanup to make sure the pod is properly cleaned up.
   288  			err := f.ClientSet.CoreV1().Pods(metav1.NamespaceSystem).Delete(ctx, "critical-pod", *metav1.NewDeleteOptions(0))
   289  			if err != nil && !apierrors.IsNotFound(err) {
   290  				framework.Failf("Error cleanup pod `%s/%s`: %v", metav1.NamespaceSystem, "critical-pod", err)
   291  			}
   292  		}()
   293  		runPausePod(ctx, f, pausePodConfig{
   294  			Name:              "critical-pod",
   295  			Namespace:         metav1.NamespaceSystem,
   296  			PriorityClassName: scheduling.SystemClusterCritical,
   297  			Resources: &v1.ResourceRequirements{
   298  				Requests: podRes,
   299  				Limits:   podRes,
   300  			},
   301  		})
   302  
   303  		defer func() {
   304  			// Clean-up the critical pod
   305  			err := f.ClientSet.CoreV1().Pods(metav1.NamespaceSystem).Delete(ctx, "critical-pod", *metav1.NewDeleteOptions(0))
   306  			framework.ExpectNoError(err)
   307  		}()
   308  		// Make sure that the lowest priority pod is deleted.
   309  		preemptedPod, err := cs.CoreV1().Pods(pods[0].Namespace).Get(ctx, pods[0].Name, metav1.GetOptions{})
   310  		podPreempted := (err != nil && apierrors.IsNotFound(err)) ||
   311  			(err == nil && preemptedPod.DeletionTimestamp != nil)
   312  		for i := 1; i < len(pods); i++ {
   313  			livePod, err := cs.CoreV1().Pods(pods[i].Namespace).Get(ctx, pods[i].Name, metav1.GetOptions{})
   314  			framework.ExpectNoError(err)
   315  			gomega.Expect(livePod.DeletionTimestamp).To(gomega.BeNil())
   316  		}
   317  
   318  		if !podPreempted {
   319  			framework.Failf("expected pod to be preempted, instead got pod %+v and error %v", preemptedPod, err)
   320  		}
   321  	})
   322  
   323  	// 1. Run a low priority pod with finalizer which consumes 1/1 of node resources
   324  	// 2. Schedule a higher priority pod which also consumes 1/1 of node resources
   325  	// 3. See if the pod with lower priority is preempted and has the pod disruption condition
   326  	// 4. Remove the finalizer so that the pod can be deleted by GC
   327  	ginkgo.It("validates pod disruption condition is added to the preempted pod", func(ctx context.Context) {
   328  		podRes := v1.ResourceList{testExtendedResource: resource.MustParse("1")}
   329  
   330  		ginkgo.By("Select a node to run the lower and higher priority pods")
   331  		gomega.Expect(nodeList.Items).ToNot(gomega.BeEmpty(), "We need at least one node for the test to run")
   332  		node := nodeList.Items[0]
   333  		nodeCopy := node.DeepCopy()
   334  		nodeCopy.Status.Capacity[testExtendedResource] = resource.MustParse("1")
   335  		nodeCopy.Status.Allocatable[testExtendedResource] = resource.MustParse("1")
   336  		err := patchNode(ctx, cs, &node, nodeCopy)
   337  		framework.ExpectNoError(err)
   338  
   339  		// prepare node affinity to make sure both the lower and higher priority pods are scheduled on the same node
   340  		testNodeAffinity := v1.Affinity{
   341  			NodeAffinity: &v1.NodeAffinity{
   342  				RequiredDuringSchedulingIgnoredDuringExecution: &v1.NodeSelector{
   343  					NodeSelectorTerms: []v1.NodeSelectorTerm{
   344  						{
   345  							MatchFields: []v1.NodeSelectorRequirement{
   346  								{Key: "metadata.name", Operator: v1.NodeSelectorOpIn, Values: []string{node.Name}},
   347  							},
   348  						},
   349  					},
   350  				},
   351  			},
   352  		}
   353  
   354  		ginkgo.By("Create a low priority pod that consumes 1/1 of node resources")
   355  		victimPod := createPausePod(ctx, f, pausePodConfig{
   356  			Name:              "victim-pod",
   357  			PriorityClassName: lowPriorityClassName,
   358  			Resources: &v1.ResourceRequirements{
   359  				Requests: podRes,
   360  				Limits:   podRes,
   361  			},
   362  			Finalizers: []string{testFinalizer},
   363  			Affinity:   &testNodeAffinity,
   364  		})
   365  		framework.Logf("Created pod: %v", victimPod.Name)
   366  
   367  		ginkgo.By("Wait for the victim pod to be scheduled")
   368  		framework.ExpectNoError(e2epod.WaitForPodRunningInNamespace(ctx, cs, victimPod))
   369  
   370  		// Remove the finalizer so that the victim pod can be GCed
   371  		defer e2epod.NewPodClient(f).RemoveFinalizer(ctx, victimPod.Name, testFinalizer)
   372  
   373  		ginkgo.By("Create a high priority pod to trigger preemption of the lower priority pod")
   374  		preemptorPod := createPausePod(ctx, f, pausePodConfig{
   375  			Name:              "preemptor-pod",
   376  			PriorityClassName: highPriorityClassName,
   377  			Resources: &v1.ResourceRequirements{
   378  				Requests: podRes,
   379  				Limits:   podRes,
   380  			},
   381  			Affinity: &testNodeAffinity,
   382  		})
   383  		framework.Logf("Created pod: %v", preemptorPod.Name)
   384  
   385  		ginkgo.By("Waiting for the victim pod to be terminating")
   386  		err = e2epod.WaitForPodTerminatingInNamespaceTimeout(ctx, f.ClientSet, victimPod.Name, victimPod.Namespace, framework.PodDeleteTimeout)
   387  		framework.ExpectNoError(err)
   388  
   389  		ginkgo.By("Verifying the pod has the pod disruption condition")
   390  		e2epod.VerifyPodHasConditionWithType(ctx, f, victimPod, v1.DisruptionTarget)
   391  	})
   392  
   393  	ginkgo.Context("PodTopologySpread Preemption", func() {
   394  		var nodeNames []string
   395  		var nodes []*v1.Node
   396  		topologyKey := "kubernetes.io/e2e-pts-preemption"
   397  		var fakeRes v1.ResourceName = "example.com/fakePTSRes"
   398  
   399  		ginkgo.BeforeEach(func(ctx context.Context) {
   400  			if len(nodeList.Items) < 2 {
   401  				ginkgo.Skip("At least 2 nodes are required to run the test")
   402  			}
   403  			ginkgo.By("Trying to get 2 available nodes which can run pod")
   404  			nodeNames = Get2NodesThatCanRunPod(ctx, f)
   405  			ginkgo.By(fmt.Sprintf("Apply dedicated topologyKey %v for this test on the 2 nodes.", topologyKey))
   406  			for _, nodeName := range nodeNames {
   407  				e2enode.AddOrUpdateLabelOnNode(cs, nodeName, topologyKey, nodeName)
   408  
   409  				node, err := cs.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{})
   410  				framework.ExpectNoError(err)
   411  				// update Node API object with a fake resource
   412  				ginkgo.By(fmt.Sprintf("Apply 10 fake resource to node %v.", node.Name))
   413  				nodeCopy := node.DeepCopy()
   414  				nodeCopy.Status.Capacity[fakeRes] = resource.MustParse("10")
   415  				nodeCopy.Status.Allocatable[fakeRes] = resource.MustParse("10")
   416  				err = patchNode(ctx, cs, node, nodeCopy)
   417  				framework.ExpectNoError(err)
   418  				nodes = append(nodes, node)
   419  			}
   420  		})
   421  		ginkgo.AfterEach(func(ctx context.Context) {
   422  			for _, nodeName := range nodeNames {
   423  				e2enode.RemoveLabelOffNode(cs, nodeName, topologyKey)
   424  			}
   425  			for _, node := range nodes {
   426  				nodeCopy := node.DeepCopy()
   427  				delete(nodeCopy.Status.Capacity, fakeRes)
   428  				delete(nodeCopy.Status.Allocatable, fakeRes)
   429  				err := patchNode(ctx, cs, node, nodeCopy)
   430  				framework.ExpectNoError(err)
   431  			}
   432  		})
   433  
   434  		ginkgo.It("validates proper pods are preempted", func(ctx context.Context) {
   435  			podLabel := "e2e-pts-preemption"
   436  			nodeAffinity := &v1.Affinity{
   437  				NodeAffinity: &v1.NodeAffinity{
   438  					RequiredDuringSchedulingIgnoredDuringExecution: &v1.NodeSelector{
   439  						NodeSelectorTerms: []v1.NodeSelectorTerm{
   440  							{
   441  								MatchExpressions: []v1.NodeSelectorRequirement{
   442  									{
   443  										Key:      topologyKey,
   444  										Operator: v1.NodeSelectorOpIn,
   445  										Values:   nodeNames,
   446  									},
   447  								},
   448  							},
   449  						},
   450  					},
   451  				},
   452  			}
   453  			highPodCfg := pausePodConfig{
   454  				Name:              "high",
   455  				Namespace:         ns,
   456  				Labels:            map[string]string{podLabel: ""},
   457  				PriorityClassName: highPriorityClassName,
   458  				Affinity:          nodeAffinity,
   459  				Resources: &v1.ResourceRequirements{
   460  					Requests: v1.ResourceList{fakeRes: resource.MustParse("9")},
   461  					Limits:   v1.ResourceList{fakeRes: resource.MustParse("9")},
   462  				},
   463  			}
   464  			lowPodCfg := pausePodConfig{
   465  				Namespace:         ns,
   466  				Labels:            map[string]string{podLabel: ""},
   467  				PriorityClassName: lowPriorityClassName,
   468  				Affinity:          nodeAffinity,
   469  				Resources: &v1.ResourceRequirements{
   470  					Requests: v1.ResourceList{fakeRes: resource.MustParse("3")},
   471  					Limits:   v1.ResourceList{fakeRes: resource.MustParse("3")},
   472  				},
   473  			}
   474  
   475  			ginkgo.By("Create 1 High Pod and 3 Low Pods to occupy 9/10 of fake resources on both nodes.")
   476  			// Prepare 1 High Pod and 3 Low Pods
   477  			runPausePod(ctx, f, highPodCfg)
   478  			for i := 1; i <= 3; i++ {
   479  				lowPodCfg.Name = fmt.Sprintf("low-%v", i)
   480  				runPausePod(ctx, f, lowPodCfg)
   481  			}
   482  
   483  			ginkgo.By("Create 1 Medium Pod with TopologySpreadConstraints")
   484  			mediumPodCfg := pausePodConfig{
   485  				Name:              "medium",
   486  				Namespace:         ns,
   487  				Labels:            map[string]string{podLabel: ""},
   488  				PriorityClassName: mediumPriorityClassName,
   489  				Affinity:          nodeAffinity,
   490  				Resources: &v1.ResourceRequirements{
   491  					Requests: v1.ResourceList{fakeRes: resource.MustParse("3")},
   492  					Limits:   v1.ResourceList{fakeRes: resource.MustParse("3")},
   493  				},
   494  				TopologySpreadConstraints: []v1.TopologySpreadConstraint{
   495  					{
   496  						MaxSkew:           1,
   497  						TopologyKey:       topologyKey,
   498  						WhenUnsatisfiable: v1.DoNotSchedule,
   499  						LabelSelector: &metav1.LabelSelector{
   500  							MatchExpressions: []metav1.LabelSelectorRequirement{
   501  								{
   502  									Key:      podLabel,
   503  									Operator: metav1.LabelSelectorOpExists,
   504  								},
   505  							},
   506  						},
   507  					},
   508  				},
   509  			}
   510  			// To fulfil resource.requests, the medium Pod only needs to preempt one low pod.
   511  			// However, in that case, the Pods spread becomes [<high>, <medium, low, low>], which doesn't
   512  			// satisfy the pod topology spread constraints. Hence it needs to preempt another low pod
   513  			// to make the Pods spread like [<high>, <medium, low>].
   514  			runPausePod(ctx, f, mediumPodCfg)
   515  
   516  			ginkgo.By("Verify there are 3 Pods left in this namespace")
   517  			wantPods := sets.New("high", "medium", "low")
   518  
   519  			// Wait until the number of pods stabilizes. Note that `medium` pod can get scheduled once the
   520  			// second low priority pod is marked as terminating.
   521  			pods, err := e2epod.WaitForNumberOfPods(ctx, cs, ns, 3, framework.PollShortTimeout)
   522  			framework.ExpectNoError(err)
   523  
   524  			for _, pod := range pods.Items {
   525  				// Remove the ordinal index for low pod.
   526  				podName := strings.Split(pod.Name, "-")[0]
   527  				if wantPods.Has(podName) {
   528  					ginkgo.By(fmt.Sprintf("Pod %q is as expected to be running.", pod.Name))
   529  					wantPods.Delete(podName)
   530  				} else {
   531  					framework.Failf("Pod %q conflicted with expected PodSet %v", podName, wantPods)
   532  				}
   533  			}
   534  		})
   535  	})
   536  
   537  	ginkgo.Context("PreemptionExecutionPath", func() {
   538  		// construct a fakecpu so as to set it to status of Node object
   539  		// otherwise if we update CPU/Memory/etc, those values will be corrected back by kubelet
   540  		var fakecpu v1.ResourceName = "example.com/fakecpu"
   541  		var cs clientset.Interface
   542  		var node *v1.Node
   543  		var ns, nodeHostNameLabel string
   544  		f := framework.NewDefaultFramework("sched-preemption-path")
   545  		f.NamespacePodSecurityLevel = admissionapi.LevelBaseline
   546  
   547  		priorityPairs := make([]priorityPair, 0)
   548  
   549  		ginkgo.AfterEach(func(ctx context.Context) {
   550  			// print out additional info if tests failed
   551  			if ginkgo.CurrentSpecReport().Failed() {
   552  				// List existing PriorityClasses.
   553  				priorityList, err := cs.SchedulingV1().PriorityClasses().List(ctx, metav1.ListOptions{})
   554  				if err != nil {
   555  					framework.Logf("Unable to list PriorityClasses: %v", err)
   556  				} else {
   557  					framework.Logf("List existing PriorityClasses:")
   558  					for _, p := range priorityList.Items {
   559  						framework.Logf("%v/%v created at %v", p.Name, p.Value, p.CreationTimestamp)
   560  					}
   561  				}
   562  			}
   563  
   564  			if node != nil {
   565  				nodeCopy := node.DeepCopy()
   566  				delete(nodeCopy.Status.Capacity, fakecpu)
   567  				delete(nodeCopy.Status.Allocatable, fakecpu)
   568  				err := patchNode(ctx, cs, node, nodeCopy)
   569  				framework.ExpectNoError(err)
   570  			}
   571  			for _, pair := range priorityPairs {
   572  				_ = cs.SchedulingV1().PriorityClasses().Delete(ctx, pair.name, *metav1.NewDeleteOptions(0))
   573  			}
   574  		})
   575  
   576  		ginkgo.BeforeEach(func(ctx context.Context) {
   577  			cs = f.ClientSet
   578  			ns = f.Namespace.Name
   579  
   580  			// find an available node
   581  			ginkgo.By("Finding an available node")
   582  			nodeName := GetNodeThatCanRunPod(ctx, f)
   583  			framework.Logf("found a healthy node: %s", nodeName)
   584  
   585  			// get the node API object
   586  			var err error
   587  			node, err = cs.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{})
   588  			if err != nil {
   589  				framework.Failf("error getting node %q: %v", nodeName, err)
   590  			}
   591  			var ok bool
   592  			nodeHostNameLabel, ok = node.GetObjectMeta().GetLabels()["kubernetes.io/hostname"]
   593  			if !ok {
   594  				framework.Failf("error getting kubernetes.io/hostname label on node %s", nodeName)
   595  			}
   596  
   597  			// update Node API object with a fake resource
   598  			nodeCopy := node.DeepCopy()
   599  			nodeCopy.Status.Capacity[fakecpu] = resource.MustParse("1000")
   600  			nodeCopy.Status.Allocatable[fakecpu] = resource.MustParse("1000")
   601  			err = patchNode(ctx, cs, node, nodeCopy)
   602  			framework.ExpectNoError(err)
   603  
   604  			// create four PriorityClass: p1, p2, p3, p4
   605  			for i := 1; i <= 4; i++ {
   606  				priorityName := fmt.Sprintf("p%d", i)
   607  				priorityVal := int32(i)
   608  				priorityPairs = append(priorityPairs, priorityPair{name: priorityName, value: priorityVal})
   609  				_, err := cs.SchedulingV1().PriorityClasses().Create(ctx, &schedulingv1.PriorityClass{ObjectMeta: metav1.ObjectMeta{Name: priorityName}, Value: priorityVal}, metav1.CreateOptions{})
   610  				if err != nil {
   611  					framework.Logf("Failed to create priority '%v/%v'. Reason: %v. Msg: %v", priorityName, priorityVal, apierrors.ReasonForError(err), err)
   612  				}
   613  				if err != nil && !apierrors.IsAlreadyExists(err) {
   614  					framework.Failf("expected 'alreadyExists' as error, got instead: %v", err)
   615  				}
   616  			}
   617  		})
   618  
   619  		/*
   620  			Release: v1.19
   621  			Testname: Pod preemption verification
   622  			Description: Four levels of Pods in ReplicaSets with different levels of Priority, restricted by given CPU limits MUST launch. Priority 1 - 3 Pods MUST spawn first followed by Priority 4 Pod. The ReplicaSets with Replicas MUST contain the expected number of Replicas.
   623  		*/
   624  		framework.ConformanceIt("runs ReplicaSets to verify preemption running path", func(ctx context.Context) {
   625  			podNamesSeen := []int32{0, 0, 0}
   626  
   627  			// create a pod controller to list/watch pod events from the test framework namespace
   628  			_, podController := cache.NewInformer(
   629  				&cache.ListWatch{
   630  					ListFunc: func(options metav1.ListOptions) (runtime.Object, error) {
   631  						obj, err := f.ClientSet.CoreV1().Pods(ns).List(ctx, options)
   632  						return runtime.Object(obj), err
   633  					},
   634  					WatchFunc: func(options metav1.ListOptions) (watch.Interface, error) {
   635  						return f.ClientSet.CoreV1().Pods(ns).Watch(ctx, options)
   636  					},
   637  				},
   638  				&v1.Pod{},
   639  				0,
   640  				cache.ResourceEventHandlerFuncs{
   641  					AddFunc: func(obj interface{}) {
   642  						if pod, ok := obj.(*v1.Pod); ok {
   643  							if strings.HasPrefix(pod.Name, "rs-pod1") {
   644  								atomic.AddInt32(&podNamesSeen[0], 1)
   645  							} else if strings.HasPrefix(pod.Name, "rs-pod2") {
   646  								atomic.AddInt32(&podNamesSeen[1], 1)
   647  							} else if strings.HasPrefix(pod.Name, "rs-pod3") {
   648  								atomic.AddInt32(&podNamesSeen[2], 1)
   649  							}
   650  						}
   651  					},
   652  				},
   653  			)
   654  			go podController.Run(ctx.Done())
   655  
   656  			// prepare three ReplicaSet
   657  			rsConfs := []pauseRSConfig{
   658  				{
   659  					Replicas: int32(1),
   660  					PodConfig: pausePodConfig{
   661  						Name:              "pod1",
   662  						Namespace:         ns,
   663  						Labels:            map[string]string{"name": "pod1"},
   664  						PriorityClassName: "p1",
   665  						NodeSelector:      map[string]string{"kubernetes.io/hostname": nodeHostNameLabel},
   666  						Resources: &v1.ResourceRequirements{
   667  							Requests: v1.ResourceList{fakecpu: resource.MustParse("200")},
   668  							Limits:   v1.ResourceList{fakecpu: resource.MustParse("200")},
   669  						},
   670  					},
   671  				},
   672  				{
   673  					Replicas: int32(1),
   674  					PodConfig: pausePodConfig{
   675  						Name:              "pod2",
   676  						Namespace:         ns,
   677  						Labels:            map[string]string{"name": "pod2"},
   678  						PriorityClassName: "p2",
   679  						NodeSelector:      map[string]string{"kubernetes.io/hostname": nodeHostNameLabel},
   680  						Resources: &v1.ResourceRequirements{
   681  							Requests: v1.ResourceList{fakecpu: resource.MustParse("300")},
   682  							Limits:   v1.ResourceList{fakecpu: resource.MustParse("300")},
   683  						},
   684  					},
   685  				},
   686  				{
   687  					Replicas: int32(1),
   688  					PodConfig: pausePodConfig{
   689  						Name:              "pod3",
   690  						Namespace:         ns,
   691  						Labels:            map[string]string{"name": "pod3"},
   692  						PriorityClassName: "p3",
   693  						NodeSelector:      map[string]string{"kubernetes.io/hostname": nodeHostNameLabel},
   694  						Resources: &v1.ResourceRequirements{
   695  							Requests: v1.ResourceList{fakecpu: resource.MustParse("450")},
   696  							Limits:   v1.ResourceList{fakecpu: resource.MustParse("450")},
   697  						},
   698  					},
   699  				},
   700  			}
   701  			// create ReplicaSet{1,2,3} so as to occupy 950/1000 fake resource
   702  			for i := range rsConfs {
   703  				runPauseRS(ctx, f, rsConfs[i])
   704  			}
   705  
   706  			framework.Logf("pods created so far: %v", podNamesSeen)
   707  			framework.Logf("length of pods created so far: %v", len(podNamesSeen))
   708  
   709  			// create a Preemptor Pod
   710  			preemptorPodConf := pausePodConfig{
   711  				Name:              "pod4",
   712  				Namespace:         ns,
   713  				Labels:            map[string]string{"name": "pod4"},
   714  				PriorityClassName: "p4",
   715  				NodeSelector:      map[string]string{"kubernetes.io/hostname": nodeHostNameLabel},
   716  				Resources: &v1.ResourceRequirements{
   717  					Requests: v1.ResourceList{fakecpu: resource.MustParse("500")},
   718  					Limits:   v1.ResourceList{fakecpu: resource.MustParse("500")},
   719  				},
   720  			}
   721  			preemptorPod := createPod(ctx, f, preemptorPodConf)
   722  			waitForPreemptingWithTimeout(ctx, f, preemptorPod, framework.PodGetTimeout)
   723  
   724  			framework.Logf("pods created so far: %v", podNamesSeen)
   725  
   726  			// count pods number of ReplicaSet{1,2,3}:
   727  			// - if it's more than expected replicas, it denotes its pods have been over-preempted
   728  			// - if it's less than expected replicas, it denotes its pods are under-preempted
   729  			// "*2" means pods of ReplicaSet{1,2} are expected to be only preempted once.
   730  			expectedRSPods := []int32{1 * 2, 1 * 2, 1}
   731  			err := wait.PollUntilContextTimeout(ctx, framework.Poll, framework.PollShortTimeout, false, func(ctx context.Context) (bool, error) {
   732  				for i := 0; i < len(podNamesSeen); i++ {
   733  					got := atomic.LoadInt32(&podNamesSeen[i])
   734  					if got < expectedRSPods[i] {
   735  						framework.Logf("waiting for rs%d to observe %d pod creations, got %d", i+1, expectedRSPods[i], got)
   736  						return false, nil
   737  					} else if got > expectedRSPods[i] {
   738  						return false, fmt.Errorf("rs%d had more than %d pods created: %d", i+1, expectedRSPods[i], got)
   739  					}
   740  				}
   741  				return true, nil
   742  			})
   743  			if err != nil {
   744  				framework.Logf("pods created so far: %v", podNamesSeen)
   745  				framework.Failf("failed pod observation expectations: %v", err)
   746  			}
   747  
   748  			// If logic continues to here, we should do a final check to ensure within a time period,
   749  			// the state is stable; otherwise, pods may be over-preempted.
   750  			time.Sleep(5 * time.Second)
   751  			for i := 0; i < len(podNamesSeen); i++ {
   752  				got := atomic.LoadInt32(&podNamesSeen[i])
   753  				if got < expectedRSPods[i] {
   754  					framework.Failf("pods of ReplicaSet%d have been under-preempted: expect %v pod names, but got %d", i+1, expectedRSPods[i], got)
   755  				} else if got > expectedRSPods[i] {
   756  					framework.Failf("pods of ReplicaSet%d have been over-preempted: expect %v pod names, but got %d", i+1, expectedRSPods[i], got)
   757  				}
   758  			}
   759  		})
   760  	})
   761  
   762  	ginkgo.Context("PriorityClass endpoints", func() {
   763  		var cs clientset.Interface
   764  		f := framework.NewDefaultFramework("sched-preemption-path")
   765  		f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged
   766  		testUUID := uuid.New().String()
   767  		var pcs []*schedulingv1.PriorityClass
   768  
   769  		ginkgo.BeforeEach(func(ctx context.Context) {
   770  			cs = f.ClientSet
   771  			// Create 2 PriorityClass: p1, p2.
   772  			for i := 1; i <= 2; i++ {
   773  				name, val := fmt.Sprintf("p%d", i), int32(i)
   774  				pc, err := cs.SchedulingV1().PriorityClasses().Create(ctx, &schedulingv1.PriorityClass{ObjectMeta: metav1.ObjectMeta{Name: name, Labels: map[string]string{"e2e": testUUID}}, Value: val}, metav1.CreateOptions{})
   775  				if err != nil {
   776  					framework.Logf("Failed to create priority '%v/%v'. Reason: %v. Msg: %v", name, val, apierrors.ReasonForError(err), err)
   777  				}
   778  				if err != nil && !apierrors.IsAlreadyExists(err) {
   779  					framework.Failf("expected 'alreadyExists' as error, got instead: %v", err)
   780  				}
   781  				pcs = append(pcs, pc)
   782  			}
   783  		})
   784  
   785  		ginkgo.AfterEach(func(ctx context.Context) {
   786  			// Print out additional info if tests failed.
   787  			if ginkgo.CurrentSpecReport().Failed() {
   788  				// List existing PriorityClasses.
   789  				priorityList, err := cs.SchedulingV1().PriorityClasses().List(ctx, metav1.ListOptions{})
   790  				if err != nil {
   791  					framework.Logf("Unable to list PriorityClasses: %v", err)
   792  				} else {
   793  					framework.Logf("List existing PriorityClasses:")
   794  					for _, p := range priorityList.Items {
   795  						framework.Logf("%v/%v created at %v", p.Name, p.Value, p.CreationTimestamp)
   796  					}
   797  				}
   798  			}
   799  
   800  			// Collection deletion on created PriorityClasses.
   801  			err := cs.SchedulingV1().PriorityClasses().DeleteCollection(ctx, metav1.DeleteOptions{}, metav1.ListOptions{LabelSelector: fmt.Sprintf("e2e=%v", testUUID)})
   802  			framework.ExpectNoError(err)
   803  		})
   804  
   805  		/*
   806  			Release: v1.20
   807  			Testname: Scheduler, Verify PriorityClass endpoints
   808  			Description: Verify that PriorityClass endpoints can be listed. When any mutable field is
   809  			either patched or updated it MUST succeed. When any immutable field is either patched or
   810  			updated it MUST fail.
   811  		*/
   812  		framework.ConformanceIt("verify PriorityClass endpoints can be operated with different HTTP methods", func(ctx context.Context) {
   813  			// 1. Patch/Update on immutable fields will fail.
   814  			pcCopy := pcs[0].DeepCopy()
   815  			pcCopy.Value = pcCopy.Value * 10
   816  			err := patchPriorityClass(ctx, cs, pcs[0], pcCopy)
   817  			gomega.Expect(err).To(gomega.HaveOccurred(), "expect a patch error on an immutable field")
   818  			framework.Logf("%v", err)
   819  
   820  			pcCopy = pcs[1].DeepCopy()
   821  			pcCopy.Value = pcCopy.Value * 10
   822  			_, err = cs.SchedulingV1().PriorityClasses().Update(ctx, pcCopy, metav1.UpdateOptions{})
   823  			gomega.Expect(err).To(gomega.HaveOccurred(), "expect an update error on an immutable field")
   824  			framework.Logf("%v", err)
   825  
   826  			// 2. Patch/Update on mutable fields will succeed.
   827  			newDesc := "updated description"
   828  			pcCopy = pcs[0].DeepCopy()
   829  			pcCopy.Description = newDesc
   830  			err = patchPriorityClass(ctx, cs, pcs[0], pcCopy)
   831  			framework.ExpectNoError(err)
   832  
   833  			pcCopy = pcs[1].DeepCopy()
   834  			pcCopy.Description = newDesc
   835  			_, err = cs.SchedulingV1().PriorityClasses().Update(ctx, pcCopy, metav1.UpdateOptions{})
   836  			framework.ExpectNoError(err)
   837  
   838  			// 3. List existing PriorityClasses.
   839  			_, err = cs.SchedulingV1().PriorityClasses().List(ctx, metav1.ListOptions{})
   840  			framework.ExpectNoError(err)
   841  
   842  			// 4. Verify fields of updated PriorityClasses.
   843  			for _, pc := range pcs {
   844  				livePC, err := cs.SchedulingV1().PriorityClasses().Get(ctx, pc.Name, metav1.GetOptions{})
   845  				framework.ExpectNoError(err)
   846  				gomega.Expect(livePC.Value).To(gomega.Equal(pc.Value))
   847  				gomega.Expect(livePC.Description).To(gomega.Equal(newDesc))
   848  			}
   849  		})
   850  	})
   851  })
   852  
   853  type pauseRSConfig struct {
   854  	Replicas  int32
   855  	PodConfig pausePodConfig
   856  }
   857  
   858  func initPauseRS(f *framework.Framework, conf pauseRSConfig) *appsv1.ReplicaSet {
   859  	pausePod := initPausePod(f, conf.PodConfig)
   860  	pauseRS := &appsv1.ReplicaSet{
   861  		ObjectMeta: metav1.ObjectMeta{
   862  			Name:      "rs-" + pausePod.Name,
   863  			Namespace: pausePod.Namespace,
   864  		},
   865  		Spec: appsv1.ReplicaSetSpec{
   866  			Replicas: &conf.Replicas,
   867  			Selector: &metav1.LabelSelector{
   868  				MatchLabels: pausePod.Labels,
   869  			},
   870  			Template: v1.PodTemplateSpec{
   871  				ObjectMeta: metav1.ObjectMeta{Labels: pausePod.ObjectMeta.Labels},
   872  				Spec:       pausePod.Spec,
   873  			},
   874  		},
   875  	}
   876  	return pauseRS
   877  }
   878  
   879  func createPauseRS(ctx context.Context, f *framework.Framework, conf pauseRSConfig) *appsv1.ReplicaSet {
   880  	namespace := conf.PodConfig.Namespace
   881  	if len(namespace) == 0 {
   882  		namespace = f.Namespace.Name
   883  	}
   884  	rs, err := f.ClientSet.AppsV1().ReplicaSets(namespace).Create(ctx, initPauseRS(f, conf), metav1.CreateOptions{})
   885  	framework.ExpectNoError(err)
   886  	return rs
   887  }
   888  
   889  func runPauseRS(ctx context.Context, f *framework.Framework, conf pauseRSConfig) *appsv1.ReplicaSet {
   890  	rs := createPauseRS(ctx, f, conf)
   891  	framework.ExpectNoError(e2ereplicaset.WaitForReplicaSetTargetAvailableReplicasWithTimeout(ctx, f.ClientSet, rs, conf.Replicas, framework.PodGetTimeout))
   892  	return rs
   893  }
   894  
   895  func createPod(ctx context.Context, f *framework.Framework, conf pausePodConfig) *v1.Pod {
   896  	namespace := conf.Namespace
   897  	if len(namespace) == 0 {
   898  		namespace = f.Namespace.Name
   899  	}
   900  	pod, err := f.ClientSet.CoreV1().Pods(namespace).Create(ctx, initPausePod(f, conf), metav1.CreateOptions{})
   901  	framework.ExpectNoError(err)
   902  	return pod
   903  }
   904  
   905  // waitForPreemptingWithTimeout verifies if 'pod' is preempting within 'timeout', specifically it checks
   906  // if the 'spec.NodeName' field of preemptor 'pod' has been set.
   907  func waitForPreemptingWithTimeout(ctx context.Context, f *framework.Framework, pod *v1.Pod, timeout time.Duration) {
   908  	err := wait.PollUntilContextTimeout(ctx, 2*time.Second, timeout, false, func(ctx context.Context) (bool, error) {
   909  		pod, err := f.ClientSet.CoreV1().Pods(pod.Namespace).Get(ctx, pod.Name, metav1.GetOptions{})
   910  		if err != nil {
   911  			return false, err
   912  		}
   913  		if len(pod.Spec.NodeName) > 0 {
   914  			return true, nil
   915  		}
   916  		return false, err
   917  	})
   918  	framework.ExpectNoError(err, "pod %v/%v failed to preempt other pods", pod.Namespace, pod.Name)
   919  }
   920  
   921  func patchNode(ctx context.Context, client clientset.Interface, old *v1.Node, new *v1.Node) error {
   922  	oldData, err := json.Marshal(old)
   923  	if err != nil {
   924  		return err
   925  	}
   926  
   927  	newData, err := json.Marshal(new)
   928  	if err != nil {
   929  		return err
   930  	}
   931  	patchBytes, err := strategicpatch.CreateTwoWayMergePatch(oldData, newData, &v1.Node{})
   932  	if err != nil {
   933  		return fmt.Errorf("failed to create merge patch for node %q: %w", old.Name, err)
   934  	}
   935  	_, err = client.CoreV1().Nodes().Patch(ctx, old.Name, types.StrategicMergePatchType, patchBytes, metav1.PatchOptions{}, "status")
   936  	return err
   937  }
   938  
   939  func patchPriorityClass(ctx context.Context, cs clientset.Interface, old, new *schedulingv1.PriorityClass) error {
   940  	oldData, err := json.Marshal(old)
   941  	if err != nil {
   942  		return err
   943  	}
   944  
   945  	newData, err := json.Marshal(new)
   946  	if err != nil {
   947  		return err
   948  	}
   949  	patchBytes, err := strategicpatch.CreateTwoWayMergePatch(oldData, newData, &schedulingv1.PriorityClass{})
   950  	if err != nil {
   951  		return fmt.Errorf("failed to create merge patch for PriorityClass %q: %w", old.Name, err)
   952  	}
   953  	_, err = cs.SchedulingV1().PriorityClasses().Patch(ctx, old.Name, types.StrategicMergePatchType, patchBytes, metav1.PatchOptions{})
   954  	return err
   955  }