k8s.io/kubernetes@v1.29.3/test/integration/scheduler/preemption/preemption_test.go (about)

     1  /*
     2  Copyright 2017 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  // This file tests preemption functionality of the scheduler.
    18  
    19  package preemption
    20  
    21  import (
    22  	"context"
    23  	"fmt"
    24  	"strings"
    25  	"testing"
    26  	"time"
    27  
    28  	v1 "k8s.io/api/core/v1"
    29  	policy "k8s.io/api/policy/v1"
    30  	"k8s.io/apimachinery/pkg/api/resource"
    31  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    32  	"k8s.io/apimachinery/pkg/runtime"
    33  	"k8s.io/apimachinery/pkg/types"
    34  	"k8s.io/apimachinery/pkg/util/intstr"
    35  	"k8s.io/apimachinery/pkg/util/wait"
    36  	"k8s.io/apiserver/pkg/util/feature"
    37  	"k8s.io/client-go/informers"
    38  	clientset "k8s.io/client-go/kubernetes"
    39  	restclient "k8s.io/client-go/rest"
    40  	featuregatetesting "k8s.io/component-base/featuregate/testing"
    41  	"k8s.io/component-helpers/storage/volume"
    42  	"k8s.io/klog/v2"
    43  	configv1 "k8s.io/kube-scheduler/config/v1"
    44  	podutil "k8s.io/kubernetes/pkg/api/v1/pod"
    45  	"k8s.io/kubernetes/pkg/apis/scheduling"
    46  	"k8s.io/kubernetes/pkg/features"
    47  	"k8s.io/kubernetes/pkg/scheduler"
    48  	configtesting "k8s.io/kubernetes/pkg/scheduler/apis/config/testing"
    49  	"k8s.io/kubernetes/pkg/scheduler/framework"
    50  	"k8s.io/kubernetes/pkg/scheduler/framework/plugins/volumerestrictions"
    51  	frameworkruntime "k8s.io/kubernetes/pkg/scheduler/framework/runtime"
    52  	st "k8s.io/kubernetes/pkg/scheduler/testing"
    53  	"k8s.io/kubernetes/plugin/pkg/admission/priority"
    54  	testutils "k8s.io/kubernetes/test/integration/util"
    55  	"k8s.io/utils/pointer"
    56  )
    57  
    58  // imported from testutils
    59  var (
    60  	initPausePod                    = testutils.InitPausePod
    61  	createNode                      = testutils.CreateNode
    62  	createPausePod                  = testutils.CreatePausePod
    63  	runPausePod                     = testutils.RunPausePod
    64  	deletePod                       = testutils.DeletePod
    65  	initTest                        = testutils.InitTestSchedulerWithNS
    66  	initTestDisablePreemption       = testutils.InitTestDisablePreemption
    67  	initDisruptionController        = testutils.InitDisruptionController
    68  	waitCachedPodsStable            = testutils.WaitCachedPodsStable
    69  	podIsGettingEvicted             = testutils.PodIsGettingEvicted
    70  	podUnschedulable                = testutils.PodUnschedulable
    71  	waitForPDBsStable               = testutils.WaitForPDBsStable
    72  	waitForPodToScheduleWithTimeout = testutils.WaitForPodToScheduleWithTimeout
    73  	waitForPodUnschedulable         = testutils.WaitForPodUnschedulable
    74  )
    75  
    76  const filterPluginName = "filter-plugin"
    77  
    78  var lowPriority, mediumPriority, highPriority = int32(100), int32(200), int32(300)
    79  
    80  func waitForNominatedNodeNameWithTimeout(cs clientset.Interface, pod *v1.Pod, timeout time.Duration) error {
    81  	if err := wait.PollUntilContextTimeout(context.TODO(), 100*time.Millisecond, timeout, false, func(ctx context.Context) (bool, error) {
    82  		pod, err := cs.CoreV1().Pods(pod.Namespace).Get(ctx, pod.Name, metav1.GetOptions{})
    83  		if err != nil {
    84  			return false, err
    85  		}
    86  		if len(pod.Status.NominatedNodeName) > 0 {
    87  			return true, nil
    88  		}
    89  		return false, err
    90  	}); err != nil {
    91  		return fmt.Errorf(".status.nominatedNodeName of Pod %v/%v did not get set: %v", pod.Namespace, pod.Name, err)
    92  	}
    93  	return nil
    94  }
    95  
    96  func waitForNominatedNodeName(cs clientset.Interface, pod *v1.Pod) error {
    97  	return waitForNominatedNodeNameWithTimeout(cs, pod, wait.ForeverTestTimeout)
    98  }
    99  
   100  const tokenFilterName = "token-filter"
   101  
   102  // tokenFilter is a fake plugin that implements PreFilter and Filter.
   103  // `Token` simulates the allowed pods number a cluster can host.
   104  // If `EnablePreFilter` is set to false or `Token` is positive, PreFilter passes; otherwise returns Unschedulable
   105  // For each Filter() call, `Token` is decreased by one. When `Token` is positive, Filter passes; otherwise return
   106  // Unschedulable or UnschedulableAndUnresolvable (when `Unresolvable` is set to true)
   107  // AddPod()/RemovePod() adds/removes one token to the cluster to simulate the dryrun preemption
   108  type tokenFilter struct {
   109  	Tokens          int
   110  	Unresolvable    bool
   111  	EnablePreFilter bool
   112  }
   113  
   114  // Name returns name of the plugin.
   115  func (fp *tokenFilter) Name() string {
   116  	return tokenFilterName
   117  }
   118  
   119  func (fp *tokenFilter) Filter(ctx context.Context, state *framework.CycleState, pod *v1.Pod,
   120  	nodeInfo *framework.NodeInfo) *framework.Status {
   121  	if fp.Tokens > 0 {
   122  		fp.Tokens--
   123  		return nil
   124  	}
   125  	status := framework.Unschedulable
   126  	if fp.Unresolvable {
   127  		status = framework.UnschedulableAndUnresolvable
   128  	}
   129  	return framework.NewStatus(status, fmt.Sprintf("can't fit %v", pod.Name))
   130  }
   131  
   132  func (fp *tokenFilter) PreFilter(ctx context.Context, state *framework.CycleState, pod *v1.Pod) (*framework.PreFilterResult, *framework.Status) {
   133  	if !fp.EnablePreFilter || fp.Tokens > 0 {
   134  		return nil, nil
   135  	}
   136  	return nil, framework.NewStatus(framework.Unschedulable)
   137  }
   138  
   139  func (fp *tokenFilter) AddPod(ctx context.Context, state *framework.CycleState, podToSchedule *v1.Pod,
   140  	podInfoToAdd *framework.PodInfo, nodeInfo *framework.NodeInfo) *framework.Status {
   141  	fp.Tokens--
   142  	return nil
   143  }
   144  
   145  func (fp *tokenFilter) RemovePod(ctx context.Context, state *framework.CycleState, podToSchedule *v1.Pod,
   146  	podInfoToRemove *framework.PodInfo, nodeInfo *framework.NodeInfo) *framework.Status {
   147  	fp.Tokens++
   148  	return nil
   149  }
   150  
   151  func (fp *tokenFilter) PreFilterExtensions() framework.PreFilterExtensions {
   152  	return fp
   153  }
   154  
   155  var _ framework.FilterPlugin = &tokenFilter{}
   156  
   157  // TestPreemption tests a few preemption scenarios.
   158  func TestPreemption(t *testing.T) {
   159  	// Initialize scheduler with a filter plugin.
   160  	var filter tokenFilter
   161  	registry := make(frameworkruntime.Registry)
   162  	err := registry.Register(filterPluginName, func(_ context.Context, _ runtime.Object, fh framework.Handle) (framework.Plugin, error) {
   163  		return &filter, nil
   164  	})
   165  	if err != nil {
   166  		t.Fatalf("Error registering a filter: %v", err)
   167  	}
   168  	cfg := configtesting.V1ToInternalWithDefaults(t, configv1.KubeSchedulerConfiguration{
   169  		Profiles: []configv1.KubeSchedulerProfile{{
   170  			SchedulerName: pointer.String(v1.DefaultSchedulerName),
   171  			Plugins: &configv1.Plugins{
   172  				Filter: configv1.PluginSet{
   173  					Enabled: []configv1.Plugin{
   174  						{Name: filterPluginName},
   175  					},
   176  				},
   177  				PreFilter: configv1.PluginSet{
   178  					Enabled: []configv1.Plugin{
   179  						{Name: filterPluginName},
   180  					},
   181  				},
   182  			},
   183  		}},
   184  	})
   185  
   186  	testCtx := testutils.InitTestSchedulerWithOptions(t,
   187  		testutils.InitTestAPIServer(t, "preemption", nil),
   188  		0,
   189  		scheduler.WithProfiles(cfg.Profiles...),
   190  		scheduler.WithFrameworkOutOfTreeRegistry(registry))
   191  	testutils.SyncSchedulerInformerFactory(testCtx)
   192  	go testCtx.Scheduler.Run(testCtx.Ctx)
   193  
   194  	cs := testCtx.ClientSet
   195  
   196  	defaultPodRes := &v1.ResourceRequirements{Requests: v1.ResourceList{
   197  		v1.ResourceCPU:    *resource.NewMilliQuantity(100, resource.DecimalSI),
   198  		v1.ResourceMemory: *resource.NewQuantity(100, resource.DecimalSI)},
   199  	}
   200  
   201  	maxTokens := 1000
   202  	tests := []struct {
   203  		name                          string
   204  		existingPods                  []*v1.Pod
   205  		pod                           *v1.Pod
   206  		initTokens                    int
   207  		enablePreFilter               bool
   208  		unresolvable                  bool
   209  		preemptedPodIndexes           map[int]struct{}
   210  		enablePodDisruptionConditions bool
   211  	}{
   212  		{
   213  			name:       "basic pod preemption with PodDisruptionConditions enabled",
   214  			initTokens: maxTokens,
   215  			existingPods: []*v1.Pod{
   216  				initPausePod(&testutils.PausePodConfig{
   217  					Name:      "victim-pod",
   218  					Namespace: testCtx.NS.Name,
   219  					Priority:  &lowPriority,
   220  					Resources: &v1.ResourceRequirements{Requests: v1.ResourceList{
   221  						v1.ResourceCPU:    *resource.NewMilliQuantity(400, resource.DecimalSI),
   222  						v1.ResourceMemory: *resource.NewQuantity(200, resource.DecimalSI)},
   223  					},
   224  				}),
   225  			},
   226  			pod: initPausePod(&testutils.PausePodConfig{
   227  				Name:      "preemptor-pod",
   228  				Namespace: testCtx.NS.Name,
   229  				Priority:  &highPriority,
   230  				Resources: &v1.ResourceRequirements{Requests: v1.ResourceList{
   231  					v1.ResourceCPU:    *resource.NewMilliQuantity(300, resource.DecimalSI),
   232  					v1.ResourceMemory: *resource.NewQuantity(200, resource.DecimalSI)},
   233  				},
   234  			}),
   235  			preemptedPodIndexes:           map[int]struct{}{0: {}},
   236  			enablePodDisruptionConditions: true,
   237  		},
   238  		{
   239  			name:       "basic pod preemption",
   240  			initTokens: maxTokens,
   241  			existingPods: []*v1.Pod{
   242  				initPausePod(&testutils.PausePodConfig{
   243  					Name:      "victim-pod",
   244  					Namespace: testCtx.NS.Name,
   245  					Priority:  &lowPriority,
   246  					Resources: &v1.ResourceRequirements{Requests: v1.ResourceList{
   247  						v1.ResourceCPU:    *resource.NewMilliQuantity(400, resource.DecimalSI),
   248  						v1.ResourceMemory: *resource.NewQuantity(200, resource.DecimalSI)},
   249  					},
   250  				}),
   251  			},
   252  			pod: initPausePod(&testutils.PausePodConfig{
   253  				Name:      "preemptor-pod",
   254  				Namespace: testCtx.NS.Name,
   255  				Priority:  &highPriority,
   256  				Resources: &v1.ResourceRequirements{Requests: v1.ResourceList{
   257  					v1.ResourceCPU:    *resource.NewMilliQuantity(300, resource.DecimalSI),
   258  					v1.ResourceMemory: *resource.NewQuantity(200, resource.DecimalSI)},
   259  				},
   260  			}),
   261  			preemptedPodIndexes: map[int]struct{}{0: {}},
   262  		},
   263  		{
   264  			name:       "basic pod preemption with filter",
   265  			initTokens: 1,
   266  			existingPods: []*v1.Pod{
   267  				initPausePod(&testutils.PausePodConfig{
   268  					Name:      "victim-pod",
   269  					Namespace: testCtx.NS.Name,
   270  					Priority:  &lowPriority,
   271  					Resources: &v1.ResourceRequirements{Requests: v1.ResourceList{
   272  						v1.ResourceCPU:    *resource.NewMilliQuantity(200, resource.DecimalSI),
   273  						v1.ResourceMemory: *resource.NewQuantity(200, resource.DecimalSI)},
   274  					},
   275  				}),
   276  			},
   277  			pod: initPausePod(&testutils.PausePodConfig{
   278  				Name:      "preemptor-pod",
   279  				Namespace: testCtx.NS.Name,
   280  				Priority:  &highPriority,
   281  				Resources: &v1.ResourceRequirements{Requests: v1.ResourceList{
   282  					v1.ResourceCPU:    *resource.NewMilliQuantity(200, resource.DecimalSI),
   283  					v1.ResourceMemory: *resource.NewQuantity(200, resource.DecimalSI)},
   284  				},
   285  			}),
   286  			preemptedPodIndexes: map[int]struct{}{0: {}},
   287  		},
   288  		// This is identical with previous subtest except for setting enablePreFilter to true.
   289  		// With this fake plugin returning Unschedulable in PreFilter, it's able to exercise the path
   290  		// that in-tree plugins return Skip in PreFilter and their AddPod/RemovePod functions are also
   291  		// skipped properly upon preemption.
   292  		{
   293  			name:            "basic pod preemption with preFilter",
   294  			initTokens:      1,
   295  			enablePreFilter: true,
   296  			existingPods: []*v1.Pod{
   297  				initPausePod(&testutils.PausePodConfig{
   298  					Name:      "victim-pod",
   299  					Namespace: testCtx.NS.Name,
   300  					Priority:  &lowPriority,
   301  					Resources: &v1.ResourceRequirements{Requests: v1.ResourceList{
   302  						v1.ResourceCPU:    *resource.NewMilliQuantity(200, resource.DecimalSI),
   303  						v1.ResourceMemory: *resource.NewQuantity(200, resource.DecimalSI)},
   304  					},
   305  				}),
   306  			},
   307  			pod: initPausePod(&testutils.PausePodConfig{
   308  				Name:      "preemptor-pod",
   309  				Namespace: testCtx.NS.Name,
   310  				Priority:  &highPriority,
   311  				Resources: &v1.ResourceRequirements{Requests: v1.ResourceList{
   312  					v1.ResourceCPU:    *resource.NewMilliQuantity(200, resource.DecimalSI),
   313  					v1.ResourceMemory: *resource.NewQuantity(200, resource.DecimalSI)},
   314  				},
   315  			}),
   316  			preemptedPodIndexes: map[int]struct{}{0: {}},
   317  		},
   318  		{
   319  			// same as the previous test, but the filter is unresolvable.
   320  			name:         "basic pod preemption with unresolvable filter",
   321  			initTokens:   1,
   322  			unresolvable: true,
   323  			existingPods: []*v1.Pod{
   324  				initPausePod(&testutils.PausePodConfig{
   325  					Name:      "victim-pod",
   326  					Namespace: testCtx.NS.Name,
   327  					Priority:  &lowPriority,
   328  					Resources: &v1.ResourceRequirements{Requests: v1.ResourceList{
   329  						v1.ResourceCPU:    *resource.NewMilliQuantity(200, resource.DecimalSI),
   330  						v1.ResourceMemory: *resource.NewQuantity(200, resource.DecimalSI)},
   331  					},
   332  				}),
   333  			},
   334  			pod: initPausePod(&testutils.PausePodConfig{
   335  				Name:      "preemptor-pod",
   336  				Namespace: testCtx.NS.Name,
   337  				Priority:  &highPriority,
   338  				Resources: &v1.ResourceRequirements{Requests: v1.ResourceList{
   339  					v1.ResourceCPU:    *resource.NewMilliQuantity(200, resource.DecimalSI),
   340  					v1.ResourceMemory: *resource.NewQuantity(200, resource.DecimalSI)},
   341  				},
   342  			}),
   343  			preemptedPodIndexes: map[int]struct{}{},
   344  		},
   345  		{
   346  			name:       "preemption is performed to satisfy anti-affinity",
   347  			initTokens: maxTokens,
   348  			existingPods: []*v1.Pod{
   349  				initPausePod(&testutils.PausePodConfig{
   350  					Name: "pod-0", Namespace: testCtx.NS.Name,
   351  					Priority:  &mediumPriority,
   352  					Labels:    map[string]string{"pod": "p0"},
   353  					Resources: defaultPodRes,
   354  				}),
   355  				initPausePod(&testutils.PausePodConfig{
   356  					Name: "pod-1", Namespace: testCtx.NS.Name,
   357  					Priority:  &lowPriority,
   358  					Labels:    map[string]string{"pod": "p1"},
   359  					Resources: defaultPodRes,
   360  					Affinity: &v1.Affinity{
   361  						PodAntiAffinity: &v1.PodAntiAffinity{
   362  							RequiredDuringSchedulingIgnoredDuringExecution: []v1.PodAffinityTerm{
   363  								{
   364  									LabelSelector: &metav1.LabelSelector{
   365  										MatchExpressions: []metav1.LabelSelectorRequirement{
   366  											{
   367  												Key:      "pod",
   368  												Operator: metav1.LabelSelectorOpIn,
   369  												Values:   []string{"preemptor"},
   370  											},
   371  										},
   372  									},
   373  									TopologyKey: "node",
   374  								},
   375  							},
   376  						},
   377  					},
   378  				}),
   379  			},
   380  			// A higher priority pod with anti-affinity.
   381  			pod: initPausePod(&testutils.PausePodConfig{
   382  				Name:      "preemptor-pod",
   383  				Namespace: testCtx.NS.Name,
   384  				Priority:  &highPriority,
   385  				Labels:    map[string]string{"pod": "preemptor"},
   386  				Resources: defaultPodRes,
   387  				Affinity: &v1.Affinity{
   388  					PodAntiAffinity: &v1.PodAntiAffinity{
   389  						RequiredDuringSchedulingIgnoredDuringExecution: []v1.PodAffinityTerm{
   390  							{
   391  								LabelSelector: &metav1.LabelSelector{
   392  									MatchExpressions: []metav1.LabelSelectorRequirement{
   393  										{
   394  											Key:      "pod",
   395  											Operator: metav1.LabelSelectorOpIn,
   396  											Values:   []string{"p0"},
   397  										},
   398  									},
   399  								},
   400  								TopologyKey: "node",
   401  							},
   402  						},
   403  					},
   404  				},
   405  			}),
   406  			preemptedPodIndexes: map[int]struct{}{0: {}, 1: {}},
   407  		},
   408  		{
   409  			// This is similar to the previous case only pod-1 is high priority.
   410  			name:       "preemption is not performed when anti-affinity is not satisfied",
   411  			initTokens: maxTokens,
   412  			existingPods: []*v1.Pod{
   413  				initPausePod(&testutils.PausePodConfig{
   414  					Name: "pod-0", Namespace: testCtx.NS.Name,
   415  					Priority:  &mediumPriority,
   416  					Labels:    map[string]string{"pod": "p0"},
   417  					Resources: defaultPodRes,
   418  				}),
   419  				initPausePod(&testutils.PausePodConfig{
   420  					Name: "pod-1", Namespace: testCtx.NS.Name,
   421  					Priority:  &highPriority,
   422  					Labels:    map[string]string{"pod": "p1"},
   423  					Resources: defaultPodRes,
   424  					Affinity: &v1.Affinity{
   425  						PodAntiAffinity: &v1.PodAntiAffinity{
   426  							RequiredDuringSchedulingIgnoredDuringExecution: []v1.PodAffinityTerm{
   427  								{
   428  									LabelSelector: &metav1.LabelSelector{
   429  										MatchExpressions: []metav1.LabelSelectorRequirement{
   430  											{
   431  												Key:      "pod",
   432  												Operator: metav1.LabelSelectorOpIn,
   433  												Values:   []string{"preemptor"},
   434  											},
   435  										},
   436  									},
   437  									TopologyKey: "node",
   438  								},
   439  							},
   440  						},
   441  					},
   442  				}),
   443  			},
   444  			// A higher priority pod with anti-affinity.
   445  			pod: initPausePod(&testutils.PausePodConfig{
   446  				Name:      "preemptor-pod",
   447  				Namespace: testCtx.NS.Name,
   448  				Priority:  &highPriority,
   449  				Labels:    map[string]string{"pod": "preemptor"},
   450  				Resources: defaultPodRes,
   451  				Affinity: &v1.Affinity{
   452  					PodAntiAffinity: &v1.PodAntiAffinity{
   453  						RequiredDuringSchedulingIgnoredDuringExecution: []v1.PodAffinityTerm{
   454  							{
   455  								LabelSelector: &metav1.LabelSelector{
   456  									MatchExpressions: []metav1.LabelSelectorRequirement{
   457  										{
   458  											Key:      "pod",
   459  											Operator: metav1.LabelSelectorOpIn,
   460  											Values:   []string{"p0"},
   461  										},
   462  									},
   463  								},
   464  								TopologyKey: "node",
   465  							},
   466  						},
   467  					},
   468  				},
   469  			}),
   470  			preemptedPodIndexes: map[int]struct{}{},
   471  		},
   472  	}
   473  
   474  	// Create a node with some resources and a label.
   475  	nodeRes := map[v1.ResourceName]string{
   476  		v1.ResourcePods:   "32",
   477  		v1.ResourceCPU:    "500m",
   478  		v1.ResourceMemory: "500",
   479  	}
   480  	nodeObject := st.MakeNode().Name("node1").Capacity(nodeRes).Label("node", "node1").Obj()
   481  	if _, err := createNode(testCtx.ClientSet, nodeObject); err != nil {
   482  		t.Fatalf("Error creating node: %v", err)
   483  	}
   484  
   485  	for _, test := range tests {
   486  		t.Run(test.name, func(t *testing.T) {
   487  			defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.PodDisruptionConditions, test.enablePodDisruptionConditions)()
   488  			filter.Tokens = test.initTokens
   489  			filter.EnablePreFilter = test.enablePreFilter
   490  			filter.Unresolvable = test.unresolvable
   491  			pods := make([]*v1.Pod, len(test.existingPods))
   492  			// Create and run existingPods.
   493  			for i, p := range test.existingPods {
   494  				pods[i], err = runPausePod(cs, p)
   495  				if err != nil {
   496  					t.Fatalf("Error running pause pod: %v", err)
   497  				}
   498  			}
   499  			// Create the "pod".
   500  			preemptor, err := createPausePod(cs, test.pod)
   501  			if err != nil {
   502  				t.Errorf("Error while creating high priority pod: %v", err)
   503  			}
   504  			// Wait for preemption of pods and make sure the other ones are not preempted.
   505  			for i, p := range pods {
   506  				if _, found := test.preemptedPodIndexes[i]; found {
   507  					if err = wait.PollUntilContextTimeout(testCtx.Ctx, time.Second, wait.ForeverTestTimeout, false,
   508  						podIsGettingEvicted(cs, p.Namespace, p.Name)); err != nil {
   509  						t.Errorf("Pod %v/%v is not getting evicted.", p.Namespace, p.Name)
   510  					}
   511  					pod, err := cs.CoreV1().Pods(p.Namespace).Get(testCtx.Ctx, p.Name, metav1.GetOptions{})
   512  					if err != nil {
   513  						t.Errorf("Error %v when getting the updated status for pod %v/%v ", err, p.Namespace, p.Name)
   514  					}
   515  					_, cond := podutil.GetPodCondition(&pod.Status, v1.DisruptionTarget)
   516  					if test.enablePodDisruptionConditions && cond == nil {
   517  						t.Errorf("Pod %q does not have the expected condition: %q", klog.KObj(pod), v1.DisruptionTarget)
   518  					} else if test.enablePodDisruptionConditions == false && cond != nil {
   519  						t.Errorf("Pod %q has an unexpected condition: %q", klog.KObj(pod), v1.DisruptionTarget)
   520  					}
   521  				} else {
   522  					if p.DeletionTimestamp != nil {
   523  						t.Errorf("Didn't expect pod %v to get preempted.", p.Name)
   524  					}
   525  				}
   526  			}
   527  			// Also check that the preemptor pod gets the NominatedNodeName field set.
   528  			if len(test.preemptedPodIndexes) > 0 {
   529  				if err := waitForNominatedNodeName(cs, preemptor); err != nil {
   530  					t.Errorf("NominatedNodeName field was not set for pod %v: %v", preemptor.Name, err)
   531  				}
   532  			}
   533  
   534  			// Cleanup
   535  			pods = append(pods, preemptor)
   536  			testutils.CleanupPods(testCtx.Ctx, cs, t, pods)
   537  		})
   538  	}
   539  }
   540  
   541  // TestNonPreemption tests NonPreempt option of PriorityClass of scheduler works as expected.
   542  func TestNonPreemption(t *testing.T) {
   543  	var preemptNever = v1.PreemptNever
   544  	// Initialize scheduler.
   545  	testCtx := initTest(t, "non-preemption")
   546  	cs := testCtx.ClientSet
   547  	tests := []struct {
   548  		name             string
   549  		PreemptionPolicy *v1.PreemptionPolicy
   550  	}{
   551  		{
   552  			name:             "pod preemption will happen",
   553  			PreemptionPolicy: nil,
   554  		},
   555  		{
   556  			name:             "pod preemption will not happen",
   557  			PreemptionPolicy: &preemptNever,
   558  		},
   559  	}
   560  	victim := initPausePod(&testutils.PausePodConfig{
   561  		Name:      "victim-pod",
   562  		Namespace: testCtx.NS.Name,
   563  		Priority:  &lowPriority,
   564  		Resources: &v1.ResourceRequirements{Requests: v1.ResourceList{
   565  			v1.ResourceCPU:    *resource.NewMilliQuantity(400, resource.DecimalSI),
   566  			v1.ResourceMemory: *resource.NewQuantity(200, resource.DecimalSI)},
   567  		},
   568  	})
   569  
   570  	preemptor := initPausePod(&testutils.PausePodConfig{
   571  		Name:      "preemptor-pod",
   572  		Namespace: testCtx.NS.Name,
   573  		Priority:  &highPriority,
   574  		Resources: &v1.ResourceRequirements{Requests: v1.ResourceList{
   575  			v1.ResourceCPU:    *resource.NewMilliQuantity(300, resource.DecimalSI),
   576  			v1.ResourceMemory: *resource.NewQuantity(200, resource.DecimalSI)},
   577  		},
   578  	})
   579  
   580  	// Create a node with some resources
   581  	nodeRes := map[v1.ResourceName]string{
   582  		v1.ResourcePods:   "32",
   583  		v1.ResourceCPU:    "500m",
   584  		v1.ResourceMemory: "500",
   585  	}
   586  	_, err := createNode(testCtx.ClientSet, st.MakeNode().Name("node1").Capacity(nodeRes).Obj())
   587  	if err != nil {
   588  		t.Fatalf("Error creating nodes: %v", err)
   589  	}
   590  	for _, test := range tests {
   591  		t.Run(test.name, func(t *testing.T) {
   592  			defer testutils.CleanupPods(testCtx.Ctx, cs, t, []*v1.Pod{preemptor, victim})
   593  			preemptor.Spec.PreemptionPolicy = test.PreemptionPolicy
   594  			victimPod, err := createPausePod(cs, victim)
   595  			if err != nil {
   596  				t.Fatalf("Error while creating victim: %v", err)
   597  			}
   598  			if err := waitForPodToScheduleWithTimeout(cs, victimPod, 5*time.Second); err != nil {
   599  				t.Fatalf("victim %v should be become scheduled", victimPod.Name)
   600  			}
   601  
   602  			preemptorPod, err := createPausePod(cs, preemptor)
   603  			if err != nil {
   604  				t.Fatalf("Error while creating preemptor: %v", err)
   605  			}
   606  
   607  			err = waitForNominatedNodeNameWithTimeout(cs, preemptorPod, 5*time.Second)
   608  			// test.PreemptionPolicy == nil means we expect the preemptor to be nominated.
   609  			expect := test.PreemptionPolicy == nil
   610  			// err == nil indicates the preemptor is indeed nominated.
   611  			got := err == nil
   612  			if got != expect {
   613  				t.Errorf("Expect preemptor to be nominated=%v, but got=%v", expect, got)
   614  			}
   615  		})
   616  	}
   617  }
   618  
   619  // TestDisablePreemption tests disable pod preemption of scheduler works as expected.
   620  func TestDisablePreemption(t *testing.T) {
   621  	// Initialize scheduler, and disable preemption.
   622  	testCtx := initTestDisablePreemption(t, "disable-preemption")
   623  	cs := testCtx.ClientSet
   624  
   625  	tests := []struct {
   626  		name         string
   627  		existingPods []*v1.Pod
   628  		pod          *v1.Pod
   629  	}{
   630  		{
   631  			name: "pod preemption will not happen",
   632  			existingPods: []*v1.Pod{
   633  				initPausePod(&testutils.PausePodConfig{
   634  					Name:      "victim-pod",
   635  					Namespace: testCtx.NS.Name,
   636  					Priority:  &lowPriority,
   637  					Resources: &v1.ResourceRequirements{Requests: v1.ResourceList{
   638  						v1.ResourceCPU:    *resource.NewMilliQuantity(400, resource.DecimalSI),
   639  						v1.ResourceMemory: *resource.NewQuantity(200, resource.DecimalSI)},
   640  					},
   641  				}),
   642  			},
   643  			pod: initPausePod(&testutils.PausePodConfig{
   644  				Name:      "preemptor-pod",
   645  				Namespace: testCtx.NS.Name,
   646  				Priority:  &highPriority,
   647  				Resources: &v1.ResourceRequirements{Requests: v1.ResourceList{
   648  					v1.ResourceCPU:    *resource.NewMilliQuantity(300, resource.DecimalSI),
   649  					v1.ResourceMemory: *resource.NewQuantity(200, resource.DecimalSI)},
   650  				},
   651  			}),
   652  		},
   653  	}
   654  
   655  	// Create a node with some resources
   656  	nodeRes := map[v1.ResourceName]string{
   657  		v1.ResourcePods:   "32",
   658  		v1.ResourceCPU:    "500m",
   659  		v1.ResourceMemory: "500",
   660  	}
   661  	_, err := createNode(testCtx.ClientSet, st.MakeNode().Name("node1").Capacity(nodeRes).Obj())
   662  	if err != nil {
   663  		t.Fatalf("Error creating nodes: %v", err)
   664  	}
   665  
   666  	for _, test := range tests {
   667  		t.Run(test.name, func(t *testing.T) {
   668  			pods := make([]*v1.Pod, len(test.existingPods))
   669  			// Create and run existingPods.
   670  			for i, p := range test.existingPods {
   671  				pods[i], err = runPausePod(cs, p)
   672  				if err != nil {
   673  					t.Fatalf("Test [%v]: Error running pause pod: %v", test.name, err)
   674  				}
   675  			}
   676  			// Create the "pod".
   677  			preemptor, err := createPausePod(cs, test.pod)
   678  			if err != nil {
   679  				t.Errorf("Error while creating high priority pod: %v", err)
   680  			}
   681  			// Ensure preemptor should keep unschedulable.
   682  			if err := waitForPodUnschedulable(cs, preemptor); err != nil {
   683  				t.Errorf("Preemptor %v should not become scheduled", preemptor.Name)
   684  			}
   685  
   686  			// Ensure preemptor should not be nominated.
   687  			if err := waitForNominatedNodeNameWithTimeout(cs, preemptor, 5*time.Second); err == nil {
   688  				t.Errorf("Preemptor %v should not be nominated", preemptor.Name)
   689  			}
   690  
   691  			// Cleanup
   692  			pods = append(pods, preemptor)
   693  			testutils.CleanupPods(testCtx.Ctx, cs, t, pods)
   694  		})
   695  	}
   696  }
   697  
   698  // This test verifies that system critical priorities are created automatically and resolved properly.
   699  func TestPodPriorityResolution(t *testing.T) {
   700  	admission := priority.NewPlugin()
   701  	testCtx := testutils.InitTestScheduler(t, testutils.InitTestAPIServer(t, "preemption", admission))
   702  	cs := testCtx.ClientSet
   703  
   704  	// Build clientset and informers for controllers.
   705  	externalClientConfig := restclient.CopyConfig(testCtx.KubeConfig)
   706  	externalClientConfig.QPS = -1
   707  	externalClientset := clientset.NewForConfigOrDie(externalClientConfig)
   708  	externalInformers := informers.NewSharedInformerFactory(externalClientset, time.Second)
   709  	admission.SetExternalKubeClientSet(externalClientset)
   710  	admission.SetExternalKubeInformerFactory(externalInformers)
   711  
   712  	// Waiting for all controllers to sync
   713  	testutils.SyncSchedulerInformerFactory(testCtx)
   714  	externalInformers.Start(testCtx.Ctx.Done())
   715  	externalInformers.WaitForCacheSync(testCtx.Ctx.Done())
   716  
   717  	// Run all controllers
   718  	go testCtx.Scheduler.Run(testCtx.Ctx)
   719  
   720  	tests := []struct {
   721  		Name             string
   722  		PriorityClass    string
   723  		Pod              *v1.Pod
   724  		ExpectedPriority int32
   725  		ExpectedError    error
   726  	}{
   727  		{
   728  			Name:             "SystemNodeCritical priority class",
   729  			PriorityClass:    scheduling.SystemNodeCritical,
   730  			ExpectedPriority: scheduling.SystemCriticalPriority + 1000,
   731  			Pod: initPausePod(&testutils.PausePodConfig{
   732  				Name:              fmt.Sprintf("pod1-%v", scheduling.SystemNodeCritical),
   733  				Namespace:         metav1.NamespaceSystem,
   734  				PriorityClassName: scheduling.SystemNodeCritical,
   735  			}),
   736  		},
   737  		{
   738  			Name:             "SystemClusterCritical priority class",
   739  			PriorityClass:    scheduling.SystemClusterCritical,
   740  			ExpectedPriority: scheduling.SystemCriticalPriority,
   741  			Pod: initPausePod(&testutils.PausePodConfig{
   742  				Name:              fmt.Sprintf("pod2-%v", scheduling.SystemClusterCritical),
   743  				Namespace:         metav1.NamespaceSystem,
   744  				PriorityClassName: scheduling.SystemClusterCritical,
   745  			}),
   746  		},
   747  		{
   748  			Name:             "Invalid priority class should result in error",
   749  			PriorityClass:    "foo",
   750  			ExpectedPriority: scheduling.SystemCriticalPriority,
   751  			Pod: initPausePod(&testutils.PausePodConfig{
   752  				Name:              fmt.Sprintf("pod3-%v", scheduling.SystemClusterCritical),
   753  				Namespace:         metav1.NamespaceSystem,
   754  				PriorityClassName: "foo",
   755  			}),
   756  			ExpectedError: fmt.Errorf("failed to create pause pod: pods \"pod3-system-cluster-critical\" is forbidden: no PriorityClass with name foo was found"),
   757  		},
   758  	}
   759  
   760  	// Create a node with some resources
   761  	nodeRes := map[v1.ResourceName]string{
   762  		v1.ResourcePods:   "32",
   763  		v1.ResourceCPU:    "500m",
   764  		v1.ResourceMemory: "500",
   765  	}
   766  	_, err := createNode(testCtx.ClientSet, st.MakeNode().Name("node1").Capacity(nodeRes).Obj())
   767  	if err != nil {
   768  		t.Fatalf("Error creating nodes: %v", err)
   769  	}
   770  
   771  	pods := make([]*v1.Pod, 0, len(tests))
   772  	for _, test := range tests {
   773  		t.Run(test.Name, func(t *testing.T) {
   774  			t.Run(test.Name, func(t *testing.T) {
   775  				pod, err := runPausePod(cs, test.Pod)
   776  				if err != nil {
   777  					if test.ExpectedError == nil {
   778  						t.Fatalf("Test [PodPriority/%v]: Error running pause pod: %v", test.PriorityClass, err)
   779  					}
   780  					if err.Error() != test.ExpectedError.Error() {
   781  						t.Fatalf("Test [PodPriority/%v]: Expected error %v but got error %v", test.PriorityClass, test.ExpectedError, err)
   782  					}
   783  					return
   784  				}
   785  				pods = append(pods, pod)
   786  				if pod.Spec.Priority != nil {
   787  					if *pod.Spec.Priority != test.ExpectedPriority {
   788  						t.Errorf("Expected pod %v to have priority %v but was %v", pod.Name, test.ExpectedPriority, pod.Spec.Priority)
   789  					}
   790  				} else {
   791  					t.Errorf("Expected pod %v to have priority %v but was nil", pod.Name, test.PriorityClass)
   792  				}
   793  			})
   794  		})
   795  	}
   796  	testutils.CleanupPods(testCtx.Ctx, cs, t, pods)
   797  	testutils.CleanupNodes(cs, t)
   798  }
   799  
   800  func mkPriorityPodWithGrace(tc *testutils.TestContext, name string, priority int32, grace int64) *v1.Pod {
   801  	defaultPodRes := &v1.ResourceRequirements{Requests: v1.ResourceList{
   802  		v1.ResourceCPU:    *resource.NewMilliQuantity(100, resource.DecimalSI),
   803  		v1.ResourceMemory: *resource.NewQuantity(100, resource.DecimalSI)},
   804  	}
   805  	pod := initPausePod(&testutils.PausePodConfig{
   806  		Name:      name,
   807  		Namespace: tc.NS.Name,
   808  		Priority:  &priority,
   809  		Labels:    map[string]string{"pod": name},
   810  		Resources: defaultPodRes,
   811  	})
   812  	pod.Spec.TerminationGracePeriodSeconds = &grace
   813  	return pod
   814  }
   815  
   816  // This test ensures that while the preempting pod is waiting for the victims to
   817  // terminate, other pending lower priority pods are not scheduled in the room created
   818  // after preemption and while the higher priority pods is not scheduled yet.
   819  func TestPreemptionStarvation(t *testing.T) {
   820  	// Initialize scheduler.
   821  	testCtx := initTest(t, "preemption")
   822  	cs := testCtx.ClientSet
   823  
   824  	tests := []struct {
   825  		name               string
   826  		numExistingPod     int
   827  		numExpectedPending int
   828  		preemptor          *v1.Pod
   829  	}{
   830  		{
   831  			// This test ensures that while the preempting pod is waiting for the victims
   832  			// terminate, other lower priority pods are not scheduled in the room created
   833  			// after preemption and while the higher priority pods is not scheduled yet.
   834  			name:               "starvation test: higher priority pod is scheduled before the lower priority ones",
   835  			numExistingPod:     10,
   836  			numExpectedPending: 5,
   837  			preemptor: initPausePod(&testutils.PausePodConfig{
   838  				Name:      "preemptor-pod",
   839  				Namespace: testCtx.NS.Name,
   840  				Priority:  &highPriority,
   841  				Resources: &v1.ResourceRequirements{Requests: v1.ResourceList{
   842  					v1.ResourceCPU:    *resource.NewMilliQuantity(300, resource.DecimalSI),
   843  					v1.ResourceMemory: *resource.NewQuantity(200, resource.DecimalSI)},
   844  				},
   845  			}),
   846  		},
   847  	}
   848  
   849  	// Create a node with some resources
   850  	nodeRes := map[v1.ResourceName]string{
   851  		v1.ResourcePods:   "32",
   852  		v1.ResourceCPU:    "500m",
   853  		v1.ResourceMemory: "500",
   854  	}
   855  	_, err := createNode(testCtx.ClientSet, st.MakeNode().Name("node1").Capacity(nodeRes).Obj())
   856  	if err != nil {
   857  		t.Fatalf("Error creating nodes: %v", err)
   858  	}
   859  
   860  	for _, test := range tests {
   861  		t.Run(test.name, func(t *testing.T) {
   862  			pendingPods := make([]*v1.Pod, test.numExpectedPending)
   863  			numRunningPods := test.numExistingPod - test.numExpectedPending
   864  			runningPods := make([]*v1.Pod, numRunningPods)
   865  			// Create and run existingPods.
   866  			for i := 0; i < numRunningPods; i++ {
   867  				runningPods[i], err = createPausePod(cs, mkPriorityPodWithGrace(testCtx, fmt.Sprintf("rpod-%v", i), mediumPriority, 0))
   868  				if err != nil {
   869  					t.Fatalf("Error creating pause pod: %v", err)
   870  				}
   871  			}
   872  			// make sure that runningPods are all scheduled.
   873  			for _, p := range runningPods {
   874  				if err := testutils.WaitForPodToSchedule(cs, p); err != nil {
   875  					t.Fatalf("Pod %v/%v didn't get scheduled: %v", p.Namespace, p.Name, err)
   876  				}
   877  			}
   878  			// Create pending pods.
   879  			for i := 0; i < test.numExpectedPending; i++ {
   880  				pendingPods[i], err = createPausePod(cs, mkPriorityPodWithGrace(testCtx, fmt.Sprintf("ppod-%v", i), mediumPriority, 0))
   881  				if err != nil {
   882  					t.Fatalf("Error creating pending pod: %v", err)
   883  				}
   884  			}
   885  			// Make sure that all pending pods are being marked unschedulable.
   886  			for _, p := range pendingPods {
   887  				if err := wait.PollUntilContextTimeout(testCtx.Ctx, 100*time.Millisecond, wait.ForeverTestTimeout, false,
   888  					podUnschedulable(cs, p.Namespace, p.Name)); err != nil {
   889  					t.Errorf("Pod %v/%v didn't get marked unschedulable: %v", p.Namespace, p.Name, err)
   890  				}
   891  			}
   892  			// Create the preemptor.
   893  			preemptor, err := createPausePod(cs, test.preemptor)
   894  			if err != nil {
   895  				t.Errorf("Error while creating the preempting pod: %v", err)
   896  			}
   897  			// Check if .status.nominatedNodeName of the preemptor pod gets set.
   898  			if err := waitForNominatedNodeName(cs, preemptor); err != nil {
   899  				t.Errorf(".status.nominatedNodeName was not set for pod %v/%v: %v", preemptor.Namespace, preemptor.Name, err)
   900  			}
   901  			// Make sure that preemptor is scheduled after preemptions.
   902  			if err := testutils.WaitForPodToScheduleWithTimeout(cs, preemptor, 60*time.Second); err != nil {
   903  				t.Errorf("Preemptor pod %v didn't get scheduled: %v", preemptor.Name, err)
   904  			}
   905  			// Cleanup
   906  			klog.Info("Cleaning up all pods...")
   907  			allPods := pendingPods
   908  			allPods = append(allPods, runningPods...)
   909  			allPods = append(allPods, preemptor)
   910  			testutils.CleanupPods(testCtx.Ctx, cs, t, allPods)
   911  		})
   912  	}
   913  }
   914  
   915  // TestPreemptionRaces tests that other scheduling events and operations do not
   916  // race with the preemption process.
   917  func TestPreemptionRaces(t *testing.T) {
   918  	// Initialize scheduler.
   919  	testCtx := initTest(t, "preemption-race")
   920  	cs := testCtx.ClientSet
   921  
   922  	tests := []struct {
   923  		name              string
   924  		numInitialPods    int // Pods created and executed before running preemptor
   925  		numAdditionalPods int // Pods created after creating the preemptor
   926  		numRepetitions    int // Repeat the tests to check races
   927  		preemptor         *v1.Pod
   928  	}{
   929  		{
   930  			// This test ensures that while the preempting pod is waiting for the victims
   931  			// terminate, other lower priority pods are not scheduled in the room created
   932  			// after preemption and while the higher priority pods is not scheduled yet.
   933  			name:              "ensures that other pods are not scheduled while preemptor is being marked as nominated (issue #72124)",
   934  			numInitialPods:    2,
   935  			numAdditionalPods: 20,
   936  			numRepetitions:    5,
   937  			preemptor: initPausePod(&testutils.PausePodConfig{
   938  				Name:      "preemptor-pod",
   939  				Namespace: testCtx.NS.Name,
   940  				Priority:  &highPriority,
   941  				Resources: &v1.ResourceRequirements{Requests: v1.ResourceList{
   942  					v1.ResourceCPU:    *resource.NewMilliQuantity(4900, resource.DecimalSI),
   943  					v1.ResourceMemory: *resource.NewQuantity(4900, resource.DecimalSI)},
   944  				},
   945  			}),
   946  		},
   947  	}
   948  
   949  	// Create a node with some resources
   950  	nodeRes := map[v1.ResourceName]string{
   951  		v1.ResourcePods:   "100",
   952  		v1.ResourceCPU:    "5000m",
   953  		v1.ResourceMemory: "5000",
   954  	}
   955  	_, err := createNode(testCtx.ClientSet, st.MakeNode().Name("node1").Capacity(nodeRes).Obj())
   956  	if err != nil {
   957  		t.Fatalf("Error creating nodes: %v", err)
   958  	}
   959  
   960  	for _, test := range tests {
   961  		t.Run(test.name, func(t *testing.T) {
   962  			if test.numRepetitions <= 0 {
   963  				test.numRepetitions = 1
   964  			}
   965  			for n := 0; n < test.numRepetitions; n++ {
   966  				initialPods := make([]*v1.Pod, test.numInitialPods)
   967  				additionalPods := make([]*v1.Pod, test.numAdditionalPods)
   968  				// Create and run existingPods.
   969  				for i := 0; i < test.numInitialPods; i++ {
   970  					initialPods[i], err = createPausePod(cs, mkPriorityPodWithGrace(testCtx, fmt.Sprintf("rpod-%v", i), mediumPriority, 0))
   971  					if err != nil {
   972  						t.Fatalf("Error creating pause pod: %v", err)
   973  					}
   974  				}
   975  				// make sure that initial Pods are all scheduled.
   976  				for _, p := range initialPods {
   977  					if err := testutils.WaitForPodToSchedule(cs, p); err != nil {
   978  						t.Fatalf("Pod %v/%v didn't get scheduled: %v", p.Namespace, p.Name, err)
   979  					}
   980  				}
   981  				// Create the preemptor.
   982  				klog.Info("Creating the preemptor pod...")
   983  				preemptor, err := createPausePod(cs, test.preemptor)
   984  				if err != nil {
   985  					t.Errorf("Error while creating the preempting pod: %v", err)
   986  				}
   987  
   988  				klog.Info("Creating additional pods...")
   989  				for i := 0; i < test.numAdditionalPods; i++ {
   990  					additionalPods[i], err = createPausePod(cs, mkPriorityPodWithGrace(testCtx, fmt.Sprintf("ppod-%v", i), mediumPriority, 0))
   991  					if err != nil {
   992  						t.Fatalf("Error creating pending pod: %v", err)
   993  					}
   994  				}
   995  				// Check that the preemptor pod gets nominated node name.
   996  				if err := waitForNominatedNodeName(cs, preemptor); err != nil {
   997  					t.Errorf(".status.nominatedNodeName was not set for pod %v/%v: %v", preemptor.Namespace, preemptor.Name, err)
   998  				}
   999  				// Make sure that preemptor is scheduled after preemptions.
  1000  				if err := testutils.WaitForPodToScheduleWithTimeout(cs, preemptor, 60*time.Second); err != nil {
  1001  					t.Errorf("Preemptor pod %v didn't get scheduled: %v", preemptor.Name, err)
  1002  				}
  1003  
  1004  				klog.Info("Check unschedulable pods still exists and were never scheduled...")
  1005  				for _, p := range additionalPods {
  1006  					pod, err := cs.CoreV1().Pods(p.Namespace).Get(testCtx.Ctx, p.Name, metav1.GetOptions{})
  1007  					if err != nil {
  1008  						t.Errorf("Error in getting Pod %v/%v info: %v", p.Namespace, p.Name, err)
  1009  					}
  1010  					if len(pod.Spec.NodeName) > 0 {
  1011  						t.Errorf("Pod %v/%v is already scheduled", p.Namespace, p.Name)
  1012  					}
  1013  					_, cond := podutil.GetPodCondition(&pod.Status, v1.PodScheduled)
  1014  					if cond != nil && cond.Status != v1.ConditionFalse {
  1015  						t.Errorf("Pod %v/%v is no longer unschedulable: %v", p.Namespace, p.Name, err)
  1016  					}
  1017  				}
  1018  				// Cleanup
  1019  				klog.Info("Cleaning up all pods...")
  1020  				allPods := additionalPods
  1021  				allPods = append(allPods, initialPods...)
  1022  				allPods = append(allPods, preemptor)
  1023  				testutils.CleanupPods(testCtx.Ctx, cs, t, allPods)
  1024  			}
  1025  		})
  1026  	}
  1027  }
  1028  
  1029  const (
  1030  	alwaysFailPlugin = "alwaysFailPlugin"
  1031  	doNotFailMe      = "do-not-fail-me"
  1032  )
  1033  
  1034  // A fake plugin implements PreBind extension point.
  1035  // It always fails with an Unschedulable status, unless the pod contains a `doNotFailMe` string.
  1036  type alwaysFail struct{}
  1037  
  1038  func (af *alwaysFail) Name() string {
  1039  	return alwaysFailPlugin
  1040  }
  1041  
  1042  func (af *alwaysFail) PreBind(_ context.Context, _ *framework.CycleState, p *v1.Pod, _ string) *framework.Status {
  1043  	if strings.Contains(p.Name, doNotFailMe) {
  1044  		return nil
  1045  	}
  1046  	return framework.NewStatus(framework.Unschedulable)
  1047  }
  1048  
  1049  func newAlwaysFail(_ context.Context, _ runtime.Object, _ framework.Handle) (framework.Plugin, error) {
  1050  	return &alwaysFail{}, nil
  1051  }
  1052  
  1053  // TestNominatedNodeCleanUp verifies if a pod's nominatedNodeName is set and unset
  1054  // properly in different scenarios.
  1055  func TestNominatedNodeCleanUp(t *testing.T) {
  1056  	tests := []struct {
  1057  		name         string
  1058  		nodeCapacity map[v1.ResourceName]string
  1059  		// A slice of pods to be created in batch.
  1060  		podsToCreate [][]*v1.Pod
  1061  		// Each postCheck function is run after each batch of pods' creation.
  1062  		postChecks []func(cs clientset.Interface, pod *v1.Pod) error
  1063  		// Delete the fake node or not. Optional.
  1064  		deleteNode bool
  1065  		// Pods to be deleted. Optional.
  1066  		podNamesToDelete []string
  1067  
  1068  		// Register dummy plugin to simulate particular scheduling failures. Optional.
  1069  		customPlugins     *configv1.Plugins
  1070  		outOfTreeRegistry frameworkruntime.Registry
  1071  	}{
  1072  		{
  1073  			name:         "mid-priority pod preempts low-priority pod, followed by a high-priority pod with another preemption",
  1074  			nodeCapacity: map[v1.ResourceName]string{v1.ResourceCPU: "5"},
  1075  			podsToCreate: [][]*v1.Pod{
  1076  				{
  1077  					st.MakePod().Name("low-1").Priority(lowPriority).Req(map[v1.ResourceName]string{v1.ResourceCPU: "1"}).Obj(),
  1078  					st.MakePod().Name("low-2").Priority(lowPriority).Req(map[v1.ResourceName]string{v1.ResourceCPU: "1"}).Obj(),
  1079  					st.MakePod().Name("low-3").Priority(lowPriority).Req(map[v1.ResourceName]string{v1.ResourceCPU: "1"}).Obj(),
  1080  					st.MakePod().Name("low-4").Priority(lowPriority).Req(map[v1.ResourceName]string{v1.ResourceCPU: "1"}).Obj(),
  1081  				},
  1082  				{
  1083  					st.MakePod().Name("medium").Priority(mediumPriority).Req(map[v1.ResourceName]string{v1.ResourceCPU: "4"}).Obj(),
  1084  				},
  1085  				{
  1086  					st.MakePod().Name("high").Priority(highPriority).Req(map[v1.ResourceName]string{v1.ResourceCPU: "3"}).Obj(),
  1087  				},
  1088  			},
  1089  			postChecks: []func(cs clientset.Interface, pod *v1.Pod) error{
  1090  				testutils.WaitForPodToSchedule,
  1091  				waitForNominatedNodeName,
  1092  				waitForNominatedNodeName,
  1093  			},
  1094  		},
  1095  		{
  1096  			name:         "mid-priority pod preempts low-priority pod, followed by a high-priority pod without additional preemption",
  1097  			nodeCapacity: map[v1.ResourceName]string{v1.ResourceCPU: "2"},
  1098  			podsToCreate: [][]*v1.Pod{
  1099  				{
  1100  					st.MakePod().Name("low").Priority(lowPriority).Req(map[v1.ResourceName]string{v1.ResourceCPU: "1"}).Obj(),
  1101  				},
  1102  				{
  1103  					st.MakePod().Name("medium").Priority(mediumPriority).Req(map[v1.ResourceName]string{v1.ResourceCPU: "2"}).Obj(),
  1104  				},
  1105  				{
  1106  					st.MakePod().Name("high").Priority(highPriority).Req(map[v1.ResourceName]string{v1.ResourceCPU: "1"}).Obj(),
  1107  				},
  1108  			},
  1109  			postChecks: []func(cs clientset.Interface, pod *v1.Pod) error{
  1110  				testutils.WaitForPodToSchedule,
  1111  				waitForNominatedNodeName,
  1112  				testutils.WaitForPodToSchedule,
  1113  			},
  1114  			podNamesToDelete: []string{"low"},
  1115  		},
  1116  		{
  1117  			name:         "mid-priority pod preempts low-priority pod, followed by a node deletion",
  1118  			nodeCapacity: map[v1.ResourceName]string{v1.ResourceCPU: "1"},
  1119  			podsToCreate: [][]*v1.Pod{
  1120  				{
  1121  					st.MakePod().Name("low").Priority(lowPriority).Req(map[v1.ResourceName]string{v1.ResourceCPU: "1"}).Obj(),
  1122  				},
  1123  				{
  1124  					st.MakePod().Name("medium").Priority(mediumPriority).Req(map[v1.ResourceName]string{v1.ResourceCPU: "1"}).Obj(),
  1125  				},
  1126  			},
  1127  			postChecks: []func(cs clientset.Interface, pod *v1.Pod) error{
  1128  				testutils.WaitForPodToSchedule,
  1129  				waitForNominatedNodeName,
  1130  			},
  1131  			// Delete the node to simulate an ErrNoNodesAvailable error.
  1132  			deleteNode:       true,
  1133  			podNamesToDelete: []string{"low"},
  1134  		},
  1135  		{
  1136  			name:         "mid-priority pod preempts low-priority pod, but failed the scheduling unexpectedly",
  1137  			nodeCapacity: map[v1.ResourceName]string{v1.ResourceCPU: "1"},
  1138  			podsToCreate: [][]*v1.Pod{
  1139  				{
  1140  					st.MakePod().Name(fmt.Sprintf("low-%v", doNotFailMe)).Priority(lowPriority).Req(map[v1.ResourceName]string{v1.ResourceCPU: "1"}).Obj(),
  1141  				},
  1142  				{
  1143  					st.MakePod().Name("medium").Priority(mediumPriority).Req(map[v1.ResourceName]string{v1.ResourceCPU: "1"}).Obj(),
  1144  				},
  1145  			},
  1146  			postChecks: []func(cs clientset.Interface, pod *v1.Pod) error{
  1147  				testutils.WaitForPodToSchedule,
  1148  				waitForNominatedNodeName,
  1149  			},
  1150  			podNamesToDelete: []string{fmt.Sprintf("low-%v", doNotFailMe)},
  1151  			customPlugins: &configv1.Plugins{
  1152  				PreBind: configv1.PluginSet{
  1153  					Enabled: []configv1.Plugin{
  1154  						{Name: alwaysFailPlugin},
  1155  					},
  1156  				},
  1157  			},
  1158  			outOfTreeRegistry: frameworkruntime.Registry{alwaysFailPlugin: newAlwaysFail},
  1159  		},
  1160  	}
  1161  
  1162  	for _, tt := range tests {
  1163  		t.Run(tt.name, func(t *testing.T) {
  1164  			cfg := configtesting.V1ToInternalWithDefaults(t, configv1.KubeSchedulerConfiguration{
  1165  				Profiles: []configv1.KubeSchedulerProfile{{
  1166  					SchedulerName: pointer.String(v1.DefaultSchedulerName),
  1167  					Plugins:       tt.customPlugins,
  1168  				}},
  1169  			})
  1170  			testCtx := initTest(
  1171  				t,
  1172  				"preemption",
  1173  				scheduler.WithProfiles(cfg.Profiles...),
  1174  				scheduler.WithFrameworkOutOfTreeRegistry(tt.outOfTreeRegistry),
  1175  			)
  1176  
  1177  			cs, ns := testCtx.ClientSet, testCtx.NS.Name
  1178  			// Create a node with the specified capacity.
  1179  			nodeName := "fake-node"
  1180  			if _, err := createNode(cs, st.MakeNode().Name(nodeName).Capacity(tt.nodeCapacity).Obj()); err != nil {
  1181  				t.Fatalf("Error creating node %v: %v", nodeName, err)
  1182  			}
  1183  
  1184  			// Create pods and run post check if necessary.
  1185  			for i, pods := range tt.podsToCreate {
  1186  				for _, p := range pods {
  1187  					p.Namespace = ns
  1188  					if _, err := createPausePod(cs, p); err != nil {
  1189  						t.Fatalf("Error creating pod %v: %v", p.Name, err)
  1190  					}
  1191  				}
  1192  				// If necessary, run the post check function.
  1193  				if len(tt.postChecks) > i && tt.postChecks[i] != nil {
  1194  					for _, p := range pods {
  1195  						if err := tt.postChecks[i](cs, p); err != nil {
  1196  							t.Fatalf("Pod %v didn't pass the postChecks[%v]: %v", p.Name, i, err)
  1197  						}
  1198  					}
  1199  				}
  1200  			}
  1201  
  1202  			// Delete the node if necessary.
  1203  			if tt.deleteNode {
  1204  				if err := cs.CoreV1().Nodes().Delete(context.TODO(), nodeName, *metav1.NewDeleteOptions(0)); err != nil {
  1205  					t.Fatalf("Node %v cannot be deleted: %v", nodeName, err)
  1206  				}
  1207  			}
  1208  
  1209  			// Force deleting the terminating pods if necessary.
  1210  			// This is required if we demand to delete terminating Pods physically.
  1211  			for _, podName := range tt.podNamesToDelete {
  1212  				if err := deletePod(cs, podName, ns); err != nil {
  1213  					t.Fatalf("Pod %v cannot be deleted: %v", podName, err)
  1214  				}
  1215  			}
  1216  
  1217  			// Verify if .status.nominatedNodeName is cleared.
  1218  			if err := wait.PollUntilContextTimeout(testCtx.Ctx, 100*time.Millisecond, wait.ForeverTestTimeout, false, func(ctx context.Context) (bool, error) {
  1219  				pod, err := cs.CoreV1().Pods(ns).Get(ctx, "medium", metav1.GetOptions{})
  1220  				if err != nil {
  1221  					t.Errorf("Error getting the medium pod: %v", err)
  1222  				}
  1223  				if len(pod.Status.NominatedNodeName) == 0 {
  1224  					return true, nil
  1225  				}
  1226  				return false, err
  1227  			}); err != nil {
  1228  				t.Errorf(".status.nominatedNodeName of the medium pod was not cleared: %v", err)
  1229  			}
  1230  		})
  1231  	}
  1232  }
  1233  
  1234  func mkMinAvailablePDB(name, namespace string, uid types.UID, minAvailable int, matchLabels map[string]string) *policy.PodDisruptionBudget {
  1235  	intMinAvailable := intstr.FromInt32(int32(minAvailable))
  1236  	return &policy.PodDisruptionBudget{
  1237  		ObjectMeta: metav1.ObjectMeta{
  1238  			Name:      name,
  1239  			Namespace: namespace,
  1240  		},
  1241  		Spec: policy.PodDisruptionBudgetSpec{
  1242  			MinAvailable: &intMinAvailable,
  1243  			Selector:     &metav1.LabelSelector{MatchLabels: matchLabels},
  1244  		},
  1245  	}
  1246  }
  1247  
  1248  func addPodConditionReady(pod *v1.Pod) {
  1249  	pod.Status = v1.PodStatus{
  1250  		Phase: v1.PodRunning,
  1251  		Conditions: []v1.PodCondition{
  1252  			{
  1253  				Type:   v1.PodReady,
  1254  				Status: v1.ConditionTrue,
  1255  			},
  1256  		},
  1257  	}
  1258  }
  1259  
  1260  // TestPDBInPreemption tests PodDisruptionBudget support in preemption.
  1261  func TestPDBInPreemption(t *testing.T) {
  1262  	// Initialize scheduler.
  1263  	testCtx := initTest(t, "preemption-pdb")
  1264  	cs := testCtx.ClientSet
  1265  
  1266  	initDisruptionController(t, testCtx)
  1267  
  1268  	defaultPodRes := &v1.ResourceRequirements{Requests: v1.ResourceList{
  1269  		v1.ResourceCPU:    *resource.NewMilliQuantity(100, resource.DecimalSI),
  1270  		v1.ResourceMemory: *resource.NewQuantity(100, resource.DecimalSI)},
  1271  	}
  1272  	defaultNodeRes := map[v1.ResourceName]string{
  1273  		v1.ResourcePods:   "32",
  1274  		v1.ResourceCPU:    "500m",
  1275  		v1.ResourceMemory: "500",
  1276  	}
  1277  
  1278  	tests := []struct {
  1279  		name                string
  1280  		nodeCnt             int
  1281  		pdbs                []*policy.PodDisruptionBudget
  1282  		pdbPodNum           []int32
  1283  		existingPods        []*v1.Pod
  1284  		pod                 *v1.Pod
  1285  		preemptedPodIndexes map[int]struct{}
  1286  	}{
  1287  		{
  1288  			name:    "A non-PDB violating pod is preempted despite its higher priority",
  1289  			nodeCnt: 1,
  1290  			pdbs: []*policy.PodDisruptionBudget{
  1291  				mkMinAvailablePDB("pdb-1", testCtx.NS.Name, types.UID("pdb-1-uid"), 2, map[string]string{"foo": "bar"}),
  1292  			},
  1293  			pdbPodNum: []int32{2},
  1294  			existingPods: []*v1.Pod{
  1295  				initPausePod(&testutils.PausePodConfig{
  1296  					Name:      "low-pod1",
  1297  					Namespace: testCtx.NS.Name,
  1298  					Priority:  &lowPriority,
  1299  					Resources: defaultPodRes,
  1300  					Labels:    map[string]string{"foo": "bar"},
  1301  				}),
  1302  				initPausePod(&testutils.PausePodConfig{
  1303  					Name:      "low-pod2",
  1304  					Namespace: testCtx.NS.Name,
  1305  					Priority:  &lowPriority,
  1306  					Resources: defaultPodRes,
  1307  					Labels:    map[string]string{"foo": "bar"},
  1308  				}),
  1309  				initPausePod(&testutils.PausePodConfig{
  1310  					Name:      "mid-pod3",
  1311  					Namespace: testCtx.NS.Name,
  1312  					Priority:  &mediumPriority,
  1313  					Resources: defaultPodRes,
  1314  				}),
  1315  			},
  1316  			pod: initPausePod(&testutils.PausePodConfig{
  1317  				Name:      "preemptor-pod",
  1318  				Namespace: testCtx.NS.Name,
  1319  				Priority:  &highPriority,
  1320  				Resources: &v1.ResourceRequirements{Requests: v1.ResourceList{
  1321  					v1.ResourceCPU:    *resource.NewMilliQuantity(300, resource.DecimalSI),
  1322  					v1.ResourceMemory: *resource.NewQuantity(200, resource.DecimalSI)},
  1323  				},
  1324  			}),
  1325  			preemptedPodIndexes: map[int]struct{}{2: {}},
  1326  		},
  1327  		{
  1328  			name:    "A node without any PDB violating pods is preferred for preemption",
  1329  			nodeCnt: 2,
  1330  			pdbs: []*policy.PodDisruptionBudget{
  1331  				mkMinAvailablePDB("pdb-1", testCtx.NS.Name, types.UID("pdb-1-uid"), 2, map[string]string{"foo": "bar"}),
  1332  			},
  1333  			pdbPodNum: []int32{1},
  1334  			existingPods: []*v1.Pod{
  1335  				initPausePod(&testutils.PausePodConfig{
  1336  					Name:      "low-pod1",
  1337  					Namespace: testCtx.NS.Name,
  1338  					Priority:  &lowPriority,
  1339  					Resources: defaultPodRes,
  1340  					NodeName:  "node-1",
  1341  					Labels:    map[string]string{"foo": "bar"},
  1342  				}),
  1343  				initPausePod(&testutils.PausePodConfig{
  1344  					Name:      "mid-pod2",
  1345  					Namespace: testCtx.NS.Name,
  1346  					Priority:  &mediumPriority,
  1347  					NodeName:  "node-2",
  1348  					Resources: defaultPodRes,
  1349  				}),
  1350  			},
  1351  			pod: initPausePod(&testutils.PausePodConfig{
  1352  				Name:      "preemptor-pod",
  1353  				Namespace: testCtx.NS.Name,
  1354  				Priority:  &highPriority,
  1355  				Resources: &v1.ResourceRequirements{Requests: v1.ResourceList{
  1356  					v1.ResourceCPU:    *resource.NewMilliQuantity(500, resource.DecimalSI),
  1357  					v1.ResourceMemory: *resource.NewQuantity(200, resource.DecimalSI)},
  1358  				},
  1359  			}),
  1360  			preemptedPodIndexes: map[int]struct{}{1: {}},
  1361  		},
  1362  		{
  1363  			name:    "A node with fewer PDB violating pods is preferred for preemption",
  1364  			nodeCnt: 3,
  1365  			pdbs: []*policy.PodDisruptionBudget{
  1366  				mkMinAvailablePDB("pdb-1", testCtx.NS.Name, types.UID("pdb-1-uid"), 2, map[string]string{"foo1": "bar"}),
  1367  				mkMinAvailablePDB("pdb-2", testCtx.NS.Name, types.UID("pdb-2-uid"), 2, map[string]string{"foo2": "bar"}),
  1368  			},
  1369  			pdbPodNum: []int32{1, 5},
  1370  			existingPods: []*v1.Pod{
  1371  				initPausePod(&testutils.PausePodConfig{
  1372  					Name:      "low-pod1",
  1373  					Namespace: testCtx.NS.Name,
  1374  					Priority:  &lowPriority,
  1375  					Resources: defaultPodRes,
  1376  					NodeName:  "node-1",
  1377  					Labels:    map[string]string{"foo1": "bar"},
  1378  				}),
  1379  				initPausePod(&testutils.PausePodConfig{
  1380  					Name:      "mid-pod1",
  1381  					Namespace: testCtx.NS.Name,
  1382  					Priority:  &mediumPriority,
  1383  					Resources: defaultPodRes,
  1384  					NodeName:  "node-1",
  1385  				}),
  1386  				initPausePod(&testutils.PausePodConfig{
  1387  					Name:      "low-pod2",
  1388  					Namespace: testCtx.NS.Name,
  1389  					Priority:  &lowPriority,
  1390  					Resources: defaultPodRes,
  1391  					NodeName:  "node-2",
  1392  					Labels:    map[string]string{"foo2": "bar"},
  1393  				}),
  1394  				initPausePod(&testutils.PausePodConfig{
  1395  					Name:      "mid-pod2",
  1396  					Namespace: testCtx.NS.Name,
  1397  					Priority:  &mediumPriority,
  1398  					Resources: defaultPodRes,
  1399  					NodeName:  "node-2",
  1400  					Labels:    map[string]string{"foo2": "bar"},
  1401  				}),
  1402  				initPausePod(&testutils.PausePodConfig{
  1403  					Name:      "low-pod4",
  1404  					Namespace: testCtx.NS.Name,
  1405  					Priority:  &lowPriority,
  1406  					Resources: defaultPodRes,
  1407  					NodeName:  "node-3",
  1408  					Labels:    map[string]string{"foo2": "bar"},
  1409  				}),
  1410  				initPausePod(&testutils.PausePodConfig{
  1411  					Name:      "low-pod5",
  1412  					Namespace: testCtx.NS.Name,
  1413  					Priority:  &lowPriority,
  1414  					Resources: defaultPodRes,
  1415  					NodeName:  "node-3",
  1416  					Labels:    map[string]string{"foo2": "bar"},
  1417  				}),
  1418  				initPausePod(&testutils.PausePodConfig{
  1419  					Name:      "low-pod6",
  1420  					Namespace: testCtx.NS.Name,
  1421  					Priority:  &lowPriority,
  1422  					Resources: defaultPodRes,
  1423  					NodeName:  "node-3",
  1424  					Labels:    map[string]string{"foo2": "bar"},
  1425  				}),
  1426  			},
  1427  			pod: initPausePod(&testutils.PausePodConfig{
  1428  				Name:      "preemptor-pod",
  1429  				Namespace: testCtx.NS.Name,
  1430  				Priority:  &highPriority,
  1431  				Resources: &v1.ResourceRequirements{Requests: v1.ResourceList{
  1432  					v1.ResourceCPU:    *resource.NewMilliQuantity(500, resource.DecimalSI),
  1433  					v1.ResourceMemory: *resource.NewQuantity(400, resource.DecimalSI)},
  1434  				},
  1435  			}),
  1436  			// The third node is chosen because PDB is not violated for node 3 and the victims have lower priority than node-2.
  1437  			preemptedPodIndexes: map[int]struct{}{4: {}, 5: {}, 6: {}},
  1438  		},
  1439  	}
  1440  
  1441  	for _, test := range tests {
  1442  		t.Run(test.name, func(t *testing.T) {
  1443  			for i := 1; i <= test.nodeCnt; i++ {
  1444  				nodeName := fmt.Sprintf("node-%v", i)
  1445  				_, err := createNode(cs, st.MakeNode().Name(nodeName).Capacity(defaultNodeRes).Obj())
  1446  				if err != nil {
  1447  					t.Fatalf("Error creating node %v: %v", nodeName, err)
  1448  				}
  1449  			}
  1450  
  1451  			pods := make([]*v1.Pod, len(test.existingPods))
  1452  			var err error
  1453  			// Create and run existingPods.
  1454  			for i, p := range test.existingPods {
  1455  				if pods[i], err = runPausePod(cs, p); err != nil {
  1456  					t.Fatalf("Test [%v]: Error running pause pod: %v", test.name, err)
  1457  				}
  1458  				// Add pod condition ready so that PDB is updated.
  1459  				addPodConditionReady(p)
  1460  				if _, err := testCtx.ClientSet.CoreV1().Pods(testCtx.NS.Name).UpdateStatus(context.TODO(), p, metav1.UpdateOptions{}); err != nil {
  1461  					t.Fatal(err)
  1462  				}
  1463  			}
  1464  			// Wait for Pods to be stable in scheduler cache.
  1465  			if err := waitCachedPodsStable(testCtx, test.existingPods); err != nil {
  1466  				t.Fatalf("Not all pods are stable in the cache: %v", err)
  1467  			}
  1468  
  1469  			// Create PDBs.
  1470  			for _, pdb := range test.pdbs {
  1471  				_, err := testCtx.ClientSet.PolicyV1().PodDisruptionBudgets(testCtx.NS.Name).Create(context.TODO(), pdb, metav1.CreateOptions{})
  1472  				if err != nil {
  1473  					t.Fatalf("Failed to create PDB: %v", err)
  1474  				}
  1475  			}
  1476  			// Wait for PDBs to become stable.
  1477  			if err := waitForPDBsStable(testCtx, test.pdbs, test.pdbPodNum); err != nil {
  1478  				t.Fatalf("Not all pdbs are stable in the cache: %v", err)
  1479  			}
  1480  
  1481  			// Create the "pod".
  1482  			preemptor, err := createPausePod(cs, test.pod)
  1483  			if err != nil {
  1484  				t.Errorf("Error while creating high priority pod: %v", err)
  1485  			}
  1486  			// Wait for preemption of pods and make sure the other ones are not preempted.
  1487  			for i, p := range pods {
  1488  				if _, found := test.preemptedPodIndexes[i]; found {
  1489  					if err = wait.PollUntilContextTimeout(testCtx.Ctx, time.Second, wait.ForeverTestTimeout, false,
  1490  						podIsGettingEvicted(cs, p.Namespace, p.Name)); err != nil {
  1491  						t.Errorf("Test [%v]: Pod %v/%v is not getting evicted.", test.name, p.Namespace, p.Name)
  1492  					}
  1493  				} else {
  1494  					if p.DeletionTimestamp != nil {
  1495  						t.Errorf("Test [%v]: Didn't expect pod %v/%v to get preempted.", test.name, p.Namespace, p.Name)
  1496  					}
  1497  				}
  1498  			}
  1499  			// Also check if .status.nominatedNodeName of the preemptor pod gets set.
  1500  			if len(test.preemptedPodIndexes) > 0 {
  1501  				if err := waitForNominatedNodeName(cs, preemptor); err != nil {
  1502  					t.Errorf("Test [%v]: .status.nominatedNodeName was not set for pod %v/%v: %v", test.name, preemptor.Namespace, preemptor.Name, err)
  1503  				}
  1504  			}
  1505  
  1506  			// Cleanup
  1507  			pods = append(pods, preemptor)
  1508  			testutils.CleanupPods(testCtx.Ctx, cs, t, pods)
  1509  			if err := cs.PolicyV1().PodDisruptionBudgets(testCtx.NS.Name).DeleteCollection(testCtx.Ctx, metav1.DeleteOptions{}, metav1.ListOptions{}); err != nil {
  1510  				t.Errorf("error while deleting PDBs, error: %v", err)
  1511  			}
  1512  			if err := cs.CoreV1().Nodes().DeleteCollection(testCtx.Ctx, metav1.DeleteOptions{}, metav1.ListOptions{}); err != nil {
  1513  				t.Errorf("error whiling deleting nodes, error: %v", err)
  1514  			}
  1515  		})
  1516  	}
  1517  }
  1518  
  1519  func initTestPreferNominatedNode(t *testing.T, nsPrefix string, opts ...scheduler.Option) *testutils.TestContext {
  1520  	testCtx := testutils.InitTestSchedulerWithOptions(t, testutils.InitTestAPIServer(t, nsPrefix, nil), 0, opts...)
  1521  	testutils.SyncSchedulerInformerFactory(testCtx)
  1522  	// wraps the NextPod() method to make it appear the preemption has been done already and the nominated node has been set.
  1523  	f := testCtx.Scheduler.NextPod
  1524  	testCtx.Scheduler.NextPod = func(logger klog.Logger) (*framework.QueuedPodInfo, error) {
  1525  		podInfo, _ := f(klog.FromContext(testCtx.Ctx))
  1526  		// Scheduler.Next() may return nil when scheduler is shutting down.
  1527  		if podInfo != nil {
  1528  			podInfo.Pod.Status.NominatedNodeName = "node-1"
  1529  		}
  1530  		return podInfo, nil
  1531  	}
  1532  	go testCtx.Scheduler.Run(testCtx.Ctx)
  1533  	return testCtx
  1534  }
  1535  
  1536  // TestPreferNominatedNode test that if the nominated node pass all the filters, then preemptor pod will run on the nominated node,
  1537  // otherwise, it will be scheduled to another node in the cluster that ables to pass all the filters.
  1538  func TestPreferNominatedNode(t *testing.T) {
  1539  	defaultNodeRes := map[v1.ResourceName]string{
  1540  		v1.ResourcePods:   "32",
  1541  		v1.ResourceCPU:    "500m",
  1542  		v1.ResourceMemory: "500",
  1543  	}
  1544  	defaultPodRes := &v1.ResourceRequirements{Requests: v1.ResourceList{
  1545  		v1.ResourceCPU:    *resource.NewMilliQuantity(100, resource.DecimalSI),
  1546  		v1.ResourceMemory: *resource.NewQuantity(100, resource.DecimalSI)},
  1547  	}
  1548  	tests := []struct {
  1549  		name         string
  1550  		nodeNames    []string
  1551  		existingPods []*v1.Pod
  1552  		pod          *v1.Pod
  1553  		runningNode  string
  1554  	}{
  1555  		{
  1556  			name:      "nominated node released all resource, preemptor is scheduled to the nominated node",
  1557  			nodeNames: []string{"node-1", "node-2"},
  1558  			existingPods: []*v1.Pod{
  1559  				initPausePod(&testutils.PausePodConfig{
  1560  					Name:      "low-pod1",
  1561  					Priority:  &lowPriority,
  1562  					NodeName:  "node-2",
  1563  					Resources: defaultPodRes,
  1564  				}),
  1565  			},
  1566  			pod: initPausePod(&testutils.PausePodConfig{
  1567  				Name:     "preemptor-pod",
  1568  				Priority: &highPriority,
  1569  				Resources: &v1.ResourceRequirements{Requests: v1.ResourceList{
  1570  					v1.ResourceCPU:    *resource.NewMilliQuantity(500, resource.DecimalSI),
  1571  					v1.ResourceMemory: *resource.NewQuantity(200, resource.DecimalSI)},
  1572  				},
  1573  			}),
  1574  			runningNode: "node-1",
  1575  		},
  1576  		{
  1577  			name:      "nominated node cannot pass all the filters, preemptor should find a different node",
  1578  			nodeNames: []string{"node-1", "node-2"},
  1579  			existingPods: []*v1.Pod{
  1580  				initPausePod(&testutils.PausePodConfig{
  1581  					Name:      "low-pod",
  1582  					Priority:  &lowPriority,
  1583  					Resources: defaultPodRes,
  1584  					NodeName:  "node-1",
  1585  				}),
  1586  			},
  1587  			pod: initPausePod(&testutils.PausePodConfig{
  1588  				Name:     "preemptor-pod1",
  1589  				Priority: &highPriority,
  1590  				Resources: &v1.ResourceRequirements{Requests: v1.ResourceList{
  1591  					v1.ResourceCPU:    *resource.NewMilliQuantity(500, resource.DecimalSI),
  1592  					v1.ResourceMemory: *resource.NewQuantity(200, resource.DecimalSI)},
  1593  				},
  1594  			}),
  1595  			runningNode: "node-2",
  1596  		},
  1597  	}
  1598  
  1599  	for _, test := range tests {
  1600  		t.Run(test.name, func(t *testing.T) {
  1601  			testCtx := initTestPreferNominatedNode(t, "perfer-nominated-node")
  1602  			cs := testCtx.ClientSet
  1603  			nsName := testCtx.NS.Name
  1604  			var err error
  1605  			var preemptor *v1.Pod
  1606  			for _, nodeName := range test.nodeNames {
  1607  				_, err := createNode(cs, st.MakeNode().Name(nodeName).Capacity(defaultNodeRes).Obj())
  1608  				if err != nil {
  1609  					t.Fatalf("Error creating node %v: %v", nodeName, err)
  1610  				}
  1611  			}
  1612  
  1613  			pods := make([]*v1.Pod, len(test.existingPods))
  1614  			// Create and run existingPods.
  1615  			for i, p := range test.existingPods {
  1616  				p.Namespace = nsName
  1617  				pods[i], err = runPausePod(cs, p)
  1618  				if err != nil {
  1619  					t.Fatalf("Error running pause pod: %v", err)
  1620  				}
  1621  			}
  1622  			test.pod.Namespace = nsName
  1623  			preemptor, err = createPausePod(cs, test.pod)
  1624  			if err != nil {
  1625  				t.Errorf("Error while creating high priority pod: %v", err)
  1626  			}
  1627  			err = wait.PollUntilContextTimeout(testCtx.Ctx, 100*time.Millisecond, wait.ForeverTestTimeout, false, func(ctx context.Context) (bool, error) {
  1628  				preemptor, err = cs.CoreV1().Pods(test.pod.Namespace).Get(ctx, test.pod.Name, metav1.GetOptions{})
  1629  				if err != nil {
  1630  					t.Errorf("Error getting the preemptor pod info: %v", err)
  1631  				}
  1632  				if len(preemptor.Spec.NodeName) == 0 {
  1633  					return false, err
  1634  				}
  1635  				return true, nil
  1636  			})
  1637  			if err != nil {
  1638  				t.Errorf("Cannot schedule Pod %v/%v, error: %v", test.pod.Namespace, test.pod.Name, err)
  1639  			}
  1640  			// Make sure the pod has been scheduled to the right node.
  1641  			if preemptor.Spec.NodeName != test.runningNode {
  1642  				t.Errorf("Expect pod running on %v, got %v.", test.runningNode, preemptor.Spec.NodeName)
  1643  			}
  1644  		})
  1645  	}
  1646  }
  1647  
  1648  // TestReadWriteOncePodPreemption tests preemption scenarios for pods with
  1649  // ReadWriteOncePod PVCs.
  1650  func TestReadWriteOncePodPreemption(t *testing.T) {
  1651  	cfg := configtesting.V1ToInternalWithDefaults(t, configv1.KubeSchedulerConfiguration{
  1652  		Profiles: []configv1.KubeSchedulerProfile{{
  1653  			SchedulerName: pointer.StringPtr(v1.DefaultSchedulerName),
  1654  			Plugins: &configv1.Plugins{
  1655  				Filter: configv1.PluginSet{
  1656  					Enabled: []configv1.Plugin{
  1657  						{Name: volumerestrictions.Name},
  1658  					},
  1659  				},
  1660  				PreFilter: configv1.PluginSet{
  1661  					Enabled: []configv1.Plugin{
  1662  						{Name: volumerestrictions.Name},
  1663  					},
  1664  				},
  1665  			},
  1666  		}},
  1667  	})
  1668  
  1669  	testCtx := testutils.InitTestSchedulerWithOptions(t,
  1670  		testutils.InitTestAPIServer(t, "preemption", nil),
  1671  		0,
  1672  		scheduler.WithProfiles(cfg.Profiles...))
  1673  	testutils.SyncSchedulerInformerFactory(testCtx)
  1674  	go testCtx.Scheduler.Run(testCtx.Ctx)
  1675  
  1676  	cs := testCtx.ClientSet
  1677  
  1678  	storage := v1.VolumeResourceRequirements{Requests: v1.ResourceList{v1.ResourceStorage: resource.MustParse("1Mi")}}
  1679  	volType := v1.HostPathDirectoryOrCreate
  1680  	pv1 := st.MakePersistentVolume().
  1681  		Name("pv-with-read-write-once-pod-1").
  1682  		AccessModes([]v1.PersistentVolumeAccessMode{v1.ReadWriteOncePod}).
  1683  		Capacity(storage.Requests).
  1684  		HostPathVolumeSource(&v1.HostPathVolumeSource{Path: "/mnt1", Type: &volType}).
  1685  		Obj()
  1686  	pvc1 := st.MakePersistentVolumeClaim().
  1687  		Name("pvc-with-read-write-once-pod-1").
  1688  		Namespace(testCtx.NS.Name).
  1689  		// Annotation and volume name required for PVC to be considered bound.
  1690  		Annotation(volume.AnnBindCompleted, "true").
  1691  		VolumeName(pv1.Name).
  1692  		AccessModes([]v1.PersistentVolumeAccessMode{v1.ReadWriteOncePod}).
  1693  		Resources(storage).
  1694  		Obj()
  1695  	pv2 := st.MakePersistentVolume().
  1696  		Name("pv-with-read-write-once-pod-2").
  1697  		AccessModes([]v1.PersistentVolumeAccessMode{v1.ReadWriteOncePod}).
  1698  		Capacity(storage.Requests).
  1699  		HostPathVolumeSource(&v1.HostPathVolumeSource{Path: "/mnt2", Type: &volType}).
  1700  		Obj()
  1701  	pvc2 := st.MakePersistentVolumeClaim().
  1702  		Name("pvc-with-read-write-once-pod-2").
  1703  		Namespace(testCtx.NS.Name).
  1704  		// Annotation and volume name required for PVC to be considered bound.
  1705  		Annotation(volume.AnnBindCompleted, "true").
  1706  		VolumeName(pv2.Name).
  1707  		AccessModes([]v1.PersistentVolumeAccessMode{v1.ReadWriteOncePod}).
  1708  		Resources(storage).
  1709  		Obj()
  1710  
  1711  	tests := []struct {
  1712  		name                string
  1713  		init                func() error
  1714  		existingPods        []*v1.Pod
  1715  		pod                 *v1.Pod
  1716  		unresolvable        bool
  1717  		preemptedPodIndexes map[int]struct{}
  1718  		cleanup             func() error
  1719  	}{
  1720  		{
  1721  			name: "preempt single pod",
  1722  			init: func() error {
  1723  				_, err := testutils.CreatePV(cs, pv1)
  1724  				if err != nil {
  1725  					return fmt.Errorf("cannot create pv: %v", err)
  1726  				}
  1727  				_, err = testutils.CreatePVC(cs, pvc1)
  1728  				if err != nil {
  1729  					return fmt.Errorf("cannot create pvc: %v", err)
  1730  				}
  1731  				return nil
  1732  			},
  1733  			existingPods: []*v1.Pod{
  1734  				initPausePod(&testutils.PausePodConfig{
  1735  					Name:      "victim-pod",
  1736  					Namespace: testCtx.NS.Name,
  1737  					Priority:  &lowPriority,
  1738  					Volumes: []v1.Volume{{
  1739  						Name: "volume",
  1740  						VolumeSource: v1.VolumeSource{
  1741  							PersistentVolumeClaim: &v1.PersistentVolumeClaimVolumeSource{
  1742  								ClaimName: pvc1.Name,
  1743  							},
  1744  						},
  1745  					}},
  1746  				}),
  1747  			},
  1748  			pod: initPausePod(&testutils.PausePodConfig{
  1749  				Name:      "preemptor-pod",
  1750  				Namespace: testCtx.NS.Name,
  1751  				Priority:  &highPriority,
  1752  				Volumes: []v1.Volume{{
  1753  					Name: "volume",
  1754  					VolumeSource: v1.VolumeSource{
  1755  						PersistentVolumeClaim: &v1.PersistentVolumeClaimVolumeSource{
  1756  							ClaimName: pvc1.Name,
  1757  						},
  1758  					},
  1759  				}},
  1760  			}),
  1761  			preemptedPodIndexes: map[int]struct{}{0: {}},
  1762  			cleanup: func() error {
  1763  				if err := testutils.DeletePVC(cs, pvc1.Name, pvc1.Namespace); err != nil {
  1764  					return fmt.Errorf("cannot delete pvc: %v", err)
  1765  				}
  1766  				if err := testutils.DeletePV(cs, pv1.Name); err != nil {
  1767  					return fmt.Errorf("cannot delete pv: %v", err)
  1768  				}
  1769  				return nil
  1770  			},
  1771  		},
  1772  		{
  1773  			name: "preempt two pods",
  1774  			init: func() error {
  1775  				for _, pv := range []*v1.PersistentVolume{pv1, pv2} {
  1776  					_, err := testutils.CreatePV(cs, pv)
  1777  					if err != nil {
  1778  						return fmt.Errorf("cannot create pv: %v", err)
  1779  					}
  1780  				}
  1781  				for _, pvc := range []*v1.PersistentVolumeClaim{pvc1, pvc2} {
  1782  					_, err := testutils.CreatePVC(cs, pvc)
  1783  					if err != nil {
  1784  						return fmt.Errorf("cannot create pvc: %v", err)
  1785  					}
  1786  				}
  1787  				return nil
  1788  			},
  1789  			existingPods: []*v1.Pod{
  1790  				initPausePod(&testutils.PausePodConfig{
  1791  					Name:      "victim-pod-1",
  1792  					Namespace: testCtx.NS.Name,
  1793  					Priority:  &lowPriority,
  1794  					Volumes: []v1.Volume{{
  1795  						Name: "volume",
  1796  						VolumeSource: v1.VolumeSource{
  1797  							PersistentVolumeClaim: &v1.PersistentVolumeClaimVolumeSource{
  1798  								ClaimName: pvc1.Name,
  1799  							},
  1800  						},
  1801  					}},
  1802  				}),
  1803  				initPausePod(&testutils.PausePodConfig{
  1804  					Name:      "victim-pod-2",
  1805  					Namespace: testCtx.NS.Name,
  1806  					Priority:  &lowPriority,
  1807  					Volumes: []v1.Volume{{
  1808  						Name: "volume",
  1809  						VolumeSource: v1.VolumeSource{
  1810  							PersistentVolumeClaim: &v1.PersistentVolumeClaimVolumeSource{
  1811  								ClaimName: pvc2.Name,
  1812  							},
  1813  						},
  1814  					}},
  1815  				}),
  1816  			},
  1817  			pod: initPausePod(&testutils.PausePodConfig{
  1818  				Name:      "preemptor-pod",
  1819  				Namespace: testCtx.NS.Name,
  1820  				Priority:  &highPriority,
  1821  				Volumes: []v1.Volume{
  1822  					{
  1823  						Name: "volume-1",
  1824  						VolumeSource: v1.VolumeSource{
  1825  							PersistentVolumeClaim: &v1.PersistentVolumeClaimVolumeSource{
  1826  								ClaimName: pvc1.Name,
  1827  							},
  1828  						},
  1829  					},
  1830  					{
  1831  						Name: "volume-2",
  1832  						VolumeSource: v1.VolumeSource{
  1833  							PersistentVolumeClaim: &v1.PersistentVolumeClaimVolumeSource{
  1834  								ClaimName: pvc2.Name,
  1835  							},
  1836  						},
  1837  					},
  1838  				},
  1839  			}),
  1840  			preemptedPodIndexes: map[int]struct{}{0: {}, 1: {}},
  1841  			cleanup: func() error {
  1842  				for _, pvc := range []*v1.PersistentVolumeClaim{pvc1, pvc2} {
  1843  					if err := testutils.DeletePVC(cs, pvc.Name, pvc.Namespace); err != nil {
  1844  						return fmt.Errorf("cannot delete pvc: %v", err)
  1845  					}
  1846  				}
  1847  				for _, pv := range []*v1.PersistentVolume{pv1, pv2} {
  1848  					if err := testutils.DeletePV(cs, pv.Name); err != nil {
  1849  						return fmt.Errorf("cannot delete pv: %v", err)
  1850  					}
  1851  				}
  1852  				return nil
  1853  			},
  1854  		},
  1855  		{
  1856  			name: "preempt single pod with two volumes",
  1857  			init: func() error {
  1858  				for _, pv := range []*v1.PersistentVolume{pv1, pv2} {
  1859  					_, err := testutils.CreatePV(cs, pv)
  1860  					if err != nil {
  1861  						return fmt.Errorf("cannot create pv: %v", err)
  1862  					}
  1863  				}
  1864  				for _, pvc := range []*v1.PersistentVolumeClaim{pvc1, pvc2} {
  1865  					_, err := testutils.CreatePVC(cs, pvc)
  1866  					if err != nil {
  1867  						return fmt.Errorf("cannot create pvc: %v", err)
  1868  					}
  1869  				}
  1870  				return nil
  1871  			},
  1872  			existingPods: []*v1.Pod{
  1873  				initPausePod(&testutils.PausePodConfig{
  1874  					Name:      "victim-pod",
  1875  					Namespace: testCtx.NS.Name,
  1876  					Priority:  &lowPriority,
  1877  					Volumes: []v1.Volume{
  1878  						{
  1879  							Name: "volume-1",
  1880  							VolumeSource: v1.VolumeSource{
  1881  								PersistentVolumeClaim: &v1.PersistentVolumeClaimVolumeSource{
  1882  									ClaimName: pvc1.Name,
  1883  								},
  1884  							},
  1885  						},
  1886  						{
  1887  							Name: "volume-2",
  1888  							VolumeSource: v1.VolumeSource{
  1889  								PersistentVolumeClaim: &v1.PersistentVolumeClaimVolumeSource{
  1890  									ClaimName: pvc2.Name,
  1891  								},
  1892  							},
  1893  						},
  1894  					},
  1895  				}),
  1896  			},
  1897  			pod: initPausePod(&testutils.PausePodConfig{
  1898  				Name:      "preemptor-pod",
  1899  				Namespace: testCtx.NS.Name,
  1900  				Priority:  &highPriority,
  1901  				Volumes: []v1.Volume{
  1902  					{
  1903  						Name: "volume-1",
  1904  						VolumeSource: v1.VolumeSource{
  1905  							PersistentVolumeClaim: &v1.PersistentVolumeClaimVolumeSource{
  1906  								ClaimName: pvc1.Name,
  1907  							},
  1908  						},
  1909  					},
  1910  					{
  1911  						Name: "volume-2",
  1912  						VolumeSource: v1.VolumeSource{
  1913  							PersistentVolumeClaim: &v1.PersistentVolumeClaimVolumeSource{
  1914  								ClaimName: pvc2.Name,
  1915  							},
  1916  						},
  1917  					},
  1918  				},
  1919  			}),
  1920  			preemptedPodIndexes: map[int]struct{}{0: {}},
  1921  			cleanup: func() error {
  1922  				for _, pvc := range []*v1.PersistentVolumeClaim{pvc1, pvc2} {
  1923  					if err := testutils.DeletePVC(cs, pvc.Name, pvc.Namespace); err != nil {
  1924  						return fmt.Errorf("cannot delete pvc: %v", err)
  1925  					}
  1926  				}
  1927  				for _, pv := range []*v1.PersistentVolume{pv1, pv2} {
  1928  					if err := testutils.DeletePV(cs, pv.Name); err != nil {
  1929  						return fmt.Errorf("cannot delete pv: %v", err)
  1930  					}
  1931  				}
  1932  				return nil
  1933  			},
  1934  		},
  1935  	}
  1936  
  1937  	// Create a node with some resources and a label.
  1938  	nodeRes := map[v1.ResourceName]string{
  1939  		v1.ResourcePods:   "32",
  1940  		v1.ResourceCPU:    "500m",
  1941  		v1.ResourceMemory: "500",
  1942  	}
  1943  	nodeObject := st.MakeNode().Name("node1").Capacity(nodeRes).Label("node", "node1").Obj()
  1944  	if _, err := createNode(cs, nodeObject); err != nil {
  1945  		t.Fatalf("Error creating node: %v", err)
  1946  	}
  1947  
  1948  	for _, test := range tests {
  1949  		t.Run(test.name, func(t *testing.T) {
  1950  			if err := test.init(); err != nil {
  1951  				t.Fatalf("Error while initializing test: %v", err)
  1952  			}
  1953  
  1954  			pods := make([]*v1.Pod, len(test.existingPods))
  1955  			t.Cleanup(func() {
  1956  				testutils.CleanupPods(testCtx.Ctx, cs, t, pods)
  1957  				if err := test.cleanup(); err != nil {
  1958  					t.Errorf("Error cleaning up test: %v", err)
  1959  				}
  1960  			})
  1961  			// Create and run existingPods.
  1962  			for i, p := range test.existingPods {
  1963  				var err error
  1964  				pods[i], err = runPausePod(cs, p)
  1965  				if err != nil {
  1966  					t.Fatalf("Error running pause pod: %v", err)
  1967  				}
  1968  			}
  1969  			// Create the "pod".
  1970  			preemptor, err := createPausePod(cs, test.pod)
  1971  			if err != nil {
  1972  				t.Errorf("Error while creating high priority pod: %v", err)
  1973  			}
  1974  			pods = append(pods, preemptor)
  1975  			// Wait for preemption of pods and make sure the other ones are not preempted.
  1976  			for i, p := range pods {
  1977  				if _, found := test.preemptedPodIndexes[i]; found {
  1978  					if err = wait.PollUntilContextTimeout(testCtx.Ctx, time.Second, wait.ForeverTestTimeout, false,
  1979  						podIsGettingEvicted(cs, p.Namespace, p.Name)); err != nil {
  1980  						t.Errorf("Pod %v/%v is not getting evicted.", p.Namespace, p.Name)
  1981  					}
  1982  				} else {
  1983  					if p.DeletionTimestamp != nil {
  1984  						t.Errorf("Didn't expect pod %v to get preempted.", p.Name)
  1985  					}
  1986  				}
  1987  			}
  1988  			// Also check that the preemptor pod gets the NominatedNodeName field set.
  1989  			if len(test.preemptedPodIndexes) > 0 {
  1990  				if err := waitForNominatedNodeName(cs, preemptor); err != nil {
  1991  					t.Errorf("NominatedNodeName field was not set for pod %v: %v", preemptor.Name, err)
  1992  				}
  1993  			}
  1994  		})
  1995  	}
  1996  }