k8s.io/kubernetes@v1.29.3/test/integration/scheduler/preemption/preemption_test.go (about) 1 /* 2 Copyright 2017 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 // This file tests preemption functionality of the scheduler. 18 19 package preemption 20 21 import ( 22 "context" 23 "fmt" 24 "strings" 25 "testing" 26 "time" 27 28 v1 "k8s.io/api/core/v1" 29 policy "k8s.io/api/policy/v1" 30 "k8s.io/apimachinery/pkg/api/resource" 31 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 32 "k8s.io/apimachinery/pkg/runtime" 33 "k8s.io/apimachinery/pkg/types" 34 "k8s.io/apimachinery/pkg/util/intstr" 35 "k8s.io/apimachinery/pkg/util/wait" 36 "k8s.io/apiserver/pkg/util/feature" 37 "k8s.io/client-go/informers" 38 clientset "k8s.io/client-go/kubernetes" 39 restclient "k8s.io/client-go/rest" 40 featuregatetesting "k8s.io/component-base/featuregate/testing" 41 "k8s.io/component-helpers/storage/volume" 42 "k8s.io/klog/v2" 43 configv1 "k8s.io/kube-scheduler/config/v1" 44 podutil "k8s.io/kubernetes/pkg/api/v1/pod" 45 "k8s.io/kubernetes/pkg/apis/scheduling" 46 "k8s.io/kubernetes/pkg/features" 47 "k8s.io/kubernetes/pkg/scheduler" 48 configtesting "k8s.io/kubernetes/pkg/scheduler/apis/config/testing" 49 "k8s.io/kubernetes/pkg/scheduler/framework" 50 "k8s.io/kubernetes/pkg/scheduler/framework/plugins/volumerestrictions" 51 frameworkruntime "k8s.io/kubernetes/pkg/scheduler/framework/runtime" 52 st "k8s.io/kubernetes/pkg/scheduler/testing" 53 "k8s.io/kubernetes/plugin/pkg/admission/priority" 54 testutils "k8s.io/kubernetes/test/integration/util" 55 "k8s.io/utils/pointer" 56 ) 57 58 // imported from testutils 59 var ( 60 initPausePod = testutils.InitPausePod 61 createNode = testutils.CreateNode 62 createPausePod = testutils.CreatePausePod 63 runPausePod = testutils.RunPausePod 64 deletePod = testutils.DeletePod 65 initTest = testutils.InitTestSchedulerWithNS 66 initTestDisablePreemption = testutils.InitTestDisablePreemption 67 initDisruptionController = testutils.InitDisruptionController 68 waitCachedPodsStable = testutils.WaitCachedPodsStable 69 podIsGettingEvicted = testutils.PodIsGettingEvicted 70 podUnschedulable = testutils.PodUnschedulable 71 waitForPDBsStable = testutils.WaitForPDBsStable 72 waitForPodToScheduleWithTimeout = testutils.WaitForPodToScheduleWithTimeout 73 waitForPodUnschedulable = testutils.WaitForPodUnschedulable 74 ) 75 76 const filterPluginName = "filter-plugin" 77 78 var lowPriority, mediumPriority, highPriority = int32(100), int32(200), int32(300) 79 80 func waitForNominatedNodeNameWithTimeout(cs clientset.Interface, pod *v1.Pod, timeout time.Duration) error { 81 if err := wait.PollUntilContextTimeout(context.TODO(), 100*time.Millisecond, timeout, false, func(ctx context.Context) (bool, error) { 82 pod, err := cs.CoreV1().Pods(pod.Namespace).Get(ctx, pod.Name, metav1.GetOptions{}) 83 if err != nil { 84 return false, err 85 } 86 if len(pod.Status.NominatedNodeName) > 0 { 87 return true, nil 88 } 89 return false, err 90 }); err != nil { 91 return fmt.Errorf(".status.nominatedNodeName of Pod %v/%v did not get set: %v", pod.Namespace, pod.Name, err) 92 } 93 return nil 94 } 95 96 func waitForNominatedNodeName(cs clientset.Interface, pod *v1.Pod) error { 97 return waitForNominatedNodeNameWithTimeout(cs, pod, wait.ForeverTestTimeout) 98 } 99 100 const tokenFilterName = "token-filter" 101 102 // tokenFilter is a fake plugin that implements PreFilter and Filter. 103 // `Token` simulates the allowed pods number a cluster can host. 104 // If `EnablePreFilter` is set to false or `Token` is positive, PreFilter passes; otherwise returns Unschedulable 105 // For each Filter() call, `Token` is decreased by one. When `Token` is positive, Filter passes; otherwise return 106 // Unschedulable or UnschedulableAndUnresolvable (when `Unresolvable` is set to true) 107 // AddPod()/RemovePod() adds/removes one token to the cluster to simulate the dryrun preemption 108 type tokenFilter struct { 109 Tokens int 110 Unresolvable bool 111 EnablePreFilter bool 112 } 113 114 // Name returns name of the plugin. 115 func (fp *tokenFilter) Name() string { 116 return tokenFilterName 117 } 118 119 func (fp *tokenFilter) Filter(ctx context.Context, state *framework.CycleState, pod *v1.Pod, 120 nodeInfo *framework.NodeInfo) *framework.Status { 121 if fp.Tokens > 0 { 122 fp.Tokens-- 123 return nil 124 } 125 status := framework.Unschedulable 126 if fp.Unresolvable { 127 status = framework.UnschedulableAndUnresolvable 128 } 129 return framework.NewStatus(status, fmt.Sprintf("can't fit %v", pod.Name)) 130 } 131 132 func (fp *tokenFilter) PreFilter(ctx context.Context, state *framework.CycleState, pod *v1.Pod) (*framework.PreFilterResult, *framework.Status) { 133 if !fp.EnablePreFilter || fp.Tokens > 0 { 134 return nil, nil 135 } 136 return nil, framework.NewStatus(framework.Unschedulable) 137 } 138 139 func (fp *tokenFilter) AddPod(ctx context.Context, state *framework.CycleState, podToSchedule *v1.Pod, 140 podInfoToAdd *framework.PodInfo, nodeInfo *framework.NodeInfo) *framework.Status { 141 fp.Tokens-- 142 return nil 143 } 144 145 func (fp *tokenFilter) RemovePod(ctx context.Context, state *framework.CycleState, podToSchedule *v1.Pod, 146 podInfoToRemove *framework.PodInfo, nodeInfo *framework.NodeInfo) *framework.Status { 147 fp.Tokens++ 148 return nil 149 } 150 151 func (fp *tokenFilter) PreFilterExtensions() framework.PreFilterExtensions { 152 return fp 153 } 154 155 var _ framework.FilterPlugin = &tokenFilter{} 156 157 // TestPreemption tests a few preemption scenarios. 158 func TestPreemption(t *testing.T) { 159 // Initialize scheduler with a filter plugin. 160 var filter tokenFilter 161 registry := make(frameworkruntime.Registry) 162 err := registry.Register(filterPluginName, func(_ context.Context, _ runtime.Object, fh framework.Handle) (framework.Plugin, error) { 163 return &filter, nil 164 }) 165 if err != nil { 166 t.Fatalf("Error registering a filter: %v", err) 167 } 168 cfg := configtesting.V1ToInternalWithDefaults(t, configv1.KubeSchedulerConfiguration{ 169 Profiles: []configv1.KubeSchedulerProfile{{ 170 SchedulerName: pointer.String(v1.DefaultSchedulerName), 171 Plugins: &configv1.Plugins{ 172 Filter: configv1.PluginSet{ 173 Enabled: []configv1.Plugin{ 174 {Name: filterPluginName}, 175 }, 176 }, 177 PreFilter: configv1.PluginSet{ 178 Enabled: []configv1.Plugin{ 179 {Name: filterPluginName}, 180 }, 181 }, 182 }, 183 }}, 184 }) 185 186 testCtx := testutils.InitTestSchedulerWithOptions(t, 187 testutils.InitTestAPIServer(t, "preemption", nil), 188 0, 189 scheduler.WithProfiles(cfg.Profiles...), 190 scheduler.WithFrameworkOutOfTreeRegistry(registry)) 191 testutils.SyncSchedulerInformerFactory(testCtx) 192 go testCtx.Scheduler.Run(testCtx.Ctx) 193 194 cs := testCtx.ClientSet 195 196 defaultPodRes := &v1.ResourceRequirements{Requests: v1.ResourceList{ 197 v1.ResourceCPU: *resource.NewMilliQuantity(100, resource.DecimalSI), 198 v1.ResourceMemory: *resource.NewQuantity(100, resource.DecimalSI)}, 199 } 200 201 maxTokens := 1000 202 tests := []struct { 203 name string 204 existingPods []*v1.Pod 205 pod *v1.Pod 206 initTokens int 207 enablePreFilter bool 208 unresolvable bool 209 preemptedPodIndexes map[int]struct{} 210 enablePodDisruptionConditions bool 211 }{ 212 { 213 name: "basic pod preemption with PodDisruptionConditions enabled", 214 initTokens: maxTokens, 215 existingPods: []*v1.Pod{ 216 initPausePod(&testutils.PausePodConfig{ 217 Name: "victim-pod", 218 Namespace: testCtx.NS.Name, 219 Priority: &lowPriority, 220 Resources: &v1.ResourceRequirements{Requests: v1.ResourceList{ 221 v1.ResourceCPU: *resource.NewMilliQuantity(400, resource.DecimalSI), 222 v1.ResourceMemory: *resource.NewQuantity(200, resource.DecimalSI)}, 223 }, 224 }), 225 }, 226 pod: initPausePod(&testutils.PausePodConfig{ 227 Name: "preemptor-pod", 228 Namespace: testCtx.NS.Name, 229 Priority: &highPriority, 230 Resources: &v1.ResourceRequirements{Requests: v1.ResourceList{ 231 v1.ResourceCPU: *resource.NewMilliQuantity(300, resource.DecimalSI), 232 v1.ResourceMemory: *resource.NewQuantity(200, resource.DecimalSI)}, 233 }, 234 }), 235 preemptedPodIndexes: map[int]struct{}{0: {}}, 236 enablePodDisruptionConditions: true, 237 }, 238 { 239 name: "basic pod preemption", 240 initTokens: maxTokens, 241 existingPods: []*v1.Pod{ 242 initPausePod(&testutils.PausePodConfig{ 243 Name: "victim-pod", 244 Namespace: testCtx.NS.Name, 245 Priority: &lowPriority, 246 Resources: &v1.ResourceRequirements{Requests: v1.ResourceList{ 247 v1.ResourceCPU: *resource.NewMilliQuantity(400, resource.DecimalSI), 248 v1.ResourceMemory: *resource.NewQuantity(200, resource.DecimalSI)}, 249 }, 250 }), 251 }, 252 pod: initPausePod(&testutils.PausePodConfig{ 253 Name: "preemptor-pod", 254 Namespace: testCtx.NS.Name, 255 Priority: &highPriority, 256 Resources: &v1.ResourceRequirements{Requests: v1.ResourceList{ 257 v1.ResourceCPU: *resource.NewMilliQuantity(300, resource.DecimalSI), 258 v1.ResourceMemory: *resource.NewQuantity(200, resource.DecimalSI)}, 259 }, 260 }), 261 preemptedPodIndexes: map[int]struct{}{0: {}}, 262 }, 263 { 264 name: "basic pod preemption with filter", 265 initTokens: 1, 266 existingPods: []*v1.Pod{ 267 initPausePod(&testutils.PausePodConfig{ 268 Name: "victim-pod", 269 Namespace: testCtx.NS.Name, 270 Priority: &lowPriority, 271 Resources: &v1.ResourceRequirements{Requests: v1.ResourceList{ 272 v1.ResourceCPU: *resource.NewMilliQuantity(200, resource.DecimalSI), 273 v1.ResourceMemory: *resource.NewQuantity(200, resource.DecimalSI)}, 274 }, 275 }), 276 }, 277 pod: initPausePod(&testutils.PausePodConfig{ 278 Name: "preemptor-pod", 279 Namespace: testCtx.NS.Name, 280 Priority: &highPriority, 281 Resources: &v1.ResourceRequirements{Requests: v1.ResourceList{ 282 v1.ResourceCPU: *resource.NewMilliQuantity(200, resource.DecimalSI), 283 v1.ResourceMemory: *resource.NewQuantity(200, resource.DecimalSI)}, 284 }, 285 }), 286 preemptedPodIndexes: map[int]struct{}{0: {}}, 287 }, 288 // This is identical with previous subtest except for setting enablePreFilter to true. 289 // With this fake plugin returning Unschedulable in PreFilter, it's able to exercise the path 290 // that in-tree plugins return Skip in PreFilter and their AddPod/RemovePod functions are also 291 // skipped properly upon preemption. 292 { 293 name: "basic pod preemption with preFilter", 294 initTokens: 1, 295 enablePreFilter: true, 296 existingPods: []*v1.Pod{ 297 initPausePod(&testutils.PausePodConfig{ 298 Name: "victim-pod", 299 Namespace: testCtx.NS.Name, 300 Priority: &lowPriority, 301 Resources: &v1.ResourceRequirements{Requests: v1.ResourceList{ 302 v1.ResourceCPU: *resource.NewMilliQuantity(200, resource.DecimalSI), 303 v1.ResourceMemory: *resource.NewQuantity(200, resource.DecimalSI)}, 304 }, 305 }), 306 }, 307 pod: initPausePod(&testutils.PausePodConfig{ 308 Name: "preemptor-pod", 309 Namespace: testCtx.NS.Name, 310 Priority: &highPriority, 311 Resources: &v1.ResourceRequirements{Requests: v1.ResourceList{ 312 v1.ResourceCPU: *resource.NewMilliQuantity(200, resource.DecimalSI), 313 v1.ResourceMemory: *resource.NewQuantity(200, resource.DecimalSI)}, 314 }, 315 }), 316 preemptedPodIndexes: map[int]struct{}{0: {}}, 317 }, 318 { 319 // same as the previous test, but the filter is unresolvable. 320 name: "basic pod preemption with unresolvable filter", 321 initTokens: 1, 322 unresolvable: true, 323 existingPods: []*v1.Pod{ 324 initPausePod(&testutils.PausePodConfig{ 325 Name: "victim-pod", 326 Namespace: testCtx.NS.Name, 327 Priority: &lowPriority, 328 Resources: &v1.ResourceRequirements{Requests: v1.ResourceList{ 329 v1.ResourceCPU: *resource.NewMilliQuantity(200, resource.DecimalSI), 330 v1.ResourceMemory: *resource.NewQuantity(200, resource.DecimalSI)}, 331 }, 332 }), 333 }, 334 pod: initPausePod(&testutils.PausePodConfig{ 335 Name: "preemptor-pod", 336 Namespace: testCtx.NS.Name, 337 Priority: &highPriority, 338 Resources: &v1.ResourceRequirements{Requests: v1.ResourceList{ 339 v1.ResourceCPU: *resource.NewMilliQuantity(200, resource.DecimalSI), 340 v1.ResourceMemory: *resource.NewQuantity(200, resource.DecimalSI)}, 341 }, 342 }), 343 preemptedPodIndexes: map[int]struct{}{}, 344 }, 345 { 346 name: "preemption is performed to satisfy anti-affinity", 347 initTokens: maxTokens, 348 existingPods: []*v1.Pod{ 349 initPausePod(&testutils.PausePodConfig{ 350 Name: "pod-0", Namespace: testCtx.NS.Name, 351 Priority: &mediumPriority, 352 Labels: map[string]string{"pod": "p0"}, 353 Resources: defaultPodRes, 354 }), 355 initPausePod(&testutils.PausePodConfig{ 356 Name: "pod-1", Namespace: testCtx.NS.Name, 357 Priority: &lowPriority, 358 Labels: map[string]string{"pod": "p1"}, 359 Resources: defaultPodRes, 360 Affinity: &v1.Affinity{ 361 PodAntiAffinity: &v1.PodAntiAffinity{ 362 RequiredDuringSchedulingIgnoredDuringExecution: []v1.PodAffinityTerm{ 363 { 364 LabelSelector: &metav1.LabelSelector{ 365 MatchExpressions: []metav1.LabelSelectorRequirement{ 366 { 367 Key: "pod", 368 Operator: metav1.LabelSelectorOpIn, 369 Values: []string{"preemptor"}, 370 }, 371 }, 372 }, 373 TopologyKey: "node", 374 }, 375 }, 376 }, 377 }, 378 }), 379 }, 380 // A higher priority pod with anti-affinity. 381 pod: initPausePod(&testutils.PausePodConfig{ 382 Name: "preemptor-pod", 383 Namespace: testCtx.NS.Name, 384 Priority: &highPriority, 385 Labels: map[string]string{"pod": "preemptor"}, 386 Resources: defaultPodRes, 387 Affinity: &v1.Affinity{ 388 PodAntiAffinity: &v1.PodAntiAffinity{ 389 RequiredDuringSchedulingIgnoredDuringExecution: []v1.PodAffinityTerm{ 390 { 391 LabelSelector: &metav1.LabelSelector{ 392 MatchExpressions: []metav1.LabelSelectorRequirement{ 393 { 394 Key: "pod", 395 Operator: metav1.LabelSelectorOpIn, 396 Values: []string{"p0"}, 397 }, 398 }, 399 }, 400 TopologyKey: "node", 401 }, 402 }, 403 }, 404 }, 405 }), 406 preemptedPodIndexes: map[int]struct{}{0: {}, 1: {}}, 407 }, 408 { 409 // This is similar to the previous case only pod-1 is high priority. 410 name: "preemption is not performed when anti-affinity is not satisfied", 411 initTokens: maxTokens, 412 existingPods: []*v1.Pod{ 413 initPausePod(&testutils.PausePodConfig{ 414 Name: "pod-0", Namespace: testCtx.NS.Name, 415 Priority: &mediumPriority, 416 Labels: map[string]string{"pod": "p0"}, 417 Resources: defaultPodRes, 418 }), 419 initPausePod(&testutils.PausePodConfig{ 420 Name: "pod-1", Namespace: testCtx.NS.Name, 421 Priority: &highPriority, 422 Labels: map[string]string{"pod": "p1"}, 423 Resources: defaultPodRes, 424 Affinity: &v1.Affinity{ 425 PodAntiAffinity: &v1.PodAntiAffinity{ 426 RequiredDuringSchedulingIgnoredDuringExecution: []v1.PodAffinityTerm{ 427 { 428 LabelSelector: &metav1.LabelSelector{ 429 MatchExpressions: []metav1.LabelSelectorRequirement{ 430 { 431 Key: "pod", 432 Operator: metav1.LabelSelectorOpIn, 433 Values: []string{"preemptor"}, 434 }, 435 }, 436 }, 437 TopologyKey: "node", 438 }, 439 }, 440 }, 441 }, 442 }), 443 }, 444 // A higher priority pod with anti-affinity. 445 pod: initPausePod(&testutils.PausePodConfig{ 446 Name: "preemptor-pod", 447 Namespace: testCtx.NS.Name, 448 Priority: &highPriority, 449 Labels: map[string]string{"pod": "preemptor"}, 450 Resources: defaultPodRes, 451 Affinity: &v1.Affinity{ 452 PodAntiAffinity: &v1.PodAntiAffinity{ 453 RequiredDuringSchedulingIgnoredDuringExecution: []v1.PodAffinityTerm{ 454 { 455 LabelSelector: &metav1.LabelSelector{ 456 MatchExpressions: []metav1.LabelSelectorRequirement{ 457 { 458 Key: "pod", 459 Operator: metav1.LabelSelectorOpIn, 460 Values: []string{"p0"}, 461 }, 462 }, 463 }, 464 TopologyKey: "node", 465 }, 466 }, 467 }, 468 }, 469 }), 470 preemptedPodIndexes: map[int]struct{}{}, 471 }, 472 } 473 474 // Create a node with some resources and a label. 475 nodeRes := map[v1.ResourceName]string{ 476 v1.ResourcePods: "32", 477 v1.ResourceCPU: "500m", 478 v1.ResourceMemory: "500", 479 } 480 nodeObject := st.MakeNode().Name("node1").Capacity(nodeRes).Label("node", "node1").Obj() 481 if _, err := createNode(testCtx.ClientSet, nodeObject); err != nil { 482 t.Fatalf("Error creating node: %v", err) 483 } 484 485 for _, test := range tests { 486 t.Run(test.name, func(t *testing.T) { 487 defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.PodDisruptionConditions, test.enablePodDisruptionConditions)() 488 filter.Tokens = test.initTokens 489 filter.EnablePreFilter = test.enablePreFilter 490 filter.Unresolvable = test.unresolvable 491 pods := make([]*v1.Pod, len(test.existingPods)) 492 // Create and run existingPods. 493 for i, p := range test.existingPods { 494 pods[i], err = runPausePod(cs, p) 495 if err != nil { 496 t.Fatalf("Error running pause pod: %v", err) 497 } 498 } 499 // Create the "pod". 500 preemptor, err := createPausePod(cs, test.pod) 501 if err != nil { 502 t.Errorf("Error while creating high priority pod: %v", err) 503 } 504 // Wait for preemption of pods and make sure the other ones are not preempted. 505 for i, p := range pods { 506 if _, found := test.preemptedPodIndexes[i]; found { 507 if err = wait.PollUntilContextTimeout(testCtx.Ctx, time.Second, wait.ForeverTestTimeout, false, 508 podIsGettingEvicted(cs, p.Namespace, p.Name)); err != nil { 509 t.Errorf("Pod %v/%v is not getting evicted.", p.Namespace, p.Name) 510 } 511 pod, err := cs.CoreV1().Pods(p.Namespace).Get(testCtx.Ctx, p.Name, metav1.GetOptions{}) 512 if err != nil { 513 t.Errorf("Error %v when getting the updated status for pod %v/%v ", err, p.Namespace, p.Name) 514 } 515 _, cond := podutil.GetPodCondition(&pod.Status, v1.DisruptionTarget) 516 if test.enablePodDisruptionConditions && cond == nil { 517 t.Errorf("Pod %q does not have the expected condition: %q", klog.KObj(pod), v1.DisruptionTarget) 518 } else if test.enablePodDisruptionConditions == false && cond != nil { 519 t.Errorf("Pod %q has an unexpected condition: %q", klog.KObj(pod), v1.DisruptionTarget) 520 } 521 } else { 522 if p.DeletionTimestamp != nil { 523 t.Errorf("Didn't expect pod %v to get preempted.", p.Name) 524 } 525 } 526 } 527 // Also check that the preemptor pod gets the NominatedNodeName field set. 528 if len(test.preemptedPodIndexes) > 0 { 529 if err := waitForNominatedNodeName(cs, preemptor); err != nil { 530 t.Errorf("NominatedNodeName field was not set for pod %v: %v", preemptor.Name, err) 531 } 532 } 533 534 // Cleanup 535 pods = append(pods, preemptor) 536 testutils.CleanupPods(testCtx.Ctx, cs, t, pods) 537 }) 538 } 539 } 540 541 // TestNonPreemption tests NonPreempt option of PriorityClass of scheduler works as expected. 542 func TestNonPreemption(t *testing.T) { 543 var preemptNever = v1.PreemptNever 544 // Initialize scheduler. 545 testCtx := initTest(t, "non-preemption") 546 cs := testCtx.ClientSet 547 tests := []struct { 548 name string 549 PreemptionPolicy *v1.PreemptionPolicy 550 }{ 551 { 552 name: "pod preemption will happen", 553 PreemptionPolicy: nil, 554 }, 555 { 556 name: "pod preemption will not happen", 557 PreemptionPolicy: &preemptNever, 558 }, 559 } 560 victim := initPausePod(&testutils.PausePodConfig{ 561 Name: "victim-pod", 562 Namespace: testCtx.NS.Name, 563 Priority: &lowPriority, 564 Resources: &v1.ResourceRequirements{Requests: v1.ResourceList{ 565 v1.ResourceCPU: *resource.NewMilliQuantity(400, resource.DecimalSI), 566 v1.ResourceMemory: *resource.NewQuantity(200, resource.DecimalSI)}, 567 }, 568 }) 569 570 preemptor := initPausePod(&testutils.PausePodConfig{ 571 Name: "preemptor-pod", 572 Namespace: testCtx.NS.Name, 573 Priority: &highPriority, 574 Resources: &v1.ResourceRequirements{Requests: v1.ResourceList{ 575 v1.ResourceCPU: *resource.NewMilliQuantity(300, resource.DecimalSI), 576 v1.ResourceMemory: *resource.NewQuantity(200, resource.DecimalSI)}, 577 }, 578 }) 579 580 // Create a node with some resources 581 nodeRes := map[v1.ResourceName]string{ 582 v1.ResourcePods: "32", 583 v1.ResourceCPU: "500m", 584 v1.ResourceMemory: "500", 585 } 586 _, err := createNode(testCtx.ClientSet, st.MakeNode().Name("node1").Capacity(nodeRes).Obj()) 587 if err != nil { 588 t.Fatalf("Error creating nodes: %v", err) 589 } 590 for _, test := range tests { 591 t.Run(test.name, func(t *testing.T) { 592 defer testutils.CleanupPods(testCtx.Ctx, cs, t, []*v1.Pod{preemptor, victim}) 593 preemptor.Spec.PreemptionPolicy = test.PreemptionPolicy 594 victimPod, err := createPausePod(cs, victim) 595 if err != nil { 596 t.Fatalf("Error while creating victim: %v", err) 597 } 598 if err := waitForPodToScheduleWithTimeout(cs, victimPod, 5*time.Second); err != nil { 599 t.Fatalf("victim %v should be become scheduled", victimPod.Name) 600 } 601 602 preemptorPod, err := createPausePod(cs, preemptor) 603 if err != nil { 604 t.Fatalf("Error while creating preemptor: %v", err) 605 } 606 607 err = waitForNominatedNodeNameWithTimeout(cs, preemptorPod, 5*time.Second) 608 // test.PreemptionPolicy == nil means we expect the preemptor to be nominated. 609 expect := test.PreemptionPolicy == nil 610 // err == nil indicates the preemptor is indeed nominated. 611 got := err == nil 612 if got != expect { 613 t.Errorf("Expect preemptor to be nominated=%v, but got=%v", expect, got) 614 } 615 }) 616 } 617 } 618 619 // TestDisablePreemption tests disable pod preemption of scheduler works as expected. 620 func TestDisablePreemption(t *testing.T) { 621 // Initialize scheduler, and disable preemption. 622 testCtx := initTestDisablePreemption(t, "disable-preemption") 623 cs := testCtx.ClientSet 624 625 tests := []struct { 626 name string 627 existingPods []*v1.Pod 628 pod *v1.Pod 629 }{ 630 { 631 name: "pod preemption will not happen", 632 existingPods: []*v1.Pod{ 633 initPausePod(&testutils.PausePodConfig{ 634 Name: "victim-pod", 635 Namespace: testCtx.NS.Name, 636 Priority: &lowPriority, 637 Resources: &v1.ResourceRequirements{Requests: v1.ResourceList{ 638 v1.ResourceCPU: *resource.NewMilliQuantity(400, resource.DecimalSI), 639 v1.ResourceMemory: *resource.NewQuantity(200, resource.DecimalSI)}, 640 }, 641 }), 642 }, 643 pod: initPausePod(&testutils.PausePodConfig{ 644 Name: "preemptor-pod", 645 Namespace: testCtx.NS.Name, 646 Priority: &highPriority, 647 Resources: &v1.ResourceRequirements{Requests: v1.ResourceList{ 648 v1.ResourceCPU: *resource.NewMilliQuantity(300, resource.DecimalSI), 649 v1.ResourceMemory: *resource.NewQuantity(200, resource.DecimalSI)}, 650 }, 651 }), 652 }, 653 } 654 655 // Create a node with some resources 656 nodeRes := map[v1.ResourceName]string{ 657 v1.ResourcePods: "32", 658 v1.ResourceCPU: "500m", 659 v1.ResourceMemory: "500", 660 } 661 _, err := createNode(testCtx.ClientSet, st.MakeNode().Name("node1").Capacity(nodeRes).Obj()) 662 if err != nil { 663 t.Fatalf("Error creating nodes: %v", err) 664 } 665 666 for _, test := range tests { 667 t.Run(test.name, func(t *testing.T) { 668 pods := make([]*v1.Pod, len(test.existingPods)) 669 // Create and run existingPods. 670 for i, p := range test.existingPods { 671 pods[i], err = runPausePod(cs, p) 672 if err != nil { 673 t.Fatalf("Test [%v]: Error running pause pod: %v", test.name, err) 674 } 675 } 676 // Create the "pod". 677 preemptor, err := createPausePod(cs, test.pod) 678 if err != nil { 679 t.Errorf("Error while creating high priority pod: %v", err) 680 } 681 // Ensure preemptor should keep unschedulable. 682 if err := waitForPodUnschedulable(cs, preemptor); err != nil { 683 t.Errorf("Preemptor %v should not become scheduled", preemptor.Name) 684 } 685 686 // Ensure preemptor should not be nominated. 687 if err := waitForNominatedNodeNameWithTimeout(cs, preemptor, 5*time.Second); err == nil { 688 t.Errorf("Preemptor %v should not be nominated", preemptor.Name) 689 } 690 691 // Cleanup 692 pods = append(pods, preemptor) 693 testutils.CleanupPods(testCtx.Ctx, cs, t, pods) 694 }) 695 } 696 } 697 698 // This test verifies that system critical priorities are created automatically and resolved properly. 699 func TestPodPriorityResolution(t *testing.T) { 700 admission := priority.NewPlugin() 701 testCtx := testutils.InitTestScheduler(t, testutils.InitTestAPIServer(t, "preemption", admission)) 702 cs := testCtx.ClientSet 703 704 // Build clientset and informers for controllers. 705 externalClientConfig := restclient.CopyConfig(testCtx.KubeConfig) 706 externalClientConfig.QPS = -1 707 externalClientset := clientset.NewForConfigOrDie(externalClientConfig) 708 externalInformers := informers.NewSharedInformerFactory(externalClientset, time.Second) 709 admission.SetExternalKubeClientSet(externalClientset) 710 admission.SetExternalKubeInformerFactory(externalInformers) 711 712 // Waiting for all controllers to sync 713 testutils.SyncSchedulerInformerFactory(testCtx) 714 externalInformers.Start(testCtx.Ctx.Done()) 715 externalInformers.WaitForCacheSync(testCtx.Ctx.Done()) 716 717 // Run all controllers 718 go testCtx.Scheduler.Run(testCtx.Ctx) 719 720 tests := []struct { 721 Name string 722 PriorityClass string 723 Pod *v1.Pod 724 ExpectedPriority int32 725 ExpectedError error 726 }{ 727 { 728 Name: "SystemNodeCritical priority class", 729 PriorityClass: scheduling.SystemNodeCritical, 730 ExpectedPriority: scheduling.SystemCriticalPriority + 1000, 731 Pod: initPausePod(&testutils.PausePodConfig{ 732 Name: fmt.Sprintf("pod1-%v", scheduling.SystemNodeCritical), 733 Namespace: metav1.NamespaceSystem, 734 PriorityClassName: scheduling.SystemNodeCritical, 735 }), 736 }, 737 { 738 Name: "SystemClusterCritical priority class", 739 PriorityClass: scheduling.SystemClusterCritical, 740 ExpectedPriority: scheduling.SystemCriticalPriority, 741 Pod: initPausePod(&testutils.PausePodConfig{ 742 Name: fmt.Sprintf("pod2-%v", scheduling.SystemClusterCritical), 743 Namespace: metav1.NamespaceSystem, 744 PriorityClassName: scheduling.SystemClusterCritical, 745 }), 746 }, 747 { 748 Name: "Invalid priority class should result in error", 749 PriorityClass: "foo", 750 ExpectedPriority: scheduling.SystemCriticalPriority, 751 Pod: initPausePod(&testutils.PausePodConfig{ 752 Name: fmt.Sprintf("pod3-%v", scheduling.SystemClusterCritical), 753 Namespace: metav1.NamespaceSystem, 754 PriorityClassName: "foo", 755 }), 756 ExpectedError: fmt.Errorf("failed to create pause pod: pods \"pod3-system-cluster-critical\" is forbidden: no PriorityClass with name foo was found"), 757 }, 758 } 759 760 // Create a node with some resources 761 nodeRes := map[v1.ResourceName]string{ 762 v1.ResourcePods: "32", 763 v1.ResourceCPU: "500m", 764 v1.ResourceMemory: "500", 765 } 766 _, err := createNode(testCtx.ClientSet, st.MakeNode().Name("node1").Capacity(nodeRes).Obj()) 767 if err != nil { 768 t.Fatalf("Error creating nodes: %v", err) 769 } 770 771 pods := make([]*v1.Pod, 0, len(tests)) 772 for _, test := range tests { 773 t.Run(test.Name, func(t *testing.T) { 774 t.Run(test.Name, func(t *testing.T) { 775 pod, err := runPausePod(cs, test.Pod) 776 if err != nil { 777 if test.ExpectedError == nil { 778 t.Fatalf("Test [PodPriority/%v]: Error running pause pod: %v", test.PriorityClass, err) 779 } 780 if err.Error() != test.ExpectedError.Error() { 781 t.Fatalf("Test [PodPriority/%v]: Expected error %v but got error %v", test.PriorityClass, test.ExpectedError, err) 782 } 783 return 784 } 785 pods = append(pods, pod) 786 if pod.Spec.Priority != nil { 787 if *pod.Spec.Priority != test.ExpectedPriority { 788 t.Errorf("Expected pod %v to have priority %v but was %v", pod.Name, test.ExpectedPriority, pod.Spec.Priority) 789 } 790 } else { 791 t.Errorf("Expected pod %v to have priority %v but was nil", pod.Name, test.PriorityClass) 792 } 793 }) 794 }) 795 } 796 testutils.CleanupPods(testCtx.Ctx, cs, t, pods) 797 testutils.CleanupNodes(cs, t) 798 } 799 800 func mkPriorityPodWithGrace(tc *testutils.TestContext, name string, priority int32, grace int64) *v1.Pod { 801 defaultPodRes := &v1.ResourceRequirements{Requests: v1.ResourceList{ 802 v1.ResourceCPU: *resource.NewMilliQuantity(100, resource.DecimalSI), 803 v1.ResourceMemory: *resource.NewQuantity(100, resource.DecimalSI)}, 804 } 805 pod := initPausePod(&testutils.PausePodConfig{ 806 Name: name, 807 Namespace: tc.NS.Name, 808 Priority: &priority, 809 Labels: map[string]string{"pod": name}, 810 Resources: defaultPodRes, 811 }) 812 pod.Spec.TerminationGracePeriodSeconds = &grace 813 return pod 814 } 815 816 // This test ensures that while the preempting pod is waiting for the victims to 817 // terminate, other pending lower priority pods are not scheduled in the room created 818 // after preemption and while the higher priority pods is not scheduled yet. 819 func TestPreemptionStarvation(t *testing.T) { 820 // Initialize scheduler. 821 testCtx := initTest(t, "preemption") 822 cs := testCtx.ClientSet 823 824 tests := []struct { 825 name string 826 numExistingPod int 827 numExpectedPending int 828 preemptor *v1.Pod 829 }{ 830 { 831 // This test ensures that while the preempting pod is waiting for the victims 832 // terminate, other lower priority pods are not scheduled in the room created 833 // after preemption and while the higher priority pods is not scheduled yet. 834 name: "starvation test: higher priority pod is scheduled before the lower priority ones", 835 numExistingPod: 10, 836 numExpectedPending: 5, 837 preemptor: initPausePod(&testutils.PausePodConfig{ 838 Name: "preemptor-pod", 839 Namespace: testCtx.NS.Name, 840 Priority: &highPriority, 841 Resources: &v1.ResourceRequirements{Requests: v1.ResourceList{ 842 v1.ResourceCPU: *resource.NewMilliQuantity(300, resource.DecimalSI), 843 v1.ResourceMemory: *resource.NewQuantity(200, resource.DecimalSI)}, 844 }, 845 }), 846 }, 847 } 848 849 // Create a node with some resources 850 nodeRes := map[v1.ResourceName]string{ 851 v1.ResourcePods: "32", 852 v1.ResourceCPU: "500m", 853 v1.ResourceMemory: "500", 854 } 855 _, err := createNode(testCtx.ClientSet, st.MakeNode().Name("node1").Capacity(nodeRes).Obj()) 856 if err != nil { 857 t.Fatalf("Error creating nodes: %v", err) 858 } 859 860 for _, test := range tests { 861 t.Run(test.name, func(t *testing.T) { 862 pendingPods := make([]*v1.Pod, test.numExpectedPending) 863 numRunningPods := test.numExistingPod - test.numExpectedPending 864 runningPods := make([]*v1.Pod, numRunningPods) 865 // Create and run existingPods. 866 for i := 0; i < numRunningPods; i++ { 867 runningPods[i], err = createPausePod(cs, mkPriorityPodWithGrace(testCtx, fmt.Sprintf("rpod-%v", i), mediumPriority, 0)) 868 if err != nil { 869 t.Fatalf("Error creating pause pod: %v", err) 870 } 871 } 872 // make sure that runningPods are all scheduled. 873 for _, p := range runningPods { 874 if err := testutils.WaitForPodToSchedule(cs, p); err != nil { 875 t.Fatalf("Pod %v/%v didn't get scheduled: %v", p.Namespace, p.Name, err) 876 } 877 } 878 // Create pending pods. 879 for i := 0; i < test.numExpectedPending; i++ { 880 pendingPods[i], err = createPausePod(cs, mkPriorityPodWithGrace(testCtx, fmt.Sprintf("ppod-%v", i), mediumPriority, 0)) 881 if err != nil { 882 t.Fatalf("Error creating pending pod: %v", err) 883 } 884 } 885 // Make sure that all pending pods are being marked unschedulable. 886 for _, p := range pendingPods { 887 if err := wait.PollUntilContextTimeout(testCtx.Ctx, 100*time.Millisecond, wait.ForeverTestTimeout, false, 888 podUnschedulable(cs, p.Namespace, p.Name)); err != nil { 889 t.Errorf("Pod %v/%v didn't get marked unschedulable: %v", p.Namespace, p.Name, err) 890 } 891 } 892 // Create the preemptor. 893 preemptor, err := createPausePod(cs, test.preemptor) 894 if err != nil { 895 t.Errorf("Error while creating the preempting pod: %v", err) 896 } 897 // Check if .status.nominatedNodeName of the preemptor pod gets set. 898 if err := waitForNominatedNodeName(cs, preemptor); err != nil { 899 t.Errorf(".status.nominatedNodeName was not set for pod %v/%v: %v", preemptor.Namespace, preemptor.Name, err) 900 } 901 // Make sure that preemptor is scheduled after preemptions. 902 if err := testutils.WaitForPodToScheduleWithTimeout(cs, preemptor, 60*time.Second); err != nil { 903 t.Errorf("Preemptor pod %v didn't get scheduled: %v", preemptor.Name, err) 904 } 905 // Cleanup 906 klog.Info("Cleaning up all pods...") 907 allPods := pendingPods 908 allPods = append(allPods, runningPods...) 909 allPods = append(allPods, preemptor) 910 testutils.CleanupPods(testCtx.Ctx, cs, t, allPods) 911 }) 912 } 913 } 914 915 // TestPreemptionRaces tests that other scheduling events and operations do not 916 // race with the preemption process. 917 func TestPreemptionRaces(t *testing.T) { 918 // Initialize scheduler. 919 testCtx := initTest(t, "preemption-race") 920 cs := testCtx.ClientSet 921 922 tests := []struct { 923 name string 924 numInitialPods int // Pods created and executed before running preemptor 925 numAdditionalPods int // Pods created after creating the preemptor 926 numRepetitions int // Repeat the tests to check races 927 preemptor *v1.Pod 928 }{ 929 { 930 // This test ensures that while the preempting pod is waiting for the victims 931 // terminate, other lower priority pods are not scheduled in the room created 932 // after preemption and while the higher priority pods is not scheduled yet. 933 name: "ensures that other pods are not scheduled while preemptor is being marked as nominated (issue #72124)", 934 numInitialPods: 2, 935 numAdditionalPods: 20, 936 numRepetitions: 5, 937 preemptor: initPausePod(&testutils.PausePodConfig{ 938 Name: "preemptor-pod", 939 Namespace: testCtx.NS.Name, 940 Priority: &highPriority, 941 Resources: &v1.ResourceRequirements{Requests: v1.ResourceList{ 942 v1.ResourceCPU: *resource.NewMilliQuantity(4900, resource.DecimalSI), 943 v1.ResourceMemory: *resource.NewQuantity(4900, resource.DecimalSI)}, 944 }, 945 }), 946 }, 947 } 948 949 // Create a node with some resources 950 nodeRes := map[v1.ResourceName]string{ 951 v1.ResourcePods: "100", 952 v1.ResourceCPU: "5000m", 953 v1.ResourceMemory: "5000", 954 } 955 _, err := createNode(testCtx.ClientSet, st.MakeNode().Name("node1").Capacity(nodeRes).Obj()) 956 if err != nil { 957 t.Fatalf("Error creating nodes: %v", err) 958 } 959 960 for _, test := range tests { 961 t.Run(test.name, func(t *testing.T) { 962 if test.numRepetitions <= 0 { 963 test.numRepetitions = 1 964 } 965 for n := 0; n < test.numRepetitions; n++ { 966 initialPods := make([]*v1.Pod, test.numInitialPods) 967 additionalPods := make([]*v1.Pod, test.numAdditionalPods) 968 // Create and run existingPods. 969 for i := 0; i < test.numInitialPods; i++ { 970 initialPods[i], err = createPausePod(cs, mkPriorityPodWithGrace(testCtx, fmt.Sprintf("rpod-%v", i), mediumPriority, 0)) 971 if err != nil { 972 t.Fatalf("Error creating pause pod: %v", err) 973 } 974 } 975 // make sure that initial Pods are all scheduled. 976 for _, p := range initialPods { 977 if err := testutils.WaitForPodToSchedule(cs, p); err != nil { 978 t.Fatalf("Pod %v/%v didn't get scheduled: %v", p.Namespace, p.Name, err) 979 } 980 } 981 // Create the preemptor. 982 klog.Info("Creating the preemptor pod...") 983 preemptor, err := createPausePod(cs, test.preemptor) 984 if err != nil { 985 t.Errorf("Error while creating the preempting pod: %v", err) 986 } 987 988 klog.Info("Creating additional pods...") 989 for i := 0; i < test.numAdditionalPods; i++ { 990 additionalPods[i], err = createPausePod(cs, mkPriorityPodWithGrace(testCtx, fmt.Sprintf("ppod-%v", i), mediumPriority, 0)) 991 if err != nil { 992 t.Fatalf("Error creating pending pod: %v", err) 993 } 994 } 995 // Check that the preemptor pod gets nominated node name. 996 if err := waitForNominatedNodeName(cs, preemptor); err != nil { 997 t.Errorf(".status.nominatedNodeName was not set for pod %v/%v: %v", preemptor.Namespace, preemptor.Name, err) 998 } 999 // Make sure that preemptor is scheduled after preemptions. 1000 if err := testutils.WaitForPodToScheduleWithTimeout(cs, preemptor, 60*time.Second); err != nil { 1001 t.Errorf("Preemptor pod %v didn't get scheduled: %v", preemptor.Name, err) 1002 } 1003 1004 klog.Info("Check unschedulable pods still exists and were never scheduled...") 1005 for _, p := range additionalPods { 1006 pod, err := cs.CoreV1().Pods(p.Namespace).Get(testCtx.Ctx, p.Name, metav1.GetOptions{}) 1007 if err != nil { 1008 t.Errorf("Error in getting Pod %v/%v info: %v", p.Namespace, p.Name, err) 1009 } 1010 if len(pod.Spec.NodeName) > 0 { 1011 t.Errorf("Pod %v/%v is already scheduled", p.Namespace, p.Name) 1012 } 1013 _, cond := podutil.GetPodCondition(&pod.Status, v1.PodScheduled) 1014 if cond != nil && cond.Status != v1.ConditionFalse { 1015 t.Errorf("Pod %v/%v is no longer unschedulable: %v", p.Namespace, p.Name, err) 1016 } 1017 } 1018 // Cleanup 1019 klog.Info("Cleaning up all pods...") 1020 allPods := additionalPods 1021 allPods = append(allPods, initialPods...) 1022 allPods = append(allPods, preemptor) 1023 testutils.CleanupPods(testCtx.Ctx, cs, t, allPods) 1024 } 1025 }) 1026 } 1027 } 1028 1029 const ( 1030 alwaysFailPlugin = "alwaysFailPlugin" 1031 doNotFailMe = "do-not-fail-me" 1032 ) 1033 1034 // A fake plugin implements PreBind extension point. 1035 // It always fails with an Unschedulable status, unless the pod contains a `doNotFailMe` string. 1036 type alwaysFail struct{} 1037 1038 func (af *alwaysFail) Name() string { 1039 return alwaysFailPlugin 1040 } 1041 1042 func (af *alwaysFail) PreBind(_ context.Context, _ *framework.CycleState, p *v1.Pod, _ string) *framework.Status { 1043 if strings.Contains(p.Name, doNotFailMe) { 1044 return nil 1045 } 1046 return framework.NewStatus(framework.Unschedulable) 1047 } 1048 1049 func newAlwaysFail(_ context.Context, _ runtime.Object, _ framework.Handle) (framework.Plugin, error) { 1050 return &alwaysFail{}, nil 1051 } 1052 1053 // TestNominatedNodeCleanUp verifies if a pod's nominatedNodeName is set and unset 1054 // properly in different scenarios. 1055 func TestNominatedNodeCleanUp(t *testing.T) { 1056 tests := []struct { 1057 name string 1058 nodeCapacity map[v1.ResourceName]string 1059 // A slice of pods to be created in batch. 1060 podsToCreate [][]*v1.Pod 1061 // Each postCheck function is run after each batch of pods' creation. 1062 postChecks []func(cs clientset.Interface, pod *v1.Pod) error 1063 // Delete the fake node or not. Optional. 1064 deleteNode bool 1065 // Pods to be deleted. Optional. 1066 podNamesToDelete []string 1067 1068 // Register dummy plugin to simulate particular scheduling failures. Optional. 1069 customPlugins *configv1.Plugins 1070 outOfTreeRegistry frameworkruntime.Registry 1071 }{ 1072 { 1073 name: "mid-priority pod preempts low-priority pod, followed by a high-priority pod with another preemption", 1074 nodeCapacity: map[v1.ResourceName]string{v1.ResourceCPU: "5"}, 1075 podsToCreate: [][]*v1.Pod{ 1076 { 1077 st.MakePod().Name("low-1").Priority(lowPriority).Req(map[v1.ResourceName]string{v1.ResourceCPU: "1"}).Obj(), 1078 st.MakePod().Name("low-2").Priority(lowPriority).Req(map[v1.ResourceName]string{v1.ResourceCPU: "1"}).Obj(), 1079 st.MakePod().Name("low-3").Priority(lowPriority).Req(map[v1.ResourceName]string{v1.ResourceCPU: "1"}).Obj(), 1080 st.MakePod().Name("low-4").Priority(lowPriority).Req(map[v1.ResourceName]string{v1.ResourceCPU: "1"}).Obj(), 1081 }, 1082 { 1083 st.MakePod().Name("medium").Priority(mediumPriority).Req(map[v1.ResourceName]string{v1.ResourceCPU: "4"}).Obj(), 1084 }, 1085 { 1086 st.MakePod().Name("high").Priority(highPriority).Req(map[v1.ResourceName]string{v1.ResourceCPU: "3"}).Obj(), 1087 }, 1088 }, 1089 postChecks: []func(cs clientset.Interface, pod *v1.Pod) error{ 1090 testutils.WaitForPodToSchedule, 1091 waitForNominatedNodeName, 1092 waitForNominatedNodeName, 1093 }, 1094 }, 1095 { 1096 name: "mid-priority pod preempts low-priority pod, followed by a high-priority pod without additional preemption", 1097 nodeCapacity: map[v1.ResourceName]string{v1.ResourceCPU: "2"}, 1098 podsToCreate: [][]*v1.Pod{ 1099 { 1100 st.MakePod().Name("low").Priority(lowPriority).Req(map[v1.ResourceName]string{v1.ResourceCPU: "1"}).Obj(), 1101 }, 1102 { 1103 st.MakePod().Name("medium").Priority(mediumPriority).Req(map[v1.ResourceName]string{v1.ResourceCPU: "2"}).Obj(), 1104 }, 1105 { 1106 st.MakePod().Name("high").Priority(highPriority).Req(map[v1.ResourceName]string{v1.ResourceCPU: "1"}).Obj(), 1107 }, 1108 }, 1109 postChecks: []func(cs clientset.Interface, pod *v1.Pod) error{ 1110 testutils.WaitForPodToSchedule, 1111 waitForNominatedNodeName, 1112 testutils.WaitForPodToSchedule, 1113 }, 1114 podNamesToDelete: []string{"low"}, 1115 }, 1116 { 1117 name: "mid-priority pod preempts low-priority pod, followed by a node deletion", 1118 nodeCapacity: map[v1.ResourceName]string{v1.ResourceCPU: "1"}, 1119 podsToCreate: [][]*v1.Pod{ 1120 { 1121 st.MakePod().Name("low").Priority(lowPriority).Req(map[v1.ResourceName]string{v1.ResourceCPU: "1"}).Obj(), 1122 }, 1123 { 1124 st.MakePod().Name("medium").Priority(mediumPriority).Req(map[v1.ResourceName]string{v1.ResourceCPU: "1"}).Obj(), 1125 }, 1126 }, 1127 postChecks: []func(cs clientset.Interface, pod *v1.Pod) error{ 1128 testutils.WaitForPodToSchedule, 1129 waitForNominatedNodeName, 1130 }, 1131 // Delete the node to simulate an ErrNoNodesAvailable error. 1132 deleteNode: true, 1133 podNamesToDelete: []string{"low"}, 1134 }, 1135 { 1136 name: "mid-priority pod preempts low-priority pod, but failed the scheduling unexpectedly", 1137 nodeCapacity: map[v1.ResourceName]string{v1.ResourceCPU: "1"}, 1138 podsToCreate: [][]*v1.Pod{ 1139 { 1140 st.MakePod().Name(fmt.Sprintf("low-%v", doNotFailMe)).Priority(lowPriority).Req(map[v1.ResourceName]string{v1.ResourceCPU: "1"}).Obj(), 1141 }, 1142 { 1143 st.MakePod().Name("medium").Priority(mediumPriority).Req(map[v1.ResourceName]string{v1.ResourceCPU: "1"}).Obj(), 1144 }, 1145 }, 1146 postChecks: []func(cs clientset.Interface, pod *v1.Pod) error{ 1147 testutils.WaitForPodToSchedule, 1148 waitForNominatedNodeName, 1149 }, 1150 podNamesToDelete: []string{fmt.Sprintf("low-%v", doNotFailMe)}, 1151 customPlugins: &configv1.Plugins{ 1152 PreBind: configv1.PluginSet{ 1153 Enabled: []configv1.Plugin{ 1154 {Name: alwaysFailPlugin}, 1155 }, 1156 }, 1157 }, 1158 outOfTreeRegistry: frameworkruntime.Registry{alwaysFailPlugin: newAlwaysFail}, 1159 }, 1160 } 1161 1162 for _, tt := range tests { 1163 t.Run(tt.name, func(t *testing.T) { 1164 cfg := configtesting.V1ToInternalWithDefaults(t, configv1.KubeSchedulerConfiguration{ 1165 Profiles: []configv1.KubeSchedulerProfile{{ 1166 SchedulerName: pointer.String(v1.DefaultSchedulerName), 1167 Plugins: tt.customPlugins, 1168 }}, 1169 }) 1170 testCtx := initTest( 1171 t, 1172 "preemption", 1173 scheduler.WithProfiles(cfg.Profiles...), 1174 scheduler.WithFrameworkOutOfTreeRegistry(tt.outOfTreeRegistry), 1175 ) 1176 1177 cs, ns := testCtx.ClientSet, testCtx.NS.Name 1178 // Create a node with the specified capacity. 1179 nodeName := "fake-node" 1180 if _, err := createNode(cs, st.MakeNode().Name(nodeName).Capacity(tt.nodeCapacity).Obj()); err != nil { 1181 t.Fatalf("Error creating node %v: %v", nodeName, err) 1182 } 1183 1184 // Create pods and run post check if necessary. 1185 for i, pods := range tt.podsToCreate { 1186 for _, p := range pods { 1187 p.Namespace = ns 1188 if _, err := createPausePod(cs, p); err != nil { 1189 t.Fatalf("Error creating pod %v: %v", p.Name, err) 1190 } 1191 } 1192 // If necessary, run the post check function. 1193 if len(tt.postChecks) > i && tt.postChecks[i] != nil { 1194 for _, p := range pods { 1195 if err := tt.postChecks[i](cs, p); err != nil { 1196 t.Fatalf("Pod %v didn't pass the postChecks[%v]: %v", p.Name, i, err) 1197 } 1198 } 1199 } 1200 } 1201 1202 // Delete the node if necessary. 1203 if tt.deleteNode { 1204 if err := cs.CoreV1().Nodes().Delete(context.TODO(), nodeName, *metav1.NewDeleteOptions(0)); err != nil { 1205 t.Fatalf("Node %v cannot be deleted: %v", nodeName, err) 1206 } 1207 } 1208 1209 // Force deleting the terminating pods if necessary. 1210 // This is required if we demand to delete terminating Pods physically. 1211 for _, podName := range tt.podNamesToDelete { 1212 if err := deletePod(cs, podName, ns); err != nil { 1213 t.Fatalf("Pod %v cannot be deleted: %v", podName, err) 1214 } 1215 } 1216 1217 // Verify if .status.nominatedNodeName is cleared. 1218 if err := wait.PollUntilContextTimeout(testCtx.Ctx, 100*time.Millisecond, wait.ForeverTestTimeout, false, func(ctx context.Context) (bool, error) { 1219 pod, err := cs.CoreV1().Pods(ns).Get(ctx, "medium", metav1.GetOptions{}) 1220 if err != nil { 1221 t.Errorf("Error getting the medium pod: %v", err) 1222 } 1223 if len(pod.Status.NominatedNodeName) == 0 { 1224 return true, nil 1225 } 1226 return false, err 1227 }); err != nil { 1228 t.Errorf(".status.nominatedNodeName of the medium pod was not cleared: %v", err) 1229 } 1230 }) 1231 } 1232 } 1233 1234 func mkMinAvailablePDB(name, namespace string, uid types.UID, minAvailable int, matchLabels map[string]string) *policy.PodDisruptionBudget { 1235 intMinAvailable := intstr.FromInt32(int32(minAvailable)) 1236 return &policy.PodDisruptionBudget{ 1237 ObjectMeta: metav1.ObjectMeta{ 1238 Name: name, 1239 Namespace: namespace, 1240 }, 1241 Spec: policy.PodDisruptionBudgetSpec{ 1242 MinAvailable: &intMinAvailable, 1243 Selector: &metav1.LabelSelector{MatchLabels: matchLabels}, 1244 }, 1245 } 1246 } 1247 1248 func addPodConditionReady(pod *v1.Pod) { 1249 pod.Status = v1.PodStatus{ 1250 Phase: v1.PodRunning, 1251 Conditions: []v1.PodCondition{ 1252 { 1253 Type: v1.PodReady, 1254 Status: v1.ConditionTrue, 1255 }, 1256 }, 1257 } 1258 } 1259 1260 // TestPDBInPreemption tests PodDisruptionBudget support in preemption. 1261 func TestPDBInPreemption(t *testing.T) { 1262 // Initialize scheduler. 1263 testCtx := initTest(t, "preemption-pdb") 1264 cs := testCtx.ClientSet 1265 1266 initDisruptionController(t, testCtx) 1267 1268 defaultPodRes := &v1.ResourceRequirements{Requests: v1.ResourceList{ 1269 v1.ResourceCPU: *resource.NewMilliQuantity(100, resource.DecimalSI), 1270 v1.ResourceMemory: *resource.NewQuantity(100, resource.DecimalSI)}, 1271 } 1272 defaultNodeRes := map[v1.ResourceName]string{ 1273 v1.ResourcePods: "32", 1274 v1.ResourceCPU: "500m", 1275 v1.ResourceMemory: "500", 1276 } 1277 1278 tests := []struct { 1279 name string 1280 nodeCnt int 1281 pdbs []*policy.PodDisruptionBudget 1282 pdbPodNum []int32 1283 existingPods []*v1.Pod 1284 pod *v1.Pod 1285 preemptedPodIndexes map[int]struct{} 1286 }{ 1287 { 1288 name: "A non-PDB violating pod is preempted despite its higher priority", 1289 nodeCnt: 1, 1290 pdbs: []*policy.PodDisruptionBudget{ 1291 mkMinAvailablePDB("pdb-1", testCtx.NS.Name, types.UID("pdb-1-uid"), 2, map[string]string{"foo": "bar"}), 1292 }, 1293 pdbPodNum: []int32{2}, 1294 existingPods: []*v1.Pod{ 1295 initPausePod(&testutils.PausePodConfig{ 1296 Name: "low-pod1", 1297 Namespace: testCtx.NS.Name, 1298 Priority: &lowPriority, 1299 Resources: defaultPodRes, 1300 Labels: map[string]string{"foo": "bar"}, 1301 }), 1302 initPausePod(&testutils.PausePodConfig{ 1303 Name: "low-pod2", 1304 Namespace: testCtx.NS.Name, 1305 Priority: &lowPriority, 1306 Resources: defaultPodRes, 1307 Labels: map[string]string{"foo": "bar"}, 1308 }), 1309 initPausePod(&testutils.PausePodConfig{ 1310 Name: "mid-pod3", 1311 Namespace: testCtx.NS.Name, 1312 Priority: &mediumPriority, 1313 Resources: defaultPodRes, 1314 }), 1315 }, 1316 pod: initPausePod(&testutils.PausePodConfig{ 1317 Name: "preemptor-pod", 1318 Namespace: testCtx.NS.Name, 1319 Priority: &highPriority, 1320 Resources: &v1.ResourceRequirements{Requests: v1.ResourceList{ 1321 v1.ResourceCPU: *resource.NewMilliQuantity(300, resource.DecimalSI), 1322 v1.ResourceMemory: *resource.NewQuantity(200, resource.DecimalSI)}, 1323 }, 1324 }), 1325 preemptedPodIndexes: map[int]struct{}{2: {}}, 1326 }, 1327 { 1328 name: "A node without any PDB violating pods is preferred for preemption", 1329 nodeCnt: 2, 1330 pdbs: []*policy.PodDisruptionBudget{ 1331 mkMinAvailablePDB("pdb-1", testCtx.NS.Name, types.UID("pdb-1-uid"), 2, map[string]string{"foo": "bar"}), 1332 }, 1333 pdbPodNum: []int32{1}, 1334 existingPods: []*v1.Pod{ 1335 initPausePod(&testutils.PausePodConfig{ 1336 Name: "low-pod1", 1337 Namespace: testCtx.NS.Name, 1338 Priority: &lowPriority, 1339 Resources: defaultPodRes, 1340 NodeName: "node-1", 1341 Labels: map[string]string{"foo": "bar"}, 1342 }), 1343 initPausePod(&testutils.PausePodConfig{ 1344 Name: "mid-pod2", 1345 Namespace: testCtx.NS.Name, 1346 Priority: &mediumPriority, 1347 NodeName: "node-2", 1348 Resources: defaultPodRes, 1349 }), 1350 }, 1351 pod: initPausePod(&testutils.PausePodConfig{ 1352 Name: "preemptor-pod", 1353 Namespace: testCtx.NS.Name, 1354 Priority: &highPriority, 1355 Resources: &v1.ResourceRequirements{Requests: v1.ResourceList{ 1356 v1.ResourceCPU: *resource.NewMilliQuantity(500, resource.DecimalSI), 1357 v1.ResourceMemory: *resource.NewQuantity(200, resource.DecimalSI)}, 1358 }, 1359 }), 1360 preemptedPodIndexes: map[int]struct{}{1: {}}, 1361 }, 1362 { 1363 name: "A node with fewer PDB violating pods is preferred for preemption", 1364 nodeCnt: 3, 1365 pdbs: []*policy.PodDisruptionBudget{ 1366 mkMinAvailablePDB("pdb-1", testCtx.NS.Name, types.UID("pdb-1-uid"), 2, map[string]string{"foo1": "bar"}), 1367 mkMinAvailablePDB("pdb-2", testCtx.NS.Name, types.UID("pdb-2-uid"), 2, map[string]string{"foo2": "bar"}), 1368 }, 1369 pdbPodNum: []int32{1, 5}, 1370 existingPods: []*v1.Pod{ 1371 initPausePod(&testutils.PausePodConfig{ 1372 Name: "low-pod1", 1373 Namespace: testCtx.NS.Name, 1374 Priority: &lowPriority, 1375 Resources: defaultPodRes, 1376 NodeName: "node-1", 1377 Labels: map[string]string{"foo1": "bar"}, 1378 }), 1379 initPausePod(&testutils.PausePodConfig{ 1380 Name: "mid-pod1", 1381 Namespace: testCtx.NS.Name, 1382 Priority: &mediumPriority, 1383 Resources: defaultPodRes, 1384 NodeName: "node-1", 1385 }), 1386 initPausePod(&testutils.PausePodConfig{ 1387 Name: "low-pod2", 1388 Namespace: testCtx.NS.Name, 1389 Priority: &lowPriority, 1390 Resources: defaultPodRes, 1391 NodeName: "node-2", 1392 Labels: map[string]string{"foo2": "bar"}, 1393 }), 1394 initPausePod(&testutils.PausePodConfig{ 1395 Name: "mid-pod2", 1396 Namespace: testCtx.NS.Name, 1397 Priority: &mediumPriority, 1398 Resources: defaultPodRes, 1399 NodeName: "node-2", 1400 Labels: map[string]string{"foo2": "bar"}, 1401 }), 1402 initPausePod(&testutils.PausePodConfig{ 1403 Name: "low-pod4", 1404 Namespace: testCtx.NS.Name, 1405 Priority: &lowPriority, 1406 Resources: defaultPodRes, 1407 NodeName: "node-3", 1408 Labels: map[string]string{"foo2": "bar"}, 1409 }), 1410 initPausePod(&testutils.PausePodConfig{ 1411 Name: "low-pod5", 1412 Namespace: testCtx.NS.Name, 1413 Priority: &lowPriority, 1414 Resources: defaultPodRes, 1415 NodeName: "node-3", 1416 Labels: map[string]string{"foo2": "bar"}, 1417 }), 1418 initPausePod(&testutils.PausePodConfig{ 1419 Name: "low-pod6", 1420 Namespace: testCtx.NS.Name, 1421 Priority: &lowPriority, 1422 Resources: defaultPodRes, 1423 NodeName: "node-3", 1424 Labels: map[string]string{"foo2": "bar"}, 1425 }), 1426 }, 1427 pod: initPausePod(&testutils.PausePodConfig{ 1428 Name: "preemptor-pod", 1429 Namespace: testCtx.NS.Name, 1430 Priority: &highPriority, 1431 Resources: &v1.ResourceRequirements{Requests: v1.ResourceList{ 1432 v1.ResourceCPU: *resource.NewMilliQuantity(500, resource.DecimalSI), 1433 v1.ResourceMemory: *resource.NewQuantity(400, resource.DecimalSI)}, 1434 }, 1435 }), 1436 // The third node is chosen because PDB is not violated for node 3 and the victims have lower priority than node-2. 1437 preemptedPodIndexes: map[int]struct{}{4: {}, 5: {}, 6: {}}, 1438 }, 1439 } 1440 1441 for _, test := range tests { 1442 t.Run(test.name, func(t *testing.T) { 1443 for i := 1; i <= test.nodeCnt; i++ { 1444 nodeName := fmt.Sprintf("node-%v", i) 1445 _, err := createNode(cs, st.MakeNode().Name(nodeName).Capacity(defaultNodeRes).Obj()) 1446 if err != nil { 1447 t.Fatalf("Error creating node %v: %v", nodeName, err) 1448 } 1449 } 1450 1451 pods := make([]*v1.Pod, len(test.existingPods)) 1452 var err error 1453 // Create and run existingPods. 1454 for i, p := range test.existingPods { 1455 if pods[i], err = runPausePod(cs, p); err != nil { 1456 t.Fatalf("Test [%v]: Error running pause pod: %v", test.name, err) 1457 } 1458 // Add pod condition ready so that PDB is updated. 1459 addPodConditionReady(p) 1460 if _, err := testCtx.ClientSet.CoreV1().Pods(testCtx.NS.Name).UpdateStatus(context.TODO(), p, metav1.UpdateOptions{}); err != nil { 1461 t.Fatal(err) 1462 } 1463 } 1464 // Wait for Pods to be stable in scheduler cache. 1465 if err := waitCachedPodsStable(testCtx, test.existingPods); err != nil { 1466 t.Fatalf("Not all pods are stable in the cache: %v", err) 1467 } 1468 1469 // Create PDBs. 1470 for _, pdb := range test.pdbs { 1471 _, err := testCtx.ClientSet.PolicyV1().PodDisruptionBudgets(testCtx.NS.Name).Create(context.TODO(), pdb, metav1.CreateOptions{}) 1472 if err != nil { 1473 t.Fatalf("Failed to create PDB: %v", err) 1474 } 1475 } 1476 // Wait for PDBs to become stable. 1477 if err := waitForPDBsStable(testCtx, test.pdbs, test.pdbPodNum); err != nil { 1478 t.Fatalf("Not all pdbs are stable in the cache: %v", err) 1479 } 1480 1481 // Create the "pod". 1482 preemptor, err := createPausePod(cs, test.pod) 1483 if err != nil { 1484 t.Errorf("Error while creating high priority pod: %v", err) 1485 } 1486 // Wait for preemption of pods and make sure the other ones are not preempted. 1487 for i, p := range pods { 1488 if _, found := test.preemptedPodIndexes[i]; found { 1489 if err = wait.PollUntilContextTimeout(testCtx.Ctx, time.Second, wait.ForeverTestTimeout, false, 1490 podIsGettingEvicted(cs, p.Namespace, p.Name)); err != nil { 1491 t.Errorf("Test [%v]: Pod %v/%v is not getting evicted.", test.name, p.Namespace, p.Name) 1492 } 1493 } else { 1494 if p.DeletionTimestamp != nil { 1495 t.Errorf("Test [%v]: Didn't expect pod %v/%v to get preempted.", test.name, p.Namespace, p.Name) 1496 } 1497 } 1498 } 1499 // Also check if .status.nominatedNodeName of the preemptor pod gets set. 1500 if len(test.preemptedPodIndexes) > 0 { 1501 if err := waitForNominatedNodeName(cs, preemptor); err != nil { 1502 t.Errorf("Test [%v]: .status.nominatedNodeName was not set for pod %v/%v: %v", test.name, preemptor.Namespace, preemptor.Name, err) 1503 } 1504 } 1505 1506 // Cleanup 1507 pods = append(pods, preemptor) 1508 testutils.CleanupPods(testCtx.Ctx, cs, t, pods) 1509 if err := cs.PolicyV1().PodDisruptionBudgets(testCtx.NS.Name).DeleteCollection(testCtx.Ctx, metav1.DeleteOptions{}, metav1.ListOptions{}); err != nil { 1510 t.Errorf("error while deleting PDBs, error: %v", err) 1511 } 1512 if err := cs.CoreV1().Nodes().DeleteCollection(testCtx.Ctx, metav1.DeleteOptions{}, metav1.ListOptions{}); err != nil { 1513 t.Errorf("error whiling deleting nodes, error: %v", err) 1514 } 1515 }) 1516 } 1517 } 1518 1519 func initTestPreferNominatedNode(t *testing.T, nsPrefix string, opts ...scheduler.Option) *testutils.TestContext { 1520 testCtx := testutils.InitTestSchedulerWithOptions(t, testutils.InitTestAPIServer(t, nsPrefix, nil), 0, opts...) 1521 testutils.SyncSchedulerInformerFactory(testCtx) 1522 // wraps the NextPod() method to make it appear the preemption has been done already and the nominated node has been set. 1523 f := testCtx.Scheduler.NextPod 1524 testCtx.Scheduler.NextPod = func(logger klog.Logger) (*framework.QueuedPodInfo, error) { 1525 podInfo, _ := f(klog.FromContext(testCtx.Ctx)) 1526 // Scheduler.Next() may return nil when scheduler is shutting down. 1527 if podInfo != nil { 1528 podInfo.Pod.Status.NominatedNodeName = "node-1" 1529 } 1530 return podInfo, nil 1531 } 1532 go testCtx.Scheduler.Run(testCtx.Ctx) 1533 return testCtx 1534 } 1535 1536 // TestPreferNominatedNode test that if the nominated node pass all the filters, then preemptor pod will run on the nominated node, 1537 // otherwise, it will be scheduled to another node in the cluster that ables to pass all the filters. 1538 func TestPreferNominatedNode(t *testing.T) { 1539 defaultNodeRes := map[v1.ResourceName]string{ 1540 v1.ResourcePods: "32", 1541 v1.ResourceCPU: "500m", 1542 v1.ResourceMemory: "500", 1543 } 1544 defaultPodRes := &v1.ResourceRequirements{Requests: v1.ResourceList{ 1545 v1.ResourceCPU: *resource.NewMilliQuantity(100, resource.DecimalSI), 1546 v1.ResourceMemory: *resource.NewQuantity(100, resource.DecimalSI)}, 1547 } 1548 tests := []struct { 1549 name string 1550 nodeNames []string 1551 existingPods []*v1.Pod 1552 pod *v1.Pod 1553 runningNode string 1554 }{ 1555 { 1556 name: "nominated node released all resource, preemptor is scheduled to the nominated node", 1557 nodeNames: []string{"node-1", "node-2"}, 1558 existingPods: []*v1.Pod{ 1559 initPausePod(&testutils.PausePodConfig{ 1560 Name: "low-pod1", 1561 Priority: &lowPriority, 1562 NodeName: "node-2", 1563 Resources: defaultPodRes, 1564 }), 1565 }, 1566 pod: initPausePod(&testutils.PausePodConfig{ 1567 Name: "preemptor-pod", 1568 Priority: &highPriority, 1569 Resources: &v1.ResourceRequirements{Requests: v1.ResourceList{ 1570 v1.ResourceCPU: *resource.NewMilliQuantity(500, resource.DecimalSI), 1571 v1.ResourceMemory: *resource.NewQuantity(200, resource.DecimalSI)}, 1572 }, 1573 }), 1574 runningNode: "node-1", 1575 }, 1576 { 1577 name: "nominated node cannot pass all the filters, preemptor should find a different node", 1578 nodeNames: []string{"node-1", "node-2"}, 1579 existingPods: []*v1.Pod{ 1580 initPausePod(&testutils.PausePodConfig{ 1581 Name: "low-pod", 1582 Priority: &lowPriority, 1583 Resources: defaultPodRes, 1584 NodeName: "node-1", 1585 }), 1586 }, 1587 pod: initPausePod(&testutils.PausePodConfig{ 1588 Name: "preemptor-pod1", 1589 Priority: &highPriority, 1590 Resources: &v1.ResourceRequirements{Requests: v1.ResourceList{ 1591 v1.ResourceCPU: *resource.NewMilliQuantity(500, resource.DecimalSI), 1592 v1.ResourceMemory: *resource.NewQuantity(200, resource.DecimalSI)}, 1593 }, 1594 }), 1595 runningNode: "node-2", 1596 }, 1597 } 1598 1599 for _, test := range tests { 1600 t.Run(test.name, func(t *testing.T) { 1601 testCtx := initTestPreferNominatedNode(t, "perfer-nominated-node") 1602 cs := testCtx.ClientSet 1603 nsName := testCtx.NS.Name 1604 var err error 1605 var preemptor *v1.Pod 1606 for _, nodeName := range test.nodeNames { 1607 _, err := createNode(cs, st.MakeNode().Name(nodeName).Capacity(defaultNodeRes).Obj()) 1608 if err != nil { 1609 t.Fatalf("Error creating node %v: %v", nodeName, err) 1610 } 1611 } 1612 1613 pods := make([]*v1.Pod, len(test.existingPods)) 1614 // Create and run existingPods. 1615 for i, p := range test.existingPods { 1616 p.Namespace = nsName 1617 pods[i], err = runPausePod(cs, p) 1618 if err != nil { 1619 t.Fatalf("Error running pause pod: %v", err) 1620 } 1621 } 1622 test.pod.Namespace = nsName 1623 preemptor, err = createPausePod(cs, test.pod) 1624 if err != nil { 1625 t.Errorf("Error while creating high priority pod: %v", err) 1626 } 1627 err = wait.PollUntilContextTimeout(testCtx.Ctx, 100*time.Millisecond, wait.ForeverTestTimeout, false, func(ctx context.Context) (bool, error) { 1628 preemptor, err = cs.CoreV1().Pods(test.pod.Namespace).Get(ctx, test.pod.Name, metav1.GetOptions{}) 1629 if err != nil { 1630 t.Errorf("Error getting the preemptor pod info: %v", err) 1631 } 1632 if len(preemptor.Spec.NodeName) == 0 { 1633 return false, err 1634 } 1635 return true, nil 1636 }) 1637 if err != nil { 1638 t.Errorf("Cannot schedule Pod %v/%v, error: %v", test.pod.Namespace, test.pod.Name, err) 1639 } 1640 // Make sure the pod has been scheduled to the right node. 1641 if preemptor.Spec.NodeName != test.runningNode { 1642 t.Errorf("Expect pod running on %v, got %v.", test.runningNode, preemptor.Spec.NodeName) 1643 } 1644 }) 1645 } 1646 } 1647 1648 // TestReadWriteOncePodPreemption tests preemption scenarios for pods with 1649 // ReadWriteOncePod PVCs. 1650 func TestReadWriteOncePodPreemption(t *testing.T) { 1651 cfg := configtesting.V1ToInternalWithDefaults(t, configv1.KubeSchedulerConfiguration{ 1652 Profiles: []configv1.KubeSchedulerProfile{{ 1653 SchedulerName: pointer.StringPtr(v1.DefaultSchedulerName), 1654 Plugins: &configv1.Plugins{ 1655 Filter: configv1.PluginSet{ 1656 Enabled: []configv1.Plugin{ 1657 {Name: volumerestrictions.Name}, 1658 }, 1659 }, 1660 PreFilter: configv1.PluginSet{ 1661 Enabled: []configv1.Plugin{ 1662 {Name: volumerestrictions.Name}, 1663 }, 1664 }, 1665 }, 1666 }}, 1667 }) 1668 1669 testCtx := testutils.InitTestSchedulerWithOptions(t, 1670 testutils.InitTestAPIServer(t, "preemption", nil), 1671 0, 1672 scheduler.WithProfiles(cfg.Profiles...)) 1673 testutils.SyncSchedulerInformerFactory(testCtx) 1674 go testCtx.Scheduler.Run(testCtx.Ctx) 1675 1676 cs := testCtx.ClientSet 1677 1678 storage := v1.VolumeResourceRequirements{Requests: v1.ResourceList{v1.ResourceStorage: resource.MustParse("1Mi")}} 1679 volType := v1.HostPathDirectoryOrCreate 1680 pv1 := st.MakePersistentVolume(). 1681 Name("pv-with-read-write-once-pod-1"). 1682 AccessModes([]v1.PersistentVolumeAccessMode{v1.ReadWriteOncePod}). 1683 Capacity(storage.Requests). 1684 HostPathVolumeSource(&v1.HostPathVolumeSource{Path: "/mnt1", Type: &volType}). 1685 Obj() 1686 pvc1 := st.MakePersistentVolumeClaim(). 1687 Name("pvc-with-read-write-once-pod-1"). 1688 Namespace(testCtx.NS.Name). 1689 // Annotation and volume name required for PVC to be considered bound. 1690 Annotation(volume.AnnBindCompleted, "true"). 1691 VolumeName(pv1.Name). 1692 AccessModes([]v1.PersistentVolumeAccessMode{v1.ReadWriteOncePod}). 1693 Resources(storage). 1694 Obj() 1695 pv2 := st.MakePersistentVolume(). 1696 Name("pv-with-read-write-once-pod-2"). 1697 AccessModes([]v1.PersistentVolumeAccessMode{v1.ReadWriteOncePod}). 1698 Capacity(storage.Requests). 1699 HostPathVolumeSource(&v1.HostPathVolumeSource{Path: "/mnt2", Type: &volType}). 1700 Obj() 1701 pvc2 := st.MakePersistentVolumeClaim(). 1702 Name("pvc-with-read-write-once-pod-2"). 1703 Namespace(testCtx.NS.Name). 1704 // Annotation and volume name required for PVC to be considered bound. 1705 Annotation(volume.AnnBindCompleted, "true"). 1706 VolumeName(pv2.Name). 1707 AccessModes([]v1.PersistentVolumeAccessMode{v1.ReadWriteOncePod}). 1708 Resources(storage). 1709 Obj() 1710 1711 tests := []struct { 1712 name string 1713 init func() error 1714 existingPods []*v1.Pod 1715 pod *v1.Pod 1716 unresolvable bool 1717 preemptedPodIndexes map[int]struct{} 1718 cleanup func() error 1719 }{ 1720 { 1721 name: "preempt single pod", 1722 init: func() error { 1723 _, err := testutils.CreatePV(cs, pv1) 1724 if err != nil { 1725 return fmt.Errorf("cannot create pv: %v", err) 1726 } 1727 _, err = testutils.CreatePVC(cs, pvc1) 1728 if err != nil { 1729 return fmt.Errorf("cannot create pvc: %v", err) 1730 } 1731 return nil 1732 }, 1733 existingPods: []*v1.Pod{ 1734 initPausePod(&testutils.PausePodConfig{ 1735 Name: "victim-pod", 1736 Namespace: testCtx.NS.Name, 1737 Priority: &lowPriority, 1738 Volumes: []v1.Volume{{ 1739 Name: "volume", 1740 VolumeSource: v1.VolumeSource{ 1741 PersistentVolumeClaim: &v1.PersistentVolumeClaimVolumeSource{ 1742 ClaimName: pvc1.Name, 1743 }, 1744 }, 1745 }}, 1746 }), 1747 }, 1748 pod: initPausePod(&testutils.PausePodConfig{ 1749 Name: "preemptor-pod", 1750 Namespace: testCtx.NS.Name, 1751 Priority: &highPriority, 1752 Volumes: []v1.Volume{{ 1753 Name: "volume", 1754 VolumeSource: v1.VolumeSource{ 1755 PersistentVolumeClaim: &v1.PersistentVolumeClaimVolumeSource{ 1756 ClaimName: pvc1.Name, 1757 }, 1758 }, 1759 }}, 1760 }), 1761 preemptedPodIndexes: map[int]struct{}{0: {}}, 1762 cleanup: func() error { 1763 if err := testutils.DeletePVC(cs, pvc1.Name, pvc1.Namespace); err != nil { 1764 return fmt.Errorf("cannot delete pvc: %v", err) 1765 } 1766 if err := testutils.DeletePV(cs, pv1.Name); err != nil { 1767 return fmt.Errorf("cannot delete pv: %v", err) 1768 } 1769 return nil 1770 }, 1771 }, 1772 { 1773 name: "preempt two pods", 1774 init: func() error { 1775 for _, pv := range []*v1.PersistentVolume{pv1, pv2} { 1776 _, err := testutils.CreatePV(cs, pv) 1777 if err != nil { 1778 return fmt.Errorf("cannot create pv: %v", err) 1779 } 1780 } 1781 for _, pvc := range []*v1.PersistentVolumeClaim{pvc1, pvc2} { 1782 _, err := testutils.CreatePVC(cs, pvc) 1783 if err != nil { 1784 return fmt.Errorf("cannot create pvc: %v", err) 1785 } 1786 } 1787 return nil 1788 }, 1789 existingPods: []*v1.Pod{ 1790 initPausePod(&testutils.PausePodConfig{ 1791 Name: "victim-pod-1", 1792 Namespace: testCtx.NS.Name, 1793 Priority: &lowPriority, 1794 Volumes: []v1.Volume{{ 1795 Name: "volume", 1796 VolumeSource: v1.VolumeSource{ 1797 PersistentVolumeClaim: &v1.PersistentVolumeClaimVolumeSource{ 1798 ClaimName: pvc1.Name, 1799 }, 1800 }, 1801 }}, 1802 }), 1803 initPausePod(&testutils.PausePodConfig{ 1804 Name: "victim-pod-2", 1805 Namespace: testCtx.NS.Name, 1806 Priority: &lowPriority, 1807 Volumes: []v1.Volume{{ 1808 Name: "volume", 1809 VolumeSource: v1.VolumeSource{ 1810 PersistentVolumeClaim: &v1.PersistentVolumeClaimVolumeSource{ 1811 ClaimName: pvc2.Name, 1812 }, 1813 }, 1814 }}, 1815 }), 1816 }, 1817 pod: initPausePod(&testutils.PausePodConfig{ 1818 Name: "preemptor-pod", 1819 Namespace: testCtx.NS.Name, 1820 Priority: &highPriority, 1821 Volumes: []v1.Volume{ 1822 { 1823 Name: "volume-1", 1824 VolumeSource: v1.VolumeSource{ 1825 PersistentVolumeClaim: &v1.PersistentVolumeClaimVolumeSource{ 1826 ClaimName: pvc1.Name, 1827 }, 1828 }, 1829 }, 1830 { 1831 Name: "volume-2", 1832 VolumeSource: v1.VolumeSource{ 1833 PersistentVolumeClaim: &v1.PersistentVolumeClaimVolumeSource{ 1834 ClaimName: pvc2.Name, 1835 }, 1836 }, 1837 }, 1838 }, 1839 }), 1840 preemptedPodIndexes: map[int]struct{}{0: {}, 1: {}}, 1841 cleanup: func() error { 1842 for _, pvc := range []*v1.PersistentVolumeClaim{pvc1, pvc2} { 1843 if err := testutils.DeletePVC(cs, pvc.Name, pvc.Namespace); err != nil { 1844 return fmt.Errorf("cannot delete pvc: %v", err) 1845 } 1846 } 1847 for _, pv := range []*v1.PersistentVolume{pv1, pv2} { 1848 if err := testutils.DeletePV(cs, pv.Name); err != nil { 1849 return fmt.Errorf("cannot delete pv: %v", err) 1850 } 1851 } 1852 return nil 1853 }, 1854 }, 1855 { 1856 name: "preempt single pod with two volumes", 1857 init: func() error { 1858 for _, pv := range []*v1.PersistentVolume{pv1, pv2} { 1859 _, err := testutils.CreatePV(cs, pv) 1860 if err != nil { 1861 return fmt.Errorf("cannot create pv: %v", err) 1862 } 1863 } 1864 for _, pvc := range []*v1.PersistentVolumeClaim{pvc1, pvc2} { 1865 _, err := testutils.CreatePVC(cs, pvc) 1866 if err != nil { 1867 return fmt.Errorf("cannot create pvc: %v", err) 1868 } 1869 } 1870 return nil 1871 }, 1872 existingPods: []*v1.Pod{ 1873 initPausePod(&testutils.PausePodConfig{ 1874 Name: "victim-pod", 1875 Namespace: testCtx.NS.Name, 1876 Priority: &lowPriority, 1877 Volumes: []v1.Volume{ 1878 { 1879 Name: "volume-1", 1880 VolumeSource: v1.VolumeSource{ 1881 PersistentVolumeClaim: &v1.PersistentVolumeClaimVolumeSource{ 1882 ClaimName: pvc1.Name, 1883 }, 1884 }, 1885 }, 1886 { 1887 Name: "volume-2", 1888 VolumeSource: v1.VolumeSource{ 1889 PersistentVolumeClaim: &v1.PersistentVolumeClaimVolumeSource{ 1890 ClaimName: pvc2.Name, 1891 }, 1892 }, 1893 }, 1894 }, 1895 }), 1896 }, 1897 pod: initPausePod(&testutils.PausePodConfig{ 1898 Name: "preemptor-pod", 1899 Namespace: testCtx.NS.Name, 1900 Priority: &highPriority, 1901 Volumes: []v1.Volume{ 1902 { 1903 Name: "volume-1", 1904 VolumeSource: v1.VolumeSource{ 1905 PersistentVolumeClaim: &v1.PersistentVolumeClaimVolumeSource{ 1906 ClaimName: pvc1.Name, 1907 }, 1908 }, 1909 }, 1910 { 1911 Name: "volume-2", 1912 VolumeSource: v1.VolumeSource{ 1913 PersistentVolumeClaim: &v1.PersistentVolumeClaimVolumeSource{ 1914 ClaimName: pvc2.Name, 1915 }, 1916 }, 1917 }, 1918 }, 1919 }), 1920 preemptedPodIndexes: map[int]struct{}{0: {}}, 1921 cleanup: func() error { 1922 for _, pvc := range []*v1.PersistentVolumeClaim{pvc1, pvc2} { 1923 if err := testutils.DeletePVC(cs, pvc.Name, pvc.Namespace); err != nil { 1924 return fmt.Errorf("cannot delete pvc: %v", err) 1925 } 1926 } 1927 for _, pv := range []*v1.PersistentVolume{pv1, pv2} { 1928 if err := testutils.DeletePV(cs, pv.Name); err != nil { 1929 return fmt.Errorf("cannot delete pv: %v", err) 1930 } 1931 } 1932 return nil 1933 }, 1934 }, 1935 } 1936 1937 // Create a node with some resources and a label. 1938 nodeRes := map[v1.ResourceName]string{ 1939 v1.ResourcePods: "32", 1940 v1.ResourceCPU: "500m", 1941 v1.ResourceMemory: "500", 1942 } 1943 nodeObject := st.MakeNode().Name("node1").Capacity(nodeRes).Label("node", "node1").Obj() 1944 if _, err := createNode(cs, nodeObject); err != nil { 1945 t.Fatalf("Error creating node: %v", err) 1946 } 1947 1948 for _, test := range tests { 1949 t.Run(test.name, func(t *testing.T) { 1950 if err := test.init(); err != nil { 1951 t.Fatalf("Error while initializing test: %v", err) 1952 } 1953 1954 pods := make([]*v1.Pod, len(test.existingPods)) 1955 t.Cleanup(func() { 1956 testutils.CleanupPods(testCtx.Ctx, cs, t, pods) 1957 if err := test.cleanup(); err != nil { 1958 t.Errorf("Error cleaning up test: %v", err) 1959 } 1960 }) 1961 // Create and run existingPods. 1962 for i, p := range test.existingPods { 1963 var err error 1964 pods[i], err = runPausePod(cs, p) 1965 if err != nil { 1966 t.Fatalf("Error running pause pod: %v", err) 1967 } 1968 } 1969 // Create the "pod". 1970 preemptor, err := createPausePod(cs, test.pod) 1971 if err != nil { 1972 t.Errorf("Error while creating high priority pod: %v", err) 1973 } 1974 pods = append(pods, preemptor) 1975 // Wait for preemption of pods and make sure the other ones are not preempted. 1976 for i, p := range pods { 1977 if _, found := test.preemptedPodIndexes[i]; found { 1978 if err = wait.PollUntilContextTimeout(testCtx.Ctx, time.Second, wait.ForeverTestTimeout, false, 1979 podIsGettingEvicted(cs, p.Namespace, p.Name)); err != nil { 1980 t.Errorf("Pod %v/%v is not getting evicted.", p.Namespace, p.Name) 1981 } 1982 } else { 1983 if p.DeletionTimestamp != nil { 1984 t.Errorf("Didn't expect pod %v to get preempted.", p.Name) 1985 } 1986 } 1987 } 1988 // Also check that the preemptor pod gets the NominatedNodeName field set. 1989 if len(test.preemptedPodIndexes) > 0 { 1990 if err := waitForNominatedNodeName(cs, preemptor); err != nil { 1991 t.Errorf("NominatedNodeName field was not set for pod %v: %v", preemptor.Name, err) 1992 } 1993 } 1994 }) 1995 } 1996 }