sigs.k8s.io/kueue@v0.6.2/test/integration/scheduler/podsready/scheduler_test.go (about) 1 /* 2 Copyright 2022 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package podsready 18 19 import ( 20 "context" 21 "path/filepath" 22 "time" 23 24 "github.com/google/go-cmp/cmp/cmpopts" 25 "github.com/onsi/ginkgo/v2" 26 "github.com/onsi/gomega" 27 corev1 "k8s.io/api/core/v1" 28 apimeta "k8s.io/apimachinery/pkg/api/meta" 29 "k8s.io/apimachinery/pkg/api/resource" 30 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 31 "k8s.io/utils/ptr" 32 "sigs.k8s.io/controller-runtime/pkg/client" 33 "sigs.k8s.io/controller-runtime/pkg/manager" 34 35 config "sigs.k8s.io/kueue/apis/config/v1beta1" 36 kueue "sigs.k8s.io/kueue/apis/kueue/v1beta1" 37 "sigs.k8s.io/kueue/pkg/util/testing" 38 "sigs.k8s.io/kueue/pkg/workload" 39 "sigs.k8s.io/kueue/test/integration/framework" 40 "sigs.k8s.io/kueue/test/util" 41 ) 42 43 var ( 44 ignoreCQConditions = cmpopts.IgnoreFields(kueue.ClusterQueueStatus{}, "Conditions") 45 ignorePendingWorkloadsStatus = cmpopts.IgnoreFields(kueue.ClusterQueueStatus{}, "PendingWorkloadsStatus") 46 defaultRequeuingBackoffLimitCount *int32 = nil 47 ) 48 49 const ( 50 defaultPodsReadyTimeout = 3 * time.Second 51 defaultRequeuingTimestamp = config.EvictionTimestamp 52 ) 53 54 // +kubebuilder:docs-gen:collapse=Imports 55 56 var _ = ginkgo.Describe("SchedulerWithWaitForPodsReady", func() { 57 58 var ( 59 // Values changed by tests (and reset after each): 60 podsReadyTimeout = defaultPodsReadyTimeout 61 requeuingTimestamp = defaultRequeuingTimestamp 62 requeuingBackoffLimitCount = defaultRequeuingBackoffLimitCount 63 ) 64 65 var ( 66 // Values referenced by tests: 67 defaultFlavor *kueue.ResourceFlavor 68 ns *corev1.Namespace 69 prodClusterQ *kueue.ClusterQueue 70 devClusterQ *kueue.ClusterQueue 71 prodQueue *kueue.LocalQueue 72 devQueue *kueue.LocalQueue 73 ) 74 75 ginkgo.JustBeforeEach(func() { 76 fwk = &framework.Framework{ 77 CRDPath: filepath.Join("..", "..", "..", "..", "config", "components", "crd", "bases"), 78 WebhookPath: filepath.Join("..", "..", "..", "..", "config", "components", "webhook"), 79 } 80 cfg = fwk.Init() 81 ctx, k8sClient = fwk.RunManager(cfg, func(mgr manager.Manager, ctx context.Context) { 82 managerAndSchedulerSetupWithTimeoutAdmission(mgr, ctx, podsReadyTimeout, true, requeuingTimestamp, requeuingBackoffLimitCount) 83 }) 84 85 defaultFlavor = testing.MakeResourceFlavor("default").Obj() 86 gomega.Expect(k8sClient.Create(ctx, defaultFlavor)).To(gomega.Succeed()) 87 88 ns = &corev1.Namespace{ 89 ObjectMeta: metav1.ObjectMeta{ 90 GenerateName: "podsready-", 91 }, 92 } 93 gomega.Expect(k8sClient.Create(ctx, ns)).To(gomega.Succeed()) 94 95 prodClusterQ = testing.MakeClusterQueue("prod-cq"). 96 Cohort("all"). 97 ResourceGroup(*testing.MakeFlavorQuotas("default").Resource(corev1.ResourceCPU, "5").Obj()). 98 Obj() 99 gomega.Expect(k8sClient.Create(ctx, prodClusterQ)).Should(gomega.Succeed()) 100 101 devClusterQ = testing.MakeClusterQueue("dev-cq"). 102 Cohort("all"). 103 ResourceGroup(*testing.MakeFlavorQuotas("default").Resource(corev1.ResourceCPU, "5").Obj()). 104 Obj() 105 gomega.Expect(k8sClient.Create(ctx, devClusterQ)).Should(gomega.Succeed()) 106 107 prodQueue = testing.MakeLocalQueue("prod-queue", ns.Name).ClusterQueue(prodClusterQ.Name).Obj() 108 gomega.Expect(k8sClient.Create(ctx, prodQueue)).Should(gomega.Succeed()) 109 110 devQueue = testing.MakeLocalQueue("dev-queue", ns.Name).ClusterQueue(devClusterQ.Name).Obj() 111 gomega.Expect(k8sClient.Create(ctx, devQueue)).Should(gomega.Succeed()) 112 }) 113 114 ginkgo.AfterEach(func() { 115 gomega.Expect(util.DeleteNamespace(ctx, k8sClient, ns)).To(gomega.Succeed()) 116 util.ExpectClusterQueueToBeDeleted(ctx, k8sClient, prodClusterQ, true) 117 util.ExpectClusterQueueToBeDeleted(ctx, k8sClient, devClusterQ, true) 118 fwk.Teardown() 119 120 // Reset values that are changed by tests. 121 podsReadyTimeout = defaultPodsReadyTimeout 122 requeuingTimestamp = defaultRequeuingTimestamp 123 requeuingBackoffLimitCount = defaultRequeuingBackoffLimitCount 124 }) 125 126 ginkgo.Context("Long PodsReady timeout", func() { 127 128 ginkgo.BeforeEach(func() { 129 podsReadyTimeout = time.Minute 130 }) 131 132 ginkgo.It("Should unblock admission of new workloads in other ClusterQueues once the admitted workload exceeds timeout", func() { 133 ginkgo.By("checking the first prod workload gets admitted while the second is waiting") 134 prodWl := testing.MakeWorkload("prod-wl", ns.Name).Queue(prodQueue.Name).Request(corev1.ResourceCPU, "2").Obj() 135 gomega.Expect(k8sClient.Create(ctx, prodWl)).Should(gomega.Succeed()) 136 devWl := testing.MakeWorkload("dev-wl", ns.Name).Queue(devQueue.Name).Request(corev1.ResourceCPU, "2").Obj() 137 gomega.Expect(k8sClient.Create(ctx, devWl)).Should(gomega.Succeed()) 138 util.ExpectWorkloadsToHaveQuotaReservation(ctx, k8sClient, prodClusterQ.Name, prodWl) 139 util.ExpectWorkloadsToBeWaiting(ctx, k8sClient, devWl) 140 141 ginkgo.By("update the first workload to be in the PodsReady condition and verify the second workload is admitted") 142 gomega.Eventually(func() error { 143 gomega.Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(prodWl), prodWl)).Should(gomega.Succeed()) 144 apimeta.SetStatusCondition(&prodWl.Status.Conditions, metav1.Condition{ 145 Type: kueue.WorkloadPodsReady, 146 Status: metav1.ConditionTrue, 147 Reason: "PodsReady", 148 }) 149 return k8sClient.Status().Update(ctx, prodWl) 150 }, util.Timeout, util.Interval).Should(gomega.Succeed()) 151 util.ExpectWorkloadsToHaveQuotaReservation(ctx, k8sClient, devClusterQ.Name, devWl) 152 }) 153 154 ginkgo.It("Should unblock admission of new workloads once the admitted workload is deleted", func() { 155 ginkgo.By("checking the first prod workload gets admitted while the second is waiting") 156 prodWl := testing.MakeWorkload("prod-wl", ns.Name).Queue(prodQueue.Name).Request(corev1.ResourceCPU, "2").Obj() 157 gomega.Expect(k8sClient.Create(ctx, prodWl)).Should(gomega.Succeed()) 158 devWl := testing.MakeWorkload("dev-wl", ns.Name).Queue(devQueue.Name).Request(corev1.ResourceCPU, "2").Obj() 159 gomega.Expect(k8sClient.Create(ctx, devWl)).Should(gomega.Succeed()) 160 util.ExpectWorkloadsToHaveQuotaReservation(ctx, k8sClient, prodClusterQ.Name, prodWl) 161 util.ExpectWorkloadsToBeWaiting(ctx, k8sClient, devWl) 162 163 ginkgo.By("delete the first workload and verify the second workload is admitted") 164 gomega.Expect(k8sClient.Delete(ctx, prodWl)).Should(gomega.Succeed()) 165 util.ExpectWorkloadsToHaveQuotaReservation(ctx, k8sClient, devClusterQ.Name, devWl) 166 }) 167 168 ginkgo.It("Should block admission of one new workload if two are considered in the same scheduling cycle", func() { 169 ginkgo.By("creating two workloads but delaying cluster queue creation which has enough capacity") 170 prodWl := testing.MakeWorkload("prod-wl", ns.Name).Queue(prodQueue.Name).Request(corev1.ResourceCPU, "11").Obj() 171 gomega.Expect(k8sClient.Create(ctx, prodWl)).Should(gomega.Succeed()) 172 // wait a second to ensure the CreationTimestamps differ and scheduler picks the first created to be admitted 173 time.Sleep(time.Second) 174 devWl := testing.MakeWorkload("dev-wl", ns.Name).Queue(devQueue.Name).Request(corev1.ResourceCPU, "11").Obj() 175 gomega.Expect(k8sClient.Create(ctx, devWl)).Should(gomega.Succeed()) 176 util.ExpectWorkloadsToBePending(ctx, k8sClient, prodWl, devWl) 177 178 ginkgo.By("creating the cluster queue") 179 // Delay cluster queue creation to make sure workloads are in the same 180 // scheduling cycle. 181 testCQ := testing.MakeClusterQueue("test-cq"). 182 Cohort("all"). 183 ResourceGroup(*testing.MakeFlavorQuotas("default").Resource(corev1.ResourceCPU, "25", "0").Obj()). 184 Obj() 185 gomega.Expect(k8sClient.Create(ctx, testCQ)).Should(gomega.Succeed()) 186 defer func() { 187 gomega.Expect(util.DeleteClusterQueue(ctx, k8sClient, testCQ)).Should(gomega.Succeed()) 188 }() 189 190 ginkgo.By("verifying that the first created workload is admitted and the second workload is waiting as the first one has PodsReady=False") 191 util.ExpectWorkloadsToHaveQuotaReservation(ctx, k8sClient, prodClusterQ.Name, prodWl) 192 util.ExpectWorkloadsToBeWaiting(ctx, k8sClient, devWl) 193 }) 194 195 }) 196 197 var _ = ginkgo.Context("Short PodsReady timeout", func() { 198 199 ginkgo.BeforeEach(func() { 200 podsReadyTimeout = 3 * time.Second 201 requeuingBackoffLimitCount = ptr.To[int32](2) 202 }) 203 204 ginkgo.It("Should requeue a workload which exceeded the timeout to reach PodsReady=True", func() { 205 const lowPrio, highPrio = 0, 100 206 207 ginkgo.By("create the 'prod1' workload") 208 prodWl1 := testing.MakeWorkload("prod1", ns.Name).Queue(prodQueue.Name).Priority(highPrio).Request(corev1.ResourceCPU, "2").Obj() 209 gomega.Expect(k8sClient.Create(ctx, prodWl1)).Should(gomega.Succeed()) 210 211 ginkgo.By("create the 'prod2' workload") 212 prodWl2 := testing.MakeWorkload("prod2", ns.Name).Queue(prodQueue.Name).Priority(lowPrio).Request(corev1.ResourceCPU, "2").Obj() 213 gomega.Expect(k8sClient.Create(ctx, prodWl2)).Should(gomega.Succeed()) 214 215 ginkgo.By("checking the 'prod1' workload is admitted and the 'prod2' workload is waiting") 216 util.ExpectWorkloadsToHaveQuotaReservation(ctx, k8sClient, prodClusterQ.Name, prodWl1) 217 util.ExpectWorkloadsToBeWaiting(ctx, k8sClient, prodWl2) 218 219 ginkgo.By("awaiting for the Admitted=True condition to be added to 'prod1") 220 // We assume that the test will get to this check before the timeout expires and the 221 // kueue cancels the admission. Mentioning this in case this test flakes in the future. 222 gomega.Eventually(func() bool { 223 gomega.Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(prodWl1), prodWl1)).Should(gomega.Succeed()) 224 return workload.HasQuotaReservation(prodWl1) 225 }, util.Timeout, util.Interval).Should(gomega.BeTrue()) 226 227 ginkgo.By("determining the time of admission as LastTransitionTime for the Admitted condition") 228 admittedAt := apimeta.FindStatusCondition(prodWl1.Status.Conditions, kueue.WorkloadQuotaReserved).LastTransitionTime.Time 229 230 ginkgo.By("wait for the 'prod1' workload to be evicted") 231 gomega.Eventually(func(g gomega.Gomega) { 232 g.Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(prodWl1), prodWl1)).Should(gomega.Succeed()) 233 isEvicting := apimeta.IsStatusConditionTrue(prodWl1.Status.Conditions, kueue.WorkloadEvicted) 234 if time.Since(admittedAt) < podsReadyTimeout { 235 g.Expect(isEvicting).Should(gomega.BeFalse(), "the workload should not be evicted until the timeout expires") 236 } 237 g.Expect(isEvicting).Should(gomega.BeTrue()) 238 g.Expect(ptr.Deref(prodWl1.Status.RequeueState, kueue.RequeueState{})).Should(gomega.BeComparableTo(kueue.RequeueState{ 239 Count: ptr.To[int32](1), 240 }, cmpopts.IgnoreFields(kueue.RequeueState{}, "RequeueAt"))) 241 g.Expect(prodWl1.Status.RequeueState.RequeueAt).ShouldNot(gomega.BeNil()) 242 }, util.Timeout, util.Interval).Should(gomega.Succeed(), "the workload should be evicted after the timeout expires") 243 244 util.FinishEvictionForWorkloads(ctx, k8sClient, prodWl1) 245 246 ginkgo.By("verify the 'prod2' workload gets admitted and the 'prod1' is pending by backoff") 247 util.ExpectWorkloadsToHaveQuotaReservation(ctx, k8sClient, prodClusterQ.Name, prodWl2) 248 // To avoid flakiness, we don't verify if the workload has a QuotaReserved=false with pending reason here. 249 }) 250 251 ginkgo.It("Should re-admit a timed out workload and deactivate a workload exceeded the re-queue count limit. After that re-activating a workload", func() { 252 ginkgo.By("create the 'prod' workload") 253 prodWl := testing.MakeWorkload("prod", ns.Name).Queue(prodQueue.Name).Request(corev1.ResourceCPU, "2").Obj() 254 gomega.Expect(k8sClient.Create(ctx, prodWl)).Should(gomega.Succeed()) 255 ginkgo.By("checking the 'prod' workload is admitted") 256 util.ExpectWorkloadsToHaveQuotaReservation(ctx, k8sClient, prodClusterQ.Name, prodWl) 257 util.ExpectAdmittedWorkloadsTotalMetric(prodClusterQ, 1) 258 ginkgo.By("exceed the timeout for the 'prod' workload") 259 time.Sleep(podsReadyTimeout) 260 ginkgo.By("finish the eviction, and the workload is pending by backoff") 261 util.FinishEvictionForWorkloads(ctx, k8sClient, prodWl) 262 // To avoid flakiness, we don't verify if the workload has a QuotaReserved=false with pending reason here. 263 264 ginkgo.By("verify the 'prod' workload gets re-admitted twice") 265 util.ExpectWorkloadsToHaveQuotaReservation(ctx, k8sClient, prodClusterQ.Name, prodWl) 266 util.ExpectAdmittedWorkloadsTotalMetric(prodClusterQ, 2) 267 time.Sleep(podsReadyTimeout) 268 util.FinishEvictionForWorkloads(ctx, k8sClient, prodWl) 269 util.ExpectWorkloadsToHaveQuotaReservation(ctx, k8sClient, prodClusterQ.Name, prodWl) 270 util.ExpectAdmittedWorkloadsTotalMetric(prodClusterQ, 3) 271 time.Sleep(podsReadyTimeout) 272 ginkgo.By("evicted re-admitted workload should have 2 in the re-queue count") 273 util.ExpectWorkloadToHaveRequeueCount(ctx, k8sClient, client.ObjectKeyFromObject(prodWl), ptr.To[int32](2)) 274 ginkgo.By("the workload exceeded re-queue backoff limit should be deactivated") 275 gomega.Eventually(func(g gomega.Gomega) { 276 g.Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(prodWl), prodWl)) 277 g.Expect(ptr.Deref(prodWl.Spec.Active, true)).Should(gomega.BeFalse()) 278 }, util.Timeout, util.Interval).Should(gomega.Succeed()) 279 280 ginkgo.By("verify the re-activated inactive 'prod' workload re-queue state is reset") 281 // TODO: Once we move a logic to issue the Eviction with InactiveWorkload reason, we need to remove the below updates. 282 // REF: https://github.com/kubernetes-sigs/kueue/issues/1841 283 gomega.Eventually(func(g gomega.Gomega) { 284 g.Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(prodWl), prodWl)).Should(gomega.Succeed()) 285 apimeta.SetStatusCondition(&prodWl.Status.Conditions, metav1.Condition{ 286 Type: kueue.WorkloadEvicted, 287 Status: metav1.ConditionTrue, 288 Reason: kueue.WorkloadEvictedByDeactivation, 289 Message: "evicted by Test", 290 }) 291 g.Expect(k8sClient.Status().Update(ctx, prodWl)).Should(gomega.Succeed()) 292 }, util.Timeout, util.Interval).Should(gomega.Succeed(), "Job reconciler should add an Evicted condition with InactiveWorkload to the Workload") 293 gomega.Eventually(func(g gomega.Gomega) { 294 g.Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(prodWl), prodWl)).Should(gomega.Succeed()) 295 prodWl.Spec.Active = ptr.To(true) 296 g.Expect(k8sClient.Update(ctx, prodWl)).Should(gomega.Succeed()) 297 }, util.Timeout, util.Interval).Should(gomega.Succeed(), "Reactivate inactive Workload") 298 gomega.Eventually(func(g gomega.Gomega) { 299 g.Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(prodWl), prodWl)).Should(gomega.Succeed()) 300 g.Expect(prodWl.Status.RequeueState).Should(gomega.BeNil()) 301 }, util.Timeout, util.Interval).Should(gomega.Succeed()) 302 }) 303 304 ginkgo.It("Should unblock admission of new workloads in other ClusterQueues once the admitted workload exceeds timeout", func() { 305 ginkgo.By("create the 'prod' workload") 306 prodWl := testing.MakeWorkload("prod", ns.Name).Queue(prodQueue.Name).Request(corev1.ResourceCPU, "2").Obj() 307 gomega.Expect(k8sClient.Create(ctx, prodWl)).Should(gomega.Succeed()) 308 309 ginkgo.By("create the 'dev' workload after a second") 310 time.Sleep(time.Second) 311 devWl := testing.MakeWorkload("dev", ns.Name).Queue(devQueue.Name).Request(corev1.ResourceCPU, "2").Obj() 312 gomega.Expect(k8sClient.Create(ctx, devWl)).Should(gomega.Succeed()) 313 314 ginkgo.By("wait for the 'prod' workload to be admitted and the 'dev' to be waiting") 315 util.ExpectWorkloadsToHaveQuotaReservation(ctx, k8sClient, prodClusterQ.Name, prodWl) 316 util.ExpectWorkloadsToBeWaiting(ctx, k8sClient, devWl) 317 318 ginkgo.By("verify the 'prod' queue resources are used") 319 gomega.Eventually(func() kueue.ClusterQueueStatus { 320 var updatedCQ kueue.ClusterQueue 321 gomega.Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(prodClusterQ), &updatedCQ)).To(gomega.Succeed()) 322 return updatedCQ.Status 323 }, util.Timeout, util.Interval).Should(gomega.BeComparableTo(kueue.ClusterQueueStatus{ 324 PendingWorkloads: 0, 325 ReservingWorkloads: 1, 326 AdmittedWorkloads: 1, 327 FlavorsReservation: []kueue.FlavorUsage{{ 328 Name: "default", 329 Resources: []kueue.ResourceUsage{{ 330 Name: corev1.ResourceCPU, 331 Total: resource.MustParse("2"), 332 }}, 333 }}, 334 FlavorsUsage: []kueue.FlavorUsage{{ 335 Name: "default", 336 Resources: []kueue.ResourceUsage{{ 337 Name: corev1.ResourceCPU, 338 Total: resource.MustParse("2"), 339 }}, 340 }}, 341 }, ignoreCQConditions, ignorePendingWorkloadsStatus)) 342 343 ginkgo.By("wait for the timeout to be exceeded") 344 time.Sleep(podsReadyTimeout) 345 346 ginkgo.By("finish the eviction") 347 util.FinishEvictionForWorkloads(ctx, k8sClient, prodWl) 348 349 ginkgo.By("wait for the first workload to be unadmitted") 350 gomega.Eventually(func() *kueue.Admission { 351 gomega.Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(prodWl), prodWl)).Should(gomega.Succeed()) 352 return prodWl.Status.Admission 353 }, util.Timeout, util.Interval).Should(gomega.BeNil()) 354 355 ginkgo.By("verify the queue resources are freed") 356 gomega.Eventually(func() kueue.ClusterQueueStatus { 357 var updatedCQ kueue.ClusterQueue 358 gomega.Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(prodClusterQ), &updatedCQ)).To(gomega.Succeed()) 359 return updatedCQ.Status 360 }, util.Timeout, util.Interval).Should(gomega.BeComparableTo(kueue.ClusterQueueStatus{ 361 PendingWorkloads: 1, 362 ReservingWorkloads: 0, 363 AdmittedWorkloads: 0, 364 FlavorsReservation: []kueue.FlavorUsage{{ 365 Name: "default", 366 Resources: []kueue.ResourceUsage{{ 367 Name: corev1.ResourceCPU, 368 Total: resource.MustParse("0"), 369 }}, 370 }}, 371 FlavorsUsage: []kueue.FlavorUsage{{ 372 Name: "default", 373 Resources: []kueue.ResourceUsage{{ 374 Name: corev1.ResourceCPU, 375 Total: resource.MustParse("0"), 376 }}, 377 }}, 378 }, ignoreCQConditions, ignorePendingWorkloadsStatus)) 379 380 ginkgo.By("verify the active workload metric is decreased for the cluster queue") 381 util.ExpectReservingActiveWorkloadsMetric(prodClusterQ, 0) 382 383 ginkgo.By("wait for the 'dev' workload to get admitted") 384 util.ExpectWorkloadsToHaveQuotaReservation(ctx, k8sClient, devClusterQ.Name, devWl) 385 ginkgo.By("wait for the 'prod' workload to be waiting") 386 util.ExpectWorkloadsToBeWaiting(ctx, k8sClient, prodWl) 387 388 ginkgo.By("delete the waiting 'prod' workload so that it does not get admitted during teardown") 389 gomega.Expect(k8sClient.Delete(ctx, prodWl)).Should(gomega.Succeed()) 390 }) 391 392 ginkgo.It("Should move the evicted workload at the end of the queue", func() { 393 localQueueName := "eviction-lq" 394 395 // the workloads are created with a 5 cpu resource requirement to ensure only one can fit at a given time, 396 // letting them all to time out, we should see a circular buffer admission pattern 397 wl1 := testing.MakeWorkload("prod1", ns.Name).Queue(localQueueName).Request(corev1.ResourceCPU, "5").Obj() 398 wl2 := testing.MakeWorkload("prod2", ns.Name).Queue(localQueueName).Request(corev1.ResourceCPU, "5").Obj() 399 wl3 := testing.MakeWorkload("prod3", ns.Name).Queue(localQueueName).Request(corev1.ResourceCPU, "5").Obj() 400 401 ginkgo.By("create the workloads", func() { 402 // since metav1.Time has only second resolution, wait one second between 403 // create calls to avoid any potential creation timestamp collision 404 gomega.Expect(k8sClient.Create(ctx, wl1)).Should(gomega.Succeed()) 405 time.Sleep(time.Second) 406 gomega.Expect(k8sClient.Create(ctx, wl2)).Should(gomega.Succeed()) 407 time.Sleep(time.Second) 408 gomega.Expect(k8sClient.Create(ctx, wl3)).Should(gomega.Succeed()) 409 }) 410 411 ginkgo.By("create the local queue to start admission", func() { 412 lq := testing.MakeLocalQueue(localQueueName, ns.Name).ClusterQueue(prodClusterQ.Name).Obj() 413 gomega.Expect(k8sClient.Create(ctx, lq)).Should(gomega.Succeed()) 414 }) 415 416 ginkgo.By("waiting for the first workload to be admitted", func() { 417 util.ExpectWorkloadsToHaveQuotaReservation(ctx, k8sClient, prodClusterQ.Name, wl1) 418 }) 419 420 ginkgo.By("waiting the timeout, the first workload should be evicted and the second one should be admitted", func() { 421 time.Sleep(podsReadyTimeout) 422 util.FinishEvictionForWorkloads(ctx, k8sClient, wl1) 423 util.ExpectWorkloadsToHaveQuotaReservation(ctx, k8sClient, prodClusterQ.Name, wl2) 424 }) 425 426 ginkgo.By("finishing the second workload, the third one should be admitted", func() { 427 time.Sleep(podsReadyTimeout) 428 util.FinishWorkloads(ctx, k8sClient, wl2) 429 util.ExpectWorkloadsToHaveQuotaReservation(ctx, k8sClient, prodClusterQ.Name, wl3) 430 }) 431 432 ginkgo.By("finishing the third workload, the first one should be admitted", func() { 433 time.Sleep(podsReadyTimeout) 434 util.FinishWorkloads(ctx, k8sClient, wl3) 435 util.ExpectWorkloadsToHaveQuotaReservation(ctx, k8sClient, prodClusterQ.Name, wl1) 436 }) 437 438 ginkgo.By("verifying if all workloads have a proper re-queue count", func() { 439 util.ExpectWorkloadToHaveRequeueCount(ctx, k8sClient, client.ObjectKeyFromObject(wl1), ptr.To[int32](2)) 440 util.ExpectWorkloadToHaveRequeueCount(ctx, k8sClient, client.ObjectKeyFromObject(wl2), ptr.To[int32](1)) 441 util.ExpectWorkloadToHaveRequeueCount(ctx, k8sClient, client.ObjectKeyFromObject(wl3), ptr.To[int32](1)) 442 }) 443 }) 444 }) 445 446 var _ = ginkgo.Context("Requeuing timestamp set to Creation", func() { 447 448 var ( 449 standaloneClusterQ *kueue.ClusterQueue 450 standaloneQueue *kueue.LocalQueue 451 ) 452 453 ginkgo.BeforeEach(func() { 454 requeuingTimestamp = config.CreationTimestamp 455 }) 456 457 ginkgo.JustBeforeEach(func() { 458 // Build a standalone cluster queue with just enough capacity for a single workload. 459 // (Avoid using prod/dev queues to avoid borrowing) 460 standaloneClusterQ = testing.MakeClusterQueue("standalone-cq"). 461 ResourceGroup(*testing.MakeFlavorQuotas("default").Resource(corev1.ResourceCPU, "1").Obj()). 462 Obj() 463 gomega.Expect(k8sClient.Create(ctx, standaloneClusterQ)).Should(gomega.Succeed()) 464 465 standaloneQueue = testing.MakeLocalQueue("standalone-queue", ns.Name).ClusterQueue(standaloneClusterQ.Name).Obj() 466 gomega.Expect(k8sClient.Create(ctx, standaloneQueue)).Should(gomega.Succeed()) 467 }) 468 469 ginkgo.AfterEach(func() { 470 gomega.Expect(util.DeleteClusterQueue(ctx, k8sClient, standaloneClusterQ)).Should(gomega.Succeed()) 471 gomega.Expect(util.DeleteLocalQueue(ctx, k8sClient, standaloneQueue)).Should(gomega.Succeed()) 472 }) 473 474 ginkgo.It("Should prioritize workloads submitted earlier", func() { 475 // the workloads are created with a 1 cpu resource requirement to ensure only one can fit at a given time 476 wl1 := testing.MakeWorkload("wl-1", ns.Name).Queue(standaloneQueue.Name).Request(corev1.ResourceCPU, "1").Obj() 477 wl2 := testing.MakeWorkload("wl-2", ns.Name).Queue(standaloneQueue.Name).Request(corev1.ResourceCPU, "1").Obj() 478 wl3 := testing.MakeWorkload("wl-3", ns.Name).Queue(standaloneQueue.Name).Request(corev1.ResourceCPU, "1").Obj() 479 480 ginkgo.By("create the workloads", func() { 481 // since metav1.Time has only second resolution, wait one second between 482 // create calls to avoid any potential creation timestamp collision 483 gomega.Expect(k8sClient.Create(ctx, wl1)).Should(gomega.Succeed()) 484 time.Sleep(time.Second) 485 gomega.Expect(k8sClient.Create(ctx, wl2)).Should(gomega.Succeed()) 486 time.Sleep(time.Second) 487 gomega.Expect(k8sClient.Create(ctx, wl3)).Should(gomega.Succeed()) 488 }) 489 490 ginkgo.By("waiting for the first workload to be admitted", func() { 491 util.ExpectWorkloadsToHaveQuotaReservation(ctx, k8sClient, standaloneClusterQ.Name, wl1) 492 }) 493 ginkgo.By("checking that the second and third workloads are still pending", func() { 494 util.ExpectWorkloadsToBePending(ctx, k8sClient, wl2, wl3) 495 }) 496 ginkgo.By("finishing the eviction of the first workload", func() { 497 util.FinishEvictionForWorkloads(ctx, k8sClient, wl1) 498 }) 499 ginkgo.By("waiting for the second workload to be admitted", func() { 500 util.ExpectWorkloadsToHaveQuotaReservation(ctx, k8sClient, standaloneClusterQ.Name, wl2) 501 }) 502 // The first workload is still pending by backoff, and the third workload is also still pending by insufficient quota. 503 // To avoid flakiness, we don't verify if the workload has a QuotaReserved=false with pending reason here. 504 ginkgo.By("finishing the eviction of the second workload", func() { 505 util.FinishEvictionForWorkloads(ctx, k8sClient, wl2) 506 }) 507 ginkgo.By("waiting for the first workload to be admitted since backoff is completed, and the second and third workloads are still pending", func() { 508 util.ExpectWorkloadsToHaveQuotaReservation(ctx, k8sClient, standaloneClusterQ.Name, wl1) 509 // To avoid flakiness, we don't verify if the workload has a QuotaReserved=false with pending reason here. 510 }) 511 }) 512 }) 513 514 }) 515 516 var _ = ginkgo.Describe("SchedulerWithWaitForPodsReadyNonblockingMode", func() { 517 var ( 518 // Values changed by tests (and reset after each): 519 podsReadyTimeout = defaultPodsReadyTimeout 520 requeuingTimestamp = defaultRequeuingTimestamp 521 requeueingBackoffLimitCount = defaultRequeuingBackoffLimitCount 522 ) 523 524 var ( 525 // Values referenced by tests: 526 defaultFlavor *kueue.ResourceFlavor 527 ns *corev1.Namespace 528 prodClusterQ *kueue.ClusterQueue 529 devClusterQ *kueue.ClusterQueue 530 prodQueue *kueue.LocalQueue 531 devQueue *kueue.LocalQueue 532 ) 533 534 ginkgo.JustBeforeEach(func() { 535 fwk = &framework.Framework{ 536 CRDPath: filepath.Join("..", "..", "..", "..", "config", "components", "crd", "bases"), 537 WebhookPath: filepath.Join("..", "..", "..", "..", "config", "components", "webhook"), 538 } 539 cfg = fwk.Init() 540 ctx, k8sClient = fwk.RunManager(cfg, func(mgr manager.Manager, ctx context.Context) { 541 managerAndSchedulerSetupWithTimeoutAdmission(mgr, ctx, podsReadyTimeout, false, requeuingTimestamp, requeueingBackoffLimitCount) 542 }) 543 544 defaultFlavor = testing.MakeResourceFlavor("default").Obj() 545 gomega.Expect(k8sClient.Create(ctx, defaultFlavor)).To(gomega.Succeed()) 546 547 ns = &corev1.Namespace{ 548 ObjectMeta: metav1.ObjectMeta{ 549 GenerateName: "podsready-nonblocking-", 550 }, 551 } 552 gomega.Expect(k8sClient.Create(ctx, ns)).To(gomega.Succeed()) 553 554 prodClusterQ = testing.MakeClusterQueue("prod-cq"). 555 Cohort("all"). 556 ResourceGroup(*testing.MakeFlavorQuotas("default").Resource(corev1.ResourceCPU, "5").Obj()). 557 Obj() 558 gomega.Expect(k8sClient.Create(ctx, prodClusterQ)).Should(gomega.Succeed()) 559 560 devClusterQ = testing.MakeClusterQueue("dev-cq"). 561 Cohort("all"). 562 ResourceGroup(*testing.MakeFlavorQuotas("default").Resource(corev1.ResourceCPU, "5").Obj()). 563 Obj() 564 gomega.Expect(k8sClient.Create(ctx, devClusterQ)).Should(gomega.Succeed()) 565 566 prodQueue = testing.MakeLocalQueue("prod-queue", ns.Name).ClusterQueue(prodClusterQ.Name).Obj() 567 gomega.Expect(k8sClient.Create(ctx, prodQueue)).Should(gomega.Succeed()) 568 569 devQueue = testing.MakeLocalQueue("dev-queue", ns.Name).ClusterQueue(devClusterQ.Name).Obj() 570 gomega.Expect(k8sClient.Create(ctx, devQueue)).Should(gomega.Succeed()) 571 }) 572 573 ginkgo.AfterEach(func() { 574 gomega.Expect(util.DeleteNamespace(ctx, k8sClient, ns)).To(gomega.Succeed()) 575 util.ExpectClusterQueueToBeDeleted(ctx, k8sClient, prodClusterQ, true) 576 util.ExpectClusterQueueToBeDeleted(ctx, k8sClient, devClusterQ, true) 577 fwk.Teardown() 578 579 // Reset values that are changed by tests. 580 podsReadyTimeout = defaultPodsReadyTimeout 581 requeuingTimestamp = defaultRequeuingTimestamp 582 requeueingBackoffLimitCount = defaultRequeuingBackoffLimitCount 583 }) 584 585 ginkgo.Context("Long PodsReady timeout", func() { 586 587 ginkgo.BeforeEach(func() { 588 podsReadyTimeout = time.Minute 589 }) 590 591 ginkgo.It("Should not block admission of one new workload if two are considered in the same scheduling cycle", func() { 592 ginkgo.By("creating two workloads but delaying cluster queue creation which has enough capacity") 593 prodWl := testing.MakeWorkload("prod-wl", ns.Name).Queue(prodQueue.Name).Request(corev1.ResourceCPU, "11").Obj() 594 gomega.Expect(k8sClient.Create(ctx, prodWl)).Should(gomega.Succeed()) 595 devWl := testing.MakeWorkload("dev-wl", ns.Name).Queue(devQueue.Name).Request(corev1.ResourceCPU, "11").Obj() 596 gomega.Expect(k8sClient.Create(ctx, devWl)).Should(gomega.Succeed()) 597 util.ExpectWorkloadsToBePending(ctx, k8sClient, prodWl, devWl) 598 599 ginkgo.By("creating the cluster queue") 600 // Delay cluster queue creation to make sure workloads are in the same 601 // scheduling cycle. 602 testCQ := testing.MakeClusterQueue("test-cq"). 603 Cohort("all"). 604 ResourceGroup(*testing.MakeFlavorQuotas("default").Resource(corev1.ResourceCPU, "25", "0").Obj()). 605 Obj() 606 gomega.Expect(k8sClient.Create(ctx, testCQ)).Should(gomega.Succeed()) 607 defer func() { 608 gomega.Expect(util.DeleteClusterQueue(ctx, k8sClient, testCQ)).Should(gomega.Succeed()) 609 }() 610 611 ginkgo.By("verifying that the first created workload is admitted and the second workload is admitted as the blockAdmission is false") 612 util.ExpectWorkloadsToHaveQuotaReservation(ctx, k8sClient, prodClusterQ.Name, prodWl) 613 util.ExpectWorkloadsToHaveQuotaReservation(ctx, k8sClient, devClusterQ.Name, devWl) 614 }) 615 616 }) 617 618 var _ = ginkgo.Context("Short PodsReady timeout", func() { 619 ginkgo.BeforeEach(func() { 620 podsReadyTimeout = 3 * time.Second 621 }) 622 623 ginkgo.It("Should re-admit a timed out workload", func() { 624 ginkgo.By("create the 'prod' workload") 625 prodWl := testing.MakeWorkload("prod", ns.Name).Queue(prodQueue.Name).Request(corev1.ResourceCPU, "2").Obj() 626 gomega.Expect(k8sClient.Create(ctx, prodWl)).Should(gomega.Succeed()) 627 ginkgo.By("checking the 'prod' workload is admitted") 628 util.ExpectWorkloadsToHaveQuotaReservation(ctx, k8sClient, prodClusterQ.Name, prodWl) 629 util.ExpectAdmittedWorkloadsTotalMetric(prodClusterQ, 1) 630 ginkgo.By("exceed the timeout for the 'prod' workload") 631 time.Sleep(podsReadyTimeout) 632 ginkgo.By("finish the eviction") 633 util.FinishEvictionForWorkloads(ctx, k8sClient, prodWl) 634 635 ginkgo.By("verify the 'prod' workload gets re-admitted once") 636 util.ExpectWorkloadsToHaveQuotaReservation(ctx, k8sClient, prodClusterQ.Name, prodWl) 637 util.ExpectAdmittedWorkloadsTotalMetric(prodClusterQ, 2) 638 time.Sleep(podsReadyTimeout) 639 util.ExpectWorkloadToHaveRequeueCount(ctx, k8sClient, client.ObjectKeyFromObject(prodWl), ptr.To[int32](2)) 640 gomega.Expect(prodWl) 641 gomega.Expect(ptr.Deref(prodWl.Spec.Active, true)).Should(gomega.BeTrue()) 642 }) 643 }) 644 645 var _ = ginkgo.Context("Requeuing timestamp set to Creation", func() { 646 647 var ( 648 standaloneClusterQ *kueue.ClusterQueue 649 standaloneQueue *kueue.LocalQueue 650 ) 651 652 ginkgo.BeforeEach(func() { 653 requeuingTimestamp = config.CreationTimestamp 654 }) 655 656 ginkgo.JustBeforeEach(func() { 657 // Build a standalone cluster queue with just enough capacity for a single workload. 658 // (Avoid using prod/dev queues to avoid borrowing) 659 standaloneClusterQ = testing.MakeClusterQueue("standalone-cq"). 660 ResourceGroup(*testing.MakeFlavorQuotas("default").Resource(corev1.ResourceCPU, "1").Obj()). 661 Obj() 662 gomega.Expect(k8sClient.Create(ctx, standaloneClusterQ)).Should(gomega.Succeed()) 663 664 standaloneQueue = testing.MakeLocalQueue("standalone-queue", ns.Name).ClusterQueue(standaloneClusterQ.Name).Obj() 665 gomega.Expect(k8sClient.Create(ctx, standaloneQueue)).Should(gomega.Succeed()) 666 }) 667 668 ginkgo.AfterEach(func() { 669 gomega.Expect(util.DeleteClusterQueue(ctx, k8sClient, standaloneClusterQ)).Should(gomega.Succeed()) 670 gomega.Expect(util.DeleteLocalQueue(ctx, k8sClient, standaloneQueue)).Should(gomega.Succeed()) 671 }) 672 673 ginkgo.It("Should keep the evicted workload at the front of the queue", func() { 674 // the workloads are created with a 1 cpu resource requirement to ensure only one can fit at a given time 675 wl1 := testing.MakeWorkload("wl-1", ns.Name).Queue(standaloneQueue.Name).Request(corev1.ResourceCPU, "1").Obj() 676 wl2 := testing.MakeWorkload("wl-2", ns.Name).Queue(standaloneQueue.Name).Request(corev1.ResourceCPU, "1").Obj() 677 wl3 := testing.MakeWorkload("wl-3", ns.Name).Queue(standaloneQueue.Name).Request(corev1.ResourceCPU, "1").Obj() 678 679 ginkgo.By("create the workloads", func() { 680 // since metav1.Time has only second resolution, wait one second between 681 // create calls to avoid any potential creation timestamp collision 682 gomega.Expect(k8sClient.Create(ctx, wl1)).Should(gomega.Succeed()) 683 time.Sleep(time.Second) 684 gomega.Expect(k8sClient.Create(ctx, wl2)).Should(gomega.Succeed()) 685 time.Sleep(time.Second) 686 gomega.Expect(k8sClient.Create(ctx, wl3)).Should(gomega.Succeed()) 687 }) 688 689 ginkgo.By("waiting for the first workload to be admitted", func() { 690 util.ExpectWorkloadsToHaveQuotaReservation(ctx, k8sClient, standaloneClusterQ.Name, wl1) 691 }) 692 ginkgo.By("checking that the second and third workloads are still pending", func() { 693 util.ExpectWorkloadsToBePending(ctx, k8sClient, wl2, wl3) 694 }) 695 ginkgo.By("finishing the eviction of the first workload", func() { 696 util.FinishEvictionForWorkloads(ctx, k8sClient, wl1) 697 }) 698 ginkgo.By("waiting for the second workload to be admitted", func() { 699 util.ExpectWorkloadsToHaveQuotaReservation(ctx, k8sClient, standaloneClusterQ.Name, wl2) 700 }) 701 // The first workload is still pending by backoff, and the third workload is also still pending by insufficient quota. 702 // To avoid flakiness, we don't verify if the workload has a QuotaReserved=false with pending reason here. 703 ginkgo.By("finishing the eviction of the second workload", func() { 704 util.FinishEvictionForWorkloads(ctx, k8sClient, wl2) 705 }) 706 ginkgo.By("waiting for the first workload to be admitted since backoff is completed, and the second and third workloads are still pending", func() { 707 util.ExpectWorkloadsToHaveQuotaReservation(ctx, k8sClient, standaloneClusterQ.Name, wl1) 708 // To avoid flakiness, we don't verify if the workload has a QuotaReserved=false with pending reason here. 709 }) 710 ginkgo.By("verifying if all workloads have a proper re-queue count", func() { 711 util.ExpectWorkloadToHaveRequeueCount(ctx, k8sClient, client.ObjectKeyFromObject(wl1), ptr.To[int32](2)) 712 util.ExpectWorkloadToHaveRequeueCount(ctx, k8sClient, client.ObjectKeyFromObject(wl2), ptr.To[int32](1)) 713 ginkgo.By("wl3 had never been admitted", func() { 714 util.ExpectWorkloadToHaveRequeueCount(ctx, k8sClient, client.ObjectKeyFromObject(wl3), nil) 715 }) 716 }) 717 }) 718 }) 719 720 })