sigs.k8s.io/kueue@v0.6.2/test/e2e/multikueue/e2e_test.go (about) 1 /* 2 Copyright 2023 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package mke2e 18 19 import ( 20 "os/exec" 21 22 "github.com/google/go-cmp/cmp/cmpopts" 23 "github.com/onsi/ginkgo/v2" 24 "github.com/onsi/gomega" 25 batchv1 "k8s.io/api/batch/v1" 26 corev1 "k8s.io/api/core/v1" 27 apimeta "k8s.io/apimachinery/pkg/api/meta" 28 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 29 "k8s.io/apimachinery/pkg/types" 30 "k8s.io/utils/ptr" 31 "sigs.k8s.io/controller-runtime/pkg/client" 32 jobset "sigs.k8s.io/jobset/api/jobset/v1alpha2" 33 34 kueuealpha "sigs.k8s.io/kueue/apis/kueue/v1alpha1" 35 kueue "sigs.k8s.io/kueue/apis/kueue/v1beta1" 36 "sigs.k8s.io/kueue/pkg/controller/admissionchecks/multikueue" 37 workloadjob "sigs.k8s.io/kueue/pkg/controller/jobs/job" 38 workloadjobset "sigs.k8s.io/kueue/pkg/controller/jobs/jobset" 39 utiltesting "sigs.k8s.io/kueue/pkg/util/testing" 40 testingjob "sigs.k8s.io/kueue/pkg/util/testingjobs/job" 41 testingjobset "sigs.k8s.io/kueue/pkg/util/testingjobs/jobset" 42 "sigs.k8s.io/kueue/pkg/workload" 43 "sigs.k8s.io/kueue/test/util" 44 ) 45 46 // +kubebuilder:docs-gen:collapse=Imports 47 48 var _ = ginkgo.Describe("MultiKueue", func() { 49 var ( 50 managerNs *corev1.Namespace 51 worker1Ns *corev1.Namespace 52 worker2Ns *corev1.Namespace 53 54 workerCluster1 *kueuealpha.MultiKueueCluster 55 workerCluster2 *kueuealpha.MultiKueueCluster 56 multiKueueConfig *kueuealpha.MultiKueueConfig 57 multiKueueAc *kueue.AdmissionCheck 58 managerFlavor *kueue.ResourceFlavor 59 managerCq *kueue.ClusterQueue 60 managerLq *kueue.LocalQueue 61 62 worker1Flavor *kueue.ResourceFlavor 63 worker1Cq *kueue.ClusterQueue 64 worker1Lq *kueue.LocalQueue 65 66 worker2Flavor *kueue.ResourceFlavor 67 worker2Cq *kueue.ClusterQueue 68 worker2Lq *kueue.LocalQueue 69 ) 70 71 ginkgo.BeforeEach(func() { 72 managerNs = &corev1.Namespace{ 73 ObjectMeta: metav1.ObjectMeta{ 74 GenerateName: "multikueue-", 75 }, 76 } 77 gomega.Expect(k8sManagerClient.Create(ctx, managerNs)).To(gomega.Succeed()) 78 79 worker1Ns = &corev1.Namespace{ 80 ObjectMeta: metav1.ObjectMeta{ 81 Name: managerNs.Name, 82 }, 83 } 84 gomega.Expect(k8sWorker1Client.Create(ctx, worker1Ns)).To(gomega.Succeed()) 85 86 worker2Ns = &corev1.Namespace{ 87 ObjectMeta: metav1.ObjectMeta{ 88 Name: managerNs.Name, 89 }, 90 } 91 gomega.Expect(k8sWorker2Client.Create(ctx, worker2Ns)).To(gomega.Succeed()) 92 93 workerCluster1 = utiltesting.MakeMultiKueueCluster("worker1").KubeConfig(kueuealpha.SecretLocationType, "multikueue1").Obj() 94 gomega.Expect(k8sManagerClient.Create(ctx, workerCluster1)).To(gomega.Succeed()) 95 96 workerCluster2 = utiltesting.MakeMultiKueueCluster("worker2").KubeConfig(kueuealpha.SecretLocationType, "multikueue2").Obj() 97 gomega.Expect(k8sManagerClient.Create(ctx, workerCluster2)).To(gomega.Succeed()) 98 99 multiKueueConfig = utiltesting.MakeMultiKueueConfig("multikueueconfig").Clusters("worker1", "worker2").Obj() 100 gomega.Expect(k8sManagerClient.Create(ctx, multiKueueConfig)).Should(gomega.Succeed()) 101 102 multiKueueAc = utiltesting.MakeAdmissionCheck("ac1"). 103 ControllerName(multikueue.ControllerName). 104 Parameters(kueuealpha.GroupVersion.Group, "MultiKueueConfig", multiKueueConfig.Name). 105 Obj() 106 gomega.Expect(k8sManagerClient.Create(ctx, multiKueueAc)).Should(gomega.Succeed()) 107 108 ginkgo.By("wait for check active", func() { 109 updatetedAc := kueue.AdmissionCheck{} 110 acKey := client.ObjectKeyFromObject(multiKueueAc) 111 gomega.Eventually(func(g gomega.Gomega) { 112 g.Expect(k8sManagerClient.Get(ctx, acKey, &updatetedAc)).To(gomega.Succeed()) 113 g.Expect(apimeta.IsStatusConditionTrue(updatetedAc.Status.Conditions, kueue.AdmissionCheckActive)).To(gomega.BeTrue()) 114 }, util.Timeout, util.Interval).Should(gomega.Succeed()) 115 116 }) 117 managerFlavor = utiltesting.MakeResourceFlavor("default").Obj() 118 gomega.Expect(k8sManagerClient.Create(ctx, managerFlavor)).Should(gomega.Succeed()) 119 120 managerCq = utiltesting.MakeClusterQueue("q1"). 121 ResourceGroup( 122 *utiltesting.MakeFlavorQuotas(managerFlavor.Name). 123 Resource(corev1.ResourceCPU, "2"). 124 Resource(corev1.ResourceMemory, "2G"). 125 Obj(), 126 ). 127 AdmissionChecks(multiKueueAc.Name). 128 Obj() 129 gomega.Expect(k8sManagerClient.Create(ctx, managerCq)).Should(gomega.Succeed()) 130 131 managerLq = utiltesting.MakeLocalQueue(managerCq.Name, managerNs.Name).ClusterQueue(managerCq.Name).Obj() 132 gomega.Expect(k8sManagerClient.Create(ctx, managerLq)).Should(gomega.Succeed()) 133 134 worker1Flavor = utiltesting.MakeResourceFlavor("default").Obj() 135 gomega.Expect(k8sWorker1Client.Create(ctx, worker1Flavor)).Should(gomega.Succeed()) 136 137 worker1Cq = utiltesting.MakeClusterQueue("q1"). 138 ResourceGroup( 139 *utiltesting.MakeFlavorQuotas(worker1Flavor.Name). 140 Resource(corev1.ResourceCPU, "2"). 141 Resource(corev1.ResourceMemory, "1G"). 142 Obj(), 143 ). 144 Obj() 145 gomega.Expect(k8sWorker1Client.Create(ctx, worker1Cq)).Should(gomega.Succeed()) 146 147 worker1Lq = utiltesting.MakeLocalQueue(worker1Cq.Name, worker1Ns.Name).ClusterQueue(worker1Cq.Name).Obj() 148 gomega.Expect(k8sWorker1Client.Create(ctx, worker1Lq)).Should(gomega.Succeed()) 149 150 worker2Flavor = utiltesting.MakeResourceFlavor("default").Obj() 151 gomega.Expect(k8sWorker2Client.Create(ctx, worker2Flavor)).Should(gomega.Succeed()) 152 153 worker2Cq = utiltesting.MakeClusterQueue("q1"). 154 ResourceGroup( 155 *utiltesting.MakeFlavorQuotas(worker2Flavor.Name). 156 Resource(corev1.ResourceCPU, "1"). 157 Resource(corev1.ResourceMemory, "2G"). 158 Obj(), 159 ). 160 Obj() 161 gomega.Expect(k8sWorker2Client.Create(ctx, worker2Cq)).Should(gomega.Succeed()) 162 163 worker2Lq = utiltesting.MakeLocalQueue(worker2Cq.Name, worker2Ns.Name).ClusterQueue(worker2Cq.Name).Obj() 164 gomega.Expect(k8sWorker2Client.Create(ctx, worker2Lq)).Should(gomega.Succeed()) 165 }) 166 167 ginkgo.AfterEach(func() { 168 gomega.Expect(util.DeleteNamespace(ctx, k8sManagerClient, managerNs)).To(gomega.Succeed()) 169 gomega.Expect(util.DeleteNamespace(ctx, k8sWorker1Client, worker1Ns)).To(gomega.Succeed()) 170 gomega.Expect(util.DeleteNamespace(ctx, k8sWorker2Client, worker2Ns)).To(gomega.Succeed()) 171 172 util.ExpectClusterQueueToBeDeleted(ctx, k8sWorker1Client, worker1Cq, true) 173 util.ExpectResourceFlavorToBeDeleted(ctx, k8sWorker1Client, worker1Flavor, true) 174 175 util.ExpectClusterQueueToBeDeleted(ctx, k8sWorker2Client, worker2Cq, true) 176 util.ExpectResourceFlavorToBeDeleted(ctx, k8sWorker2Client, worker2Flavor, true) 177 178 util.ExpectClusterQueueToBeDeleted(ctx, k8sManagerClient, managerCq, true) 179 util.ExpectResourceFlavorToBeDeleted(ctx, k8sManagerClient, managerFlavor, true) 180 util.ExpectAdmissionCheckToBeDeleted(ctx, k8sManagerClient, multiKueueAc, true) 181 gomega.Expect(k8sManagerClient.Delete(ctx, multiKueueConfig)).To(gomega.Succeed()) 182 gomega.Expect(k8sManagerClient.Delete(ctx, workerCluster1)).To(gomega.Succeed()) 183 gomega.Expect(k8sManagerClient.Delete(ctx, workerCluster2)).To(gomega.Succeed()) 184 }) 185 186 ginkgo.When("Creating a multikueue admission check", func() { 187 ginkgo.It("Should run a job on worker if admitted", func() { 188 // Since it requires 2 CPU, this job can only be admitted in worker 1. 189 job := testingjob.MakeJob("job", managerNs.Name). 190 Queue(managerLq.Name). 191 Request("cpu", "2"). 192 Request("memory", "1G"). 193 Image("gcr.io/k8s-staging-perf-tests/sleep:v0.1.0", []string{"1ms"}). 194 Obj() 195 196 ginkgo.By("Creating the job", func() { 197 gomega.Expect(k8sManagerClient.Create(ctx, job)).Should(gomega.Succeed()) 198 }) 199 200 createdLeaderWorkload := &kueue.Workload{} 201 wlLookupKey := types.NamespacedName{Name: workloadjob.GetWorkloadNameForJob(job.Name), Namespace: managerNs.Name} 202 203 // the execution should be given to the worker 204 ginkgo.By("Waiting to be admitted in worker1", func() { 205 gomega.Eventually(func(g gomega.Gomega) { 206 g.Expect(k8sManagerClient.Get(ctx, wlLookupKey, createdLeaderWorkload)).To(gomega.Succeed()) 207 g.Expect(workload.FindAdmissionCheck(createdLeaderWorkload.Status.AdmissionChecks, multiKueueAc.Name)).To(gomega.BeComparableTo(&kueue.AdmissionCheckState{ 208 Name: multiKueueAc.Name, 209 State: kueue.CheckStatePending, 210 Message: `The workload got reservation on "worker1"`, 211 }, cmpopts.IgnoreFields(kueue.AdmissionCheckState{}, "LastTransitionTime"))) 212 }, util.Timeout, util.Interval).Should(gomega.Succeed()) 213 }) 214 215 ginkgo.By("Waiting for the job to finish", func() { 216 gomega.Eventually(func(g gomega.Gomega) { 217 g.Expect(k8sManagerClient.Get(ctx, wlLookupKey, createdLeaderWorkload)).To(gomega.Succeed()) 218 219 g.Expect(apimeta.FindStatusCondition(createdLeaderWorkload.Status.Conditions, kueue.WorkloadFinished)).To(gomega.BeComparableTo(&metav1.Condition{ 220 Type: kueue.WorkloadFinished, 221 Status: metav1.ConditionTrue, 222 Reason: "JobFinished", 223 Message: `Job finished successfully`, 224 }, cmpopts.IgnoreFields(metav1.Condition{}, "LastTransitionTime"))) 225 }, util.LongTimeout, util.Interval).Should(gomega.Succeed()) 226 }) 227 228 ginkgo.By("Checking no objects are left in the worker clusters and the job is completed", func() { 229 gomega.Eventually(func(g gomega.Gomega) { 230 workerWl := &kueue.Workload{} 231 g.Expect(k8sWorker1Client.Get(ctx, wlLookupKey, workerWl)).To(utiltesting.BeNotFoundError()) 232 g.Expect(k8sWorker2Client.Get(ctx, wlLookupKey, workerWl)).To(utiltesting.BeNotFoundError()) 233 workerJob := &batchv1.Job{} 234 g.Expect(k8sWorker1Client.Get(ctx, client.ObjectKeyFromObject(job), workerJob)).To(utiltesting.BeNotFoundError()) 235 g.Expect(k8sWorker2Client.Get(ctx, client.ObjectKeyFromObject(job), workerJob)).To(utiltesting.BeNotFoundError()) 236 }, util.Timeout, util.Interval).Should(gomega.Succeed()) 237 238 createdJob := &batchv1.Job{} 239 gomega.Expect(k8sManagerClient.Get(ctx, client.ObjectKeyFromObject(job), createdJob)).To(gomega.Succeed()) 240 gomega.Expect(ptr.Deref(createdJob.Spec.Suspend, false)).To(gomega.BeTrue()) 241 gomega.Expect(createdJob.Status.Conditions).To(gomega.ContainElement(gomega.BeComparableTo( 242 batchv1.JobCondition{ 243 Type: batchv1.JobComplete, 244 Status: corev1.ConditionTrue, 245 }, 246 cmpopts.IgnoreFields(batchv1.JobCondition{}, "LastTransitionTime", "LastProbeTime")))) 247 }) 248 }) 249 ginkgo.It("Should run a jobSet on worker if admitted", func() { 250 // Since it requires 2 CPU in total, this jobset can only be admitted in worker 1. 251 jobSet := testingjobset.MakeJobSet("job-set", managerNs.Name). 252 Queue(managerLq.Name). 253 ReplicatedJobs( 254 testingjobset.ReplicatedJobRequirements{ 255 Name: "replicated-job-1", 256 Replicas: 2, 257 Parallelism: 2, 258 Completions: 2, 259 Image: "gcr.io/k8s-staging-perf-tests/sleep:v0.1.0", 260 // Give it the time to be observed Active in the live status update step. 261 Args: []string{"5s"}, 262 }, 263 ). 264 Request("replicated-job-1", "cpu", "500m"). 265 Request("replicated-job-1", "memory", "200M"). 266 Obj() 267 268 ginkgo.By("Creating the jobSet", func() { 269 gomega.Expect(k8sManagerClient.Create(ctx, jobSet)).Should(gomega.Succeed()) 270 }) 271 272 createdLeaderWorkload := &kueue.Workload{} 273 wlLookupKey := types.NamespacedName{Name: workloadjobset.GetWorkloadNameForJobSet(jobSet.Name), Namespace: managerNs.Name} 274 275 // the execution should be given to the worker 276 ginkgo.By("Waiting to be admitted in worker1 and manager", func() { 277 gomega.Eventually(func(g gomega.Gomega) { 278 g.Expect(k8sManagerClient.Get(ctx, wlLookupKey, createdLeaderWorkload)).To(gomega.Succeed()) 279 g.Expect(workload.FindAdmissionCheck(createdLeaderWorkload.Status.AdmissionChecks, multiKueueAc.Name)).To(gomega.BeComparableTo(&kueue.AdmissionCheckState{ 280 Name: multiKueueAc.Name, 281 State: kueue.CheckStateReady, 282 Message: `The workload got reservation on "worker1"`, 283 }, cmpopts.IgnoreFields(kueue.AdmissionCheckState{}, "LastTransitionTime"))) 284 g.Expect(apimeta.FindStatusCondition(createdLeaderWorkload.Status.Conditions, kueue.WorkloadAdmitted)).To(gomega.BeComparableTo(&metav1.Condition{ 285 Type: kueue.WorkloadAdmitted, 286 Status: metav1.ConditionTrue, 287 Reason: "Admitted", 288 Message: "The workload is admitted", 289 }, cmpopts.IgnoreFields(metav1.Condition{}, "LastTransitionTime"))) 290 }, util.Timeout, util.Interval).Should(gomega.Succeed()) 291 }) 292 293 ginkgo.By("Waiting for the jobSet to get status updates", func() { 294 gomega.Eventually(func(g gomega.Gomega) { 295 createdJobset := &jobset.JobSet{} 296 g.Expect(k8sManagerClient.Get(ctx, client.ObjectKeyFromObject(jobSet), createdJobset)).To(gomega.Succeed()) 297 298 g.Expect(createdJobset.Status.ReplicatedJobsStatus).To(gomega.BeComparableTo([]jobset.ReplicatedJobStatus{ 299 { 300 Name: "replicated-job-1", 301 Ready: 2, 302 Active: 2, 303 }, 304 }, cmpopts.IgnoreFields(jobset.ReplicatedJobStatus{}, "Succeeded", "Failed"))) 305 }, util.LongTimeout, util.Interval).Should(gomega.Succeed()) 306 }) 307 308 ginkgo.By("Waiting for the jobSet to finish", func() { 309 gomega.Eventually(func(g gomega.Gomega) { 310 g.Expect(k8sManagerClient.Get(ctx, wlLookupKey, createdLeaderWorkload)).To(gomega.Succeed()) 311 312 g.Expect(apimeta.FindStatusCondition(createdLeaderWorkload.Status.Conditions, kueue.WorkloadFinished)).To(gomega.BeComparableTo(&metav1.Condition{ 313 Type: kueue.WorkloadFinished, 314 Status: metav1.ConditionTrue, 315 Reason: "JobSetFinished", 316 Message: "JobSet finished successfully", 317 }, cmpopts.IgnoreFields(metav1.Condition{}, "LastTransitionTime"))) 318 }, util.LongTimeout, util.Interval).Should(gomega.Succeed()) 319 }) 320 321 ginkgo.By("Checking no objects are left in the worker clusters and the jobSet is completed", func() { 322 gomega.Eventually(func(g gomega.Gomega) { 323 workerWl := &kueue.Workload{} 324 g.Expect(k8sWorker1Client.Get(ctx, wlLookupKey, workerWl)).To(utiltesting.BeNotFoundError()) 325 g.Expect(k8sWorker2Client.Get(ctx, wlLookupKey, workerWl)).To(utiltesting.BeNotFoundError()) 326 workerJobSet := &jobset.JobSet{} 327 g.Expect(k8sWorker1Client.Get(ctx, client.ObjectKeyFromObject(jobSet), workerJobSet)).To(utiltesting.BeNotFoundError()) 328 g.Expect(k8sWorker2Client.Get(ctx, client.ObjectKeyFromObject(jobSet), workerJobSet)).To(utiltesting.BeNotFoundError()) 329 }, util.Timeout, util.Interval).Should(gomega.Succeed()) 330 331 createdJobSet := &jobset.JobSet{} 332 gomega.Expect(k8sManagerClient.Get(ctx, client.ObjectKeyFromObject(jobSet), createdJobSet)).To(gomega.Succeed()) 333 gomega.Expect(ptr.Deref(createdJobSet.Spec.Suspend, true)).To(gomega.BeFalse()) 334 gomega.Expect(createdJobSet.Status.Conditions).To(gomega.ContainElement(gomega.BeComparableTo( 335 metav1.Condition{ 336 Type: string(jobset.JobSetCompleted), 337 Status: metav1.ConditionTrue, 338 Reason: "AllJobsCompleted", 339 Message: "jobset completed successfully", 340 }, 341 cmpopts.IgnoreFields(metav1.Condition{}, "LastTransitionTime")))) 342 }) 343 }) 344 }) 345 ginkgo.When("The connection to a worker cluster is unreliable", func() { 346 ginkgo.It("Should update the cluster status to reflect the connection state", func() { 347 ginkgo.By("Disconnecting worker1 container from the kind network", func() { 348 cmd := exec.Command("docker", "network", "disconnect", "kind", "kind-worker1-control-plane") 349 output, err := cmd.CombinedOutput() 350 gomega.Expect(err).NotTo(gomega.HaveOccurred(), "%s: %s", err, output) 351 }) 352 353 worker1ClusterKey := client.ObjectKeyFromObject(workerCluster1) 354 355 ginkgo.By("Waiting for the cluster do become inactive", func() { 356 readClient := &kueuealpha.MultiKueueCluster{} 357 gomega.Eventually(func(g gomega.Gomega) { 358 g.Expect(k8sManagerClient.Get(ctx, worker1ClusterKey, readClient)).To(gomega.Succeed()) 359 g.Expect(readClient.Status.Conditions).To(gomega.ContainElement(gomega.BeComparableTo( 360 metav1.Condition{ 361 Type: kueuealpha.MultiKueueClusterActive, 362 Status: metav1.ConditionFalse, 363 Reason: "ClientConnectionFailed", 364 }, 365 cmpopts.IgnoreFields(metav1.Condition{}, "LastTransitionTime", "Message")))) 366 }, util.Timeout, util.Interval).Should(gomega.Succeed()) 367 }) 368 369 ginkgo.By("Reconnecting worker1 container to the kind network", func() { 370 cmd := exec.Command("docker", "network", "connect", "kind", "kind-worker1-control-plane") 371 output, err := cmd.CombinedOutput() 372 gomega.Expect(err).NotTo(gomega.HaveOccurred(), "%s: %s", err, output) 373 }) 374 375 ginkgo.By("Waiting for the cluster do become active", func() { 376 readClient := &kueuealpha.MultiKueueCluster{} 377 gomega.Eventually(func(g gomega.Gomega) { 378 g.Expect(k8sManagerClient.Get(ctx, worker1ClusterKey, readClient)).To(gomega.Succeed()) 379 g.Expect(readClient.Status.Conditions).To(gomega.ContainElement(gomega.BeComparableTo( 380 metav1.Condition{ 381 Type: kueuealpha.MultiKueueClusterActive, 382 Status: metav1.ConditionTrue, 383 Reason: "Active", 384 Message: "Connected", 385 }, 386 cmpopts.IgnoreFields(metav1.Condition{}, "LastTransitionTime")))) 387 }, util.Timeout, util.Interval).Should(gomega.Succeed()) 388 }) 389 }) 390 }) 391 })