sigs.k8s.io/kueue@v0.6.2/test/e2e/multikueue/e2e_test.go (about)

     1  /*
     2  Copyright 2023 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package mke2e
    18  
    19  import (
    20  	"os/exec"
    21  
    22  	"github.com/google/go-cmp/cmp/cmpopts"
    23  	"github.com/onsi/ginkgo/v2"
    24  	"github.com/onsi/gomega"
    25  	batchv1 "k8s.io/api/batch/v1"
    26  	corev1 "k8s.io/api/core/v1"
    27  	apimeta "k8s.io/apimachinery/pkg/api/meta"
    28  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    29  	"k8s.io/apimachinery/pkg/types"
    30  	"k8s.io/utils/ptr"
    31  	"sigs.k8s.io/controller-runtime/pkg/client"
    32  	jobset "sigs.k8s.io/jobset/api/jobset/v1alpha2"
    33  
    34  	kueuealpha "sigs.k8s.io/kueue/apis/kueue/v1alpha1"
    35  	kueue "sigs.k8s.io/kueue/apis/kueue/v1beta1"
    36  	"sigs.k8s.io/kueue/pkg/controller/admissionchecks/multikueue"
    37  	workloadjob "sigs.k8s.io/kueue/pkg/controller/jobs/job"
    38  	workloadjobset "sigs.k8s.io/kueue/pkg/controller/jobs/jobset"
    39  	utiltesting "sigs.k8s.io/kueue/pkg/util/testing"
    40  	testingjob "sigs.k8s.io/kueue/pkg/util/testingjobs/job"
    41  	testingjobset "sigs.k8s.io/kueue/pkg/util/testingjobs/jobset"
    42  	"sigs.k8s.io/kueue/pkg/workload"
    43  	"sigs.k8s.io/kueue/test/util"
    44  )
    45  
    46  // +kubebuilder:docs-gen:collapse=Imports
    47  
    48  var _ = ginkgo.Describe("MultiKueue", func() {
    49  	var (
    50  		managerNs *corev1.Namespace
    51  		worker1Ns *corev1.Namespace
    52  		worker2Ns *corev1.Namespace
    53  
    54  		workerCluster1   *kueuealpha.MultiKueueCluster
    55  		workerCluster2   *kueuealpha.MultiKueueCluster
    56  		multiKueueConfig *kueuealpha.MultiKueueConfig
    57  		multiKueueAc     *kueue.AdmissionCheck
    58  		managerFlavor    *kueue.ResourceFlavor
    59  		managerCq        *kueue.ClusterQueue
    60  		managerLq        *kueue.LocalQueue
    61  
    62  		worker1Flavor *kueue.ResourceFlavor
    63  		worker1Cq     *kueue.ClusterQueue
    64  		worker1Lq     *kueue.LocalQueue
    65  
    66  		worker2Flavor *kueue.ResourceFlavor
    67  		worker2Cq     *kueue.ClusterQueue
    68  		worker2Lq     *kueue.LocalQueue
    69  	)
    70  
    71  	ginkgo.BeforeEach(func() {
    72  		managerNs = &corev1.Namespace{
    73  			ObjectMeta: metav1.ObjectMeta{
    74  				GenerateName: "multikueue-",
    75  			},
    76  		}
    77  		gomega.Expect(k8sManagerClient.Create(ctx, managerNs)).To(gomega.Succeed())
    78  
    79  		worker1Ns = &corev1.Namespace{
    80  			ObjectMeta: metav1.ObjectMeta{
    81  				Name: managerNs.Name,
    82  			},
    83  		}
    84  		gomega.Expect(k8sWorker1Client.Create(ctx, worker1Ns)).To(gomega.Succeed())
    85  
    86  		worker2Ns = &corev1.Namespace{
    87  			ObjectMeta: metav1.ObjectMeta{
    88  				Name: managerNs.Name,
    89  			},
    90  		}
    91  		gomega.Expect(k8sWorker2Client.Create(ctx, worker2Ns)).To(gomega.Succeed())
    92  
    93  		workerCluster1 = utiltesting.MakeMultiKueueCluster("worker1").KubeConfig(kueuealpha.SecretLocationType, "multikueue1").Obj()
    94  		gomega.Expect(k8sManagerClient.Create(ctx, workerCluster1)).To(gomega.Succeed())
    95  
    96  		workerCluster2 = utiltesting.MakeMultiKueueCluster("worker2").KubeConfig(kueuealpha.SecretLocationType, "multikueue2").Obj()
    97  		gomega.Expect(k8sManagerClient.Create(ctx, workerCluster2)).To(gomega.Succeed())
    98  
    99  		multiKueueConfig = utiltesting.MakeMultiKueueConfig("multikueueconfig").Clusters("worker1", "worker2").Obj()
   100  		gomega.Expect(k8sManagerClient.Create(ctx, multiKueueConfig)).Should(gomega.Succeed())
   101  
   102  		multiKueueAc = utiltesting.MakeAdmissionCheck("ac1").
   103  			ControllerName(multikueue.ControllerName).
   104  			Parameters(kueuealpha.GroupVersion.Group, "MultiKueueConfig", multiKueueConfig.Name).
   105  			Obj()
   106  		gomega.Expect(k8sManagerClient.Create(ctx, multiKueueAc)).Should(gomega.Succeed())
   107  
   108  		ginkgo.By("wait for check active", func() {
   109  			updatetedAc := kueue.AdmissionCheck{}
   110  			acKey := client.ObjectKeyFromObject(multiKueueAc)
   111  			gomega.Eventually(func(g gomega.Gomega) {
   112  				g.Expect(k8sManagerClient.Get(ctx, acKey, &updatetedAc)).To(gomega.Succeed())
   113  				g.Expect(apimeta.IsStatusConditionTrue(updatetedAc.Status.Conditions, kueue.AdmissionCheckActive)).To(gomega.BeTrue())
   114  			}, util.Timeout, util.Interval).Should(gomega.Succeed())
   115  
   116  		})
   117  		managerFlavor = utiltesting.MakeResourceFlavor("default").Obj()
   118  		gomega.Expect(k8sManagerClient.Create(ctx, managerFlavor)).Should(gomega.Succeed())
   119  
   120  		managerCq = utiltesting.MakeClusterQueue("q1").
   121  			ResourceGroup(
   122  				*utiltesting.MakeFlavorQuotas(managerFlavor.Name).
   123  					Resource(corev1.ResourceCPU, "2").
   124  					Resource(corev1.ResourceMemory, "2G").
   125  					Obj(),
   126  			).
   127  			AdmissionChecks(multiKueueAc.Name).
   128  			Obj()
   129  		gomega.Expect(k8sManagerClient.Create(ctx, managerCq)).Should(gomega.Succeed())
   130  
   131  		managerLq = utiltesting.MakeLocalQueue(managerCq.Name, managerNs.Name).ClusterQueue(managerCq.Name).Obj()
   132  		gomega.Expect(k8sManagerClient.Create(ctx, managerLq)).Should(gomega.Succeed())
   133  
   134  		worker1Flavor = utiltesting.MakeResourceFlavor("default").Obj()
   135  		gomega.Expect(k8sWorker1Client.Create(ctx, worker1Flavor)).Should(gomega.Succeed())
   136  
   137  		worker1Cq = utiltesting.MakeClusterQueue("q1").
   138  			ResourceGroup(
   139  				*utiltesting.MakeFlavorQuotas(worker1Flavor.Name).
   140  					Resource(corev1.ResourceCPU, "2").
   141  					Resource(corev1.ResourceMemory, "1G").
   142  					Obj(),
   143  			).
   144  			Obj()
   145  		gomega.Expect(k8sWorker1Client.Create(ctx, worker1Cq)).Should(gomega.Succeed())
   146  
   147  		worker1Lq = utiltesting.MakeLocalQueue(worker1Cq.Name, worker1Ns.Name).ClusterQueue(worker1Cq.Name).Obj()
   148  		gomega.Expect(k8sWorker1Client.Create(ctx, worker1Lq)).Should(gomega.Succeed())
   149  
   150  		worker2Flavor = utiltesting.MakeResourceFlavor("default").Obj()
   151  		gomega.Expect(k8sWorker2Client.Create(ctx, worker2Flavor)).Should(gomega.Succeed())
   152  
   153  		worker2Cq = utiltesting.MakeClusterQueue("q1").
   154  			ResourceGroup(
   155  				*utiltesting.MakeFlavorQuotas(worker2Flavor.Name).
   156  					Resource(corev1.ResourceCPU, "1").
   157  					Resource(corev1.ResourceMemory, "2G").
   158  					Obj(),
   159  			).
   160  			Obj()
   161  		gomega.Expect(k8sWorker2Client.Create(ctx, worker2Cq)).Should(gomega.Succeed())
   162  
   163  		worker2Lq = utiltesting.MakeLocalQueue(worker2Cq.Name, worker2Ns.Name).ClusterQueue(worker2Cq.Name).Obj()
   164  		gomega.Expect(k8sWorker2Client.Create(ctx, worker2Lq)).Should(gomega.Succeed())
   165  	})
   166  
   167  	ginkgo.AfterEach(func() {
   168  		gomega.Expect(util.DeleteNamespace(ctx, k8sManagerClient, managerNs)).To(gomega.Succeed())
   169  		gomega.Expect(util.DeleteNamespace(ctx, k8sWorker1Client, worker1Ns)).To(gomega.Succeed())
   170  		gomega.Expect(util.DeleteNamespace(ctx, k8sWorker2Client, worker2Ns)).To(gomega.Succeed())
   171  
   172  		util.ExpectClusterQueueToBeDeleted(ctx, k8sWorker1Client, worker1Cq, true)
   173  		util.ExpectResourceFlavorToBeDeleted(ctx, k8sWorker1Client, worker1Flavor, true)
   174  
   175  		util.ExpectClusterQueueToBeDeleted(ctx, k8sWorker2Client, worker2Cq, true)
   176  		util.ExpectResourceFlavorToBeDeleted(ctx, k8sWorker2Client, worker2Flavor, true)
   177  
   178  		util.ExpectClusterQueueToBeDeleted(ctx, k8sManagerClient, managerCq, true)
   179  		util.ExpectResourceFlavorToBeDeleted(ctx, k8sManagerClient, managerFlavor, true)
   180  		util.ExpectAdmissionCheckToBeDeleted(ctx, k8sManagerClient, multiKueueAc, true)
   181  		gomega.Expect(k8sManagerClient.Delete(ctx, multiKueueConfig)).To(gomega.Succeed())
   182  		gomega.Expect(k8sManagerClient.Delete(ctx, workerCluster1)).To(gomega.Succeed())
   183  		gomega.Expect(k8sManagerClient.Delete(ctx, workerCluster2)).To(gomega.Succeed())
   184  	})
   185  
   186  	ginkgo.When("Creating a multikueue admission check", func() {
   187  		ginkgo.It("Should run a job on worker if admitted", func() {
   188  			// Since it requires 2 CPU, this job can only be admitted in worker 1.
   189  			job := testingjob.MakeJob("job", managerNs.Name).
   190  				Queue(managerLq.Name).
   191  				Request("cpu", "2").
   192  				Request("memory", "1G").
   193  				Image("gcr.io/k8s-staging-perf-tests/sleep:v0.1.0", []string{"1ms"}).
   194  				Obj()
   195  
   196  			ginkgo.By("Creating the job", func() {
   197  				gomega.Expect(k8sManagerClient.Create(ctx, job)).Should(gomega.Succeed())
   198  			})
   199  
   200  			createdLeaderWorkload := &kueue.Workload{}
   201  			wlLookupKey := types.NamespacedName{Name: workloadjob.GetWorkloadNameForJob(job.Name), Namespace: managerNs.Name}
   202  
   203  			// the execution should be given to the worker
   204  			ginkgo.By("Waiting to be admitted in worker1", func() {
   205  				gomega.Eventually(func(g gomega.Gomega) {
   206  					g.Expect(k8sManagerClient.Get(ctx, wlLookupKey, createdLeaderWorkload)).To(gomega.Succeed())
   207  					g.Expect(workload.FindAdmissionCheck(createdLeaderWorkload.Status.AdmissionChecks, multiKueueAc.Name)).To(gomega.BeComparableTo(&kueue.AdmissionCheckState{
   208  						Name:    multiKueueAc.Name,
   209  						State:   kueue.CheckStatePending,
   210  						Message: `The workload got reservation on "worker1"`,
   211  					}, cmpopts.IgnoreFields(kueue.AdmissionCheckState{}, "LastTransitionTime")))
   212  				}, util.Timeout, util.Interval).Should(gomega.Succeed())
   213  			})
   214  
   215  			ginkgo.By("Waiting for the job to finish", func() {
   216  				gomega.Eventually(func(g gomega.Gomega) {
   217  					g.Expect(k8sManagerClient.Get(ctx, wlLookupKey, createdLeaderWorkload)).To(gomega.Succeed())
   218  
   219  					g.Expect(apimeta.FindStatusCondition(createdLeaderWorkload.Status.Conditions, kueue.WorkloadFinished)).To(gomega.BeComparableTo(&metav1.Condition{
   220  						Type:    kueue.WorkloadFinished,
   221  						Status:  metav1.ConditionTrue,
   222  						Reason:  "JobFinished",
   223  						Message: `Job finished successfully`,
   224  					}, cmpopts.IgnoreFields(metav1.Condition{}, "LastTransitionTime")))
   225  				}, util.LongTimeout, util.Interval).Should(gomega.Succeed())
   226  			})
   227  
   228  			ginkgo.By("Checking no objects are left in the worker clusters and the job is completed", func() {
   229  				gomega.Eventually(func(g gomega.Gomega) {
   230  					workerWl := &kueue.Workload{}
   231  					g.Expect(k8sWorker1Client.Get(ctx, wlLookupKey, workerWl)).To(utiltesting.BeNotFoundError())
   232  					g.Expect(k8sWorker2Client.Get(ctx, wlLookupKey, workerWl)).To(utiltesting.BeNotFoundError())
   233  					workerJob := &batchv1.Job{}
   234  					g.Expect(k8sWorker1Client.Get(ctx, client.ObjectKeyFromObject(job), workerJob)).To(utiltesting.BeNotFoundError())
   235  					g.Expect(k8sWorker2Client.Get(ctx, client.ObjectKeyFromObject(job), workerJob)).To(utiltesting.BeNotFoundError())
   236  				}, util.Timeout, util.Interval).Should(gomega.Succeed())
   237  
   238  				createdJob := &batchv1.Job{}
   239  				gomega.Expect(k8sManagerClient.Get(ctx, client.ObjectKeyFromObject(job), createdJob)).To(gomega.Succeed())
   240  				gomega.Expect(ptr.Deref(createdJob.Spec.Suspend, false)).To(gomega.BeTrue())
   241  				gomega.Expect(createdJob.Status.Conditions).To(gomega.ContainElement(gomega.BeComparableTo(
   242  					batchv1.JobCondition{
   243  						Type:   batchv1.JobComplete,
   244  						Status: corev1.ConditionTrue,
   245  					},
   246  					cmpopts.IgnoreFields(batchv1.JobCondition{}, "LastTransitionTime", "LastProbeTime"))))
   247  			})
   248  		})
   249  		ginkgo.It("Should run a jobSet on worker if admitted", func() {
   250  			// Since it requires 2 CPU in total, this jobset can only be admitted in worker 1.
   251  			jobSet := testingjobset.MakeJobSet("job-set", managerNs.Name).
   252  				Queue(managerLq.Name).
   253  				ReplicatedJobs(
   254  					testingjobset.ReplicatedJobRequirements{
   255  						Name:        "replicated-job-1",
   256  						Replicas:    2,
   257  						Parallelism: 2,
   258  						Completions: 2,
   259  						Image:       "gcr.io/k8s-staging-perf-tests/sleep:v0.1.0",
   260  						// Give it the time to be observed Active in the live status update step.
   261  						Args: []string{"5s"},
   262  					},
   263  				).
   264  				Request("replicated-job-1", "cpu", "500m").
   265  				Request("replicated-job-1", "memory", "200M").
   266  				Obj()
   267  
   268  			ginkgo.By("Creating the jobSet", func() {
   269  				gomega.Expect(k8sManagerClient.Create(ctx, jobSet)).Should(gomega.Succeed())
   270  			})
   271  
   272  			createdLeaderWorkload := &kueue.Workload{}
   273  			wlLookupKey := types.NamespacedName{Name: workloadjobset.GetWorkloadNameForJobSet(jobSet.Name), Namespace: managerNs.Name}
   274  
   275  			// the execution should be given to the worker
   276  			ginkgo.By("Waiting to be admitted in worker1 and manager", func() {
   277  				gomega.Eventually(func(g gomega.Gomega) {
   278  					g.Expect(k8sManagerClient.Get(ctx, wlLookupKey, createdLeaderWorkload)).To(gomega.Succeed())
   279  					g.Expect(workload.FindAdmissionCheck(createdLeaderWorkload.Status.AdmissionChecks, multiKueueAc.Name)).To(gomega.BeComparableTo(&kueue.AdmissionCheckState{
   280  						Name:    multiKueueAc.Name,
   281  						State:   kueue.CheckStateReady,
   282  						Message: `The workload got reservation on "worker1"`,
   283  					}, cmpopts.IgnoreFields(kueue.AdmissionCheckState{}, "LastTransitionTime")))
   284  					g.Expect(apimeta.FindStatusCondition(createdLeaderWorkload.Status.Conditions, kueue.WorkloadAdmitted)).To(gomega.BeComparableTo(&metav1.Condition{
   285  						Type:    kueue.WorkloadAdmitted,
   286  						Status:  metav1.ConditionTrue,
   287  						Reason:  "Admitted",
   288  						Message: "The workload is admitted",
   289  					}, cmpopts.IgnoreFields(metav1.Condition{}, "LastTransitionTime")))
   290  				}, util.Timeout, util.Interval).Should(gomega.Succeed())
   291  			})
   292  
   293  			ginkgo.By("Waiting for the jobSet to get status updates", func() {
   294  				gomega.Eventually(func(g gomega.Gomega) {
   295  					createdJobset := &jobset.JobSet{}
   296  					g.Expect(k8sManagerClient.Get(ctx, client.ObjectKeyFromObject(jobSet), createdJobset)).To(gomega.Succeed())
   297  
   298  					g.Expect(createdJobset.Status.ReplicatedJobsStatus).To(gomega.BeComparableTo([]jobset.ReplicatedJobStatus{
   299  						{
   300  							Name:   "replicated-job-1",
   301  							Ready:  2,
   302  							Active: 2,
   303  						},
   304  					}, cmpopts.IgnoreFields(jobset.ReplicatedJobStatus{}, "Succeeded", "Failed")))
   305  				}, util.LongTimeout, util.Interval).Should(gomega.Succeed())
   306  			})
   307  
   308  			ginkgo.By("Waiting for the jobSet to finish", func() {
   309  				gomega.Eventually(func(g gomega.Gomega) {
   310  					g.Expect(k8sManagerClient.Get(ctx, wlLookupKey, createdLeaderWorkload)).To(gomega.Succeed())
   311  
   312  					g.Expect(apimeta.FindStatusCondition(createdLeaderWorkload.Status.Conditions, kueue.WorkloadFinished)).To(gomega.BeComparableTo(&metav1.Condition{
   313  						Type:    kueue.WorkloadFinished,
   314  						Status:  metav1.ConditionTrue,
   315  						Reason:  "JobSetFinished",
   316  						Message: "JobSet finished successfully",
   317  					}, cmpopts.IgnoreFields(metav1.Condition{}, "LastTransitionTime")))
   318  				}, util.LongTimeout, util.Interval).Should(gomega.Succeed())
   319  			})
   320  
   321  			ginkgo.By("Checking no objects are left in the worker clusters and the jobSet is completed", func() {
   322  				gomega.Eventually(func(g gomega.Gomega) {
   323  					workerWl := &kueue.Workload{}
   324  					g.Expect(k8sWorker1Client.Get(ctx, wlLookupKey, workerWl)).To(utiltesting.BeNotFoundError())
   325  					g.Expect(k8sWorker2Client.Get(ctx, wlLookupKey, workerWl)).To(utiltesting.BeNotFoundError())
   326  					workerJobSet := &jobset.JobSet{}
   327  					g.Expect(k8sWorker1Client.Get(ctx, client.ObjectKeyFromObject(jobSet), workerJobSet)).To(utiltesting.BeNotFoundError())
   328  					g.Expect(k8sWorker2Client.Get(ctx, client.ObjectKeyFromObject(jobSet), workerJobSet)).To(utiltesting.BeNotFoundError())
   329  				}, util.Timeout, util.Interval).Should(gomega.Succeed())
   330  
   331  				createdJobSet := &jobset.JobSet{}
   332  				gomega.Expect(k8sManagerClient.Get(ctx, client.ObjectKeyFromObject(jobSet), createdJobSet)).To(gomega.Succeed())
   333  				gomega.Expect(ptr.Deref(createdJobSet.Spec.Suspend, true)).To(gomega.BeFalse())
   334  				gomega.Expect(createdJobSet.Status.Conditions).To(gomega.ContainElement(gomega.BeComparableTo(
   335  					metav1.Condition{
   336  						Type:    string(jobset.JobSetCompleted),
   337  						Status:  metav1.ConditionTrue,
   338  						Reason:  "AllJobsCompleted",
   339  						Message: "jobset completed successfully",
   340  					},
   341  					cmpopts.IgnoreFields(metav1.Condition{}, "LastTransitionTime"))))
   342  			})
   343  		})
   344  	})
   345  	ginkgo.When("The connection to a worker cluster is unreliable", func() {
   346  		ginkgo.It("Should update the cluster status to reflect the connection state", func() {
   347  			ginkgo.By("Disconnecting worker1 container from the kind network", func() {
   348  				cmd := exec.Command("docker", "network", "disconnect", "kind", "kind-worker1-control-plane")
   349  				output, err := cmd.CombinedOutput()
   350  				gomega.Expect(err).NotTo(gomega.HaveOccurred(), "%s: %s", err, output)
   351  			})
   352  
   353  			worker1ClusterKey := client.ObjectKeyFromObject(workerCluster1)
   354  
   355  			ginkgo.By("Waiting for the cluster do become inactive", func() {
   356  				readClient := &kueuealpha.MultiKueueCluster{}
   357  				gomega.Eventually(func(g gomega.Gomega) {
   358  					g.Expect(k8sManagerClient.Get(ctx, worker1ClusterKey, readClient)).To(gomega.Succeed())
   359  					g.Expect(readClient.Status.Conditions).To(gomega.ContainElement(gomega.BeComparableTo(
   360  						metav1.Condition{
   361  							Type:   kueuealpha.MultiKueueClusterActive,
   362  							Status: metav1.ConditionFalse,
   363  							Reason: "ClientConnectionFailed",
   364  						},
   365  						cmpopts.IgnoreFields(metav1.Condition{}, "LastTransitionTime", "Message"))))
   366  				}, util.Timeout, util.Interval).Should(gomega.Succeed())
   367  			})
   368  
   369  			ginkgo.By("Reconnecting worker1 container to the kind network", func() {
   370  				cmd := exec.Command("docker", "network", "connect", "kind", "kind-worker1-control-plane")
   371  				output, err := cmd.CombinedOutput()
   372  				gomega.Expect(err).NotTo(gomega.HaveOccurred(), "%s: %s", err, output)
   373  			})
   374  
   375  			ginkgo.By("Waiting for the cluster do become active", func() {
   376  				readClient := &kueuealpha.MultiKueueCluster{}
   377  				gomega.Eventually(func(g gomega.Gomega) {
   378  					g.Expect(k8sManagerClient.Get(ctx, worker1ClusterKey, readClient)).To(gomega.Succeed())
   379  					g.Expect(readClient.Status.Conditions).To(gomega.ContainElement(gomega.BeComparableTo(
   380  						metav1.Condition{
   381  							Type:    kueuealpha.MultiKueueClusterActive,
   382  							Status:  metav1.ConditionTrue,
   383  							Reason:  "Active",
   384  							Message: "Connected",
   385  						},
   386  						cmpopts.IgnoreFields(metav1.Condition{}, "LastTransitionTime"))))
   387  				}, util.Timeout, util.Interval).Should(gomega.Succeed())
   388  			})
   389  		})
   390  	})
   391  })