sigs.k8s.io/kueue@v0.6.2/pkg/controller/jobs/raycluster/raycluster_controller_test.go (about)

     1  /*
     2  Copyright 2024 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package raycluster
    18  
    19  import (
    20  	"testing"
    21  
    22  	"github.com/google/go-cmp/cmp"
    23  	"github.com/google/go-cmp/cmp/cmpopts"
    24  	rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1"
    25  	corev1 "k8s.io/api/core/v1"
    26  	"k8s.io/apimachinery/pkg/api/resource"
    27  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    28  	"k8s.io/client-go/tools/record"
    29  	ctrl "sigs.k8s.io/controller-runtime"
    30  	"sigs.k8s.io/controller-runtime/pkg/client"
    31  	"sigs.k8s.io/controller-runtime/pkg/reconcile"
    32  
    33  	kueue "sigs.k8s.io/kueue/apis/kueue/v1beta1"
    34  	"sigs.k8s.io/kueue/pkg/controller/jobframework"
    35  	"sigs.k8s.io/kueue/pkg/podset"
    36  	utiltesting "sigs.k8s.io/kueue/pkg/util/testing"
    37  	testingrayutil "sigs.k8s.io/kueue/pkg/util/testingjobs/raycluster"
    38  )
    39  
    40  var (
    41  	jobCmpOpts = cmp.Options{
    42  		cmpopts.EquateEmpty(),
    43  		cmpopts.IgnoreFields(rayv1.RayCluster{}, "TypeMeta", "ObjectMeta"),
    44  	}
    45  	workloadCmpOpts = cmp.Options{
    46  		cmpopts.EquateEmpty(),
    47  		cmpopts.IgnoreFields(kueue.Workload{}, "TypeMeta", "ObjectMeta"),
    48  		cmpopts.IgnoreFields(kueue.WorkloadSpec{}, "Priority"),
    49  		cmpopts.IgnoreFields(metav1.Condition{}, "LastTransitionTime"),
    50  		cmpopts.IgnoreFields(kueue.PodSet{}, "Template"),
    51  	}
    52  )
    53  
    54  func TestReconciler(t *testing.T) {
    55  	baseJobWrapper := testingrayutil.MakeCluster("job", "ns").
    56  		Suspend(true).
    57  		Queue("foo").
    58  		RequestHead(corev1.ResourceCPU, "10").
    59  		RequestWorkerGroup(corev1.ResourceCPU, "10")
    60  
    61  	cases := map[string]struct {
    62  		reconcilerOptions []jobframework.Option
    63  		job               rayv1.RayCluster
    64  		initObjects       []client.Object
    65  		workloads         []kueue.Workload
    66  		priorityClasses   []client.Object
    67  		wantJob           rayv1.RayCluster
    68  		wantWorkloads     []kueue.Workload
    69  		runInfo           []podset.PodSetInfo
    70  		wantErr           error
    71  	}{
    72  		"when workload is admitted, cluster is unsuspended": {
    73  			initObjects: []client.Object{
    74  				utiltesting.MakeResourceFlavor("unit-test-flavor").Label("kubernetes.io/arch", "arm64").Obj(),
    75  			},
    76  			job: *baseJobWrapper.Clone().
    77  				Obj(),
    78  			wantJob: *baseJobWrapper.Clone().
    79  				Suspend(false).
    80  				NodeSelectorHeadGroup("kubernetes.io/arch", "arm64").
    81  				Obj(),
    82  			workloads: []kueue.Workload{
    83  				*utiltesting.MakeWorkload("test", "ns").
    84  					Finalizers(kueue.ResourceInUseFinalizerName).
    85  					PodSets(
    86  						kueue.PodSet{
    87  							Name:  "head",
    88  							Count: int32(1),
    89  							Template: corev1.PodTemplateSpec{
    90  								Spec: corev1.PodSpec{
    91  
    92  									RestartPolicy: corev1.RestartPolicyNever,
    93  									Containers: []corev1.Container{
    94  										{
    95  											Name: "head-container",
    96  											Resources: corev1.ResourceRequirements{
    97  												Requests: make(corev1.ResourceList),
    98  											},
    99  										},
   100  									},
   101  								},
   102  							},
   103  						},
   104  						kueue.PodSet{
   105  							Name:  "workers-group-0",
   106  							Count: int32(1),
   107  							Template: corev1.PodTemplateSpec{
   108  								Spec: corev1.PodSpec{
   109  									RestartPolicy: corev1.RestartPolicyNever,
   110  
   111  									Containers: []corev1.Container{
   112  										{
   113  											Name: "worker-container",
   114  											Resources: corev1.ResourceRequirements{
   115  												Requests: corev1.ResourceList{
   116  													corev1.ResourceCPU: resource.MustParse("10"),
   117  												},
   118  											},
   119  										},
   120  									},
   121  								},
   122  							},
   123  						}).
   124  					Request(corev1.ResourceCPU, "10").
   125  					ReserveQuota(
   126  						utiltesting.MakeAdmission("cq", "head", "workers-group-0").
   127  							Assignment(corev1.ResourceCPU, "unit-test-flavor", "1").
   128  							AssignmentPodCount(1).
   129  							Obj(),
   130  					).
   131  					Admitted(true).
   132  					AdmissionCheck(kueue.AdmissionCheckState{
   133  						Name:  "check",
   134  						State: kueue.CheckStateReady,
   135  						PodSetUpdates: []kueue.PodSetUpdate{
   136  							{
   137  								Name: "head",
   138  							},
   139  							{
   140  								Name: "workers-group-0",
   141  							},
   142  						},
   143  					}).
   144  					Obj(),
   145  			},
   146  			wantWorkloads: []kueue.Workload{
   147  				*utiltesting.MakeWorkload("a", "ns").Finalizers(kueue.ResourceInUseFinalizerName).
   148  					Finalizers(kueue.ResourceInUseFinalizerName).
   149  					PodSets(kueue.PodSet{
   150  						Name:  "head",
   151  						Count: int32(1),
   152  						Template: corev1.PodTemplateSpec{
   153  							Spec: corev1.PodSpec{
   154  								RestartPolicy: corev1.RestartPolicyNever,
   155  								Containers: []corev1.Container{
   156  									{
   157  										Name: "head-container",
   158  										Resources: corev1.ResourceRequirements{
   159  											Requests: make(corev1.ResourceList),
   160  										},
   161  									},
   162  								},
   163  							},
   164  						},
   165  					},
   166  						kueue.PodSet{
   167  							Name:  "workers-group-0",
   168  							Count: int32(1),
   169  							Template: corev1.PodTemplateSpec{
   170  								Spec: corev1.PodSpec{
   171  									RestartPolicy: corev1.RestartPolicyNever,
   172  									Containers: []corev1.Container{
   173  										{
   174  											Name: "worker-container",
   175  											Resources: corev1.ResourceRequirements{
   176  												Requests: make(corev1.ResourceList),
   177  											},
   178  										},
   179  									},
   180  								},
   181  							},
   182  						}).
   183  					ReserveQuota(
   184  						utiltesting.MakeAdmission("cq", "head", "workers-group-0").
   185  							Assignment(corev1.ResourceCPU, "unit-test-flavor", "1").
   186  							AssignmentPodCount(1).
   187  							Obj(),
   188  					).
   189  					Admitted(true).
   190  					AdmissionCheck(kueue.AdmissionCheckState{
   191  						Name:  "check",
   192  						State: kueue.CheckStateReady,
   193  						PodSetUpdates: []kueue.PodSetUpdate{
   194  							{
   195  								Name: "head",
   196  							},
   197  							{
   198  								Name: "workers-group-0",
   199  							},
   200  						},
   201  					}).
   202  					Obj(),
   203  			},
   204  		},
   205  		"when workload is admitted but workload's conditions is Evicted, suspend it and restore node selector": {
   206  			initObjects: []client.Object{
   207  				utiltesting.MakeResourceFlavor("unit-test-flavor").Label("kubernetes.io/arch", "arm64").Obj(),
   208  			},
   209  			job: *baseJobWrapper.Clone().
   210  				Suspend(false).
   211  				NodeSelectorHeadGroup("kubernetes.io/arch", "arm64").
   212  				Obj(),
   213  			wantJob: *baseJobWrapper.Clone().
   214  				Suspend(true).
   215  				Obj(),
   216  			workloads: []kueue.Workload{
   217  				*utiltesting.MakeWorkload("test", "ns").
   218  					Finalizers(kueue.ResourceInUseFinalizerName).
   219  					PodSets(
   220  						kueue.PodSet{
   221  							Name:  "head",
   222  							Count: int32(1),
   223  							Template: corev1.PodTemplateSpec{
   224  								Spec: corev1.PodSpec{
   225  									RestartPolicy: corev1.RestartPolicyNever,
   226  									Containers: []corev1.Container{
   227  										{
   228  											Name: "head-container",
   229  											Resources: corev1.ResourceRequirements{
   230  												Requests: make(corev1.ResourceList),
   231  											},
   232  										},
   233  									},
   234  								},
   235  							},
   236  						},
   237  						kueue.PodSet{
   238  							Name:  "workers-group-0",
   239  							Count: int32(1),
   240  							Template: corev1.PodTemplateSpec{
   241  								Spec: corev1.PodSpec{
   242  									RestartPolicy: corev1.RestartPolicyNever,
   243  									Containers: []corev1.Container{
   244  										{
   245  											Name: "worker-container",
   246  											Resources: corev1.ResourceRequirements{
   247  												Requests: corev1.ResourceList{
   248  													corev1.ResourceCPU: resource.MustParse("10"),
   249  												},
   250  											},
   251  										},
   252  									},
   253  								},
   254  							},
   255  						},
   256  					).
   257  					Request(corev1.ResourceCPU, "10").
   258  					ReserveQuota(utiltesting.MakeAdmission("cq", "head", "workers-group-0").AssignmentPodCount(1).Obj()).
   259  					Condition(metav1.Condition{
   260  						Type:   kueue.WorkloadEvicted,
   261  						Status: metav1.ConditionTrue,
   262  					}).
   263  					Admitted(true).
   264  					Obj(),
   265  			},
   266  			wantWorkloads: []kueue.Workload{
   267  				*utiltesting.MakeWorkload("a", "ns").Finalizers(kueue.ResourceInUseFinalizerName).
   268  					PodSets(
   269  						kueue.PodSet{
   270  							Name:  "head",
   271  							Count: int32(1),
   272  							Template: corev1.PodTemplateSpec{
   273  								Spec: corev1.PodSpec{
   274  									RestartPolicy: corev1.RestartPolicyNever,
   275  									Containers: []corev1.Container{
   276  										{
   277  											Name: "head-container",
   278  											Resources: corev1.ResourceRequirements{
   279  												Requests: make(corev1.ResourceList),
   280  											},
   281  										},
   282  									},
   283  								},
   284  							},
   285  						},
   286  						kueue.PodSet{
   287  							Name:  "workers-group-0",
   288  							Count: int32(1),
   289  							Template: corev1.PodTemplateSpec{
   290  								Spec: corev1.PodSpec{
   291  									RestartPolicy: corev1.RestartPolicyNever,
   292  									Containers: []corev1.Container{
   293  										{
   294  											Name: "worker-container",
   295  											Resources: corev1.ResourceRequirements{
   296  												Requests: make(corev1.ResourceList),
   297  											},
   298  										},
   299  									},
   300  								},
   301  							},
   302  						}).
   303  					ReserveQuota(utiltesting.MakeAdmission("cq", "head", "workers-group-0").AssignmentPodCount(1).Obj()).
   304  					Condition(metav1.Condition{
   305  						Type:   kueue.WorkloadEvicted,
   306  						Status: metav1.ConditionTrue,
   307  					}).
   308  					Condition(metav1.Condition{
   309  						Type:   kueue.WorkloadQuotaReserved,
   310  						Status: metav1.ConditionFalse,
   311  						Reason: "Pending",
   312  					}).
   313  					Admitted(true).
   314  					Condition(metav1.Condition{
   315  						Type:    kueue.WorkloadAdmitted,
   316  						Status:  metav1.ConditionFalse,
   317  						Reason:  "NoReservation",
   318  						Message: "The workload has no reservation",
   319  					}).
   320  					Obj(),
   321  			},
   322  		},
   323  	}
   324  	for name, tc := range cases {
   325  		t.Run(name, func(t *testing.T) {
   326  
   327  			ctx, _ := utiltesting.ContextWithLog(t)
   328  			clientBuilder := utiltesting.NewClientBuilder(rayv1.AddToScheme)
   329  
   330  			if err := SetupIndexes(ctx, utiltesting.AsIndexer(clientBuilder)); err != nil {
   331  				t.Fatalf("Could not setup indexes: %v", err)
   332  			}
   333  			objs := append(tc.priorityClasses, &tc.job)
   334  			kcBuilder := clientBuilder.WithObjects(objs...)
   335  
   336  			for i := range tc.workloads {
   337  				kcBuilder = kcBuilder.WithStatusSubresource(&tc.workloads[i])
   338  			}
   339  
   340  			kcBuilder = clientBuilder.WithObjects(tc.initObjects...)
   341  
   342  			kClient := kcBuilder.Build()
   343  			for i := range tc.workloads {
   344  				if err := ctrl.SetControllerReference(&tc.job, &tc.workloads[i], kClient.Scheme()); err != nil {
   345  					t.Fatalf("Could not setup owner reference in Workloads: %v", err)
   346  				}
   347  				if err := kClient.Create(ctx, &tc.workloads[i]); err != nil {
   348  					t.Fatalf("Could not create workload: %v", err)
   349  				}
   350  			}
   351  			recorder := record.NewBroadcaster().NewRecorder(kClient.Scheme(), corev1.EventSource{Component: "test"})
   352  			reconciler := NewReconciler(kClient, recorder, tc.reconcilerOptions...)
   353  
   354  			jobKey := client.ObjectKeyFromObject(&tc.job)
   355  			_, err := reconciler.Reconcile(ctx, reconcile.Request{
   356  				NamespacedName: jobKey,
   357  			})
   358  			if diff := cmp.Diff(tc.wantErr, err, cmpopts.EquateErrors()); diff != "" {
   359  				t.Errorf("Reconcile returned error (-want,+got):\n%s", diff)
   360  			}
   361  
   362  			var gotJob rayv1.RayCluster
   363  			if err := kClient.Get(ctx, jobKey, &gotJob); err != nil {
   364  				t.Fatalf("Could not get Job after reconcile: %v", err)
   365  			}
   366  			if diff := cmp.Diff(tc.wantJob, gotJob, jobCmpOpts...); diff != "" {
   367  				t.Errorf("Job after reconcile (-want,+got):\n%s", diff)
   368  			}
   369  			var gotWorkloads kueue.WorkloadList
   370  			if err := kClient.List(ctx, &gotWorkloads); err != nil {
   371  				t.Fatalf("Could not get Workloads after reconcile: %v", err)
   372  			}
   373  			if diff := cmp.Diff(tc.wantWorkloads, gotWorkloads.Items, workloadCmpOpts...); diff != "" {
   374  				t.Errorf("Workloads after reconcile (-want,+got):\n%s", diff)
   375  			}
   376  		})
   377  	}
   378  }