sigs.k8s.io/kueue@v0.6.2/pkg/controller/admissionchecks/multikueue/multikueuecluster_test.go (about)

     1  /*
     2  Copyright 2024 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package multikueue
    18  
    19  import (
    20  	"context"
    21  	"errors"
    22  	"testing"
    23  	"time"
    24  
    25  	"github.com/google/go-cmp/cmp"
    26  	"github.com/google/go-cmp/cmp/cmpopts"
    27  	batchv1 "k8s.io/api/batch/v1"
    28  	corev1 "k8s.io/api/core/v1"
    29  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    30  	"k8s.io/apimachinery/pkg/types"
    31  	"k8s.io/apimachinery/pkg/watch"
    32  	"sigs.k8s.io/controller-runtime/pkg/client"
    33  	"sigs.k8s.io/controller-runtime/pkg/client/interceptor"
    34  	"sigs.k8s.io/controller-runtime/pkg/reconcile"
    35  
    36  	kueuealpha "sigs.k8s.io/kueue/apis/kueue/v1alpha1"
    37  	kueue "sigs.k8s.io/kueue/apis/kueue/v1beta1"
    38  	"sigs.k8s.io/kueue/pkg/util/slices"
    39  	utiltesting "sigs.k8s.io/kueue/pkg/util/testing"
    40  	testingjob "sigs.k8s.io/kueue/pkg/util/testingjobs/job"
    41  )
    42  
    43  var (
    44  	errInvalidConfig = errors.New("invalid kubeconfig")
    45  	errCannotWatch   = errors.New("client cannot watch")
    46  )
    47  
    48  func fakeClientBuilder(kubeconfig []byte, options client.Options) (client.WithWatch, error) {
    49  	if string(kubeconfig) == "invalid" {
    50  		return nil, errInvalidConfig
    51  	}
    52  	b, _ := getClientBuilder()
    53  	b = b.WithInterceptorFuncs(interceptor.Funcs{
    54  		Watch: func(ctx context.Context, client client.WithWatch, obj client.ObjectList, opts ...client.ListOption) (watch.Interface, error) {
    55  			if string(kubeconfig) == "nowatch" {
    56  				return nil, errCannotWatch
    57  			}
    58  			return client.Watch(ctx, obj, opts...)
    59  		},
    60  	})
    61  	return b.Build(), nil
    62  }
    63  
    64  func newTestClient(config string, watchCancel func()) *remoteClient {
    65  	b, _ := getClientBuilder()
    66  	localClient := b.Build()
    67  	ret := &remoteClient{
    68  		kubeconfig:  []byte(config),
    69  		localClient: localClient,
    70  		watchCancel: watchCancel,
    71  
    72  		builderOverride: fakeClientBuilder,
    73  	}
    74  	return ret
    75  }
    76  
    77  func setReconnectState(rc *remoteClient, a uint) *remoteClient {
    78  	rc.failedConnAttempts = a
    79  	rc.forceReconnect.Store(true)
    80  	return rc
    81  }
    82  
    83  func makeTestSecret(name string, kubeconfig string) corev1.Secret {
    84  	return corev1.Secret{
    85  		ObjectMeta: metav1.ObjectMeta{
    86  			Name:      name,
    87  			Namespace: TestNamespace,
    88  		},
    89  		Data: map[string][]byte{
    90  			kueuealpha.MultiKueueConfigSecretKey: []byte(kubeconfig),
    91  		},
    92  	}
    93  }
    94  
    95  func TestUpdateConfig(t *testing.T) {
    96  	cancelCalledCount := 0
    97  	cancelCalled := func() { cancelCalledCount++ }
    98  
    99  	cases := map[string]struct {
   100  		reconcileFor  string
   101  		remoteClients map[string]*remoteClient
   102  		clusters      []kueuealpha.MultiKueueCluster
   103  		secrets       []corev1.Secret
   104  
   105  		wantRemoteClients map[string]*remoteClient
   106  		wantClusters      []kueuealpha.MultiKueueCluster
   107  		wantRequeueAfter  time.Duration
   108  		wantCancelCalled  int
   109  	}{
   110  		"new valid client is added": {
   111  			reconcileFor: "worker1",
   112  			clusters: []kueuealpha.MultiKueueCluster{
   113  				*utiltesting.MakeMultiKueueCluster("worker1").KubeConfig(kueuealpha.SecretLocationType, "worker1").Obj(),
   114  			},
   115  			secrets: []corev1.Secret{
   116  				makeTestSecret("worker1", "worker1 kubeconfig"),
   117  			},
   118  			wantClusters: []kueuealpha.MultiKueueCluster{
   119  				*utiltesting.MakeMultiKueueCluster("worker1").KubeConfig(kueuealpha.SecretLocationType, "worker1").Active(metav1.ConditionTrue, "Active", "Connected").Obj(),
   120  			},
   121  			wantRemoteClients: map[string]*remoteClient{
   122  				"worker1": {
   123  					kubeconfig: []byte("worker1 kubeconfig"),
   124  				},
   125  			},
   126  		},
   127  		"update client with valid secret config": {
   128  			reconcileFor: "worker1",
   129  			clusters: []kueuealpha.MultiKueueCluster{
   130  				*utiltesting.MakeMultiKueueCluster("worker1").KubeConfig(kueuealpha.SecretLocationType, "worker1").Obj(),
   131  			},
   132  			secrets: []corev1.Secret{
   133  				makeTestSecret("worker1", "worker1 kubeconfig"),
   134  			},
   135  			remoteClients: map[string]*remoteClient{
   136  				"worker1": newTestClient("worker1 old kubeconfig", cancelCalled),
   137  			},
   138  			wantClusters: []kueuealpha.MultiKueueCluster{
   139  				*utiltesting.MakeMultiKueueCluster("worker1").KubeConfig(kueuealpha.SecretLocationType, "worker1").Active(metav1.ConditionTrue, "Active", "Connected").Obj(),
   140  			},
   141  			wantRemoteClients: map[string]*remoteClient{
   142  				"worker1": {
   143  					kubeconfig: []byte("worker1 kubeconfig"),
   144  				},
   145  			},
   146  			wantCancelCalled: 1,
   147  		},
   148  		"update client with valid path config": {
   149  			reconcileFor: "worker1",
   150  			clusters: []kueuealpha.MultiKueueCluster{
   151  				*utiltesting.MakeMultiKueueCluster("worker1").KubeConfig(kueuealpha.PathLocationType, "testdata/worker1KubeConfig").Obj(),
   152  			},
   153  			remoteClients: map[string]*remoteClient{
   154  				"worker1": newTestClient("worker1 old kubeconfig", cancelCalled),
   155  			},
   156  			wantClusters: []kueuealpha.MultiKueueCluster{
   157  				*utiltesting.MakeMultiKueueCluster("worker1").KubeConfig(kueuealpha.PathLocationType, "testdata/worker1KubeConfig").Active(metav1.ConditionTrue, "Active", "Connected").Obj(),
   158  			},
   159  			wantRemoteClients: map[string]*remoteClient{
   160  				"worker1": {
   161  					kubeconfig: []byte("worker1 kubeconfig"),
   162  				},
   163  			},
   164  			wantCancelCalled: 1,
   165  		},
   166  		"update client with invalid secret config": {
   167  			reconcileFor: "worker1",
   168  			clusters: []kueuealpha.MultiKueueCluster{
   169  				*utiltesting.MakeMultiKueueCluster("worker1").KubeConfig(kueuealpha.SecretLocationType, "worker1").Obj(),
   170  			},
   171  			secrets: []corev1.Secret{
   172  				makeTestSecret("worker1", "invalid"),
   173  			},
   174  			remoteClients: map[string]*remoteClient{
   175  				"worker1": newTestClient("worker1 old kubeconfig", cancelCalled),
   176  			},
   177  			wantRemoteClients: map[string]*remoteClient{
   178  				"worker1": newTestClient("invalid", nil),
   179  			},
   180  			wantClusters: []kueuealpha.MultiKueueCluster{
   181  				*utiltesting.MakeMultiKueueCluster("worker1").KubeConfig(kueuealpha.SecretLocationType, "worker1").Active(metav1.ConditionFalse, "ClientConnectionFailed", "invalid kubeconfig").Obj(),
   182  			},
   183  			wantCancelCalled: 1,
   184  		},
   185  		"update client with invalid path config": {
   186  			reconcileFor: "worker1",
   187  			clusters: []kueuealpha.MultiKueueCluster{
   188  				*utiltesting.MakeMultiKueueCluster("worker1").KubeConfig(kueuealpha.PathLocationType, "").Obj(),
   189  			},
   190  			remoteClients: map[string]*remoteClient{
   191  				"worker1": newTestClient("worker1 old kubeconfig", cancelCalled),
   192  			},
   193  			wantClusters: []kueuealpha.MultiKueueCluster{
   194  				*utiltesting.MakeMultiKueueCluster("worker1").KubeConfig(kueuealpha.PathLocationType, "").Active(metav1.ConditionFalse, "BadConfig", "open : no such file or directory").Obj(),
   195  			},
   196  			wantCancelCalled: 1,
   197  		},
   198  		"missing cluster is removed": {
   199  			reconcileFor: "worker2",
   200  			clusters: []kueuealpha.MultiKueueCluster{
   201  				*utiltesting.MakeMultiKueueCluster("worker1").KubeConfig(kueuealpha.SecretLocationType, "worker1").Obj(),
   202  			},
   203  			remoteClients: map[string]*remoteClient{
   204  				"worker1": newTestClient("worker1 kubeconfig", cancelCalled),
   205  				"worker2": newTestClient("worker2 kubeconfig", cancelCalled),
   206  			},
   207  			wantClusters: []kueuealpha.MultiKueueCluster{
   208  				*utiltesting.MakeMultiKueueCluster("worker1").KubeConfig(kueuealpha.SecretLocationType, "worker1").Obj(),
   209  			},
   210  			wantRemoteClients: map[string]*remoteClient{
   211  				"worker1": {
   212  					kubeconfig: []byte("worker1 kubeconfig"),
   213  				},
   214  			},
   215  			wantCancelCalled: 1,
   216  		},
   217  		"update client config, nowatch": {
   218  			reconcileFor: "worker1",
   219  			clusters: []kueuealpha.MultiKueueCluster{
   220  				*utiltesting.MakeMultiKueueCluster("worker1").KubeConfig(kueuealpha.SecretLocationType, "worker1").Obj(),
   221  			},
   222  			secrets: []corev1.Secret{
   223  				makeTestSecret("worker1", "nowatch"),
   224  			},
   225  			remoteClients: map[string]*remoteClient{
   226  				"worker1": newTestClient("worker1 old kubeconfig", cancelCalled),
   227  			},
   228  			wantRemoteClients: map[string]*remoteClient{
   229  				"worker1": setReconnectState(newTestClient("nowatch", nil), 1),
   230  			},
   231  			wantClusters: []kueuealpha.MultiKueueCluster{
   232  				*utiltesting.MakeMultiKueueCluster("worker1").KubeConfig(kueuealpha.SecretLocationType, "worker1").Active(metav1.ConditionFalse, "ClientConnectionFailed", "client cannot watch").Obj(),
   233  			},
   234  			wantRequeueAfter: 5 * time.Second,
   235  			wantCancelCalled: 1,
   236  		},
   237  		"update client config, nowatch 3rd try": {
   238  			reconcileFor: "worker1",
   239  			clusters: []kueuealpha.MultiKueueCluster{
   240  				*utiltesting.MakeMultiKueueCluster("worker1").KubeConfig(kueuealpha.SecretLocationType, "worker1").Active(metav1.ConditionFalse, "ClientConnectionFailed", "client cannot watch").Obj(),
   241  			},
   242  			secrets: []corev1.Secret{
   243  				makeTestSecret("worker1", "nowatch"),
   244  			},
   245  			remoteClients: map[string]*remoteClient{
   246  				"worker1": setReconnectState(newTestClient("nowatch", cancelCalled), 2),
   247  			},
   248  			wantRemoteClients: map[string]*remoteClient{
   249  				"worker1": setReconnectState(newTestClient("nowatch", nil), 3),
   250  			},
   251  			wantClusters: []kueuealpha.MultiKueueCluster{
   252  				*utiltesting.MakeMultiKueueCluster("worker1").KubeConfig(kueuealpha.SecretLocationType, "worker1").Active(metav1.ConditionFalse, "ClientConnectionFailed", "client cannot watch").Obj(),
   253  			},
   254  			wantRequeueAfter: 20 * time.Second,
   255  			wantCancelCalled: 1,
   256  		},
   257  		"failed attempts are set to 0 on successful connection": {
   258  			reconcileFor: "worker1",
   259  			clusters: []kueuealpha.MultiKueueCluster{
   260  				*utiltesting.MakeMultiKueueCluster("worker1").
   261  					KubeConfig(kueuealpha.SecretLocationType, "worker1").
   262  					Active(metav1.ConditionFalse, "ClientConnectionFailed", "client cannot watch").
   263  					Obj(),
   264  			},
   265  			secrets: []corev1.Secret{
   266  				makeTestSecret("worker1", "good config"),
   267  			},
   268  			remoteClients: map[string]*remoteClient{
   269  				"worker1": setReconnectState(newTestClient("nowatch", cancelCalled), 5),
   270  			},
   271  			wantRemoteClients: map[string]*remoteClient{
   272  				"worker1": newTestClient("good config", nil),
   273  			},
   274  			wantClusters: []kueuealpha.MultiKueueCluster{
   275  				*utiltesting.MakeMultiKueueCluster("worker1").
   276  					KubeConfig(kueuealpha.SecretLocationType, "worker1").
   277  					Active(metav1.ConditionTrue, "Active", "Connected").
   278  					Obj(),
   279  			},
   280  			wantCancelCalled: 1,
   281  		},
   282  		"failed attempts are set to 0 on config change": {
   283  			reconcileFor: "worker1",
   284  			clusters: []kueuealpha.MultiKueueCluster{
   285  				*utiltesting.MakeMultiKueueCluster("worker1").
   286  					KubeConfig(kueuealpha.SecretLocationType, "worker1").
   287  					Active(metav1.ConditionFalse, "ClientConnectionFailed", "client cannot watch").
   288  					Obj(),
   289  			},
   290  			secrets: []corev1.Secret{
   291  				makeTestSecret("worker1", "invalid"),
   292  			},
   293  			remoteClients: map[string]*remoteClient{
   294  				"worker1": setReconnectState(newTestClient("nowatch", cancelCalled), 5),
   295  			},
   296  			wantRemoteClients: map[string]*remoteClient{
   297  				"worker1": newTestClient("invalid", nil),
   298  			},
   299  			wantClusters: []kueuealpha.MultiKueueCluster{
   300  				*utiltesting.MakeMultiKueueCluster("worker1").
   301  					KubeConfig(kueuealpha.SecretLocationType, "worker1").
   302  					Active(metav1.ConditionFalse, "ClientConnectionFailed", "invalid kubeconfig").
   303  					Obj(),
   304  			},
   305  			wantCancelCalled: 1,
   306  		},
   307  	}
   308  
   309  	for name, tc := range cases {
   310  		t.Run(name, func(t *testing.T) {
   311  			builder, ctx := getClientBuilder()
   312  			builder = builder.WithLists(&kueuealpha.MultiKueueClusterList{Items: tc.clusters})
   313  			builder = builder.WithLists(&corev1.SecretList{Items: tc.secrets})
   314  			builder = builder.WithStatusSubresource(slices.Map(tc.clusters, func(c *kueuealpha.MultiKueueCluster) client.Object { return c })...)
   315  			c := builder.Build()
   316  
   317  			reconciler := newClustersReconciler(c, TestNamespace, 0, defaultOrigin)
   318  			reconciler.rootContext = ctx
   319  
   320  			if len(tc.remoteClients) > 0 {
   321  				reconciler.remoteClients = tc.remoteClients
   322  			}
   323  			reconciler.builderOverride = fakeClientBuilder
   324  
   325  			cancelCalledCount = 0
   326  			res, gotErr := reconciler.Reconcile(ctx, reconcile.Request{NamespacedName: types.NamespacedName{Name: tc.reconcileFor}})
   327  			if gotErr != nil {
   328  				t.Errorf("unexpected reconcile error: %s", gotErr)
   329  			}
   330  
   331  			if diff := cmp.Diff(tc.wantRequeueAfter, res.RequeueAfter); diff != "" {
   332  				t.Errorf("unexpected requeue after (-want/+got):\n%s", diff)
   333  			}
   334  
   335  			if tc.wantCancelCalled != cancelCalledCount {
   336  				t.Errorf("unexpected watch cancel call count want: %d,  got: %d", tc.wantCancelCalled, cancelCalledCount)
   337  			}
   338  
   339  			lst := &kueuealpha.MultiKueueClusterList{}
   340  			gotErr = c.List(ctx, lst)
   341  			if gotErr != nil {
   342  				t.Errorf("unexpected list clusters error: %s", gotErr)
   343  			}
   344  
   345  			if diff := cmp.Diff(tc.wantClusters, lst.Items, cmpopts.EquateEmpty(),
   346  				cmpopts.IgnoreFields(metav1.ObjectMeta{}, "ResourceVersion"),
   347  				cmpopts.IgnoreFields(metav1.Condition{}, "LastTransitionTime")); diff != "" {
   348  				t.Errorf("unexpected clusters (-want/+got):\n%s", diff)
   349  			}
   350  
   351  			if diff := cmp.Diff(tc.wantRemoteClients, reconciler.remoteClients, cmpopts.EquateEmpty(),
   352  				cmp.Comparer(func(a, b *remoteClient) bool {
   353  					if a.failedConnAttempts != b.failedConnAttempts {
   354  						return false
   355  					}
   356  					return string(a.kubeconfig) == string(b.kubeconfig)
   357  				})); diff != "" {
   358  				t.Errorf("unexpected controllers (-want/+got):\n%s", diff)
   359  			}
   360  		})
   361  	}
   362  }
   363  
   364  func TestRemoteClientGC(t *testing.T) {
   365  	baseJobBuilder := testingjob.MakeJob("job1", TestNamespace)
   366  	baseWlBuilder := utiltesting.MakeWorkload("wl1", TestNamespace).ControllerReference(batchv1.SchemeGroupVersion.WithKind("Job"), "job1", "test-uuid")
   367  
   368  	cases := map[string]struct {
   369  		managersWorkloads []kueue.Workload
   370  		workersWorkloads  []kueue.Workload
   371  		managersJobs      []batchv1.Job
   372  		workersJobs       []batchv1.Job
   373  
   374  		wantWorkersWorkloads []kueue.Workload
   375  		wantWorkersJobs      []batchv1.Job
   376  	}{
   377  		"existing workers and jobs are not deleted": {
   378  			managersWorkloads: []kueue.Workload{
   379  				*baseWlBuilder.Clone().
   380  					Obj(),
   381  			},
   382  			workersWorkloads: []kueue.Workload{
   383  				*baseWlBuilder.Clone().
   384  					Label(kueuealpha.MultiKueueOriginLabel, defaultOrigin).
   385  					Obj(),
   386  			},
   387  			managersJobs: []batchv1.Job{
   388  				*baseJobBuilder.Clone().
   389  					Obj(),
   390  			},
   391  			workersJobs: []batchv1.Job{
   392  				*baseJobBuilder.Clone().
   393  					Obj(),
   394  			},
   395  			wantWorkersWorkloads: []kueue.Workload{
   396  				*baseWlBuilder.Clone().
   397  					Label(kueuealpha.MultiKueueOriginLabel, defaultOrigin).
   398  					Obj(),
   399  			},
   400  			wantWorkersJobs: []batchv1.Job{
   401  				*baseJobBuilder.Clone().
   402  					Obj(),
   403  			},
   404  		},
   405  		"missing worker workloads are deleted": {
   406  			workersWorkloads: []kueue.Workload{
   407  				*baseWlBuilder.Clone().
   408  					Label(kueuealpha.MultiKueueOriginLabel, defaultOrigin).
   409  					Obj(),
   410  			},
   411  			managersJobs: []batchv1.Job{
   412  				*baseJobBuilder.Clone().
   413  					Obj(),
   414  			},
   415  		},
   416  		"missing worker workloads are deleted (no job adapter)": {
   417  			workersWorkloads: []kueue.Workload{
   418  				*baseWlBuilder.Clone().
   419  					ControllerReference(batchv1.SchemeGroupVersion.WithKind("NptAJob"), "job1", "test-uuid").
   420  					Label(kueuealpha.MultiKueueOriginLabel, defaultOrigin).
   421  					Obj(),
   422  			},
   423  		},
   424  		"missing worker workloads and their owner jobs are deleted": {
   425  			workersWorkloads: []kueue.Workload{
   426  				*baseWlBuilder.Clone().
   427  					Label(kueuealpha.MultiKueueOriginLabel, defaultOrigin).
   428  					Obj(),
   429  			},
   430  			managersJobs: []batchv1.Job{
   431  				*baseJobBuilder.Clone().
   432  					Obj(),
   433  			},
   434  			workersJobs: []batchv1.Job{
   435  				*baseJobBuilder.Clone().
   436  					Obj(),
   437  			},
   438  		},
   439  		"unrelated workers and jobs are not deleted": {
   440  			workersWorkloads: []kueue.Workload{
   441  				*baseWlBuilder.Clone().
   442  					Label(kueuealpha.MultiKueueOriginLabel, "other-gc-key").
   443  					Obj(),
   444  			},
   445  			workersJobs: []batchv1.Job{
   446  				*baseJobBuilder.Clone().
   447  					Obj(),
   448  			},
   449  			wantWorkersWorkloads: []kueue.Workload{
   450  				*baseWlBuilder.Clone().
   451  					Label(kueuealpha.MultiKueueOriginLabel, "other-gc-key").
   452  					Obj(),
   453  			},
   454  			wantWorkersJobs: []batchv1.Job{
   455  				*baseJobBuilder.Clone().
   456  					Obj(),
   457  			},
   458  		},
   459  	}
   460  
   461  	objCheckOpts := []cmp.Option{
   462  		cmpopts.IgnoreFields(metav1.ObjectMeta{}, "ResourceVersion"),
   463  		cmpopts.EquateEmpty(),
   464  	}
   465  
   466  	for name, tc := range cases {
   467  		t.Run(name, func(t *testing.T) {
   468  			manageBuilder, ctx := getClientBuilder()
   469  			manageBuilder = manageBuilder.WithLists(&kueue.WorkloadList{Items: tc.managersWorkloads}, &batchv1.JobList{Items: tc.managersJobs})
   470  			managerClient := manageBuilder.Build()
   471  
   472  			worker1Builder, _ := getClientBuilder()
   473  			worker1Builder = worker1Builder.WithLists(&kueue.WorkloadList{Items: tc.workersWorkloads}, &batchv1.JobList{Items: tc.workersJobs})
   474  			worker1Client := worker1Builder.Build()
   475  
   476  			w1remoteClient := newRemoteClient(managerClient, nil, nil, defaultOrigin, "")
   477  			w1remoteClient.client = worker1Client
   478  
   479  			w1remoteClient.runGC(ctx)
   480  
   481  			gotWorker1Wokloads := &kueue.WorkloadList{}
   482  			err := worker1Client.List(ctx, gotWorker1Wokloads)
   483  			if err != nil {
   484  				t.Error("unexpected list worker's workloads error")
   485  			}
   486  
   487  			if diff := cmp.Diff(tc.wantWorkersWorkloads, gotWorker1Wokloads.Items, objCheckOpts...); diff != "" {
   488  				t.Errorf("unexpected worker's workloads (-want/+got):\n%s", diff)
   489  			}
   490  
   491  			gotWorker1Job := &batchv1.JobList{}
   492  			err = worker1Client.List(ctx, gotWorker1Job)
   493  			if err != nil {
   494  				t.Error("unexpected list worker's jobs error")
   495  			}
   496  
   497  			if diff := cmp.Diff(tc.wantWorkersJobs, gotWorker1Job.Items, objCheckOpts...); diff != "" {
   498  				t.Errorf("unexpected worker's jobs (-want/+got):\n%s", diff)
   499  			}
   500  		})
   501  	}
   502  }