sigs.k8s.io/kueue@v0.6.2/pkg/controller/admissionchecks/multikueue/multikueuecluster.go (about) 1 /* 2 Copyright 2024 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package multikueue 18 19 import ( 20 "context" 21 "errors" 22 "fmt" 23 "os" 24 "strings" 25 "sync" 26 "sync/atomic" 27 "time" 28 29 corev1 "k8s.io/api/core/v1" 30 "k8s.io/apimachinery/pkg/api/equality" 31 apierrors "k8s.io/apimachinery/pkg/api/errors" 32 apimeta "k8s.io/apimachinery/pkg/api/meta" 33 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 34 "k8s.io/apimachinery/pkg/runtime" 35 "k8s.io/apimachinery/pkg/runtime/schema" 36 "k8s.io/apimachinery/pkg/types" 37 "k8s.io/client-go/tools/clientcmd" 38 "k8s.io/client-go/util/workqueue" 39 "k8s.io/klog/v2" 40 "k8s.io/utils/ptr" 41 ctrl "sigs.k8s.io/controller-runtime" 42 "sigs.k8s.io/controller-runtime/pkg/client" 43 "sigs.k8s.io/controller-runtime/pkg/event" 44 "sigs.k8s.io/controller-runtime/pkg/handler" 45 "sigs.k8s.io/controller-runtime/pkg/manager" 46 "sigs.k8s.io/controller-runtime/pkg/reconcile" 47 "sigs.k8s.io/controller-runtime/pkg/source" 48 49 kueuealpha "sigs.k8s.io/kueue/apis/kueue/v1alpha1" 50 kueue "sigs.k8s.io/kueue/apis/kueue/v1beta1" 51 ) 52 53 const ( 54 eventChBufferSize = 10 55 56 // this set will provide waiting time between 0 to 5m20s 57 retryIncrement = 5 * time.Second 58 retryMaxSteps = 7 59 ) 60 61 // retryAfter returns an exponentially increasing interval between 62 // 0 and 2^(retryMaxSteps-1) * retryIncrement 63 func retryAfter(failedAttempts uint) time.Duration { 64 if failedAttempts == 0 { 65 return 0 66 } 67 return (1 << (min(failedAttempts, retryMaxSteps) - 1)) * retryIncrement 68 } 69 70 type clientWithWatchBuilder func(config []byte, options client.Options) (client.WithWatch, error) 71 72 type remoteClient struct { 73 clusterName string 74 localClient client.Client 75 client client.WithWatch 76 wlUpdateCh chan<- event.GenericEvent 77 watchEndedCh chan<- event.GenericEvent 78 watchCancel func() 79 kubeconfig []byte 80 origin string 81 82 forceReconnect atomic.Bool 83 failedConnAttempts uint 84 85 // For unit testing only. There is now need of creating fully functional remote clients in the unit tests 86 // and creating valid kubeconfig content is not trivial. 87 // The full client creation and usage is validated in the integration and e2e tests. 88 builderOverride clientWithWatchBuilder 89 } 90 91 func newRemoteClient(localClient client.Client, wlUpdateCh, watchEndedCh chan<- event.GenericEvent, origin, clusterName string) *remoteClient { 92 rc := &remoteClient{ 93 clusterName: clusterName, 94 wlUpdateCh: wlUpdateCh, 95 watchEndedCh: watchEndedCh, 96 localClient: localClient, 97 origin: origin, 98 } 99 return rc 100 } 101 102 func newClientWithWatch(kubeconfig []byte, options client.Options) (client.WithWatch, error) { 103 restConfig, err := clientcmd.RESTConfigFromKubeConfig(kubeconfig) 104 if err != nil { 105 return nil, err 106 } 107 return client.NewWithWatch(restConfig, options) 108 } 109 110 type multiKueueWatcher interface { 111 // returns an empty list of objets 112 GetEmptyList() client.ObjectList 113 // returns the key of the workload of interest 114 // - the object name for workloads 115 // - the prebuilt workload for job types 116 GetWorkloadKey(runtime.Object) (types.NamespacedName, error) 117 } 118 119 type workloadKueueWatcher struct{} 120 121 var _ multiKueueWatcher = (*workloadKueueWatcher)(nil) 122 123 func (*workloadKueueWatcher) GetEmptyList() client.ObjectList { 124 return &kueue.WorkloadList{} 125 } 126 127 func (*workloadKueueWatcher) GetWorkloadKey(o runtime.Object) (types.NamespacedName, error) { 128 wl, isWl := o.(*kueue.Workload) 129 if !isWl { 130 return types.NamespacedName{}, errors.New("not a workload") 131 } 132 return client.ObjectKeyFromObject(wl), nil 133 } 134 135 // setConfig - will try to recreate the k8s client and restart watching if the new config is different than 136 // the one currently used or a reconnect was requested. 137 // If the encountered error is not permanent the duration after which a retry should be done is returned. 138 func (rc *remoteClient) setConfig(watchCtx context.Context, kubeconfig []byte) (*time.Duration, error) { 139 configChanged := !equality.Semantic.DeepEqual(kubeconfig, rc.kubeconfig) 140 if !configChanged && !rc.forceReconnect.Load() { 141 return nil, nil 142 } 143 144 rc.StopWatchers() 145 if configChanged { 146 rc.kubeconfig = kubeconfig 147 rc.failedConnAttempts = 0 148 } 149 150 builder := newClientWithWatch 151 if rc.builderOverride != nil { 152 builder = rc.builderOverride 153 } 154 remoteClient, err := builder(kubeconfig, client.Options{Scheme: rc.localClient.Scheme()}) 155 if err != nil { 156 return nil, err 157 } 158 159 rc.client = remoteClient 160 161 watchCtx, rc.watchCancel = context.WithCancel(watchCtx) 162 err = rc.startWatcher(watchCtx, kueue.GroupVersion.WithKind("Workload").GroupKind().String(), &workloadKueueWatcher{}) 163 if err != nil { 164 rc.failedConnAttempts++ 165 return ptr.To(retryAfter(rc.failedConnAttempts)), err 166 } 167 168 // add a watch for all the adapters implementing multiKueueWatcher 169 for kind, adapter := range adapters { 170 watcher, implementsWatcher := adapter.(multiKueueWatcher) 171 if !implementsWatcher { 172 continue 173 } 174 err := rc.startWatcher(watchCtx, kind, watcher) 175 if err != nil { 176 // not being able to setup a watcher is not ideal but we can function with only the wl watcher. 177 ctrl.LoggerFrom(watchCtx).V(2).Error(err, "Unable to start the watcher", "kind", kind) 178 // however let's not accept this for now. 179 rc.failedConnAttempts++ 180 return ptr.To(retryAfter(rc.failedConnAttempts)), err 181 } 182 } 183 184 rc.forceReconnect.Store(false) 185 rc.failedConnAttempts = 0 186 return nil, nil 187 } 188 189 func (rc *remoteClient) startWatcher(ctx context.Context, kind string, w multiKueueWatcher) error { 190 log := ctrl.LoggerFrom(ctx).WithValues("watchKind", kind) 191 newWatcher, err := rc.client.Watch(ctx, w.GetEmptyList(), client.MatchingLabels{kueuealpha.MultiKueueOriginLabel: rc.origin}) 192 if err != nil { 193 return err 194 } 195 196 go func() { 197 log.V(2).Info("Starting watch") 198 for r := range newWatcher.ResultChan() { 199 wlKey, err := w.GetWorkloadKey(r.Object) 200 if err != nil { 201 log.V(2).Error(err, "Cannot get workload key", "jobKind", r.Object.GetObjectKind().GroupVersionKind()) 202 } else { 203 rc.queueWorkloadEvent(ctx, wlKey) 204 } 205 } 206 log.V(2).Info("Watch ended", "ctxErr", ctx.Err()) 207 // If the context is not yet Done , queue a reconcile to attempt reconnection 208 if ctx.Err() == nil { 209 oldReconnect := rc.forceReconnect.Swap(true) 210 //reconnect if this is the first watch failing. 211 if !oldReconnect { 212 log.V(2).Info("Queue reconcile for reconnect", "cluster", rc.clusterName) 213 rc.queueWatchEndedEvent(ctx) 214 } 215 } 216 }() 217 return nil 218 } 219 220 func (rc *remoteClient) StopWatchers() { 221 if rc.watchCancel != nil { 222 rc.watchCancel() 223 } 224 } 225 226 func (rc *remoteClient) queueWorkloadEvent(ctx context.Context, wlKey types.NamespacedName) { 227 localWl := &kueue.Workload{} 228 if err := rc.localClient.Get(ctx, wlKey, localWl); err == nil { 229 rc.wlUpdateCh <- event.GenericEvent{Object: localWl} 230 } else { 231 if !apierrors.IsNotFound(err) { 232 ctrl.LoggerFrom(ctx).Error(err, "reading local workload") 233 } 234 } 235 } 236 237 func (rc *remoteClient) queueWatchEndedEvent(ctx context.Context) { 238 cluster := &kueuealpha.MultiKueueCluster{} 239 if err := rc.localClient.Get(ctx, types.NamespacedName{Name: rc.clusterName}, cluster); err == nil { 240 rc.watchEndedCh <- event.GenericEvent{Object: cluster} 241 } else { 242 ctrl.LoggerFrom(ctx).Error(err, "sending watch ended event") 243 } 244 } 245 246 // runGC - lists all the remote workloads having the same multikueue-origin and remove those who 247 // no longer have a local correspondent (missing or awaiting deletion). If the remote workload 248 // is owned by a job, also delete the job. 249 func (rc *remoteClient) runGC(ctx context.Context) { 250 log := ctrl.LoggerFrom(ctx) 251 lst := &kueue.WorkloadList{} 252 err := rc.client.List(ctx, lst, client.MatchingLabels{kueuealpha.MultiKueueOriginLabel: rc.origin}) 253 if err != nil { 254 log.V(2).Error(err, "Listing remote workloads") 255 return 256 } 257 258 for _, remoteWl := range lst.Items { 259 localWl := &kueue.Workload{} 260 wlLog := log.WithValues("remoteWl", klog.KObj(&remoteWl)) 261 err := rc.localClient.Get(ctx, client.ObjectKeyFromObject(&remoteWl), localWl) 262 if client.IgnoreNotFound(err) != nil { 263 wlLog.V(2).Error(err, "Reading local workload") 264 continue 265 } 266 267 if err == nil && localWl.DeletionTimestamp.IsZero() { 268 // The local workload exists and isn't being deleted, so the remote workload is still relevant. 269 continue 270 } 271 272 // if the remote wl has a controller(owning Job), delete the job 273 if controller := metav1.GetControllerOf(&remoteWl); controller != nil { 274 ownerKey := klog.KRef(remoteWl.Namespace, controller.Name) 275 adapterKey := schema.FromAPIVersionAndKind(controller.APIVersion, controller.Kind).String() 276 if adapter, found := adapters[adapterKey]; !found { 277 wlLog.V(2).Info("No adapter found", "adapterKey", adapterKey, "ownerKey", ownerKey) 278 } else { 279 wlLog.V(5).Info("MultiKueueGC deleting workload owner", "ownerKey", ownerKey, "ownnerKind", controller) 280 err := adapter.DeleteRemoteObject(ctx, rc.client, types.NamespacedName{Name: controller.Name, Namespace: remoteWl.Namespace}) 281 if client.IgnoreNotFound(err) != nil { 282 wlLog.V(2).Error(err, "Deleting remote workload's owner", "ownerKey", ownerKey) 283 } 284 } 285 } 286 wlLog.V(5).Info("MultiKueueGC deleting remote workload") 287 if err := rc.client.Delete(ctx, &remoteWl); client.IgnoreNotFound(err) != nil { 288 wlLog.V(2).Error(err, "Deleting remote workload") 289 } 290 } 291 } 292 293 // clustersReconciler implements the reconciler for all MultiKueueClusters. 294 // Its main task being to maintain the list of remote clients associated to each MultiKueueCluster. 295 type clustersReconciler struct { 296 localClient client.Client 297 configNamespace string 298 299 lock sync.RWMutex 300 // The list of remote remoteClients, indexed by the cluster name. 301 remoteClients map[string]*remoteClient 302 wlUpdateCh chan event.GenericEvent 303 304 // gcInterval - time waiting between two GC runs. 305 gcInterval time.Duration 306 307 // the multikueue-origin value used 308 origin string 309 310 // rootContext - holds the context passed by the controller-runtime on Start. 311 // It's used to create child contexts for MultiKueueClusters client watch routines 312 // that will gracefully end when the controller-manager stops. 313 rootContext context.Context 314 315 // For unit testing only. There is now need of creating fully functional remote clients in the unit tests 316 // and creating valid kubeconfig content is not trivial. 317 // The full client creation and usage is validated in the integration and e2e tests. 318 builderOverride clientWithWatchBuilder 319 320 // watchEndedCh - an event chan used to request the reconciliation of the clusters for which the watch loop 321 // has ended (connection lost). 322 watchEndedCh chan event.GenericEvent 323 } 324 325 var _ manager.Runnable = (*clustersReconciler)(nil) 326 var _ reconcile.Reconciler = (*clustersReconciler)(nil) 327 328 func (c *clustersReconciler) Start(ctx context.Context) error { 329 c.rootContext = ctx 330 go c.runGC(ctx) 331 return nil 332 } 333 334 func (c *clustersReconciler) stopAndRemoveCluster(clusterName string) { 335 c.lock.Lock() 336 defer c.lock.Unlock() 337 if rc, found := c.remoteClients[clusterName]; found { 338 rc.StopWatchers() 339 delete(c.remoteClients, clusterName) 340 } 341 } 342 343 func (c *clustersReconciler) setRemoteClientConfig(ctx context.Context, clusterName string, kubeconfig []byte, origin string) (*time.Duration, error) { 344 c.lock.Lock() 345 defer c.lock.Unlock() 346 347 client, found := c.remoteClients[clusterName] 348 if !found { 349 client = newRemoteClient(c.localClient, c.wlUpdateCh, c.watchEndedCh, origin, clusterName) 350 if c.builderOverride != nil { 351 client.builderOverride = c.builderOverride 352 } 353 c.remoteClients[clusterName] = client 354 } 355 356 clientLog := ctrl.LoggerFrom(c.rootContext).WithValues("clusterName", clusterName) 357 clientCtx := ctrl.LoggerInto(c.rootContext, clientLog) 358 359 if retryAfter, err := client.setConfig(clientCtx, kubeconfig); err != nil { 360 ctrl.LoggerFrom(ctx).Error(err, "failed to set kubeConfig in the remote client") 361 return retryAfter, err 362 } 363 return nil, nil 364 } 365 366 func (a *clustersReconciler) controllerFor(acName string) (*remoteClient, bool) { 367 a.lock.RLock() 368 defer a.lock.RUnlock() 369 370 c, f := a.remoteClients[acName] 371 return c, f 372 } 373 374 func (c *clustersReconciler) Reconcile(ctx context.Context, req reconcile.Request) (reconcile.Result, error) { 375 cluster := &kueuealpha.MultiKueueCluster{} 376 log := ctrl.LoggerFrom(ctx) 377 378 err := c.localClient.Get(ctx, req.NamespacedName, cluster) 379 if client.IgnoreNotFound(err) != nil { 380 return reconcile.Result{}, err 381 } 382 383 log.V(2).Info("Reconcile MultiKueueCluster") 384 385 if err != nil || !cluster.DeletionTimestamp.IsZero() { 386 c.stopAndRemoveCluster(req.Name) 387 return reconcile.Result{}, nil 388 } 389 390 // get the kubeconfig 391 kubeConfig, retry, err := c.getKubeConfig(ctx, &cluster.Spec.KubeConfig) 392 if retry { 393 return reconcile.Result{}, err 394 } 395 if err != nil { 396 log.Error(err, "reading kubeconfig") 397 c.stopAndRemoveCluster(req.Name) 398 return reconcile.Result{}, c.updateStatus(ctx, cluster, false, "BadConfig", err.Error()) 399 } 400 401 if retryAfter, err := c.setRemoteClientConfig(ctx, cluster.Name, kubeConfig, c.origin); err != nil { 402 log.Error(err, "setting kubeconfig", "retryAfter", retryAfter) 403 if err := c.updateStatus(ctx, cluster, false, "ClientConnectionFailed", err.Error()); err != nil { 404 return reconcile.Result{}, err 405 } else { 406 return reconcile.Result{RequeueAfter: ptr.Deref(retryAfter, 0)}, nil 407 } 408 } 409 return reconcile.Result{}, c.updateStatus(ctx, cluster, true, "Active", "Connected") 410 } 411 412 func (c *clustersReconciler) getKubeConfig(ctx context.Context, ref *kueuealpha.KubeConfig) ([]byte, bool, error) { 413 if ref.LocationType == kueuealpha.SecretLocationType { 414 return c.getKubeConfigFromSecret(ctx, ref.Location) 415 } 416 // Otherwise it's path 417 return c.getKubeConfigFromPath(ref.Location) 418 } 419 420 func (c *clustersReconciler) getKubeConfigFromSecret(ctx context.Context, secretName string) ([]byte, bool, error) { 421 sec := corev1.Secret{} 422 secretObjKey := types.NamespacedName{ 423 Namespace: c.configNamespace, 424 Name: secretName, 425 } 426 err := c.localClient.Get(ctx, secretObjKey, &sec) 427 if err != nil { 428 return nil, !apierrors.IsNotFound(err), err 429 } 430 431 kconfigBytes, found := sec.Data[kueuealpha.MultiKueueConfigSecretKey] 432 if !found { 433 return nil, false, fmt.Errorf("key %q not found in secret %q", kueuealpha.MultiKueueConfigSecretKey, secretName) 434 } 435 436 return kconfigBytes, false, nil 437 } 438 439 func (c *clustersReconciler) getKubeConfigFromPath(path string) ([]byte, bool, error) { 440 content, err := os.ReadFile(path) 441 return content, false, err 442 } 443 444 func (c *clustersReconciler) updateStatus(ctx context.Context, cluster *kueuealpha.MultiKueueCluster, active bool, reason, message string) error { 445 newCondition := metav1.Condition{ 446 Type: kueuealpha.MultiKueueClusterActive, 447 Status: metav1.ConditionFalse, 448 Reason: reason, 449 Message: message, 450 } 451 if active { 452 newCondition.Status = metav1.ConditionTrue 453 } 454 455 // if the condition is up to date 456 oldCondition := apimeta.FindStatusCondition(cluster.Status.Conditions, kueuealpha.MultiKueueClusterActive) 457 if cmpConditionState(oldCondition, &newCondition) { 458 return nil 459 } 460 461 apimeta.SetStatusCondition(&cluster.Status.Conditions, newCondition) 462 return c.localClient.Status().Update(ctx, cluster) 463 } 464 465 func (c *clustersReconciler) runGC(ctx context.Context) { 466 log := ctrl.LoggerFrom(ctx).WithName("MultiKueueGC") 467 if c.gcInterval == 0 { 468 log.V(2).Info("Garbage Collection is disabled") 469 return 470 } 471 log.V(2).Info("Starting Garbage Collector") 472 for { 473 select { 474 case <-ctx.Done(): 475 log.V(2).Info("Garbage Collector Stopped") 476 return 477 case <-time.After(c.gcInterval): 478 log.V(4).Info("Run Garbage Collection for Lost Remote Workloads") 479 for clusterName, rc := range c.remoteClients { 480 rc.runGC(ctrl.LoggerInto(ctx, log.WithValues("multiKueueCluster", clusterName))) 481 } 482 } 483 } 484 } 485 486 // +kubebuilder:rbac:groups="",resources=events,verbs=create;watch;update 487 // +kubebuilder:rbac:groups=kueue.x-k8s.io,resources=multikueueclusters,verbs=get;list;watch 488 // +kubebuilder:rbac:groups=kueue.x-k8s.io,resources=multikueueclusters/status,verbs=get;update;patch 489 490 func newClustersReconciler(c client.Client, namespace string, gcInterval time.Duration, origin string) *clustersReconciler { 491 return &clustersReconciler{ 492 localClient: c, 493 configNamespace: namespace, 494 remoteClients: make(map[string]*remoteClient), 495 wlUpdateCh: make(chan event.GenericEvent, eventChBufferSize), 496 gcInterval: gcInterval, 497 origin: origin, 498 watchEndedCh: make(chan event.GenericEvent, eventChBufferSize), 499 } 500 } 501 502 func (c *clustersReconciler) setupWithManager(mgr ctrl.Manager) error { 503 err := mgr.Add(c) 504 if err != nil { 505 return err 506 } 507 508 syncHndl := handler.Funcs{ 509 GenericFunc: func(_ context.Context, e event.GenericEvent, q workqueue.RateLimitingInterface) { 510 q.Add(reconcile.Request{NamespacedName: types.NamespacedName{ 511 Name: e.Object.GetName(), 512 }}) 513 }, 514 } 515 516 return ctrl.NewControllerManagedBy(mgr). 517 For(&kueuealpha.MultiKueueCluster{}). 518 Watches(&corev1.Secret{}, &secretHandler{client: c.localClient}). 519 WatchesRawSource(&source.Channel{Source: c.watchEndedCh}, syncHndl). 520 Complete(c) 521 } 522 523 type secretHandler struct { 524 client client.Client 525 } 526 527 var _ handler.EventHandler = (*secretHandler)(nil) 528 529 func (s *secretHandler) Create(ctx context.Context, event event.CreateEvent, q workqueue.RateLimitingInterface) { 530 secret, isSecret := event.Object.(*corev1.Secret) 531 if !isSecret { 532 ctrl.LoggerFrom(ctx).V(5).Error(errors.New("not a secret"), "Failure on create event") 533 return 534 } 535 if err := s.queue(ctx, secret, q); err != nil { 536 ctrl.LoggerFrom(ctx).V(5).Error(err, "Failure on create event", "secret", klog.KObj(event.Object)) 537 } 538 } 539 540 func (s *secretHandler) Update(ctx context.Context, event event.UpdateEvent, q workqueue.RateLimitingInterface) { 541 secret, isSecret := event.ObjectNew.(*corev1.Secret) 542 if !isSecret { 543 ctrl.LoggerFrom(ctx).V(5).Error(errors.New("not a secret"), "Failure on update event") 544 return 545 } 546 if err := s.queue(ctx, secret, q); err != nil { 547 ctrl.LoggerFrom(ctx).V(5).Error(err, "Failure on update event", "secret", klog.KObj(event.ObjectOld)) 548 } 549 } 550 551 func (s *secretHandler) Delete(ctx context.Context, event event.DeleteEvent, q workqueue.RateLimitingInterface) { 552 secret, isSecret := event.Object.(*corev1.Secret) 553 if !isSecret { 554 ctrl.LoggerFrom(ctx).V(5).Error(errors.New("not a secret"), "Failure on delete event") 555 return 556 } 557 if err := s.queue(ctx, secret, q); err != nil { 558 ctrl.LoggerFrom(ctx).V(5).Error(err, "Failure on delete event", "secret", klog.KObj(event.Object)) 559 } 560 } 561 562 func (s *secretHandler) Generic(ctx context.Context, event event.GenericEvent, q workqueue.RateLimitingInterface) { 563 secret, isSecret := event.Object.(*corev1.Secret) 564 if !isSecret { 565 ctrl.LoggerFrom(ctx).V(5).Error(errors.New("not a secret"), "Failure on generic event") 566 return 567 } 568 if err := s.queue(ctx, secret, q); err != nil { 569 ctrl.LoggerFrom(ctx).V(5).Error(err, "Failure on generic event", "secret", klog.KObj(event.Object)) 570 } 571 } 572 573 func (s *secretHandler) queue(ctx context.Context, secret *corev1.Secret, q workqueue.RateLimitingInterface) error { 574 users := &kueuealpha.MultiKueueClusterList{} 575 if err := s.client.List(ctx, users, client.MatchingFields{UsingKubeConfigs: strings.Join([]string{secret.Namespace, secret.Name}, "/")}); err != nil { 576 return err 577 } 578 579 for _, user := range users.Items { 580 req := reconcile.Request{ 581 NamespacedName: types.NamespacedName{ 582 Name: user.Name, 583 }, 584 } 585 q.Add(req) 586 } 587 return nil 588 }