sigs.k8s.io/kueue@v0.6.2/pkg/controller/admissionchecks/multikueue/workload.go (about) 1 /* 2 Copyright 2024 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package multikueue 18 19 import ( 20 "context" 21 "errors" 22 "fmt" 23 "time" 24 25 batchv1 "k8s.io/api/batch/v1" 26 "k8s.io/apimachinery/pkg/api/equality" 27 apimeta "k8s.io/apimachinery/pkg/api/meta" 28 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 29 "k8s.io/apimachinery/pkg/runtime/schema" 30 "k8s.io/apimachinery/pkg/types" 31 "k8s.io/client-go/util/workqueue" 32 ctrl "sigs.k8s.io/controller-runtime" 33 "sigs.k8s.io/controller-runtime/pkg/client" 34 "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" 35 "sigs.k8s.io/controller-runtime/pkg/event" 36 "sigs.k8s.io/controller-runtime/pkg/handler" 37 "sigs.k8s.io/controller-runtime/pkg/reconcile" 38 "sigs.k8s.io/controller-runtime/pkg/source" 39 jobset "sigs.k8s.io/jobset/api/jobset/v1alpha2" 40 41 kueuealpha "sigs.k8s.io/kueue/apis/kueue/v1alpha1" 42 kueue "sigs.k8s.io/kueue/apis/kueue/v1beta1" 43 "sigs.k8s.io/kueue/pkg/util/admissioncheck" 44 "sigs.k8s.io/kueue/pkg/workload" 45 ) 46 47 var ( 48 adapters = map[string]jobAdapter{ 49 batchv1.SchemeGroupVersion.WithKind("Job").String(): &batchJobAdapter{}, 50 jobset.SchemeGroupVersion.WithKind("JobSet").String(): &jobsetAdapter{}, 51 } 52 53 errNoActiveClusters = errors.New("no active clusters") 54 ) 55 56 type wlReconciler struct { 57 client client.Client 58 helper *multiKueueStoreHelper 59 clusters *clustersReconciler 60 origin string 61 } 62 63 var _ reconcile.Reconciler = (*wlReconciler)(nil) 64 65 type jobAdapter interface { 66 // Creates the Job object in the worker cluster using remote client, if not already created. 67 // Copy the status from the remote job if already exists. 68 SyncJob(ctx context.Context, localClient client.Client, remoteClient client.Client, key types.NamespacedName, workloadName, origin string) error 69 // Deletes the Job in the worker cluster. 70 DeleteRemoteObject(ctx context.Context, remoteClient client.Client, key types.NamespacedName) error 71 // KeepAdmissionCheckPending returns true if the state of the multikueue admission check should be 72 // kept Pending while the job runs in a worker. This might be needed to keep the managers job 73 // suspended and not start the execution locally. 74 KeepAdmissionCheckPending() bool 75 } 76 77 type wlGroup struct { 78 local *kueue.Workload 79 remotes map[string]*kueue.Workload 80 remoteClients map[string]*remoteClient 81 acName string 82 jobAdapter jobAdapter 83 controllerKey types.NamespacedName 84 } 85 86 // the local wl is finished 87 func (g *wlGroup) IsFinished() bool { 88 return apimeta.IsStatusConditionTrue(g.local.Status.Conditions, kueue.WorkloadFinished) 89 } 90 91 // returns true if there is a wl reserving quota 92 // the string identifies the remote, ("" - local) 93 func (g *wlGroup) FirstReserving() (bool, string) { 94 found := false 95 bestMatch := "" 96 bestTime := time.Now() 97 for remote, wl := range g.remotes { 98 if wl == nil { 99 continue 100 } 101 c := apimeta.FindStatusCondition(wl.Status.Conditions, kueue.WorkloadQuotaReserved) 102 if c != nil && c.Status == metav1.ConditionTrue && (!found || c.LastTransitionTime.Time.Before(bestTime)) { 103 found = true 104 bestMatch = remote 105 bestTime = c.LastTransitionTime.Time 106 } 107 } 108 return found, bestMatch 109 } 110 111 func (g *wlGroup) RemoteFinishedCondition() (*metav1.Condition, string) { 112 var bestMatch *metav1.Condition 113 bestMatchRemote := "" 114 for remote, wl := range g.remotes { 115 if wl == nil { 116 continue 117 } 118 if c := apimeta.FindStatusCondition(wl.Status.Conditions, kueue.WorkloadFinished); c != nil && c.Status == metav1.ConditionTrue && (bestMatch == nil || c.LastTransitionTime.Before(&bestMatch.LastTransitionTime)) { 119 bestMatch = c 120 bestMatchRemote = remote 121 } 122 } 123 return bestMatch, bestMatchRemote 124 } 125 126 func (g *wlGroup) RemoveRemoteObjects(ctx context.Context, cluster string) error { 127 remWl := g.remotes[cluster] 128 if remWl == nil { 129 return nil 130 } 131 if err := g.jobAdapter.DeleteRemoteObject(ctx, g.remoteClients[cluster].client, g.controllerKey); err != nil { 132 return fmt.Errorf("deleting remote controller object: %w", err) 133 } 134 135 if controllerutil.RemoveFinalizer(remWl, kueue.ResourceInUseFinalizerName) { 136 if err := g.remoteClients[cluster].client.Update(ctx, remWl); err != nil { 137 return fmt.Errorf("removing remote workloads finalizeer: %w", err) 138 } 139 } 140 141 err := g.remoteClients[cluster].client.Delete(ctx, remWl) 142 if client.IgnoreNotFound(err) != nil { 143 return fmt.Errorf("deleting remote workload: %w", err) 144 } 145 g.remotes[cluster] = nil 146 return nil 147 } 148 149 func (a *wlReconciler) Reconcile(ctx context.Context, req reconcile.Request) (reconcile.Result, error) { 150 log := ctrl.LoggerFrom(ctx) 151 log.V(2).Info("Reconcile Workload") 152 wl := &kueue.Workload{} 153 if err := a.client.Get(ctx, req.NamespacedName, wl); err != nil { 154 return reconcile.Result{}, client.IgnoreNotFound(err) 155 } 156 // NOTE: the not found needs to be treated and should result in the deletion of all the remote workloads. 157 // since the list of remotes can only be taken from its list of admission check stats we need to either 158 // 1. use a finalizer 159 // 2. try to trigger the remote deletion from an event filter. 160 161 grp, err := a.readGroup(ctx, wl) 162 if err != nil { 163 return reconcile.Result{}, err 164 } 165 166 if grp == nil { 167 log.V(2).Info("Skip Workload") 168 return reconcile.Result{}, nil 169 } 170 171 return reconcile.Result{}, a.reconcileGroup(ctx, grp) 172 } 173 174 func (w *wlReconciler) remoteClientsForAC(ctx context.Context, acName string) (map[string]*remoteClient, error) { 175 cfg, err := w.helper.ConfigForAdmissionCheck(ctx, acName) 176 if err != nil { 177 return nil, err 178 } 179 clients := make(map[string]*remoteClient, len(cfg.Spec.Clusters)) 180 for _, clusterName := range cfg.Spec.Clusters { 181 if client, found := w.clusters.controllerFor(clusterName); found { 182 clients[clusterName] = client 183 } 184 } 185 if len(clients) == 0 { 186 return nil, errNoActiveClusters 187 } 188 return clients, nil 189 } 190 191 func (a *wlReconciler) readGroup(ctx context.Context, local *kueue.Workload) (*wlGroup, error) { 192 relevantChecks, err := admissioncheck.FilterForController(ctx, a.client, local.Status.AdmissionChecks, ControllerName) 193 if err != nil { 194 return nil, err 195 } 196 197 // If the are more than 1 multikueue admission checks (len(relevantChecks) > 1), skip this workload. 198 if len(relevantChecks) == 0 { 199 return nil, nil 200 } 201 202 rClients, err := a.remoteClientsForAC(ctx, relevantChecks[0]) 203 if err != nil { 204 return nil, fmt.Errorf("admission check %q: %w", relevantChecks[0], err) 205 } 206 207 // Lookup the adapter. 208 var adapter jobAdapter 209 controllerKey := types.NamespacedName{} 210 if controller := metav1.GetControllerOf(local); controller != nil { 211 adapterKey := schema.FromAPIVersionAndKind(controller.APIVersion, controller.Kind).String() 212 adapter = adapters[adapterKey] 213 controllerKey.Namespace = local.Namespace 214 controllerKey.Name = controller.Name 215 } 216 217 if adapter == nil { 218 return nil, nil 219 } 220 221 grp := wlGroup{ 222 local: local, 223 remotes: make(map[string]*kueue.Workload, len(rClients)), 224 remoteClients: rClients, 225 acName: relevantChecks[0], 226 jobAdapter: adapter, 227 controllerKey: controllerKey, 228 } 229 230 for remote, rClient := range rClients { 231 wl := &kueue.Workload{} 232 err := rClient.client.Get(ctx, client.ObjectKeyFromObject(local), wl) 233 if client.IgnoreNotFound(err) != nil { 234 return nil, err 235 } 236 if err != nil { 237 wl = nil 238 } 239 grp.remotes[remote] = wl 240 } 241 return &grp, nil 242 } 243 244 func (a *wlReconciler) reconcileGroup(ctx context.Context, group *wlGroup) error { 245 log := ctrl.LoggerFrom(ctx).WithValues("op", "reconcileGroup") 246 log.V(3).Info("Reconcile Workload Group") 247 248 // 1. delete all remote workloads when finished or the local wl has no reservation 249 if group.IsFinished() || !workload.HasQuotaReservation(group.local) { 250 errs := []error{} 251 for rem := range group.remotes { 252 if err := group.RemoveRemoteObjects(ctx, rem); err != nil { 253 errs = append(errs, err) 254 log.V(2).Error(err, "Deleting remote workload", "workerCluster", rem) 255 } 256 } 257 return errors.Join(errs...) 258 } 259 260 if remoteFinishedCond, remote := group.RemoteFinishedCondition(); remoteFinishedCond != nil { 261 // NOTE: we can have a race condition setting the wl status here and it being updated by the job controller 262 // it should not be problematic but the "From remote xxxx:" could be lost .... 263 264 if group.jobAdapter != nil { 265 if err := group.jobAdapter.SyncJob(ctx, a.client, group.remoteClients[remote].client, group.controllerKey, group.local.Name, a.origin); err != nil { 266 log.V(2).Error(err, "copying remote controller status", "workerCluster", remote) 267 // we should retry this 268 return err 269 } 270 } else { 271 log.V(3).Info("Group with no adapter, skip owner status copy", "workerCluster", remote) 272 } 273 274 // copy the status to the local one 275 wlPatch := workload.BaseSSAWorkload(group.local) 276 apimeta.SetStatusCondition(&wlPatch.Status.Conditions, metav1.Condition{ 277 Type: kueue.WorkloadFinished, 278 Status: metav1.ConditionTrue, 279 Reason: remoteFinishedCond.Reason, 280 Message: remoteFinishedCond.Message, 281 }) 282 return a.client.Status().Patch(ctx, wlPatch, client.Apply, client.FieldOwner(ControllerName+"-finish"), client.ForceOwnership) 283 } 284 285 hasReserving, reservingRemote := group.FirstReserving() 286 287 // 2. delete all workloads that are out of sync or are not in the chosen worker 288 for rem, remWl := range group.remotes { 289 if remWl == nil { 290 continue 291 } 292 outOfSync := group.local == nil || !equality.Semantic.DeepEqual(group.local.Spec, remWl.Spec) 293 notReservingRemote := hasReserving && reservingRemote != rem 294 if outOfSync || notReservingRemote { 295 if err := client.IgnoreNotFound(group.RemoveRemoteObjects(ctx, rem)); err != nil { 296 log.V(2).Error(err, "Deleting out of sync remote objects", "remote", rem) 297 return err 298 } 299 } 300 } 301 302 // 3. get the first reserving 303 if hasReserving { 304 acs := workload.FindAdmissionCheck(group.local.Status.AdmissionChecks, group.acName) 305 if err := group.jobAdapter.SyncJob(ctx, a.client, group.remoteClients[reservingRemote].client, group.controllerKey, group.local.Name, a.origin); err != nil { 306 log.V(2).Error(err, "creating remote controller object", "remote", reservingRemote) 307 // We'll retry this in the next reconcile. 308 return err 309 } 310 311 if acs.State != kueue.CheckStateRetry && acs.State != kueue.CheckStateRejected { 312 if group.jobAdapter.KeepAdmissionCheckPending() { 313 acs.State = kueue.CheckStatePending 314 } else { 315 acs.State = kueue.CheckStateReady 316 } 317 // update the message 318 acs.Message = fmt.Sprintf("The workload got reservation on %q", reservingRemote) 319 wlPatch := workload.BaseSSAWorkload(group.local) 320 workload.SetAdmissionCheckState(&wlPatch.Status.AdmissionChecks, *acs) 321 err := a.client.Status().Patch(ctx, wlPatch, client.Apply, client.FieldOwner(ControllerName), client.ForceOwnership) 322 if err != nil { 323 return err 324 } 325 } 326 // drop this if we want to create new remote workloads while holding a reservation 327 return nil 328 } 329 330 // finally - create missing workloads 331 var errs []error 332 for rem, remWl := range group.remotes { 333 if remWl == nil { 334 clone := cloneForCreate(group.local, group.remoteClients[rem].origin) 335 err := group.remoteClients[rem].client.Create(ctx, clone) 336 if err != nil { 337 // just log the error for a single remote 338 log.V(2).Error(err, "creating remote object", "remote", rem) 339 errs = append(errs, err) 340 } 341 } 342 } 343 return errors.Join(errs...) 344 } 345 346 func newWlReconciler(c client.Client, helper *multiKueueStoreHelper, cRec *clustersReconciler, origin string) *wlReconciler { 347 return &wlReconciler{ 348 client: c, 349 helper: helper, 350 clusters: cRec, 351 origin: origin, 352 } 353 } 354 355 func (w *wlReconciler) setupWithManager(mgr ctrl.Manager) error { 356 syncHndl := handler.Funcs{ 357 GenericFunc: func(_ context.Context, e event.GenericEvent, q workqueue.RateLimitingInterface) { 358 q.Add(reconcile.Request{NamespacedName: types.NamespacedName{ 359 Namespace: e.Object.GetNamespace(), 360 Name: e.Object.GetName(), 361 }}) 362 }, 363 } 364 365 return ctrl.NewControllerManagedBy(mgr). 366 For(&kueue.Workload{}). 367 WatchesRawSource(&source.Channel{Source: w.clusters.wlUpdateCh}, syncHndl). 368 Complete(w) 369 } 370 371 func cleanObjectMeta(orig *metav1.ObjectMeta) metav1.ObjectMeta { 372 // to clone the labels and annotations 373 clone := orig.DeepCopy() 374 return metav1.ObjectMeta{ 375 Name: clone.Name, 376 Namespace: clone.Namespace, 377 Labels: clone.Labels, 378 Annotations: clone.Annotations, 379 } 380 } 381 382 func cloneForCreate(orig *kueue.Workload, origin string) *kueue.Workload { 383 remoteWl := &kueue.Workload{} 384 remoteWl.ObjectMeta = cleanObjectMeta(&orig.ObjectMeta) 385 if remoteWl.Labels == nil { 386 remoteWl.Labels = make(map[string]string) 387 } 388 remoteWl.Labels[kueuealpha.MultiKueueOriginLabel] = origin 389 orig.Spec.DeepCopyInto(&remoteWl.Spec) 390 return remoteWl 391 }