sigs.k8s.io/kueue@v0.6.2/pkg/controller/admissionchecks/multikueue/workload.go (about)

     1  /*
     2  Copyright 2024 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package multikueue
    18  
    19  import (
    20  	"context"
    21  	"errors"
    22  	"fmt"
    23  	"time"
    24  
    25  	batchv1 "k8s.io/api/batch/v1"
    26  	"k8s.io/apimachinery/pkg/api/equality"
    27  	apimeta "k8s.io/apimachinery/pkg/api/meta"
    28  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    29  	"k8s.io/apimachinery/pkg/runtime/schema"
    30  	"k8s.io/apimachinery/pkg/types"
    31  	"k8s.io/client-go/util/workqueue"
    32  	ctrl "sigs.k8s.io/controller-runtime"
    33  	"sigs.k8s.io/controller-runtime/pkg/client"
    34  	"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
    35  	"sigs.k8s.io/controller-runtime/pkg/event"
    36  	"sigs.k8s.io/controller-runtime/pkg/handler"
    37  	"sigs.k8s.io/controller-runtime/pkg/reconcile"
    38  	"sigs.k8s.io/controller-runtime/pkg/source"
    39  	jobset "sigs.k8s.io/jobset/api/jobset/v1alpha2"
    40  
    41  	kueuealpha "sigs.k8s.io/kueue/apis/kueue/v1alpha1"
    42  	kueue "sigs.k8s.io/kueue/apis/kueue/v1beta1"
    43  	"sigs.k8s.io/kueue/pkg/util/admissioncheck"
    44  	"sigs.k8s.io/kueue/pkg/workload"
    45  )
    46  
    47  var (
    48  	adapters = map[string]jobAdapter{
    49  		batchv1.SchemeGroupVersion.WithKind("Job").String():   &batchJobAdapter{},
    50  		jobset.SchemeGroupVersion.WithKind("JobSet").String(): &jobsetAdapter{},
    51  	}
    52  
    53  	errNoActiveClusters = errors.New("no active clusters")
    54  )
    55  
    56  type wlReconciler struct {
    57  	client   client.Client
    58  	helper   *multiKueueStoreHelper
    59  	clusters *clustersReconciler
    60  	origin   string
    61  }
    62  
    63  var _ reconcile.Reconciler = (*wlReconciler)(nil)
    64  
    65  type jobAdapter interface {
    66  	// Creates the Job object in the worker cluster using remote client, if not already created.
    67  	// Copy the status from the remote job if already exists.
    68  	SyncJob(ctx context.Context, localClient client.Client, remoteClient client.Client, key types.NamespacedName, workloadName, origin string) error
    69  	// Deletes the Job in the worker cluster.
    70  	DeleteRemoteObject(ctx context.Context, remoteClient client.Client, key types.NamespacedName) error
    71  	// KeepAdmissionCheckPending returns true if the state of the multikueue admission check should be
    72  	// kept Pending while the job runs in a worker. This might be needed to keep the managers job
    73  	// suspended and not start the execution locally.
    74  	KeepAdmissionCheckPending() bool
    75  }
    76  
    77  type wlGroup struct {
    78  	local         *kueue.Workload
    79  	remotes       map[string]*kueue.Workload
    80  	remoteClients map[string]*remoteClient
    81  	acName        string
    82  	jobAdapter    jobAdapter
    83  	controllerKey types.NamespacedName
    84  }
    85  
    86  // the local wl is finished
    87  func (g *wlGroup) IsFinished() bool {
    88  	return apimeta.IsStatusConditionTrue(g.local.Status.Conditions, kueue.WorkloadFinished)
    89  }
    90  
    91  // returns true if there is a wl reserving quota
    92  // the string identifies the remote, ("" - local)
    93  func (g *wlGroup) FirstReserving() (bool, string) {
    94  	found := false
    95  	bestMatch := ""
    96  	bestTime := time.Now()
    97  	for remote, wl := range g.remotes {
    98  		if wl == nil {
    99  			continue
   100  		}
   101  		c := apimeta.FindStatusCondition(wl.Status.Conditions, kueue.WorkloadQuotaReserved)
   102  		if c != nil && c.Status == metav1.ConditionTrue && (!found || c.LastTransitionTime.Time.Before(bestTime)) {
   103  			found = true
   104  			bestMatch = remote
   105  			bestTime = c.LastTransitionTime.Time
   106  		}
   107  	}
   108  	return found, bestMatch
   109  }
   110  
   111  func (g *wlGroup) RemoteFinishedCondition() (*metav1.Condition, string) {
   112  	var bestMatch *metav1.Condition
   113  	bestMatchRemote := ""
   114  	for remote, wl := range g.remotes {
   115  		if wl == nil {
   116  			continue
   117  		}
   118  		if c := apimeta.FindStatusCondition(wl.Status.Conditions, kueue.WorkloadFinished); c != nil && c.Status == metav1.ConditionTrue && (bestMatch == nil || c.LastTransitionTime.Before(&bestMatch.LastTransitionTime)) {
   119  			bestMatch = c
   120  			bestMatchRemote = remote
   121  		}
   122  	}
   123  	return bestMatch, bestMatchRemote
   124  }
   125  
   126  func (g *wlGroup) RemoveRemoteObjects(ctx context.Context, cluster string) error {
   127  	remWl := g.remotes[cluster]
   128  	if remWl == nil {
   129  		return nil
   130  	}
   131  	if err := g.jobAdapter.DeleteRemoteObject(ctx, g.remoteClients[cluster].client, g.controllerKey); err != nil {
   132  		return fmt.Errorf("deleting remote controller object: %w", err)
   133  	}
   134  
   135  	if controllerutil.RemoveFinalizer(remWl, kueue.ResourceInUseFinalizerName) {
   136  		if err := g.remoteClients[cluster].client.Update(ctx, remWl); err != nil {
   137  			return fmt.Errorf("removing remote workloads finalizeer: %w", err)
   138  		}
   139  	}
   140  
   141  	err := g.remoteClients[cluster].client.Delete(ctx, remWl)
   142  	if client.IgnoreNotFound(err) != nil {
   143  		return fmt.Errorf("deleting remote workload: %w", err)
   144  	}
   145  	g.remotes[cluster] = nil
   146  	return nil
   147  }
   148  
   149  func (a *wlReconciler) Reconcile(ctx context.Context, req reconcile.Request) (reconcile.Result, error) {
   150  	log := ctrl.LoggerFrom(ctx)
   151  	log.V(2).Info("Reconcile Workload")
   152  	wl := &kueue.Workload{}
   153  	if err := a.client.Get(ctx, req.NamespacedName, wl); err != nil {
   154  		return reconcile.Result{}, client.IgnoreNotFound(err)
   155  	}
   156  	// NOTE: the not found needs to be treated and should result in the deletion of all the remote workloads.
   157  	// since the list of remotes can only be taken from its list of admission check stats we need to either
   158  	// 1. use a finalizer
   159  	// 2. try to trigger the remote deletion from an event filter.
   160  
   161  	grp, err := a.readGroup(ctx, wl)
   162  	if err != nil {
   163  		return reconcile.Result{}, err
   164  	}
   165  
   166  	if grp == nil {
   167  		log.V(2).Info("Skip Workload")
   168  		return reconcile.Result{}, nil
   169  	}
   170  
   171  	return reconcile.Result{}, a.reconcileGroup(ctx, grp)
   172  }
   173  
   174  func (w *wlReconciler) remoteClientsForAC(ctx context.Context, acName string) (map[string]*remoteClient, error) {
   175  	cfg, err := w.helper.ConfigForAdmissionCheck(ctx, acName)
   176  	if err != nil {
   177  		return nil, err
   178  	}
   179  	clients := make(map[string]*remoteClient, len(cfg.Spec.Clusters))
   180  	for _, clusterName := range cfg.Spec.Clusters {
   181  		if client, found := w.clusters.controllerFor(clusterName); found {
   182  			clients[clusterName] = client
   183  		}
   184  	}
   185  	if len(clients) == 0 {
   186  		return nil, errNoActiveClusters
   187  	}
   188  	return clients, nil
   189  }
   190  
   191  func (a *wlReconciler) readGroup(ctx context.Context, local *kueue.Workload) (*wlGroup, error) {
   192  	relevantChecks, err := admissioncheck.FilterForController(ctx, a.client, local.Status.AdmissionChecks, ControllerName)
   193  	if err != nil {
   194  		return nil, err
   195  	}
   196  
   197  	// If the are more than 1 multikueue admission checks (len(relevantChecks) > 1), skip this workload.
   198  	if len(relevantChecks) == 0 {
   199  		return nil, nil
   200  	}
   201  
   202  	rClients, err := a.remoteClientsForAC(ctx, relevantChecks[0])
   203  	if err != nil {
   204  		return nil, fmt.Errorf("admission check %q: %w", relevantChecks[0], err)
   205  	}
   206  
   207  	// Lookup the adapter.
   208  	var adapter jobAdapter
   209  	controllerKey := types.NamespacedName{}
   210  	if controller := metav1.GetControllerOf(local); controller != nil {
   211  		adapterKey := schema.FromAPIVersionAndKind(controller.APIVersion, controller.Kind).String()
   212  		adapter = adapters[adapterKey]
   213  		controllerKey.Namespace = local.Namespace
   214  		controllerKey.Name = controller.Name
   215  	}
   216  
   217  	if adapter == nil {
   218  		return nil, nil
   219  	}
   220  
   221  	grp := wlGroup{
   222  		local:         local,
   223  		remotes:       make(map[string]*kueue.Workload, len(rClients)),
   224  		remoteClients: rClients,
   225  		acName:        relevantChecks[0],
   226  		jobAdapter:    adapter,
   227  		controllerKey: controllerKey,
   228  	}
   229  
   230  	for remote, rClient := range rClients {
   231  		wl := &kueue.Workload{}
   232  		err := rClient.client.Get(ctx, client.ObjectKeyFromObject(local), wl)
   233  		if client.IgnoreNotFound(err) != nil {
   234  			return nil, err
   235  		}
   236  		if err != nil {
   237  			wl = nil
   238  		}
   239  		grp.remotes[remote] = wl
   240  	}
   241  	return &grp, nil
   242  }
   243  
   244  func (a *wlReconciler) reconcileGroup(ctx context.Context, group *wlGroup) error {
   245  	log := ctrl.LoggerFrom(ctx).WithValues("op", "reconcileGroup")
   246  	log.V(3).Info("Reconcile Workload Group")
   247  
   248  	// 1. delete all remote workloads when finished or the local wl has no reservation
   249  	if group.IsFinished() || !workload.HasQuotaReservation(group.local) {
   250  		errs := []error{}
   251  		for rem := range group.remotes {
   252  			if err := group.RemoveRemoteObjects(ctx, rem); err != nil {
   253  				errs = append(errs, err)
   254  				log.V(2).Error(err, "Deleting remote workload", "workerCluster", rem)
   255  			}
   256  		}
   257  		return errors.Join(errs...)
   258  	}
   259  
   260  	if remoteFinishedCond, remote := group.RemoteFinishedCondition(); remoteFinishedCond != nil {
   261  		// NOTE: we can have a race condition setting the wl status here and it being updated by the job controller
   262  		// it should not be problematic but the "From remote xxxx:" could be lost ....
   263  
   264  		if group.jobAdapter != nil {
   265  			if err := group.jobAdapter.SyncJob(ctx, a.client, group.remoteClients[remote].client, group.controllerKey, group.local.Name, a.origin); err != nil {
   266  				log.V(2).Error(err, "copying remote controller status", "workerCluster", remote)
   267  				// we should retry this
   268  				return err
   269  			}
   270  		} else {
   271  			log.V(3).Info("Group with no adapter, skip owner status copy", "workerCluster", remote)
   272  		}
   273  
   274  		// copy the status to the local one
   275  		wlPatch := workload.BaseSSAWorkload(group.local)
   276  		apimeta.SetStatusCondition(&wlPatch.Status.Conditions, metav1.Condition{
   277  			Type:    kueue.WorkloadFinished,
   278  			Status:  metav1.ConditionTrue,
   279  			Reason:  remoteFinishedCond.Reason,
   280  			Message: remoteFinishedCond.Message,
   281  		})
   282  		return a.client.Status().Patch(ctx, wlPatch, client.Apply, client.FieldOwner(ControllerName+"-finish"), client.ForceOwnership)
   283  	}
   284  
   285  	hasReserving, reservingRemote := group.FirstReserving()
   286  
   287  	// 2. delete all workloads that are out of sync or are not in the chosen worker
   288  	for rem, remWl := range group.remotes {
   289  		if remWl == nil {
   290  			continue
   291  		}
   292  		outOfSync := group.local == nil || !equality.Semantic.DeepEqual(group.local.Spec, remWl.Spec)
   293  		notReservingRemote := hasReserving && reservingRemote != rem
   294  		if outOfSync || notReservingRemote {
   295  			if err := client.IgnoreNotFound(group.RemoveRemoteObjects(ctx, rem)); err != nil {
   296  				log.V(2).Error(err, "Deleting out of sync remote objects", "remote", rem)
   297  				return err
   298  			}
   299  		}
   300  	}
   301  
   302  	// 3. get the first reserving
   303  	if hasReserving {
   304  		acs := workload.FindAdmissionCheck(group.local.Status.AdmissionChecks, group.acName)
   305  		if err := group.jobAdapter.SyncJob(ctx, a.client, group.remoteClients[reservingRemote].client, group.controllerKey, group.local.Name, a.origin); err != nil {
   306  			log.V(2).Error(err, "creating remote controller object", "remote", reservingRemote)
   307  			// We'll retry this in the next reconcile.
   308  			return err
   309  		}
   310  
   311  		if acs.State != kueue.CheckStateRetry && acs.State != kueue.CheckStateRejected {
   312  			if group.jobAdapter.KeepAdmissionCheckPending() {
   313  				acs.State = kueue.CheckStatePending
   314  			} else {
   315  				acs.State = kueue.CheckStateReady
   316  			}
   317  			// update the message
   318  			acs.Message = fmt.Sprintf("The workload got reservation on %q", reservingRemote)
   319  			wlPatch := workload.BaseSSAWorkload(group.local)
   320  			workload.SetAdmissionCheckState(&wlPatch.Status.AdmissionChecks, *acs)
   321  			err := a.client.Status().Patch(ctx, wlPatch, client.Apply, client.FieldOwner(ControllerName), client.ForceOwnership)
   322  			if err != nil {
   323  				return err
   324  			}
   325  		}
   326  		// drop this if we want to create new remote workloads while holding a reservation
   327  		return nil
   328  	}
   329  
   330  	// finally - create missing workloads
   331  	var errs []error
   332  	for rem, remWl := range group.remotes {
   333  		if remWl == nil {
   334  			clone := cloneForCreate(group.local, group.remoteClients[rem].origin)
   335  			err := group.remoteClients[rem].client.Create(ctx, clone)
   336  			if err != nil {
   337  				// just log the error for a single remote
   338  				log.V(2).Error(err, "creating remote object", "remote", rem)
   339  				errs = append(errs, err)
   340  			}
   341  		}
   342  	}
   343  	return errors.Join(errs...)
   344  }
   345  
   346  func newWlReconciler(c client.Client, helper *multiKueueStoreHelper, cRec *clustersReconciler, origin string) *wlReconciler {
   347  	return &wlReconciler{
   348  		client:   c,
   349  		helper:   helper,
   350  		clusters: cRec,
   351  		origin:   origin,
   352  	}
   353  }
   354  
   355  func (w *wlReconciler) setupWithManager(mgr ctrl.Manager) error {
   356  	syncHndl := handler.Funcs{
   357  		GenericFunc: func(_ context.Context, e event.GenericEvent, q workqueue.RateLimitingInterface) {
   358  			q.Add(reconcile.Request{NamespacedName: types.NamespacedName{
   359  				Namespace: e.Object.GetNamespace(),
   360  				Name:      e.Object.GetName(),
   361  			}})
   362  		},
   363  	}
   364  
   365  	return ctrl.NewControllerManagedBy(mgr).
   366  		For(&kueue.Workload{}).
   367  		WatchesRawSource(&source.Channel{Source: w.clusters.wlUpdateCh}, syncHndl).
   368  		Complete(w)
   369  }
   370  
   371  func cleanObjectMeta(orig *metav1.ObjectMeta) metav1.ObjectMeta {
   372  	// to clone the labels and annotations
   373  	clone := orig.DeepCopy()
   374  	return metav1.ObjectMeta{
   375  		Name:        clone.Name,
   376  		Namespace:   clone.Namespace,
   377  		Labels:      clone.Labels,
   378  		Annotations: clone.Annotations,
   379  	}
   380  }
   381  
   382  func cloneForCreate(orig *kueue.Workload, origin string) *kueue.Workload {
   383  	remoteWl := &kueue.Workload{}
   384  	remoteWl.ObjectMeta = cleanObjectMeta(&orig.ObjectMeta)
   385  	if remoteWl.Labels == nil {
   386  		remoteWl.Labels = make(map[string]string)
   387  	}
   388  	remoteWl.Labels[kueuealpha.MultiKueueOriginLabel] = origin
   389  	orig.Spec.DeepCopyInto(&remoteWl.Spec)
   390  	return remoteWl
   391  }