sigs.k8s.io/kueue@v0.6.2/pkg/controller/jobs/raycluster/raycluster_webhook.go (about)

     1  /*
     2  Copyright 2024 The Kubernetes Authors.
     3  Licensed under the Apache License, Version 2.0 (the "License");
     4  you may not use this file except in compliance with the License.
     5  You may obtain a copy of the License at
     6      http://www.apache.org/licenses/LICENSE-2.0
     7  Unless required by applicable law or agreed to in writing, software
     8  distributed under the License is distributed on an "AS IS" BASIS,
     9  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    10  See the License for the specific language governing permissions and
    11  limitations under the License.
    12  */
    13  
    14  package raycluster
    15  
    16  import (
    17  	"context"
    18  	"fmt"
    19  
    20  	rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1"
    21  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    22  	"k8s.io/apimachinery/pkg/runtime"
    23  	"k8s.io/apimachinery/pkg/util/validation/field"
    24  	"k8s.io/klog/v2"
    25  	"k8s.io/utils/ptr"
    26  
    27  	ctrl "sigs.k8s.io/controller-runtime"
    28  	"sigs.k8s.io/controller-runtime/pkg/webhook"
    29  	"sigs.k8s.io/controller-runtime/pkg/webhook/admission"
    30  
    31  	"sigs.k8s.io/kueue/pkg/controller/constants"
    32  	"sigs.k8s.io/kueue/pkg/controller/jobframework"
    33  )
    34  
    35  type RayClusterWebhook struct {
    36  	manageJobsWithoutQueueName bool
    37  }
    38  
    39  // SetupRayClusterWebhook configures the webhook for rayv1 RayCluster.
    40  func SetupRayClusterWebhook(mgr ctrl.Manager, opts ...jobframework.Option) error {
    41  	options := jobframework.ProcessOptions(opts...)
    42  	for _, opt := range opts {
    43  		opt(&options)
    44  	}
    45  	wh := &RayClusterWebhook{
    46  		manageJobsWithoutQueueName: options.ManageJobsWithoutQueueName,
    47  	}
    48  	return ctrl.NewWebhookManagedBy(mgr).
    49  		For(&rayv1.RayCluster{}).
    50  		WithDefaulter(wh).
    51  		WithValidator(wh).
    52  		Complete()
    53  }
    54  
    55  // +kubebuilder:webhook:path=/mutate-ray-io-v1-raycluster,mutating=true,failurePolicy=fail,sideEffects=None,groups=ray.io,resources=rayclusters,verbs=create,versions=v1,name=mraycluster.kb.io,admissionReviewVersions=v1
    56  
    57  var _ webhook.CustomDefaulter = &RayClusterWebhook{}
    58  
    59  // Default implements webhook.CustomDefaulter so a webhook will be registered for the type
    60  func (w *RayClusterWebhook) Default(ctx context.Context, obj runtime.Object) error {
    61  	job := fromObject(obj)
    62  	log := ctrl.LoggerFrom(ctx).WithName("raycluster-webhook")
    63  	log.V(10).Info("Applying defaults", "job", klog.KObj(job))
    64  
    65  	// We don't want to double count for a ray cluster created by a RayJob
    66  	if owner := metav1.GetControllerOf(job.Object()); owner != nil && jobframework.IsOwnerManagedByKueue(owner) {
    67  		log.Info("RayCluster is owned by RayJob")
    68  		if job.Annotations == nil {
    69  			job.Annotations = make(map[string]string)
    70  		}
    71  		if pwName, err := jobframework.GetWorkloadNameForOwnerRef(owner); err != nil {
    72  			return err
    73  		} else {
    74  			job.Annotations[constants.ParentWorkloadAnnotation] = pwName
    75  		}
    76  		return nil
    77  	}
    78  
    79  	jobframework.ApplyDefaultForSuspend((*RayCluster)(job), w.manageJobsWithoutQueueName)
    80  	return nil
    81  }
    82  
    83  // +kubebuilder:webhook:path=/validate-ray-io-v1-raycluster,mutating=false,failurePolicy=fail,sideEffects=None,groups=ray.io,resources=rayclusters,verbs=create;update,versions=v1,name=vraycluster.kb.io,admissionReviewVersions=v1
    84  
    85  var _ webhook.CustomValidator = &RayClusterWebhook{}
    86  
    87  // ValidateCreate implements webhook.CustomValidator so a webhook will be registered for the type
    88  func (w *RayClusterWebhook) ValidateCreate(ctx context.Context, obj runtime.Object) (admission.Warnings, error) {
    89  	job := obj.(*rayv1.RayCluster)
    90  	log := ctrl.LoggerFrom(ctx).WithName("raycluster-webhook")
    91  	log.V(10).Info("Validating create", "job", klog.KObj(job))
    92  	return nil, w.validateCreate(job).ToAggregate()
    93  }
    94  
    95  func (w *RayClusterWebhook) validateCreate(job *rayv1.RayCluster) field.ErrorList {
    96  	var allErrors field.ErrorList
    97  	kueueJob := (*RayCluster)(job)
    98  
    99  	if w.manageJobsWithoutQueueName || jobframework.QueueName(kueueJob) != "" {
   100  		spec := &job.Spec
   101  		specPath := field.NewPath("spec")
   102  
   103  		// TODO revisit once Support dynamically sized (elastic) jobs #77 is implemented
   104  		// Should not use auto scaler. Once the resources are reserved by queue the cluster should do it's best to use them.
   105  		if ptr.Deref(spec.EnableInTreeAutoscaling, false) {
   106  			allErrors = append(allErrors, field.Invalid(specPath.Child("enableInTreeAutoscaling"), spec.EnableInTreeAutoscaling, "a kueue managed job should not use autoscaling"))
   107  		}
   108  
   109  		// Should limit the worker count to 8 - 1 (max podSets num - cluster head)
   110  		if len(spec.WorkerGroupSpecs) > 7 {
   111  			allErrors = append(allErrors, field.TooMany(specPath.Child("workerGroupSpecs"), len(spec.WorkerGroupSpecs), 7))
   112  		}
   113  
   114  		// None of the workerGroups should be named "head"
   115  		for i := range spec.WorkerGroupSpecs {
   116  			if spec.WorkerGroupSpecs[i].GroupName == headGroupPodSetName {
   117  				allErrors = append(allErrors, field.Forbidden(specPath.Child("workerGroupSpecs").Index(i).Child("groupName"), fmt.Sprintf("%q is reserved for the head group", headGroupPodSetName)))
   118  			}
   119  		}
   120  	}
   121  
   122  	allErrors = append(allErrors, jobframework.ValidateCreateForQueueName(kueueJob)...)
   123  	return allErrors
   124  }
   125  
   126  // ValidateUpdate implements webhook.CustomValidator so a webhook will be registered for the type
   127  func (w *RayClusterWebhook) ValidateUpdate(ctx context.Context, oldObj, newObj runtime.Object) (admission.Warnings, error) {
   128  	oldJob := oldObj.(*rayv1.RayCluster)
   129  	newJob := newObj.(*rayv1.RayCluster)
   130  	log := ctrl.LoggerFrom(ctx).WithName("raycluster-webhook")
   131  	if w.manageJobsWithoutQueueName || jobframework.QueueName((*RayCluster)(newJob)) != "" {
   132  		log.Info("Validating update", "job", klog.KObj(newJob))
   133  		allErrors := jobframework.ValidateUpdateForQueueName((*RayCluster)(oldJob), (*RayCluster)(newJob))
   134  		allErrors = append(allErrors, w.validateCreate(newJob)...)
   135  		allErrors = append(allErrors, jobframework.ValidateUpdateForWorkloadPriorityClassName((*RayCluster)(oldJob), (*RayCluster)(newJob))...)
   136  		return nil, allErrors.ToAggregate()
   137  	}
   138  	return nil, nil
   139  }
   140  
   141  // ValidateDelete implements webhook.CustomValidator so a webhook will be registered for the type
   142  func (w *RayClusterWebhook) ValidateDelete(ctx context.Context, obj runtime.Object) (admission.Warnings, error) {
   143  	return nil, nil
   144  }