sigs.k8s.io/kueue@v0.6.2/pkg/controller/jobs/rayjob/rayjob_webhook.go (about)

     1  /*
     2  Copyright 2023 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package rayjob
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  
    23  	rayjobapi "github.com/ray-project/kuberay/ray-operator/apis/ray/v1alpha1"
    24  	"k8s.io/apimachinery/pkg/runtime"
    25  	"k8s.io/apimachinery/pkg/util/validation/field"
    26  	"k8s.io/klog/v2"
    27  	"k8s.io/utils/ptr"
    28  
    29  	ctrl "sigs.k8s.io/controller-runtime"
    30  	"sigs.k8s.io/controller-runtime/pkg/webhook"
    31  	"sigs.k8s.io/controller-runtime/pkg/webhook/admission"
    32  
    33  	"sigs.k8s.io/kueue/pkg/controller/jobframework"
    34  )
    35  
    36  type RayJobWebhook struct {
    37  	manageJobsWithoutQueueName bool
    38  }
    39  
    40  // SetupRayJobWebhook configures the webhook for rayjobapi RayJob.
    41  func SetupRayJobWebhook(mgr ctrl.Manager, opts ...jobframework.Option) error {
    42  	options := jobframework.ProcessOptions(opts...)
    43  	wh := &RayJobWebhook{
    44  		manageJobsWithoutQueueName: options.ManageJobsWithoutQueueName,
    45  	}
    46  	return ctrl.NewWebhookManagedBy(mgr).
    47  		For(&rayjobapi.RayJob{}).
    48  		WithDefaulter(wh).
    49  		WithValidator(wh).
    50  		Complete()
    51  }
    52  
    53  // +kubebuilder:webhook:path=/mutate-ray-io-v1alpha1-rayjob,mutating=true,failurePolicy=fail,sideEffects=None,groups=ray.io,resources=rayjobs,verbs=create,versions=v1alpha1,name=mrayjob.kb.io,admissionReviewVersions=v1
    54  
    55  var _ webhook.CustomDefaulter = &RayJobWebhook{}
    56  
    57  // Default implements webhook.CustomDefaulter so a webhook will be registered for the type
    58  func (w *RayJobWebhook) Default(ctx context.Context, obj runtime.Object) error {
    59  	job := obj.(*rayjobapi.RayJob)
    60  	log := ctrl.LoggerFrom(ctx).WithName("rayjob-webhook")
    61  	log.V(5).Info("Applying defaults", "job", klog.KObj(job))
    62  	jobframework.ApplyDefaultForSuspend((*RayJob)(job), w.manageJobsWithoutQueueName)
    63  	return nil
    64  }
    65  
    66  // +kubebuilder:webhook:path=/validate-ray-io-v1alpha1-rayjob,mutating=false,failurePolicy=fail,sideEffects=None,groups=ray.io,resources=rayjobs,verbs=create;update,versions=v1alpha1,name=vrayjob.kb.io,admissionReviewVersions=v1
    67  
    68  var _ webhook.CustomValidator = &RayJobWebhook{}
    69  
    70  // ValidateCreate implements webhook.CustomValidator so a webhook will be registered for the type
    71  func (w *RayJobWebhook) ValidateCreate(ctx context.Context, obj runtime.Object) (admission.Warnings, error) {
    72  	job := obj.(*rayjobapi.RayJob)
    73  	log := ctrl.LoggerFrom(ctx).WithName("rayjob-webhook")
    74  	log.Info("Validating create", "job", klog.KObj(job))
    75  	return nil, w.validateCreate(job).ToAggregate()
    76  }
    77  
    78  func (w *RayJobWebhook) validateCreate(job *rayjobapi.RayJob) field.ErrorList {
    79  	var allErrors field.ErrorList
    80  	kueueJob := (*RayJob)(job)
    81  
    82  	if w.manageJobsWithoutQueueName || jobframework.QueueName(kueueJob) != "" {
    83  		spec := &job.Spec
    84  		specPath := field.NewPath("spec")
    85  
    86  		// Should always delete the cluster after the sob has ended, otherwise it will continue to the queue's resources.
    87  		if !spec.ShutdownAfterJobFinishes {
    88  			allErrors = append(allErrors, field.Invalid(specPath.Child("shutdownAfterJobFinishes"), spec.ShutdownAfterJobFinishes, "a kueue managed job should delete the cluster after finishing"))
    89  		}
    90  
    91  		// Should not want existing cluster. Keuue (workload) should be able to control the admission of the actual work, not only the trigger.
    92  		if len(spec.ClusterSelector) > 0 {
    93  			allErrors = append(allErrors, field.Invalid(specPath.Child("clusterSelector"), spec.ClusterSelector, "a kueue managed job should not use an existing cluster"))
    94  		}
    95  
    96  		clusterSpec := spec.RayClusterSpec
    97  		clusterSpecPath := specPath.Child("rayClusterSpec")
    98  
    99  		// Should not use auto scaler. Once the resources are reserved by queue the cluster should do it's best to use them.
   100  		if ptr.Deref(clusterSpec.EnableInTreeAutoscaling, false) {
   101  			allErrors = append(allErrors, field.Invalid(clusterSpecPath.Child("enableInTreeAutoscaling"), clusterSpec.EnableInTreeAutoscaling, "a kueue managed job should not use autoscaling"))
   102  		}
   103  
   104  		// Should limit the worker count to 8 - 1 (max podSets num - cluster head)
   105  		if len(clusterSpec.WorkerGroupSpecs) > 7 {
   106  			allErrors = append(allErrors, field.TooMany(clusterSpecPath.Child("workerGroupSpecs"), len(clusterSpec.WorkerGroupSpecs), 7))
   107  		}
   108  
   109  		// None of the workerGroups should be named "head"
   110  		for i := range clusterSpec.WorkerGroupSpecs {
   111  			if clusterSpec.WorkerGroupSpecs[i].GroupName == headGroupPodSetName {
   112  				allErrors = append(allErrors, field.Forbidden(clusterSpecPath.Child("workerGroupSpecs").Index(i).Child("groupName"), fmt.Sprintf("%q is reserved for the head group", headGroupPodSetName)))
   113  			}
   114  		}
   115  	}
   116  
   117  	allErrors = append(allErrors, jobframework.ValidateCreateForQueueName(kueueJob)...)
   118  	return allErrors
   119  }
   120  
   121  // ValidateUpdate implements webhook.CustomValidator so a webhook will be registered for the type
   122  func (w *RayJobWebhook) ValidateUpdate(ctx context.Context, oldObj, newObj runtime.Object) (admission.Warnings, error) {
   123  	oldJob := oldObj.(*rayjobapi.RayJob)
   124  	newJob := newObj.(*rayjobapi.RayJob)
   125  	log := ctrl.LoggerFrom(ctx).WithName("rayjob-webhook")
   126  	if w.manageJobsWithoutQueueName || jobframework.QueueName((*RayJob)(newJob)) != "" {
   127  		log.Info("Validating update", "job", klog.KObj(newJob))
   128  		allErrors := jobframework.ValidateUpdateForQueueName((*RayJob)(oldJob), (*RayJob)(newJob))
   129  		allErrors = append(allErrors, w.validateCreate(newJob)...)
   130  		allErrors = append(allErrors, jobframework.ValidateUpdateForWorkloadPriorityClassName((*RayJob)(oldJob), (*RayJob)(newJob))...)
   131  		return nil, allErrors.ToAggregate()
   132  	}
   133  	return nil, nil
   134  }
   135  
   136  // ValidateDelete implements webhook.CustomValidator so a webhook will be registered for the type
   137  func (w *RayJobWebhook) ValidateDelete(ctx context.Context, obj runtime.Object) (admission.Warnings, error) {
   138  	return nil, nil
   139  }