sigs.k8s.io/kueue@v0.6.2/pkg/controller/jobs/rayjob/rayjob_webhook.go (about) 1 /* 2 Copyright 2023 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package rayjob 18 19 import ( 20 "context" 21 "fmt" 22 23 rayjobapi "github.com/ray-project/kuberay/ray-operator/apis/ray/v1alpha1" 24 "k8s.io/apimachinery/pkg/runtime" 25 "k8s.io/apimachinery/pkg/util/validation/field" 26 "k8s.io/klog/v2" 27 "k8s.io/utils/ptr" 28 29 ctrl "sigs.k8s.io/controller-runtime" 30 "sigs.k8s.io/controller-runtime/pkg/webhook" 31 "sigs.k8s.io/controller-runtime/pkg/webhook/admission" 32 33 "sigs.k8s.io/kueue/pkg/controller/jobframework" 34 ) 35 36 type RayJobWebhook struct { 37 manageJobsWithoutQueueName bool 38 } 39 40 // SetupRayJobWebhook configures the webhook for rayjobapi RayJob. 41 func SetupRayJobWebhook(mgr ctrl.Manager, opts ...jobframework.Option) error { 42 options := jobframework.ProcessOptions(opts...) 43 wh := &RayJobWebhook{ 44 manageJobsWithoutQueueName: options.ManageJobsWithoutQueueName, 45 } 46 return ctrl.NewWebhookManagedBy(mgr). 47 For(&rayjobapi.RayJob{}). 48 WithDefaulter(wh). 49 WithValidator(wh). 50 Complete() 51 } 52 53 // +kubebuilder:webhook:path=/mutate-ray-io-v1alpha1-rayjob,mutating=true,failurePolicy=fail,sideEffects=None,groups=ray.io,resources=rayjobs,verbs=create,versions=v1alpha1,name=mrayjob.kb.io,admissionReviewVersions=v1 54 55 var _ webhook.CustomDefaulter = &RayJobWebhook{} 56 57 // Default implements webhook.CustomDefaulter so a webhook will be registered for the type 58 func (w *RayJobWebhook) Default(ctx context.Context, obj runtime.Object) error { 59 job := obj.(*rayjobapi.RayJob) 60 log := ctrl.LoggerFrom(ctx).WithName("rayjob-webhook") 61 log.V(5).Info("Applying defaults", "job", klog.KObj(job)) 62 jobframework.ApplyDefaultForSuspend((*RayJob)(job), w.manageJobsWithoutQueueName) 63 return nil 64 } 65 66 // +kubebuilder:webhook:path=/validate-ray-io-v1alpha1-rayjob,mutating=false,failurePolicy=fail,sideEffects=None,groups=ray.io,resources=rayjobs,verbs=create;update,versions=v1alpha1,name=vrayjob.kb.io,admissionReviewVersions=v1 67 68 var _ webhook.CustomValidator = &RayJobWebhook{} 69 70 // ValidateCreate implements webhook.CustomValidator so a webhook will be registered for the type 71 func (w *RayJobWebhook) ValidateCreate(ctx context.Context, obj runtime.Object) (admission.Warnings, error) { 72 job := obj.(*rayjobapi.RayJob) 73 log := ctrl.LoggerFrom(ctx).WithName("rayjob-webhook") 74 log.Info("Validating create", "job", klog.KObj(job)) 75 return nil, w.validateCreate(job).ToAggregate() 76 } 77 78 func (w *RayJobWebhook) validateCreate(job *rayjobapi.RayJob) field.ErrorList { 79 var allErrors field.ErrorList 80 kueueJob := (*RayJob)(job) 81 82 if w.manageJobsWithoutQueueName || jobframework.QueueName(kueueJob) != "" { 83 spec := &job.Spec 84 specPath := field.NewPath("spec") 85 86 // Should always delete the cluster after the sob has ended, otherwise it will continue to the queue's resources. 87 if !spec.ShutdownAfterJobFinishes { 88 allErrors = append(allErrors, field.Invalid(specPath.Child("shutdownAfterJobFinishes"), spec.ShutdownAfterJobFinishes, "a kueue managed job should delete the cluster after finishing")) 89 } 90 91 // Should not want existing cluster. Keuue (workload) should be able to control the admission of the actual work, not only the trigger. 92 if len(spec.ClusterSelector) > 0 { 93 allErrors = append(allErrors, field.Invalid(specPath.Child("clusterSelector"), spec.ClusterSelector, "a kueue managed job should not use an existing cluster")) 94 } 95 96 clusterSpec := spec.RayClusterSpec 97 clusterSpecPath := specPath.Child("rayClusterSpec") 98 99 // Should not use auto scaler. Once the resources are reserved by queue the cluster should do it's best to use them. 100 if ptr.Deref(clusterSpec.EnableInTreeAutoscaling, false) { 101 allErrors = append(allErrors, field.Invalid(clusterSpecPath.Child("enableInTreeAutoscaling"), clusterSpec.EnableInTreeAutoscaling, "a kueue managed job should not use autoscaling")) 102 } 103 104 // Should limit the worker count to 8 - 1 (max podSets num - cluster head) 105 if len(clusterSpec.WorkerGroupSpecs) > 7 { 106 allErrors = append(allErrors, field.TooMany(clusterSpecPath.Child("workerGroupSpecs"), len(clusterSpec.WorkerGroupSpecs), 7)) 107 } 108 109 // None of the workerGroups should be named "head" 110 for i := range clusterSpec.WorkerGroupSpecs { 111 if clusterSpec.WorkerGroupSpecs[i].GroupName == headGroupPodSetName { 112 allErrors = append(allErrors, field.Forbidden(clusterSpecPath.Child("workerGroupSpecs").Index(i).Child("groupName"), fmt.Sprintf("%q is reserved for the head group", headGroupPodSetName))) 113 } 114 } 115 } 116 117 allErrors = append(allErrors, jobframework.ValidateCreateForQueueName(kueueJob)...) 118 return allErrors 119 } 120 121 // ValidateUpdate implements webhook.CustomValidator so a webhook will be registered for the type 122 func (w *RayJobWebhook) ValidateUpdate(ctx context.Context, oldObj, newObj runtime.Object) (admission.Warnings, error) { 123 oldJob := oldObj.(*rayjobapi.RayJob) 124 newJob := newObj.(*rayjobapi.RayJob) 125 log := ctrl.LoggerFrom(ctx).WithName("rayjob-webhook") 126 if w.manageJobsWithoutQueueName || jobframework.QueueName((*RayJob)(newJob)) != "" { 127 log.Info("Validating update", "job", klog.KObj(newJob)) 128 allErrors := jobframework.ValidateUpdateForQueueName((*RayJob)(oldJob), (*RayJob)(newJob)) 129 allErrors = append(allErrors, w.validateCreate(newJob)...) 130 allErrors = append(allErrors, jobframework.ValidateUpdateForWorkloadPriorityClassName((*RayJob)(oldJob), (*RayJob)(newJob))...) 131 return nil, allErrors.ToAggregate() 132 } 133 return nil, nil 134 } 135 136 // ValidateDelete implements webhook.CustomValidator so a webhook will be registered for the type 137 func (w *RayJobWebhook) ValidateDelete(ctx context.Context, obj runtime.Object) (admission.Warnings, error) { 138 return nil, nil 139 }