github.com/kubeflow/training-operator@v1.7.0/pkg/apis/kubeflow.org/v1/pytorch_validation.go (about) 1 // Copyright 2018 The Kubeflow Authors 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package v1 16 17 import ( 18 "fmt" 19 20 apimachineryvalidation "k8s.io/apimachinery/pkg/api/validation" 21 ) 22 23 func ValidateV1PyTorchJob(pytorchJob *PyTorchJob) error { 24 if errors := apimachineryvalidation.NameIsDNS1035Label(pytorchJob.ObjectMeta.Name, false); errors != nil { 25 return fmt.Errorf("PyTorchJob name is invalid: %v", errors) 26 } 27 if err := validatePyTorchReplicaSpecs(pytorchJob.Spec.PyTorchReplicaSpecs); err != nil { 28 return err 29 } 30 if err := validateNprocPerNode(pytorchJob); err != nil { 31 return err 32 } 33 return nil 34 } 35 36 func validateNprocPerNode(pytorchJob *PyTorchJob) error { 37 if pytorchJob.Spec.NprocPerNode != nil && pytorchJob.Spec.ElasticPolicy != nil && pytorchJob.Spec.ElasticPolicy.NProcPerNode != nil { 38 return fmt.Errorf(".spec.elasticPolicy.nProcPerNode is deprecated, use .spec.nprocPerNode instead") 39 } 40 return nil 41 } 42 43 func validatePyTorchReplicaSpecs(specs map[ReplicaType]*ReplicaSpec) error { 44 if specs == nil { 45 return fmt.Errorf("PyTorchJobSpec is not valid") 46 } 47 for rType, value := range specs { 48 if value == nil || len(value.Template.Spec.Containers) == 0 { 49 return fmt.Errorf("PyTorchJobSpec is not valid: containers definition expected in %v", rType) 50 } 51 // Make sure the replica type is valid. 52 validReplicaTypes := []ReplicaType{PyTorchJobReplicaTypeMaster, PyTorchJobReplicaTypeWorker} 53 54 isValidReplicaType := false 55 for _, t := range validReplicaTypes { 56 if t == rType { 57 isValidReplicaType = true 58 break 59 } 60 } 61 62 if !isValidReplicaType { 63 return fmt.Errorf("PyTorchReplicaType is %v but must be one of %v", rType, validReplicaTypes) 64 } 65 66 //Make sure the image is defined in the container 67 defaultContainerPresent := false 68 for _, container := range value.Template.Spec.Containers { 69 if container.Image == "" { 70 msg := fmt.Sprintf("PyTorchJobSpec is not valid: Image is undefined in the container of %v", rType) 71 return fmt.Errorf(msg) 72 } 73 if container.Name == PyTorchJobDefaultContainerName { 74 defaultContainerPresent = true 75 } 76 } 77 //Make sure there has at least one container named "pytorch" 78 if !defaultContainerPresent { 79 msg := fmt.Sprintf("PyTorchJobSpec is not valid: There is no container named %s in %v", PyTorchJobDefaultContainerName, rType) 80 return fmt.Errorf(msg) 81 } 82 if rType == PyTorchJobReplicaTypeMaster { 83 if value.Replicas != nil && int(*value.Replicas) != 1 { 84 return fmt.Errorf("PyTorchJobSpec is not valid: There must be only 1 master replica") 85 } 86 } 87 88 } 89 90 return nil 91 92 }