github.com/kubeflow/training-operator@v1.7.0/pkg/apis/kubeflow.org/v1/pytorch_validation.go (about)

     1  // Copyright 2018 The Kubeflow Authors
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package v1
    16  
    17  import (
    18  	"fmt"
    19  
    20  	apimachineryvalidation "k8s.io/apimachinery/pkg/api/validation"
    21  )
    22  
    23  func ValidateV1PyTorchJob(pytorchJob *PyTorchJob) error {
    24  	if errors := apimachineryvalidation.NameIsDNS1035Label(pytorchJob.ObjectMeta.Name, false); errors != nil {
    25  		return fmt.Errorf("PyTorchJob name is invalid: %v", errors)
    26  	}
    27  	if err := validatePyTorchReplicaSpecs(pytorchJob.Spec.PyTorchReplicaSpecs); err != nil {
    28  		return err
    29  	}
    30  	if err := validateNprocPerNode(pytorchJob); err != nil {
    31  		return err
    32  	}
    33  	return nil
    34  }
    35  
    36  func validateNprocPerNode(pytorchJob *PyTorchJob) error {
    37  	if pytorchJob.Spec.NprocPerNode != nil && pytorchJob.Spec.ElasticPolicy != nil && pytorchJob.Spec.ElasticPolicy.NProcPerNode != nil {
    38  		return fmt.Errorf(".spec.elasticPolicy.nProcPerNode is deprecated, use .spec.nprocPerNode instead")
    39  	}
    40  	return nil
    41  }
    42  
    43  func validatePyTorchReplicaSpecs(specs map[ReplicaType]*ReplicaSpec) error {
    44  	if specs == nil {
    45  		return fmt.Errorf("PyTorchJobSpec is not valid")
    46  	}
    47  	for rType, value := range specs {
    48  		if value == nil || len(value.Template.Spec.Containers) == 0 {
    49  			return fmt.Errorf("PyTorchJobSpec is not valid: containers definition expected in %v", rType)
    50  		}
    51  		// Make sure the replica type is valid.
    52  		validReplicaTypes := []ReplicaType{PyTorchJobReplicaTypeMaster, PyTorchJobReplicaTypeWorker}
    53  
    54  		isValidReplicaType := false
    55  		for _, t := range validReplicaTypes {
    56  			if t == rType {
    57  				isValidReplicaType = true
    58  				break
    59  			}
    60  		}
    61  
    62  		if !isValidReplicaType {
    63  			return fmt.Errorf("PyTorchReplicaType is %v but must be one of %v", rType, validReplicaTypes)
    64  		}
    65  
    66  		//Make sure the image is defined in the container
    67  		defaultContainerPresent := false
    68  		for _, container := range value.Template.Spec.Containers {
    69  			if container.Image == "" {
    70  				msg := fmt.Sprintf("PyTorchJobSpec is not valid: Image is undefined in the container of %v", rType)
    71  				return fmt.Errorf(msg)
    72  			}
    73  			if container.Name == PyTorchJobDefaultContainerName {
    74  				defaultContainerPresent = true
    75  			}
    76  		}
    77  		//Make sure there has at least one container named "pytorch"
    78  		if !defaultContainerPresent {
    79  			msg := fmt.Sprintf("PyTorchJobSpec is not valid: There is no container named %s in %v", PyTorchJobDefaultContainerName, rType)
    80  			return fmt.Errorf(msg)
    81  		}
    82  		if rType == PyTorchJobReplicaTypeMaster {
    83  			if value.Replicas != nil && int(*value.Replicas) != 1 {
    84  				return fmt.Errorf("PyTorchJobSpec is not valid: There must be only 1 master replica")
    85  			}
    86  		}
    87  
    88  	}
    89  
    90  	return nil
    91  
    92  }