github.com/kubeflow/training-operator@v1.7.0/pkg/controller.v1/tensorflow/util.go (about)

     1  // Copyright 2018 The Kubeflow Authors
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package tensorflow
    16  
    17  import (
    18  	corev1 "k8s.io/api/core/v1"
    19  
    20  	kubeflowv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1"
    21  )
    22  
    23  // GetPortFromTFJob gets the port of tensorflow container.
    24  func GetPortFromTFJob(tfJob *kubeflowv1.TFJob, rtype kubeflowv1.ReplicaType) (int32, error) {
    25  	containers := tfJob.Spec.TFReplicaSpecs[rtype].Template.Spec.Containers
    26  	for _, container := range containers {
    27  		if container.Name == kubeflowv1.TFJobDefaultContainerName {
    28  			ports := container.Ports
    29  			for _, port := range ports {
    30  				if port.Name == kubeflowv1.TFJobDefaultPortName {
    31  					return port.ContainerPort, nil
    32  				}
    33  			}
    34  		}
    35  	}
    36  	return kubeflowv1.TFJobDefaultPort, nil
    37  }
    38  
    39  // ContainsChiefOrMasterSpec returns true if the tfjob contains chief or master spec.
    40  func ContainsChiefOrMasterSpec(replicas map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec) bool {
    41  	if _, ok := replicas[kubeflowv1.TFJobReplicaTypeChief]; ok {
    42  		return true
    43  	} else if _, ok := replicas[kubeflowv1.TFJobReplicaTypeMaster]; ok {
    44  		return true
    45  	}
    46  	return false
    47  }
    48  
    49  // originally from pkg/controller.v1/tensorflow/pod.go (deleted)
    50  func getContainerExitCode(pod *corev1.Pod) int32 {
    51  	var exitCode int32 = 0xbeef // magic number
    52  	for _, status := range pod.Status.ContainerStatuses {
    53  		state := status.State
    54  		if status.Name == kubeflowv1.TFJobDefaultContainerName && state.Terminated != nil {
    55  			exitCode = state.Terminated.ExitCode
    56  		}
    57  	}
    58  	return exitCode
    59  }
    60  
    61  // originally from pkg/controller.v1/tensorflow/pod.go (deleted)
    62  func setRestartPolicy(podTemplateSpec *corev1.PodTemplateSpec, spec *kubeflowv1.ReplicaSpec) {
    63  	// This is necessary since restartPolicyExitCode is not supported in v1.PodTemplateSpec
    64  	if spec.RestartPolicy == kubeflowv1.RestartPolicyExitCode {
    65  		podTemplateSpec.Spec.RestartPolicy = corev1.RestartPolicyNever
    66  	} else {
    67  		podTemplateSpec.Spec.RestartPolicy = corev1.RestartPolicy(spec.RestartPolicy)
    68  	}
    69  }
    70  
    71  // isDistributed returns if the TFJob is a distributed training job.
    72  // Ref https://github.com/kubeflow/training-operator/issues/1078.
    73  // originally from pkg/controller.v1/tensorflow/pod.go (deleted)
    74  func isDistributed(tfjob *kubeflowv1.TFJob) bool {
    75  	replicas := tfjob.Spec.TFReplicaSpecs
    76  	distributionCount := 0
    77  	allTypes := []kubeflowv1.ReplicaType{
    78  		kubeflowv1.TFJobReplicaTypeChief,
    79  		kubeflowv1.TFJobReplicaTypeEval,
    80  		kubeflowv1.TFJobReplicaTypeMaster,
    81  		kubeflowv1.TFJobReplicaTypePS,
    82  		kubeflowv1.TFJobReplicaTypeWorker,
    83  	}
    84  	// Check if there is only one replica.
    85  	for _, typ := range allTypes {
    86  		if replicas[typ] != nil {
    87  			if replicas[typ].Replicas == nil {
    88  				distributionCount++
    89  			} else {
    90  				distributionCount += int(*replicas[typ].Replicas)
    91  			}
    92  		}
    93  	}
    94  	return distributionCount != 1
    95  }
    96  
    97  // initializeReplicaStatuses initializes the ReplicaStatuses for replica.
    98  // originally from pkg/controller.v1/tensorflow/status.go (deleted)
    99  func initializeReplicaStatuses(jobStatus *kubeflowv1.JobStatus, rtype kubeflowv1.ReplicaType) {
   100  	if jobStatus.ReplicaStatuses == nil {
   101  		jobStatus.ReplicaStatuses = make(map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaStatus)
   102  	}
   103  
   104  	jobStatus.ReplicaStatuses[rtype] = &kubeflowv1.ReplicaStatus{}
   105  }
   106  
   107  // updateJobReplicaStatuses updates the JobReplicaStatuses according to the pod.
   108  // originally from pkg/controller.v1/tensorflow/status.go (deleted)
   109  func updateJobReplicaStatuses(jobStatus *kubeflowv1.JobStatus, rtype kubeflowv1.ReplicaType, pod *corev1.Pod) {
   110  	switch pod.Status.Phase {
   111  	case corev1.PodRunning:
   112  		jobStatus.ReplicaStatuses[rtype].Active++
   113  	case corev1.PodSucceeded:
   114  		jobStatus.ReplicaStatuses[rtype].Succeeded++
   115  	case corev1.PodFailed:
   116  		jobStatus.ReplicaStatuses[rtype].Failed++
   117  	}
   118  }