github.com/kubeflow/training-operator@v1.7.0/pkg/controller.v1/tensorflow/util.go (about) 1 // Copyright 2018 The Kubeflow Authors 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package tensorflow 16 17 import ( 18 corev1 "k8s.io/api/core/v1" 19 20 kubeflowv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" 21 ) 22 23 // GetPortFromTFJob gets the port of tensorflow container. 24 func GetPortFromTFJob(tfJob *kubeflowv1.TFJob, rtype kubeflowv1.ReplicaType) (int32, error) { 25 containers := tfJob.Spec.TFReplicaSpecs[rtype].Template.Spec.Containers 26 for _, container := range containers { 27 if container.Name == kubeflowv1.TFJobDefaultContainerName { 28 ports := container.Ports 29 for _, port := range ports { 30 if port.Name == kubeflowv1.TFJobDefaultPortName { 31 return port.ContainerPort, nil 32 } 33 } 34 } 35 } 36 return kubeflowv1.TFJobDefaultPort, nil 37 } 38 39 // ContainsChiefOrMasterSpec returns true if the tfjob contains chief or master spec. 40 func ContainsChiefOrMasterSpec(replicas map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec) bool { 41 if _, ok := replicas[kubeflowv1.TFJobReplicaTypeChief]; ok { 42 return true 43 } else if _, ok := replicas[kubeflowv1.TFJobReplicaTypeMaster]; ok { 44 return true 45 } 46 return false 47 } 48 49 // originally from pkg/controller.v1/tensorflow/pod.go (deleted) 50 func getContainerExitCode(pod *corev1.Pod) int32 { 51 var exitCode int32 = 0xbeef // magic number 52 for _, status := range pod.Status.ContainerStatuses { 53 state := status.State 54 if status.Name == kubeflowv1.TFJobDefaultContainerName && state.Terminated != nil { 55 exitCode = state.Terminated.ExitCode 56 } 57 } 58 return exitCode 59 } 60 61 // originally from pkg/controller.v1/tensorflow/pod.go (deleted) 62 func setRestartPolicy(podTemplateSpec *corev1.PodTemplateSpec, spec *kubeflowv1.ReplicaSpec) { 63 // This is necessary since restartPolicyExitCode is not supported in v1.PodTemplateSpec 64 if spec.RestartPolicy == kubeflowv1.RestartPolicyExitCode { 65 podTemplateSpec.Spec.RestartPolicy = corev1.RestartPolicyNever 66 } else { 67 podTemplateSpec.Spec.RestartPolicy = corev1.RestartPolicy(spec.RestartPolicy) 68 } 69 } 70 71 // isDistributed returns if the TFJob is a distributed training job. 72 // Ref https://github.com/kubeflow/training-operator/issues/1078. 73 // originally from pkg/controller.v1/tensorflow/pod.go (deleted) 74 func isDistributed(tfjob *kubeflowv1.TFJob) bool { 75 replicas := tfjob.Spec.TFReplicaSpecs 76 distributionCount := 0 77 allTypes := []kubeflowv1.ReplicaType{ 78 kubeflowv1.TFJobReplicaTypeChief, 79 kubeflowv1.TFJobReplicaTypeEval, 80 kubeflowv1.TFJobReplicaTypeMaster, 81 kubeflowv1.TFJobReplicaTypePS, 82 kubeflowv1.TFJobReplicaTypeWorker, 83 } 84 // Check if there is only one replica. 85 for _, typ := range allTypes { 86 if replicas[typ] != nil { 87 if replicas[typ].Replicas == nil { 88 distributionCount++ 89 } else { 90 distributionCount += int(*replicas[typ].Replicas) 91 } 92 } 93 } 94 return distributionCount != 1 95 } 96 97 // initializeReplicaStatuses initializes the ReplicaStatuses for replica. 98 // originally from pkg/controller.v1/tensorflow/status.go (deleted) 99 func initializeReplicaStatuses(jobStatus *kubeflowv1.JobStatus, rtype kubeflowv1.ReplicaType) { 100 if jobStatus.ReplicaStatuses == nil { 101 jobStatus.ReplicaStatuses = make(map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaStatus) 102 } 103 104 jobStatus.ReplicaStatuses[rtype] = &kubeflowv1.ReplicaStatus{} 105 } 106 107 // updateJobReplicaStatuses updates the JobReplicaStatuses according to the pod. 108 // originally from pkg/controller.v1/tensorflow/status.go (deleted) 109 func updateJobReplicaStatuses(jobStatus *kubeflowv1.JobStatus, rtype kubeflowv1.ReplicaType, pod *corev1.Pod) { 110 switch pod.Status.Phase { 111 case corev1.PodRunning: 112 jobStatus.ReplicaStatuses[rtype].Active++ 113 case corev1.PodSucceeded: 114 jobStatus.ReplicaStatuses[rtype].Succeeded++ 115 case corev1.PodFailed: 116 jobStatus.ReplicaStatuses[rtype].Failed++ 117 } 118 }