github.com/kubeflow/training-operator@v1.7.0/pkg/controller.v1/paddlepaddle/envvar.go (about) 1 // Copyright 2022 The Kubeflow Authors 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License 14 15 package paddle 16 17 import ( 18 "fmt" 19 "strconv" 20 "strings" 21 22 corev1 "k8s.io/api/core/v1" 23 24 kubeflowv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" 25 ) 26 27 const ( 28 EnvMasterEndpoint = "PADDLE_MASTER" 29 EnvNumNodes = "PADDLE_NNODES" 30 EnvJobID = "PADDLE_JOB_ID" 31 EnvServerNum = "PADDLE_SERVER_NUM" 32 EnvTrainerNum = "PADDLE_TRAINER_NUM" 33 ) 34 35 // EnvVarGenerator is the environment variable generator interface. 36 type EnvVarGenerator interface { 37 Generate(job *kubeflowv1.PaddleJob) ([]corev1.EnvVar, error) 38 } 39 40 func setPodEnv(obj interface{}, podTemplateSpec *corev1.PodTemplateSpec, rtype, index string) error { 41 paddlejob, ok := obj.(*kubeflowv1.PaddleJob) 42 if !ok { 43 return fmt.Errorf("%+v is not a type of PaddleJob", obj) 44 } 45 46 rank, err := strconv.Atoi(index) 47 if err != nil { 48 return err 49 } 50 51 totalReplicas := getTotalReplicas(paddlejob) 52 53 for i := range podTemplateSpec.Spec.Containers { 54 // Initialize the environment variables. 55 if len(podTemplateSpec.Spec.Containers[i].Env) == 0 { 56 podTemplateSpec.Spec.Containers[i].Env = make([]corev1.EnvVar, 0) 57 } 58 // Set PYTHONUNBUFFERED to true, to disable output buffering. 59 // Ref https://stackoverflow.com/questions/59812009/what-is-the-use-of-pythonunbuffered-in-docker-file. 60 podTemplateSpec.Spec.Containers[i].Env = append(podTemplateSpec.Spec.Containers[i].Env, corev1.EnvVar{ 61 Name: "PYTHONUNBUFFERED", 62 Value: "1", 63 }) 64 65 podTemplateSpec.Spec.Containers[i].Env = append(podTemplateSpec.Spec.Containers[i].Env, corev1.EnvVar{ 66 Name: EnvJobID, 67 Value: paddlejob.Name, 68 }) 69 podTemplateSpec.Spec.Containers[i].Env = append(podTemplateSpec.Spec.Containers[i].Env, corev1.EnvVar{ 70 Name: EnvNumNodes, 71 Value: strconv.Itoa(int(totalReplicas)), 72 }) 73 74 // If the master is null, run in Collective mode 75 if paddlejob.Spec.PaddleReplicaSpecs[kubeflowv1.PaddleJobReplicaTypeMaster] == nil { 76 77 // We pick the worker 0 as the rendezvous endpoint 78 masterAddr := replicaName(paddlejob.Name, kubeflowv1.PaddleJobReplicaTypeWorker, 0) 79 masterPort := getPortFromPaddleJob(paddlejob, kubeflowv1.PaddleJobReplicaTypeWorker) 80 if rank == 0 { 81 podTemplateSpec.Spec.Containers[i].Env = append(podTemplateSpec.Spec.Containers[i].Env, corev1.EnvVar{ 82 Name: "POD_IP_DUMMY", 83 ValueFrom: &corev1.EnvVarSource{ 84 FieldRef: &corev1.ObjectFieldSelector{ 85 FieldPath: "status.podIP", 86 }, 87 }, 88 }) 89 podTemplateSpec.Spec.Containers[i].Env = append(podTemplateSpec.Spec.Containers[i].Env, corev1.EnvVar{ 90 Name: EnvMasterEndpoint, 91 Value: fmt.Sprintf("$(POD_IP_DUMMY):%d", masterPort), 92 }) 93 } else { 94 // NOTE(kuizhiqing): no need to ensure master ready by initcontainer or alternative methods, paddle launch will handle it. 95 podTemplateSpec.Spec.Containers[i].Env = append(podTemplateSpec.Spec.Containers[i].Env, corev1.EnvVar{ 96 Name: EnvMasterEndpoint, 97 Value: fmt.Sprintf("%s:%d", masterAddr, masterPort), 98 }) 99 } 100 101 // Otherwise, run in PS mode 102 } else { 103 104 // We pick the master 0 as the rendezvous endpoint 105 masterAddr := replicaName(paddlejob.Name, kubeflowv1.PaddleJobReplicaTypeMaster, 0) 106 masterPort := getPortFromPaddleJob(paddlejob, kubeflowv1.PaddleJobReplicaTypeMaster) 107 if rank == 0 && rtype == strings.ToLower(string(kubeflowv1.PaddleJobReplicaTypeMaster)) { 108 podTemplateSpec.Spec.Containers[i].Env = append(podTemplateSpec.Spec.Containers[i].Env, corev1.EnvVar{ 109 Name: "POD_IP_DUMMY", 110 ValueFrom: &corev1.EnvVarSource{ 111 FieldRef: &corev1.ObjectFieldSelector{ 112 FieldPath: "status.podIP", 113 }, 114 }, 115 }) 116 podTemplateSpec.Spec.Containers[i].Env = append(podTemplateSpec.Spec.Containers[i].Env, corev1.EnvVar{ 117 Name: EnvMasterEndpoint, 118 Value: fmt.Sprintf("$(POD_IP_DUMMY):%d", masterPort), 119 }) 120 } else { 121 podTemplateSpec.Spec.Containers[i].Env = append(podTemplateSpec.Spec.Containers[i].Env, corev1.EnvVar{ 122 Name: EnvMasterEndpoint, 123 Value: fmt.Sprintf("%s:%d", masterAddr, masterPort), 124 }) 125 } 126 127 // Each pod will have only one server or trainer. 128 if rtype == strings.ToLower(string(kubeflowv1.PaddleJobReplicaTypeMaster)) { 129 podTemplateSpec.Spec.Containers[i].Env = append(podTemplateSpec.Spec.Containers[i].Env, corev1.EnvVar{ 130 Name: EnvServerNum, 131 Value: "1", 132 }) 133 } else { 134 podTemplateSpec.Spec.Containers[i].Env = append(podTemplateSpec.Spec.Containers[i].Env, corev1.EnvVar{ 135 Name: EnvTrainerNum, 136 Value: "1", 137 }) 138 } 139 140 } 141 } 142 143 return nil 144 } 145 146 func getTotalReplicas(job *kubeflowv1.PaddleJob) int32 { 147 jobReplicas := int32(0) 148 for _, r := range job.Spec.PaddleReplicaSpecs { 149 jobReplicas += *r.Replicas 150 } 151 return jobReplicas 152 } 153 154 func replicaName(jobName string, rtype kubeflowv1.ReplicaType, index int) string { 155 n := jobName + "-" + strings.ToLower(string(rtype)) + "-" + strconv.Itoa(index) 156 return strings.Replace(n, "/", "-", -1) 157 } 158 159 func getPortFromPaddleJob(job *kubeflowv1.PaddleJob, rtype kubeflowv1.ReplicaType) int32 { 160 containers := job.Spec.PaddleReplicaSpecs[rtype].Template.Spec.Containers 161 for _, container := range containers { 162 if container.Name == kubeflowv1.PaddleJobDefaultContainerName { 163 ports := container.Ports 164 for _, port := range ports { 165 if port.Name == kubeflowv1.PaddleJobDefaultPortName { 166 return port.ContainerPort 167 } 168 } 169 } 170 } 171 return kubeflowv1.PaddleJobDefaultPort 172 }