github.com/kubeflow/training-operator@v1.7.0/pkg/controller.v1/paddlepaddle/envvar.go (about)

     1  // Copyright 2022 The Kubeflow Authors
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License
    14  
    15  package paddle
    16  
    17  import (
    18  	"fmt"
    19  	"strconv"
    20  	"strings"
    21  
    22  	corev1 "k8s.io/api/core/v1"
    23  
    24  	kubeflowv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1"
    25  )
    26  
    27  const (
    28  	EnvMasterEndpoint = "PADDLE_MASTER"
    29  	EnvNumNodes       = "PADDLE_NNODES"
    30  	EnvJobID          = "PADDLE_JOB_ID"
    31  	EnvServerNum      = "PADDLE_SERVER_NUM"
    32  	EnvTrainerNum     = "PADDLE_TRAINER_NUM"
    33  )
    34  
    35  // EnvVarGenerator is the environment variable generator interface.
    36  type EnvVarGenerator interface {
    37  	Generate(job *kubeflowv1.PaddleJob) ([]corev1.EnvVar, error)
    38  }
    39  
    40  func setPodEnv(obj interface{}, podTemplateSpec *corev1.PodTemplateSpec, rtype, index string) error {
    41  	paddlejob, ok := obj.(*kubeflowv1.PaddleJob)
    42  	if !ok {
    43  		return fmt.Errorf("%+v is not a type of PaddleJob", obj)
    44  	}
    45  
    46  	rank, err := strconv.Atoi(index)
    47  	if err != nil {
    48  		return err
    49  	}
    50  
    51  	totalReplicas := getTotalReplicas(paddlejob)
    52  
    53  	for i := range podTemplateSpec.Spec.Containers {
    54  		// Initialize the environment variables.
    55  		if len(podTemplateSpec.Spec.Containers[i].Env) == 0 {
    56  			podTemplateSpec.Spec.Containers[i].Env = make([]corev1.EnvVar, 0)
    57  		}
    58  		// Set PYTHONUNBUFFERED to true, to disable output buffering.
    59  		// Ref https://stackoverflow.com/questions/59812009/what-is-the-use-of-pythonunbuffered-in-docker-file.
    60  		podTemplateSpec.Spec.Containers[i].Env = append(podTemplateSpec.Spec.Containers[i].Env, corev1.EnvVar{
    61  			Name:  "PYTHONUNBUFFERED",
    62  			Value: "1",
    63  		})
    64  
    65  		podTemplateSpec.Spec.Containers[i].Env = append(podTemplateSpec.Spec.Containers[i].Env, corev1.EnvVar{
    66  			Name:  EnvJobID,
    67  			Value: paddlejob.Name,
    68  		})
    69  		podTemplateSpec.Spec.Containers[i].Env = append(podTemplateSpec.Spec.Containers[i].Env, corev1.EnvVar{
    70  			Name:  EnvNumNodes,
    71  			Value: strconv.Itoa(int(totalReplicas)),
    72  		})
    73  
    74  		// If the master is null, run in Collective mode
    75  		if paddlejob.Spec.PaddleReplicaSpecs[kubeflowv1.PaddleJobReplicaTypeMaster] == nil {
    76  
    77  			// We pick the worker 0 as the rendezvous endpoint
    78  			masterAddr := replicaName(paddlejob.Name, kubeflowv1.PaddleJobReplicaTypeWorker, 0)
    79  			masterPort := getPortFromPaddleJob(paddlejob, kubeflowv1.PaddleJobReplicaTypeWorker)
    80  			if rank == 0 {
    81  				podTemplateSpec.Spec.Containers[i].Env = append(podTemplateSpec.Spec.Containers[i].Env, corev1.EnvVar{
    82  					Name: "POD_IP_DUMMY",
    83  					ValueFrom: &corev1.EnvVarSource{
    84  						FieldRef: &corev1.ObjectFieldSelector{
    85  							FieldPath: "status.podIP",
    86  						},
    87  					},
    88  				})
    89  				podTemplateSpec.Spec.Containers[i].Env = append(podTemplateSpec.Spec.Containers[i].Env, corev1.EnvVar{
    90  					Name:  EnvMasterEndpoint,
    91  					Value: fmt.Sprintf("$(POD_IP_DUMMY):%d", masterPort),
    92  				})
    93  			} else {
    94  				// NOTE(kuizhiqing): no need to ensure master ready by initcontainer or alternative methods, paddle launch will handle it.
    95  				podTemplateSpec.Spec.Containers[i].Env = append(podTemplateSpec.Spec.Containers[i].Env, corev1.EnvVar{
    96  					Name:  EnvMasterEndpoint,
    97  					Value: fmt.Sprintf("%s:%d", masterAddr, masterPort),
    98  				})
    99  			}
   100  
   101  			// Otherwise, run in PS mode
   102  		} else {
   103  
   104  			// We pick the master 0 as the rendezvous endpoint
   105  			masterAddr := replicaName(paddlejob.Name, kubeflowv1.PaddleJobReplicaTypeMaster, 0)
   106  			masterPort := getPortFromPaddleJob(paddlejob, kubeflowv1.PaddleJobReplicaTypeMaster)
   107  			if rank == 0 && rtype == strings.ToLower(string(kubeflowv1.PaddleJobReplicaTypeMaster)) {
   108  				podTemplateSpec.Spec.Containers[i].Env = append(podTemplateSpec.Spec.Containers[i].Env, corev1.EnvVar{
   109  					Name: "POD_IP_DUMMY",
   110  					ValueFrom: &corev1.EnvVarSource{
   111  						FieldRef: &corev1.ObjectFieldSelector{
   112  							FieldPath: "status.podIP",
   113  						},
   114  					},
   115  				})
   116  				podTemplateSpec.Spec.Containers[i].Env = append(podTemplateSpec.Spec.Containers[i].Env, corev1.EnvVar{
   117  					Name:  EnvMasterEndpoint,
   118  					Value: fmt.Sprintf("$(POD_IP_DUMMY):%d", masterPort),
   119  				})
   120  			} else {
   121  				podTemplateSpec.Spec.Containers[i].Env = append(podTemplateSpec.Spec.Containers[i].Env, corev1.EnvVar{
   122  					Name:  EnvMasterEndpoint,
   123  					Value: fmt.Sprintf("%s:%d", masterAddr, masterPort),
   124  				})
   125  			}
   126  
   127  			// Each pod will have only one server or trainer.
   128  			if rtype == strings.ToLower(string(kubeflowv1.PaddleJobReplicaTypeMaster)) {
   129  				podTemplateSpec.Spec.Containers[i].Env = append(podTemplateSpec.Spec.Containers[i].Env, corev1.EnvVar{
   130  					Name:  EnvServerNum,
   131  					Value: "1",
   132  				})
   133  			} else {
   134  				podTemplateSpec.Spec.Containers[i].Env = append(podTemplateSpec.Spec.Containers[i].Env, corev1.EnvVar{
   135  					Name:  EnvTrainerNum,
   136  					Value: "1",
   137  				})
   138  			}
   139  
   140  		}
   141  	}
   142  
   143  	return nil
   144  }
   145  
   146  func getTotalReplicas(job *kubeflowv1.PaddleJob) int32 {
   147  	jobReplicas := int32(0)
   148  	for _, r := range job.Spec.PaddleReplicaSpecs {
   149  		jobReplicas += *r.Replicas
   150  	}
   151  	return jobReplicas
   152  }
   153  
   154  func replicaName(jobName string, rtype kubeflowv1.ReplicaType, index int) string {
   155  	n := jobName + "-" + strings.ToLower(string(rtype)) + "-" + strconv.Itoa(index)
   156  	return strings.Replace(n, "/", "-", -1)
   157  }
   158  
   159  func getPortFromPaddleJob(job *kubeflowv1.PaddleJob, rtype kubeflowv1.ReplicaType) int32 {
   160  	containers := job.Spec.PaddleReplicaSpecs[rtype].Template.Spec.Containers
   161  	for _, container := range containers {
   162  		if container.Name == kubeflowv1.PaddleJobDefaultContainerName {
   163  			ports := container.Ports
   164  			for _, port := range ports {
   165  				if port.Name == kubeflowv1.PaddleJobDefaultPortName {
   166  					return port.ContainerPort
   167  				}
   168  			}
   169  		}
   170  	}
   171  	return kubeflowv1.PaddleJobDefaultPort
   172  }