github.com/kubeflow/training-operator@v1.7.0/pkg/controller.v1/pytorch/elastic.go

github.com/kubeflow/training-operator@v1.7.0/pkg/controller.v1/pytorch/elastic.go (about)

     1  // Copyright 2021 The Kubeflow Authors
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License
    14  
    15  package pytorch
    16  
    17  import (
    18  	"fmt"
    19  	"strconv"
    20  	"sync"
    21  
    22  	corev1 "k8s.io/api/core/v1"
    23  
    24  	kubeflowv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1"
    25  )
    26  
    27  const (
    28  	// Rendezvous related arguments
    29  
    30  	// EnvRDZVBackend is the environment variable name for the rdzv backend.
    31  	EnvRDZVBackend = "PET_RDZV_BACKEND"
    32  	// EnvRDZVID is the environment variable name for the rdzv id.
    33  	EnvRDZVID = "PET_RDZV_ID"
    34  	// ENVRDZVConf is the environment variable name for the rdzv conf.
    35  	EnvRDZVConf = "PET_RDZV_CONF"
    36  	// EnvRDZVEndpoint is the environment variable name for the rdzv endpoint.
    37  	EnvRDZVEndpoint = "PET_RDZV_ENDPOINT"
    38  	// EnvRDZVStandalone is the environment variable name for the standalone mode.
    39  	EnvStandalone = "PET_STANDALONE"
    40  
    41  	// User-code launch related arguments.
    42  
    43  	// EnvMaxRestarts is the environment variable name for the maximum number of worker group restarts before failing.
    44  	EnvMaxRestarts = "PET_MAX_RESTARTS"
    45  	// EnvMonitorInterval is the environment variable name for the interval, in seconds, to monitor the state of workers.
    46  	EnvMonitorInterval = "PET_MONITOR_INTERVAL"
    47  	// EnvStartMethod is the environment variable name for the multiprocessing start method to use when creating workers, which could be fork, spawn and forkserver.
    48  	EnvStartMethod = "PET_START_METHOD"
    49  
    50  	// EnvNNodes is the common environment variable name from envvar
    51  
    52  	// EnvNProcPerNode is the environment variable name for the number of processes per node.
    53  	EnvNProcPerNode = "PET_NPROC_PER_NODE"
    54  )
    55  
    56  var (
    57  	elasticGenerator EnvVarGenerator
    58  	onceElastic      sync.Once
    59  )
    60  
    61  // ElasticEnvVarGenerator is the environment variable generator for Elastic related arguments.
    62  type ElasticEnvVarGenerator struct{}
    63  
    64  func GetElasticEnvVarGenerator() EnvVarGenerator {
    65  	onceElastic.Do(func() {
    66  		elasticGenerator = &ElasticEnvVarGenerator{}
    67  	})
    68  	return elasticGenerator
    69  }
    70  
    71  func (e ElasticEnvVarGenerator) Generate(
    72  	job *kubeflowv1.PyTorchJob) ([]corev1.EnvVar, error) {
    73  	envVars := []corev1.EnvVar{}
    74  
    75  	elasticPolicy := job.Spec.ElasticPolicy
    76  	if elasticPolicy == nil {
    77  		// Return empty env vars.
    78  		return nil, nil
    79  	}
    80  
    81  	// Generate RDZV_ENDPOINT.
    82  	if envVar, err := e.generateEnvRDZVEndpoint(job); err != nil {
    83  		return nil, err
    84  	} else {
    85  		envVars = append(envVars, *envVar)
    86  	}
    87  	// Generate RDZV_BACKEND.
    88  	envVars = append(envVars, e.generateEnvBackend(elasticPolicy))
    89  	// Generate NNODES.
    90  	if envVar, err := e.generateEnvNnodes(job); err != nil {
    91  		return nil, err
    92  	} else {
    93  		envVars = append(envVars, *envVar)
    94  	}
    95  
    96  	if elasticPolicy.MaxRestarts != nil {
    97  		envVars = append(envVars, corev1.EnvVar{
    98  			Name:  EnvMaxRestarts,
    99  			Value: strconv.Itoa(int(*elasticPolicy.MaxRestarts)),
   100  		})
   101  	}
   102  	if elasticPolicy.NProcPerNode != nil {
   103  		envVars = append(envVars, corev1.EnvVar{
   104  			Name:  EnvNProcPerNode,
   105  			Value: strconv.Itoa(int(*elasticPolicy.NProcPerNode)),
   106  		})
   107  	}
   108  	if elasticPolicy.RDZVID != nil {
   109  		envVars = append(envVars, corev1.EnvVar{
   110  			Name:  EnvRDZVID,
   111  			Value: *elasticPolicy.RDZVID,
   112  		})
   113  	}
   114  	if envVar := e.generateEnvRDZVConf(elasticPolicy); envVar != nil {
   115  		envVars = append(envVars, *envVar)
   116  	}
   117  	if elasticPolicy.Standalone != nil && *elasticPolicy.Standalone {
   118  		envVars = append(envVars, corev1.EnvVar{
   119  			Name:  EnvStandalone,
   120  			Value: "",
   121  		})
   122  	}
   123  
   124  	return envVars, nil
   125  }
   126  
   127  func (e ElasticEnvVarGenerator) generateEnvNnodes(job *kubeflowv1.PyTorchJob) (*corev1.EnvVar, error) {
   128  	// Return worker.replicas if there is no max and min replicas specified.
   129  	if job.Spec.ElasticPolicy.MinReplicas == nil &&
   130  		job.Spec.ElasticPolicy.MaxReplicas == nil {
   131  		if job.Spec.PyTorchReplicaSpecs[kubeflowv1.PyTorchJobReplicaTypeWorker] == nil {
   132  			return nil, fmt.Errorf("cannot find the worker spec")
   133  		}
   134  		return &corev1.EnvVar{
   135  			Name: EnvNnodes,
   136  			Value: strconv.Itoa(
   137  				int(*job.Spec.PyTorchReplicaSpecs[kubeflowv1.PyTorchJobReplicaTypeWorker].
   138  					Replicas)),
   139  		}, nil
   140  	}
   141  
   142  	return &corev1.EnvVar{
   143  		Name: EnvNnodes,
   144  		Value: fmt.Sprintf("%d:%d",
   145  			*job.Spec.ElasticPolicy.MinReplicas, *job.Spec.ElasticPolicy.MaxReplicas),
   146  	}, nil
   147  }
   148  
   149  func (e ElasticEnvVarGenerator) generateEnvRDZVEndpoint(job *kubeflowv1.PyTorchJob) (*corev1.EnvVar, error) {
   150  	var err error
   151  	host := ""
   152  	if job.Spec.ElasticPolicy.RDZVHost == nil {
   153  		host = fmt.Sprintf("%s-worker-0", job.Name)
   154  	} else {
   155  		host = *job.Spec.ElasticPolicy.RDZVHost
   156  	}
   157  
   158  	var port int32
   159  	if job.Spec.ElasticPolicy.RDZVPort == nil {
   160  		// Generate RDZV_Endpoint.
   161  		port, err = getPortFromPyTorchJob(job, kubeflowv1.PyTorchJobReplicaTypeWorker)
   162  		if err != nil {
   163  			return nil, err
   164  		}
   165  	} else {
   166  		port = *job.Spec.ElasticPolicy.RDZVPort
   167  	}
   168  	return &corev1.EnvVar{
   169  		Name:  EnvRDZVEndpoint,
   170  		Value: fmt.Sprintf("%s:%d", host, port),
   171  	}, nil
   172  }
   173  
   174  func (e ElasticEnvVarGenerator) generateEnvRDZVConf(elasticPolicy *kubeflowv1.ElasticPolicy) *corev1.EnvVar {
   175  	if elasticPolicy.RDZVConf == nil {
   176  		return nil
   177  	}
   178  	val := ""
   179  	for _, conf := range elasticPolicy.RDZVConf {
   180  		val += fmt.Sprintf("%s=%s,", conf.Key, conf.Value)
   181  	}
   182  	return &corev1.EnvVar{
   183  		Name: EnvRDZVConf,
   184  		// Remove the last comma.
   185  		Value: val[:len(val)-1],
   186  	}
   187  }
   188  
   189  func (e ElasticEnvVarGenerator) generateEnvBackend(elasticPolicy *kubeflowv1.ElasticPolicy) corev1.EnvVar {
   190  	if elasticPolicy.RDZVBackend != nil {
   191  		return corev1.EnvVar{
   192  			Name:  EnvRDZVBackend,
   193  			Value: string(*elasticPolicy.RDZVBackend),
   194  		}
   195  	}
   196  	return corev1.EnvVar{
   197  		Name:  EnvRDZVBackend,
   198  		Value: string(kubeflowv1.BackendC10D),
   199  	}
   200  }