github.com/kubeflow/training-operator@v1.7.0/pkg/controller.v1/pytorch/initcontainer.go (about)

     1  // Copyright 2021 The Kubeflow Authors
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License
    14  
    15  package pytorch
    16  
    17  import (
    18  	"bytes"
    19  	"fmt"
    20  	"html/template"
    21  	"os"
    22  	"strings"
    23  	"sync"
    24  
    25  	"github.com/go-logr/logr"
    26  	corev1 "k8s.io/api/core/v1"
    27  	"k8s.io/apimachinery/pkg/types"
    28  	"sigs.k8s.io/yaml"
    29  
    30  	kubeflowv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1"
    31  	"github.com/kubeflow/training-operator/pkg/config"
    32  )
    33  
    34  var (
    35  	initContainerTemplate = `
    36  - name: init-pytorch
    37    image: {{.InitContainerImage}}
    38    imagePullPolicy: IfNotPresent
    39    resources:
    40      limits:
    41        cpu: 100m
    42        memory: 20Mi
    43      requests:
    44        cpu: 50m
    45        memory: 10Mi
    46    command: ['sh', '-c', 'err=1;for i in $(seq {{.MaxTries}}); do if nslookup {{.MasterAddr}}; then err=0 && break; fi;echo waiting for master; sleep 2; done; exit $err']`
    47  	onceInitContainer sync.Once
    48  	icGenerator       *initContainerGenerator
    49  )
    50  
    51  type initContainerGenerator struct {
    52  	template string
    53  	image    string
    54  	maxTries int
    55  }
    56  
    57  func getInitContainerGenerator() *initContainerGenerator {
    58  	onceInitContainer.Do(func() {
    59  		icGenerator = &initContainerGenerator{
    60  			template: getInitContainerTemplateOrDefault(config.Config.PyTorchInitContainerTemplateFile),
    61  			image:    config.Config.PyTorchInitContainerImage,
    62  			maxTries: config.Config.PyTorchInitContainerMaxTries,
    63  		}
    64  	})
    65  	return icGenerator
    66  }
    67  
    68  func (i *initContainerGenerator) GetInitContainer(masterAddr string) ([]corev1.Container, error) {
    69  	var buf bytes.Buffer
    70  	tpl, err := template.New("container").Parse(i.template)
    71  	if err != nil {
    72  		return nil, err
    73  	}
    74  	if err := tpl.Execute(&buf, struct {
    75  		MasterAddr         string
    76  		InitContainerImage string
    77  		MaxTries           int
    78  	}{
    79  		MasterAddr:         masterAddr,
    80  		InitContainerImage: i.image,
    81  		MaxTries:           i.maxTries,
    82  	}); err != nil {
    83  		return nil, err
    84  	}
    85  
    86  	var result []corev1.Container
    87  	err = yaml.Unmarshal(buf.Bytes(), &result)
    88  	if err != nil {
    89  		return nil, err
    90  	}
    91  
    92  	return result, nil
    93  }
    94  
    95  // getInitContainerTemplateOrDefault returns the init container template file if
    96  // it exists, or return initContainerTemplate by default.
    97  func getInitContainerTemplateOrDefault(file string) string {
    98  	b, err := os.ReadFile(file)
    99  	if err == nil {
   100  		return string(b)
   101  	}
   102  	return initContainerTemplate
   103  }
   104  
   105  func setInitContainer(obj interface{}, podTemplate *corev1.PodTemplateSpec,
   106  	rtype, index string, log logr.Logger) error {
   107  	pytorchJob, ok := obj.(*kubeflowv1.PyTorchJob)
   108  	if !ok {
   109  		return fmt.Errorf("%+v is not a type of PyTorchJob", obj)
   110  	}
   111  	logger := log.WithValues(kubeflowv1.PyTorchJobSingular, types.NamespacedName{
   112  		Namespace: pytorchJob.Namespace,
   113  		Name:      pytorchJob.Name,
   114  	})
   115  
   116  	// There is no need to set init container if no master is specified.
   117  	if pytorchJob.Spec.PyTorchReplicaSpecs[kubeflowv1.PyTorchJobReplicaTypeMaster] == nil {
   118  		logger.V(1).Info("No master is specified, skip setting init container")
   119  		return nil
   120  	}
   121  
   122  	// Set the init container only if the master is specified and the current
   123  	// rtype is worker.
   124  	if rtype == strings.ToLower(string(kubeflowv1.PyTorchJobReplicaTypeWorker)) {
   125  		g := getInitContainerGenerator()
   126  		initContainers, err := g.GetInitContainer(replicaName(pytorchJob.Name,
   127  			kubeflowv1.PyTorchJobReplicaTypeMaster, 0))
   128  		if err != nil {
   129  			return err
   130  		}
   131  		podTemplate.Spec.InitContainers = append(podTemplate.Spec.InitContainers,
   132  			initContainers...)
   133  
   134  	}
   135  	return nil
   136  }