github.com/kubeflow/training-operator@v1.7.0/pkg/controller.v1/pytorch/initcontainer.go (about) 1 // Copyright 2021 The Kubeflow Authors 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License 14 15 package pytorch 16 17 import ( 18 "bytes" 19 "fmt" 20 "html/template" 21 "os" 22 "strings" 23 "sync" 24 25 "github.com/go-logr/logr" 26 corev1 "k8s.io/api/core/v1" 27 "k8s.io/apimachinery/pkg/types" 28 "sigs.k8s.io/yaml" 29 30 kubeflowv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" 31 "github.com/kubeflow/training-operator/pkg/config" 32 ) 33 34 var ( 35 initContainerTemplate = ` 36 - name: init-pytorch 37 image: {{.InitContainerImage}} 38 imagePullPolicy: IfNotPresent 39 resources: 40 limits: 41 cpu: 100m 42 memory: 20Mi 43 requests: 44 cpu: 50m 45 memory: 10Mi 46 command: ['sh', '-c', 'err=1;for i in $(seq {{.MaxTries}}); do if nslookup {{.MasterAddr}}; then err=0 && break; fi;echo waiting for master; sleep 2; done; exit $err']` 47 onceInitContainer sync.Once 48 icGenerator *initContainerGenerator 49 ) 50 51 type initContainerGenerator struct { 52 template string 53 image string 54 maxTries int 55 } 56 57 func getInitContainerGenerator() *initContainerGenerator { 58 onceInitContainer.Do(func() { 59 icGenerator = &initContainerGenerator{ 60 template: getInitContainerTemplateOrDefault(config.Config.PyTorchInitContainerTemplateFile), 61 image: config.Config.PyTorchInitContainerImage, 62 maxTries: config.Config.PyTorchInitContainerMaxTries, 63 } 64 }) 65 return icGenerator 66 } 67 68 func (i *initContainerGenerator) GetInitContainer(masterAddr string) ([]corev1.Container, error) { 69 var buf bytes.Buffer 70 tpl, err := template.New("container").Parse(i.template) 71 if err != nil { 72 return nil, err 73 } 74 if err := tpl.Execute(&buf, struct { 75 MasterAddr string 76 InitContainerImage string 77 MaxTries int 78 }{ 79 MasterAddr: masterAddr, 80 InitContainerImage: i.image, 81 MaxTries: i.maxTries, 82 }); err != nil { 83 return nil, err 84 } 85 86 var result []corev1.Container 87 err = yaml.Unmarshal(buf.Bytes(), &result) 88 if err != nil { 89 return nil, err 90 } 91 92 return result, nil 93 } 94 95 // getInitContainerTemplateOrDefault returns the init container template file if 96 // it exists, or return initContainerTemplate by default. 97 func getInitContainerTemplateOrDefault(file string) string { 98 b, err := os.ReadFile(file) 99 if err == nil { 100 return string(b) 101 } 102 return initContainerTemplate 103 } 104 105 func setInitContainer(obj interface{}, podTemplate *corev1.PodTemplateSpec, 106 rtype, index string, log logr.Logger) error { 107 pytorchJob, ok := obj.(*kubeflowv1.PyTorchJob) 108 if !ok { 109 return fmt.Errorf("%+v is not a type of PyTorchJob", obj) 110 } 111 logger := log.WithValues(kubeflowv1.PyTorchJobSingular, types.NamespacedName{ 112 Namespace: pytorchJob.Namespace, 113 Name: pytorchJob.Name, 114 }) 115 116 // There is no need to set init container if no master is specified. 117 if pytorchJob.Spec.PyTorchReplicaSpecs[kubeflowv1.PyTorchJobReplicaTypeMaster] == nil { 118 logger.V(1).Info("No master is specified, skip setting init container") 119 return nil 120 } 121 122 // Set the init container only if the master is specified and the current 123 // rtype is worker. 124 if rtype == strings.ToLower(string(kubeflowv1.PyTorchJobReplicaTypeWorker)) { 125 g := getInitContainerGenerator() 126 initContainers, err := g.GetInitContainer(replicaName(pytorchJob.Name, 127 kubeflowv1.PyTorchJobReplicaTypeMaster, 0)) 128 if err != nil { 129 return err 130 } 131 podTemplate.Spec.InitContainers = append(podTemplate.Spec.InitContainers, 132 initContainers...) 133 134 } 135 return nil 136 }