github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/ext/etl/boot.go (about) 1 // Package etl provides utilities to initialize and use transformation pods. 2 /* 3 * Copyright (c) 2018-2024, NVIDIA CORPORATION. All rights reserved. 4 */ 5 package etl 6 7 import ( 8 "context" 9 "fmt" 10 "net" 11 "time" 12 13 "github.com/NVIDIA/aistore/api/apc" 14 "github.com/NVIDIA/aistore/cmn" 15 "github.com/NVIDIA/aistore/cmn/cos" 16 "github.com/NVIDIA/aistore/cmn/debug" 17 "github.com/NVIDIA/aistore/cmn/k8s" 18 "github.com/NVIDIA/aistore/cmn/nlog" 19 "github.com/NVIDIA/aistore/core" 20 "github.com/NVIDIA/aistore/xact/xreg" 21 corev1 "k8s.io/api/core/v1" 22 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 23 "k8s.io/apimachinery/pkg/util/wait" 24 ) 25 26 const appLabel = "app" 27 28 type etlBootstrapper struct { 29 // construction 30 errCtx *cmn.ETLErrCtx 31 config *cmn.Config 32 msg InitSpecMsg 33 env map[string]string 34 35 // runtime 36 xctn core.Xact 37 pod *corev1.Pod 38 svc *corev1.Service 39 uri string 40 originalPodName string 41 originalCommand []string 42 } 43 44 func (b *etlBootstrapper) createPodSpec() (err error) { 45 if b.pod, err = ParsePodSpec(b.errCtx, b.msg.Spec); err != nil { 46 return 47 } 48 b.originalPodName = b.pod.GetName() 49 b.errCtx.ETLName = b.originalPodName 50 return b._prepSpec() 51 } 52 53 func (b *etlBootstrapper) _prepSpec() (err error) { 54 // Override pod name: append target ID 55 // (K8s doesn't allow `_` and uppercase) 56 b.pod.SetName(k8s.CleanName(b.msg.IDX + "-" + core.T.SID())) 57 b.errCtx.PodName = b.pod.GetName() 58 b.pod.APIVersion = "v1" 59 60 // The following combination of Affinity and Anti-Affinity provides for: 61 // 1. The ETL container is always scheduled on the target invoking it. 62 // 2. No more than a single ETL container with the same target is scheduled on 63 // the same node at any given point in time. 64 if err = b._setAffinity(); err != nil { 65 return 66 } 67 if err = b._setAntiAffinity(); err != nil { 68 return 69 } 70 71 b._updPodCommand() 72 b._updPodLabels() 73 b._updReady() 74 75 b._setPodEnv() 76 77 if cmn.Rom.FastV(4, cos.SmoduleETL) { 78 nlog.Infof("prep pod spec: %s, %+v", b.msg.String(), b.errCtx) 79 } 80 return 81 } 82 83 func (b *etlBootstrapper) createServiceSpec() { 84 b.svc = &corev1.Service{ 85 TypeMeta: metav1.TypeMeta{ 86 Kind: "Service", 87 APIVersion: "v1", 88 }, 89 ObjectMeta: metav1.ObjectMeta{ 90 Name: b.pod.GetName(), 91 }, 92 Spec: corev1.ServiceSpec{ 93 Ports: []corev1.ServicePort{ 94 {Port: b.pod.Spec.Containers[0].Ports[0].ContainerPort}, 95 }, 96 Selector: map[string]string{ 97 podNameLabel: b.pod.Labels[podNameLabel], 98 appLabel: b.pod.Labels[appLabel], 99 }, 100 Type: corev1.ServiceTypeNodePort, 101 }, 102 } 103 b._setSvcLabels() 104 b.errCtx.SvcName = b.svc.Name 105 } 106 107 func (b *etlBootstrapper) setupConnection() (err error) { 108 // Retrieve host IP of the pod. 109 var hostIP string 110 if hostIP, err = b._getHost(); err != nil { 111 return 112 } 113 114 // Retrieve assigned port by the service. 115 var nodePort uint 116 if nodePort, err = b._getPort(); err != nil { 117 return 118 } 119 120 // Make sure we can access the pod via TCP socket address to ensure that 121 // it is accessible from target. 122 etlSocketAddr := fmt.Sprintf("%s:%d", hostIP, nodePort) 123 if err = b._dial(etlSocketAddr); err != nil { 124 if cmn.Rom.FastV(4, cos.SmoduleETL) { 125 nlog.Warningf("failed to dial -> %s: %s, %+v, %s", etlSocketAddr, b.msg.String(), b.errCtx, b.uri) 126 } 127 err = cmn.NewErrETL(b.errCtx, err.Error()) 128 return 129 } 130 131 b.uri = "http://" + etlSocketAddr 132 if cmn.Rom.FastV(4, cos.SmoduleETL) { 133 nlog.Infof("setup connection -> %s, %+v, %s", b.uri, b.msg.String(), b.errCtx) 134 } 135 return nil 136 } 137 138 func (b *etlBootstrapper) _dial(socketAddr string) error { 139 probeInterval := cmn.Rom.MaxKeepalive() 140 err := cmn.NetworkCallWithRetry(&cmn.RetryArgs{ 141 Call: func() (int, error) { 142 conn, err := net.DialTimeout("tcp", socketAddr, probeInterval) 143 if err != nil { 144 return 0, err 145 } 146 cos.Close(conn) 147 return 0, nil 148 }, 149 SoftErr: 10, 150 HardErr: 2, 151 Sleep: 3 * time.Second, 152 Action: "dial POD " + b.pod.Name + " at " + socketAddr, 153 }) 154 if err != nil { 155 return fmt.Errorf("failed to wait for ETL Service/Pod %q to respond, err: %v", b.pod.Name, err) 156 } 157 return nil 158 } 159 160 func (b *etlBootstrapper) createEntity(entity string) error { 161 client, err := k8s.GetClient() 162 if err != nil { 163 return err 164 } 165 switch entity { 166 case k8s.Pod: 167 err = client.Create(b.pod) 168 case k8s.Svc: 169 err = client.Create(b.svc) 170 default: 171 cos.AssertMsg(false, "invalid K8s entity :"+entity) 172 } 173 174 if err != nil { 175 err = cmn.NewErrETL(b.errCtx, "failed to create %s (err: %v)", entity, err) 176 } 177 return err 178 } 179 180 // waitPodReady waits until ETL Pod becomes `Ready`. This happens 181 // only after the Pod's containers will have started and the Pod's `readinessProbe` 182 // request (made by the Kubernetes itself) returns OK. If the Pod doesn't have 183 // `readinessProbe` config specified the last step gets skipped. 184 // 185 // NOTE: currently, we do require readinessProbe config in the ETL spec. 186 func (b *etlBootstrapper) waitPodReady() error { 187 var ( 188 timeout = b.msg.Timeout.D() 189 interval = cos.ProbingFrequency(timeout) 190 client, err = k8s.GetClient() 191 ) 192 if err != nil { 193 return cmn.NewErrETL(b.errCtx, "%v", err) 194 } 195 if cmn.Rom.FastV(4, cos.SmoduleETL) { 196 nlog.Infof("waiting pod %q ready (%+v, %s) timeout=%v ival=%v", 197 b.pod.Name, b.msg.String(), b.errCtx, timeout, interval) 198 } 199 // wait 200 err = wait.PollUntilContextTimeout(context.Background(), interval, timeout, false, /*immediate*/ 201 func(context.Context) (ready bool, err error) { 202 return checkPodReady(client, b.pod.Name) 203 }, 204 ) 205 206 if err == nil { 207 return nil 208 } 209 pod, _ := client.Pod(b.pod.Name) 210 if pod == nil { 211 return cmn.NewErrETL(b.errCtx, "%v", err) 212 } 213 err = cmn.NewErrETL(b.errCtx, 214 `%v (pod phase: %q, pod conditions: %s; expected condition: %s)`, 215 err, pod.Status.Phase, podConditionsToString(pod.Status.Conditions), 216 podConditionToString(&corev1.PodCondition{Type: corev1.PodReady, Status: corev1.ConditionTrue}), 217 ) 218 return err 219 } 220 221 func (b *etlBootstrapper) setupXaction(xid string) { 222 rns := xreg.RenewETL(b.msg, xid) 223 debug.AssertNoErr(rns.Err) 224 debug.Assert(!rns.IsRunning()) 225 b.xctn = rns.Entry.Get() 226 debug.Assertf(b.xctn.ID() == xid, "%s vs %s", b.xctn.ID(), xid) 227 } 228 229 func (b *etlBootstrapper) _updPodCommand() { 230 if b.msg.CommTypeX != HpushStdin { 231 return 232 } 233 234 b.originalCommand = b.pod.Spec.Containers[0].Command 235 b.pod.Spec.Containers[0].Command = []string{"sh", "-c", "/server"} 236 } 237 238 // Sets pods node affinity, so pod will be scheduled on the same node as a target creating it. 239 func (b *etlBootstrapper) _setAffinity() error { 240 if b.pod.Spec.Affinity == nil { 241 b.pod.Spec.Affinity = &corev1.Affinity{} 242 } 243 if b.pod.Spec.Affinity.NodeAffinity == nil { 244 b.pod.Spec.Affinity.NodeAffinity = &corev1.NodeAffinity{} 245 } 246 247 reqAffinity := b.pod.Spec.Affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution 248 prefAffinity := b.pod.Spec.Affinity.NodeAffinity.PreferredDuringSchedulingIgnoredDuringExecution 249 if reqAffinity != nil && len(reqAffinity.NodeSelectorTerms) > 0 || len(prefAffinity) > 0 { 250 return cmn.NewErrETL(b.errCtx, "error in YAML spec: pod should not have any NodeAffinities defined") 251 } 252 253 nodeSelector := &corev1.NodeSelector{ 254 NodeSelectorTerms: []corev1.NodeSelectorTerm{ 255 { 256 MatchExpressions: []corev1.NodeSelectorRequirement{{ 257 Key: nodeNameLabel, 258 Operator: corev1.NodeSelectorOpIn, 259 Values: []string{k8s.NodeName}, 260 }}, 261 }, 262 }, 263 } 264 // RequiredDuringSchedulingIgnoredDuringExecution means that ETL container will be placed on the same machine as 265 // target which creates it. This guarantee holds only during scheduling - initial pod start-up sequence. 266 // However, a target removes its ETL pod when it goes down, so this guarantee is sufficient. 267 // Additionally, if other targets notice that another target went down, they all stop all running ETL pods. 268 b.pod.Spec.Affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution = nodeSelector 269 return nil 270 } 271 272 // Sets pods node anti-affinity, so no two pods with the matching criteria is scheduled on the same node 273 // at the same time. 274 func (b *etlBootstrapper) _setAntiAffinity() error { 275 if b.pod.Spec.Affinity == nil { 276 b.pod.Spec.Affinity = &corev1.Affinity{} 277 } 278 if b.pod.Spec.Affinity.PodAntiAffinity == nil { 279 b.pod.Spec.Affinity.PodAntiAffinity = &corev1.PodAntiAffinity{} 280 } 281 282 reqAntiAffinities := b.pod.Spec.Affinity.PodAntiAffinity.RequiredDuringSchedulingIgnoredDuringExecution 283 prefAntiAffinity := b.pod.Spec.Affinity.PodAntiAffinity.PreferredDuringSchedulingIgnoredDuringExecution 284 285 if len(reqAntiAffinities) > 0 || len(prefAntiAffinity) > 0 { 286 return cmn.NewErrETL(b.errCtx, "error in YAML spec, pod should not have any NodeAntiAffinities defined") 287 } 288 289 return nil 290 } 291 292 func (b *etlBootstrapper) _updPodLabels() { 293 if b.pod.Labels == nil { 294 b.pod.Labels = make(map[string]string, 6) 295 } 296 297 b.pod.Labels[appLabel] = "ais" 298 b.pod.Labels[podNameLabel] = b.pod.GetName() 299 b.pod.Labels[podNodeLabel] = k8s.NodeName 300 b.pod.Labels[podTargetLabel] = core.T.SID() 301 b.pod.Labels[appK8sNameLabel] = "etl" 302 b.pod.Labels[appK8sComponentLabel] = "server" 303 } 304 305 func (b *etlBootstrapper) _setSvcLabels() { 306 if b.svc.Labels == nil { 307 b.svc.Labels = make(map[string]string, 4) 308 } 309 b.svc.Labels[appLabel] = "ais" 310 b.svc.Labels[svcNameLabel] = b.svc.GetName() 311 b.svc.Labels[appK8sNameLabel] = "etl" 312 b.svc.Labels[appK8sComponentLabel] = "server" 313 } 314 315 func (b *etlBootstrapper) _updReady() { 316 probe := b.pod.Spec.Containers[0].ReadinessProbe 317 318 // If someone already set these values, we don't to touch them. 319 if probe.TimeoutSeconds != 0 || probe.PeriodSeconds != 0 { 320 return 321 } 322 323 // Set default values. 324 probe.TimeoutSeconds = 5 325 probe.PeriodSeconds = 10 326 } 327 328 // Sets environment variables that can be accessed inside the container. 329 func (b *etlBootstrapper) _setPodEnv() { 330 containers := b.pod.Spec.Containers 331 debug.Assert(len(containers) > 0) 332 for idx := range containers { 333 containers[idx].Env = append(containers[idx].Env, corev1.EnvVar{ 334 Name: "AIS_TARGET_URL", 335 Value: core.T.Snode().URL(cmn.NetPublic) + apc.URLPathETLObject.Join(reqSecret), 336 }) 337 for k, v := range b.env { 338 containers[idx].Env = append(containers[idx].Env, corev1.EnvVar{ 339 Name: k, 340 Value: v, 341 }) 342 } 343 } 344 for idx := range b.pod.Spec.InitContainers { 345 for k, v := range b.env { 346 b.pod.Spec.InitContainers[idx].Env = append(b.pod.Spec.InitContainers[idx].Env, corev1.EnvVar{ 347 Name: k, 348 Value: v, 349 }) 350 } 351 } 352 } 353 354 func (b *etlBootstrapper) _getHost() (string, error) { 355 client, err := k8s.GetClient() 356 if err != nil { 357 return "", cmn.NewErrETL(b.errCtx, err.Error()) 358 } 359 p, err := client.Pod(b.pod.Name) 360 if err != nil { 361 return "", err 362 } 363 return p.Status.HostIP, nil 364 } 365 366 func (b *etlBootstrapper) _getPort() (uint, error) { 367 client, err := k8s.GetClient() 368 if err != nil { 369 return 0, cmn.NewErrETL(b.errCtx, err.Error()) 370 } 371 372 s, err := client.Service(b.svc.Name) 373 if err != nil { 374 return 0, cmn.NewErrETL(b.errCtx, err.Error()) 375 } 376 377 nodePort := int(s.Spec.Ports[0].NodePort) 378 port, err := cmn.ValidatePort(nodePort) 379 if err != nil { 380 return 0, cmn.NewErrETL(b.errCtx, err.Error()) 381 } 382 return uint(port), nil 383 }