github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/ext/etl/transform.go (about) 1 // Package etl provides utilities to initialize and use transformation pods. 2 /* 3 * Copyright (c) 2018-2024, NVIDIA CORPORATION. All rights reserved. 4 */ 5 package etl 6 7 import ( 8 "context" 9 "fmt" 10 "strconv" 11 "strings" 12 "sync" 13 14 "github.com/NVIDIA/aistore/api/apc" 15 "github.com/NVIDIA/aistore/cmn" 16 "github.com/NVIDIA/aistore/cmn/cos" 17 "github.com/NVIDIA/aistore/cmn/debug" 18 "github.com/NVIDIA/aistore/cmn/k8s" 19 "github.com/NVIDIA/aistore/cmn/nlog" 20 "github.com/NVIDIA/aistore/core" 21 "github.com/NVIDIA/aistore/core/meta" 22 "github.com/NVIDIA/aistore/ext/etl/runtime" 23 "github.com/NVIDIA/aistore/xact/xreg" 24 corev1 "k8s.io/api/core/v1" 25 k8sErrors "k8s.io/apimachinery/pkg/api/errors" 26 "k8s.io/apimachinery/pkg/util/wait" 27 ) 28 29 const ( 30 // Built-in label: https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/#built-in-node-labels. 31 nodeNameLabel = "kubernetes.io/hostname" 32 33 // Recommended labels: https://kubernetes.io/docs/concepts/overview/working-with-objects/common-labels/. 34 appK8sNameLabel = "app.kubernetes.io/name" 35 appK8sComponentLabel = "app.kubernetes.io/component" 36 37 // ETL Custom labels. 38 podNameLabel = "nvidia.com/ais-etl-name" 39 svcNameLabel = "nvidia.com/ais-etl-name" 40 41 // ETL Pod's label describing which target ETL is associated with. 42 podNodeLabel = "nvidia.com/ais-etl-node" 43 podTargetLabel = "nvidia.com/ais-etl-target" 44 ) 45 46 // Definitions: 47 // 48 // ETL: 49 // Refers to Extract-Transform-Load, which allows a user to do transformation 50 // of the objects. Transformation is defined by an ETL spec, which is a K8s 51 // yaml spec file. The operations of an ETL are executed on the ETL container. 52 // 53 // ETL container: 54 // The user's K8s pod which runs the container doing the transformation of 55 // the objects. It is initiated by a target and runs on the same K8s node 56 // running the target. 57 // 58 // On-the-fly transformation flow: 59 // 1. User initiates a custom ETL workload by executing one of the documented APIs 60 // and providing either the corresponding docker image or a *transforming function* - 61 // a piece of code that we further run using one of the pre-built `runtimes` 62 // (see https://github.com/NVIDIA/aistore/blob/main/docs/etl.md). 63 // 2. The API call results in deploying multiple ETL containers (K8s pods) 64 // simultaneously: one container per storage target. 65 // 3. Each target creates a local `Communicator` instance that is based on the specified 66 // `communication type`. 67 // 4. Client-side application (e.g., PyTorch or TensorFlow based training model) 68 // starts (randomly) reading the data from a given dataset. 69 // 5. User-defined transformation is then performed using `Communicator.Do()` 70 // on each read objects, on a per-object (or shard) basis. 71 // 6. Finally, the ETL container is stopped using the `Stop` API. In response, 72 // each ais target in the cluster deletes its local ETL container (K8s pod). 73 // 74 // Limitations of the current implementation (soon to be removed): 75 // 76 // * No idle timeout for a ETL container. It keeps running unless explicitly 77 // stopped by invoking the `Stop` API. 78 // 79 // * Delete of an ETL container is done in two stages. First we gracefully try to 80 // terminate the pod with a 30s timeout. Upon failure to do so, we perform 81 // a force delete. 82 // 83 // * Recreating an ETL container with the same name will delete all running 84 // containers with the same name. 85 86 type ( 87 // Aborter listens to smap changes and aborts the ETL on the target when 88 // there is any change in targets membership. Aborter should be registered 89 // on ETL init. It is unregistered by Stop function. The is no 90 // synchronization between aborters on different targets. It is assumed that 91 // if one target received smap with changed targets membership, eventually 92 // each of the targets will receive it as well. Hence, all ETL containers 93 // will be stopped. 94 Aborter struct { 95 currentSmap *meta.Smap 96 name string 97 mtx sync.Mutex 98 } 99 100 StartOpts struct { 101 Env map[string]string 102 } 103 ) 104 105 // interface guard 106 var _ meta.Slistener = (*Aborter)(nil) 107 108 func newAborter(name string) *Aborter { 109 return &Aborter{ 110 name: name, 111 currentSmap: core.T.Sowner().Get(), 112 } 113 } 114 115 func (e *Aborter) String() string { 116 return "etl-aborter-" + e.name 117 } 118 119 func (e *Aborter) ListenSmapChanged() { 120 // New goroutine as kubectl calls can take a lot of time, 121 // making other listeners wait. 122 go func() { 123 e.mtx.Lock() 124 defer e.mtx.Unlock() 125 newSmap := core.T.Sowner().Get() 126 127 if newSmap.Version <= e.currentSmap.Version { 128 return 129 } 130 131 if !newSmap.CompareTargets(e.currentSmap) { 132 err := cmn.NewErrETL(&cmn.ETLErrCtx{ 133 TID: core.T.SID(), 134 ETLName: e.name, 135 }, "targets have changed, aborting...") 136 nlog.Warningln(err) 137 // Stop will unregister `e` from smap listeners. 138 if err := Stop(e.name, err); err != nil { 139 nlog.Errorln(err) 140 } 141 } 142 143 e.currentSmap = newSmap 144 }() 145 } 146 147 // (common for both `InitCode` and `InitSpec` flows) 148 func InitSpec(msg *InitSpecMsg, etlName string, opts StartOpts) error { 149 config := cmn.GCO.Get() 150 errCtx, podName, svcName, err := start(msg, etlName, opts, config) 151 if err == nil { 152 if cmn.Rom.FastV(4, cos.SmoduleETL) { 153 nlog.Infof("started etl[%s], msg %s, pod %s", etlName, msg, podName) 154 } 155 return nil 156 } 157 // cleanup 158 s := fmt.Sprintf("failed to start etl[%s], msg %s, err %v - cleaning up..", etlName, msg, err) 159 nlog.Warningln(cmn.NewErrETL(errCtx, s)) 160 if errV := cleanupEntities(errCtx, podName, svcName); errV != nil { 161 nlog.Errorln(errV) 162 } 163 return err 164 } 165 166 // Given user message `InitCodeMsg`: 167 // - make the corresponding assorted substitutions in the etl/runtime/podspec.yaml spec, and 168 // - execute `InitSpec` with the modified podspec 169 // See also: etl/runtime/podspec.yaml 170 func InitCode(msg *InitCodeMsg, xid string) error { 171 var ( 172 ftp = fromToPairs(msg) 173 replacer = strings.NewReplacer(ftp...) 174 ) 175 r, exists := runtime.Get(msg.Runtime) 176 debug.Assert(exists, msg.Runtime) // must've been checked by proxy 177 178 podSpec := replacer.Replace(r.PodSpec()) 179 180 // Start ETL 181 // (the point where InitCode flow converges w/ InitSpec) 182 return InitSpec( 183 &InitSpecMsg{msg.InitMsgBase, []byte(podSpec)}, 184 xid, 185 StartOpts{Env: map[string]string{ 186 r.CodeEnvName(): string(msg.Code), 187 r.DepsEnvName(): string(msg.Deps), 188 }}) 189 } 190 191 // generate (from => to) replacements 192 func fromToPairs(msg *InitCodeMsg) (ftp []string) { 193 var ( 194 chunk string 195 flags string 196 name = msg.IDX 197 ) 198 ftp = make([]string, 0, 16) 199 ftp = append(ftp, "<NAME>", name, "<COMM_TYPE>", msg.CommTypeX, "<ARG_TYPE>", msg.ArgTypeX) 200 201 // chunk == 0 means no chunks (and no streaming) - ie., 202 // reading the entire payload in memory and then transforming in one shot 203 if msg.ChunkSize > 0 { 204 chunk = "\"" + strconv.FormatInt(msg.ChunkSize, 10) + "\"" 205 } 206 ftp = append(ftp, "<CHUNK_SIZE>", chunk) 207 208 if msg.Flags > 0 { 209 flags = "\"" + strconv.FormatInt(msg.Flags, 10) + "\"" 210 } 211 ftp = append(ftp, "<FLAGS>", flags, "<FUNC_TRANSFORM>", msg.Funcs.Transform) 212 213 switch msg.CommTypeX { 214 case Hpush, Hpull, Hrev: 215 ftp = append(ftp, "<COMMAND>", "['sh', '-c', 'python /server.py']") 216 case HpushStdin: 217 ftp = append(ftp, "<COMMAND>", "['python /code/code.py']") 218 default: 219 debug.Assert(false, msg.CommTypeX) 220 } 221 return 222 } 223 224 // cleanupEntities removes provided entities. It tries its best to remove all 225 // entities so it doesn't stop when encountering an error. 226 func cleanupEntities(errCtx *cmn.ETLErrCtx, podName, svcName string) (err error) { 227 if svcName != "" { 228 if deleteErr := deleteEntity(errCtx, k8s.Svc, svcName); deleteErr != nil { 229 err = deleteErr 230 } 231 } 232 233 if podName != "" { 234 if deleteErr := deleteEntity(errCtx, k8s.Pod, podName); deleteErr != nil { 235 err = deleteErr 236 } 237 } 238 239 return 240 } 241 242 // (does the heavy-lifting) 243 // Returns: 244 // * errCtx - ETL error context 245 // * podName - non-empty if at least one attempt of creating pod was executed 246 // * svcName - non-empty if at least one attempt of creating service was executed 247 // * err - any error occurred that should be passed on. 248 func start(msg *InitSpecMsg, xid string, opts StartOpts, config *cmn.Config) (errCtx *cmn.ETLErrCtx, 249 podName, svcName string, err error) { 250 debug.Assert(k8s.NodeName != "") // checked above 251 252 errCtx = &cmn.ETLErrCtx{TID: core.T.SID(), ETLName: msg.IDX} 253 boot := &etlBootstrapper{errCtx: errCtx, config: config, env: opts.Env} 254 boot.msg = *msg 255 256 // Parse spec template and fill Pod object with necessary fields. 257 if err = boot.createPodSpec(); err != nil { 258 return 259 } 260 261 boot.createServiceSpec() 262 263 // 1. Cleanup previously started entities, if any. 264 errCleanup := cleanupEntities(errCtx, boot.pod.Name, boot.svc.Name) 265 debug.AssertNoErr(errCleanup) 266 267 // 2. Creating service. 268 svcName = boot.svc.GetName() 269 if err = boot.createEntity(k8s.Svc); err != nil { 270 return 271 } 272 // 3. Creating pod. 273 podName = boot.pod.GetName() 274 if err = boot.createEntity(k8s.Pod); err != nil { 275 return 276 } 277 if err = boot.waitPodReady(); err != nil { 278 return 279 } 280 if cmn.Rom.FastV(4, cos.SmoduleETL) { 281 nlog.Infof("pod %q is ready, %+v, %s", podName, msg, boot.errCtx) 282 } 283 if err = boot.setupConnection(); err != nil { 284 return 285 } 286 287 boot.setupXaction(xid) 288 289 // finally, add Communicator to the runtime registry 290 comm := newCommunicator(newAborter(msg.IDX), boot) 291 if err = reg.add(msg.IDX, comm); err != nil { 292 return 293 } 294 core.T.Sowner().Listeners().Reg(comm) 295 return 296 } 297 298 // Stop deletes all occupied by the ETL resources, including Pods and Services. 299 // It unregisters ETL smap listener. 300 func Stop(id string, errCause error) error { 301 errCtx := &cmn.ETLErrCtx{ 302 TID: core.T.SID(), 303 ETLName: id, 304 } 305 306 // Abort all running offline ETLs. 307 xreg.AbortKind(errCause, apc.ActETLBck) 308 309 c, err := GetCommunicator(id) 310 if err != nil { 311 return cmn.NewErrETL(errCtx, err.Error()) 312 } 313 errCtx.PodName = c.PodName() 314 errCtx.SvcName = c.SvcName() 315 316 if err := cleanupEntities(errCtx, c.PodName(), c.SvcName()); err != nil { 317 return err 318 } 319 320 if c := reg.del(id); c != nil { 321 core.T.Sowner().Listeners().Unreg(c) 322 } 323 324 c.Stop() 325 326 return nil 327 } 328 329 // StopAll terminates all running ETLs. 330 func StopAll() { 331 if !k8s.IsK8s() { 332 return 333 } 334 for _, e := range List() { 335 if err := Stop(e.Name, nil); err != nil { 336 nlog.Errorln(err) 337 } 338 } 339 } 340 341 func GetCommunicator(etlName string) (Communicator, error) { 342 c, exists := reg.get(etlName) 343 if !exists { 344 return nil, cos.NewErrNotFound(core.T, "etl job "+etlName) 345 } 346 return c, nil 347 } 348 349 func List() []Info { return reg.list() } 350 351 func PodLogs(transformID string) (logs Logs, err error) { 352 c, err := GetCommunicator(transformID) 353 if err != nil { 354 return logs, err 355 } 356 client, err := k8s.GetClient() 357 if err != nil { 358 return logs, err 359 } 360 b, err := client.Logs(c.PodName()) 361 if err != nil { 362 return logs, err 363 } 364 return Logs{ 365 TargetID: core.T.SID(), 366 Logs: b, 367 }, nil 368 } 369 370 func PodHealth(etlName string) (string, error) { 371 c, err := GetCommunicator(etlName) 372 if err != nil { 373 return "", err 374 } 375 client, err := k8s.GetClient() 376 if err != nil { 377 return "", err 378 } 379 return client.Health(c.PodName()) 380 } 381 382 func PodMetrics(etlName string) (*CPUMemUsed, error) { 383 c, err := GetCommunicator(etlName) 384 if err != nil { 385 return nil, err 386 } 387 client, err := k8s.GetClient() 388 if err != nil { 389 return nil, err 390 } 391 cpuUsed, memUsed, err := k8s.Metrics(c.PodName()) 392 if err == nil { 393 return &CPUMemUsed{TargetID: core.T.SID(), CPU: cpuUsed, Mem: memUsed}, nil 394 } 395 if cos.IsErrNotFound(err) { 396 return nil, err 397 } 398 if metricsErr := client.CheckMetricsAvailability(); metricsErr != nil { 399 err = fmt.Errorf("%v; failed to fetch metrics from Kubernetes: %v", metricsErr, err) 400 } 401 return nil, err 402 } 403 404 // Pod conditions include enumerated lifecycle states, such as `PodScheduled`, 405 // `ContainersReady`, `Initialized`, `Ready` 406 // (see https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle). 407 // First, we check that the Pod is still running (neither succeeded, nor failed), 408 // and secondly, whether it contains `Ready` condition. 409 func checkPodReady(client k8s.Client, podName string) (ready bool, err error) { 410 var p *corev1.Pod 411 if p, err = client.Pod(podName); err != nil { 412 return false, err 413 } 414 415 // Pod has run to completion, either by failing or by succeeding. We don't 416 // expect any of these to happen, as ETL containers are supposed to constantly 417 // listen to upcoming requests and never terminate. 418 switch p.Status.Phase { 419 case corev1.PodFailed, corev1.PodSucceeded: 420 return false, fmt.Errorf( 421 "pod ran to completion (phase: %s), state message: %q", 422 p.Status.Phase, p.Status.Message, 423 ) 424 } 425 426 for _, cond := range p.Status.Conditions { 427 if cond.Type == corev1.PodReady && cond.Status == corev1.ConditionTrue { 428 return true, nil 429 } 430 } 431 432 return false, nil 433 } 434 435 func deleteEntity(errCtx *cmn.ETLErrCtx, entityType, entityName string) error { 436 client, err := k8s.GetClient() 437 if err != nil { 438 return cmn.NewErrETL(errCtx, err.Error()) 439 } 440 441 // Remove entity immediately (ignoring not found). 442 if err = client.Delete(entityType, entityName); err != nil { 443 if k8sErrors.IsNotFound(err) { 444 return nil 445 } 446 return cmn.NewErrETL(errCtx, err.Error()) 447 } 448 449 // wait 450 interval := cos.ProbingFrequency(DefaultTimeout) 451 err = wait.PollUntilContextTimeout(context.Background(), interval, DefaultTimeout, false, /*immediate*/ 452 func(context.Context) (done bool, err error) { 453 var exists bool 454 exists, err = client.CheckExists(entityType, entityName) 455 if err == nil { 456 done = !exists 457 } 458 return 459 }, 460 ) 461 if err != nil { 462 return cmn.NewErrETL(errCtx, err.Error()) 463 } 464 return nil 465 } 466 467 func podConditionsToString(conditions []corev1.PodCondition) string { 468 parts := make([]string, 0, len(conditions)) 469 for i := range conditions { 470 parts = append(parts, podConditionToString(&conditions[i])) 471 } 472 return "[" + strings.Join(parts, ", ") + "]" 473 } 474 475 func podConditionToString(cond *corev1.PodCondition) string { 476 parts := []string{ 477 fmt.Sprintf("type: %q", cond.Type), 478 fmt.Sprintf("status: %q", cond.Status), 479 } 480 if cond.Reason != "" { 481 parts = append(parts, fmt.Sprintf("reason: %q", cond.Reason)) 482 } 483 if cond.Message != "" { 484 parts = append(parts, fmt.Sprintf("msg: %q", cond.Message)) 485 } 486 return "{" + strings.Join(parts, ", ") + "}" 487 }