github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/ext/etl/transform.go (about)

     1  // Package etl provides utilities to initialize and use transformation pods.
     2  /*
     3   * Copyright (c) 2018-2024, NVIDIA CORPORATION. All rights reserved.
     4   */
     5  package etl
     6  
     7  import (
     8  	"context"
     9  	"fmt"
    10  	"strconv"
    11  	"strings"
    12  	"sync"
    13  
    14  	"github.com/NVIDIA/aistore/api/apc"
    15  	"github.com/NVIDIA/aistore/cmn"
    16  	"github.com/NVIDIA/aistore/cmn/cos"
    17  	"github.com/NVIDIA/aistore/cmn/debug"
    18  	"github.com/NVIDIA/aistore/cmn/k8s"
    19  	"github.com/NVIDIA/aistore/cmn/nlog"
    20  	"github.com/NVIDIA/aistore/core"
    21  	"github.com/NVIDIA/aistore/core/meta"
    22  	"github.com/NVIDIA/aistore/ext/etl/runtime"
    23  	"github.com/NVIDIA/aistore/xact/xreg"
    24  	corev1 "k8s.io/api/core/v1"
    25  	k8sErrors "k8s.io/apimachinery/pkg/api/errors"
    26  	"k8s.io/apimachinery/pkg/util/wait"
    27  )
    28  
    29  const (
    30  	// Built-in label: https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/#built-in-node-labels.
    31  	nodeNameLabel = "kubernetes.io/hostname"
    32  
    33  	// Recommended labels: https://kubernetes.io/docs/concepts/overview/working-with-objects/common-labels/.
    34  	appK8sNameLabel      = "app.kubernetes.io/name"
    35  	appK8sComponentLabel = "app.kubernetes.io/component"
    36  
    37  	// ETL Custom labels.
    38  	podNameLabel = "nvidia.com/ais-etl-name"
    39  	svcNameLabel = "nvidia.com/ais-etl-name"
    40  
    41  	// ETL Pod's label describing which target ETL is associated with.
    42  	podNodeLabel   = "nvidia.com/ais-etl-node"
    43  	podTargetLabel = "nvidia.com/ais-etl-target"
    44  )
    45  
    46  // Definitions:
    47  //
    48  // ETL:
    49  //     Refers to Extract-Transform-Load, which allows a user to do transformation
    50  //     of the objects. Transformation is defined by an ETL spec, which is a K8s
    51  //     yaml spec file. The operations of an ETL are executed on the ETL container.
    52  //
    53  // ETL container:
    54  //     The user's K8s pod which runs the container doing the transformation of
    55  //     the objects. It is initiated by a target and runs on the same K8s node
    56  //     running the target.
    57  //
    58  // On-the-fly transformation flow:
    59  // 1. User initiates a custom ETL workload by executing one of the documented APIs
    60  //    and providing either the corresponding docker image or a *transforming function* -
    61  //    a piece of code that we further run using one of the pre-built `runtimes`
    62  //    (see https://github.com/NVIDIA/aistore/blob/main/docs/etl.md).
    63  // 2. The API call results in deploying multiple ETL containers (K8s pods)
    64  //    simultaneously: one container per storage target.
    65  // 3. Each target creates a local `Communicator` instance that is based on the specified
    66  //    `communication type`.
    67  // 4. Client-side application (e.g., PyTorch or TensorFlow based training model)
    68  //    starts (randomly) reading the data from a given dataset.
    69  // 5. User-defined transformation is then performed using `Communicator.Do()`
    70  //    on each read objects, on a per-object (or shard) basis.
    71  // 6. Finally, the ETL container is stopped using the `Stop` API. In response,
    72  //    each ais target in the cluster deletes its local ETL container (K8s pod).
    73  //
    74  // Limitations of the current implementation (soon to be removed):
    75  //
    76  // * No idle timeout for a ETL container. It keeps running unless explicitly
    77  //   stopped by invoking the `Stop` API.
    78  //
    79  // * Delete of an ETL container is done in two stages. First we gracefully try to
    80  //   terminate the pod with a 30s timeout. Upon failure to do so, we perform
    81  //   a force delete.
    82  //
    83  // * Recreating an ETL container with the same name will delete all running
    84  //   containers with the same name.
    85  
    86  type (
    87  	// Aborter listens to smap changes and aborts the ETL on the target when
    88  	// there is any change in targets membership. Aborter should be registered
    89  	// on ETL init. It is unregistered by Stop function. The is no
    90  	// synchronization between aborters on different targets. It is assumed that
    91  	// if one target received smap with changed targets membership, eventually
    92  	// each of the targets will receive it as well. Hence, all ETL containers
    93  	// will be stopped.
    94  	Aborter struct {
    95  		currentSmap *meta.Smap
    96  		name        string
    97  		mtx         sync.Mutex
    98  	}
    99  
   100  	StartOpts struct {
   101  		Env map[string]string
   102  	}
   103  )
   104  
   105  // interface guard
   106  var _ meta.Slistener = (*Aborter)(nil)
   107  
   108  func newAborter(name string) *Aborter {
   109  	return &Aborter{
   110  		name:        name,
   111  		currentSmap: core.T.Sowner().Get(),
   112  	}
   113  }
   114  
   115  func (e *Aborter) String() string {
   116  	return "etl-aborter-" + e.name
   117  }
   118  
   119  func (e *Aborter) ListenSmapChanged() {
   120  	// New goroutine as kubectl calls can take a lot of time,
   121  	// making other listeners wait.
   122  	go func() {
   123  		e.mtx.Lock()
   124  		defer e.mtx.Unlock()
   125  		newSmap := core.T.Sowner().Get()
   126  
   127  		if newSmap.Version <= e.currentSmap.Version {
   128  			return
   129  		}
   130  
   131  		if !newSmap.CompareTargets(e.currentSmap) {
   132  			err := cmn.NewErrETL(&cmn.ETLErrCtx{
   133  				TID:     core.T.SID(),
   134  				ETLName: e.name,
   135  			}, "targets have changed, aborting...")
   136  			nlog.Warningln(err)
   137  			// Stop will unregister `e` from smap listeners.
   138  			if err := Stop(e.name, err); err != nil {
   139  				nlog.Errorln(err)
   140  			}
   141  		}
   142  
   143  		e.currentSmap = newSmap
   144  	}()
   145  }
   146  
   147  // (common for both `InitCode` and `InitSpec` flows)
   148  func InitSpec(msg *InitSpecMsg, etlName string, opts StartOpts) error {
   149  	config := cmn.GCO.Get()
   150  	errCtx, podName, svcName, err := start(msg, etlName, opts, config)
   151  	if err == nil {
   152  		if cmn.Rom.FastV(4, cos.SmoduleETL) {
   153  			nlog.Infof("started etl[%s], msg %s, pod %s", etlName, msg, podName)
   154  		}
   155  		return nil
   156  	}
   157  	// cleanup
   158  	s := fmt.Sprintf("failed to start etl[%s], msg %s, err %v - cleaning up..", etlName, msg, err)
   159  	nlog.Warningln(cmn.NewErrETL(errCtx, s))
   160  	if errV := cleanupEntities(errCtx, podName, svcName); errV != nil {
   161  		nlog.Errorln(errV)
   162  	}
   163  	return err
   164  }
   165  
   166  // Given user message `InitCodeMsg`:
   167  // - make the corresponding assorted substitutions in the etl/runtime/podspec.yaml spec, and
   168  // - execute `InitSpec` with the modified podspec
   169  // See also: etl/runtime/podspec.yaml
   170  func InitCode(msg *InitCodeMsg, xid string) error {
   171  	var (
   172  		ftp      = fromToPairs(msg)
   173  		replacer = strings.NewReplacer(ftp...)
   174  	)
   175  	r, exists := runtime.Get(msg.Runtime)
   176  	debug.Assert(exists, msg.Runtime) // must've been checked by proxy
   177  
   178  	podSpec := replacer.Replace(r.PodSpec())
   179  
   180  	// Start ETL
   181  	// (the point where InitCode flow converges w/ InitSpec)
   182  	return InitSpec(
   183  		&InitSpecMsg{msg.InitMsgBase, []byte(podSpec)},
   184  		xid,
   185  		StartOpts{Env: map[string]string{
   186  			r.CodeEnvName(): string(msg.Code),
   187  			r.DepsEnvName(): string(msg.Deps),
   188  		}})
   189  }
   190  
   191  // generate (from => to) replacements
   192  func fromToPairs(msg *InitCodeMsg) (ftp []string) {
   193  	var (
   194  		chunk string
   195  		flags string
   196  		name  = msg.IDX
   197  	)
   198  	ftp = make([]string, 0, 16)
   199  	ftp = append(ftp, "<NAME>", name, "<COMM_TYPE>", msg.CommTypeX, "<ARG_TYPE>", msg.ArgTypeX)
   200  
   201  	// chunk == 0 means no chunks (and no streaming) - ie.,
   202  	// reading the entire payload in memory and then transforming in one shot
   203  	if msg.ChunkSize > 0 {
   204  		chunk = "\"" + strconv.FormatInt(msg.ChunkSize, 10) + "\""
   205  	}
   206  	ftp = append(ftp, "<CHUNK_SIZE>", chunk)
   207  
   208  	if msg.Flags > 0 {
   209  		flags = "\"" + strconv.FormatInt(msg.Flags, 10) + "\""
   210  	}
   211  	ftp = append(ftp, "<FLAGS>", flags, "<FUNC_TRANSFORM>", msg.Funcs.Transform)
   212  
   213  	switch msg.CommTypeX {
   214  	case Hpush, Hpull, Hrev:
   215  		ftp = append(ftp, "<COMMAND>", "['sh', '-c', 'python /server.py']")
   216  	case HpushStdin:
   217  		ftp = append(ftp, "<COMMAND>", "['python /code/code.py']")
   218  	default:
   219  		debug.Assert(false, msg.CommTypeX)
   220  	}
   221  	return
   222  }
   223  
   224  // cleanupEntities removes provided entities. It tries its best to remove all
   225  // entities so it doesn't stop when encountering an error.
   226  func cleanupEntities(errCtx *cmn.ETLErrCtx, podName, svcName string) (err error) {
   227  	if svcName != "" {
   228  		if deleteErr := deleteEntity(errCtx, k8s.Svc, svcName); deleteErr != nil {
   229  			err = deleteErr
   230  		}
   231  	}
   232  
   233  	if podName != "" {
   234  		if deleteErr := deleteEntity(errCtx, k8s.Pod, podName); deleteErr != nil {
   235  			err = deleteErr
   236  		}
   237  	}
   238  
   239  	return
   240  }
   241  
   242  // (does the heavy-lifting)
   243  // Returns:
   244  // * errCtx - ETL error context
   245  // * podName - non-empty if at least one attempt of creating pod was executed
   246  // * svcName - non-empty if at least one attempt of creating service was executed
   247  // * err - any error occurred that should be passed on.
   248  func start(msg *InitSpecMsg, xid string, opts StartOpts, config *cmn.Config) (errCtx *cmn.ETLErrCtx,
   249  	podName, svcName string, err error) {
   250  	debug.Assert(k8s.NodeName != "") // checked above
   251  
   252  	errCtx = &cmn.ETLErrCtx{TID: core.T.SID(), ETLName: msg.IDX}
   253  	boot := &etlBootstrapper{errCtx: errCtx, config: config, env: opts.Env}
   254  	boot.msg = *msg
   255  
   256  	// Parse spec template and fill Pod object with necessary fields.
   257  	if err = boot.createPodSpec(); err != nil {
   258  		return
   259  	}
   260  
   261  	boot.createServiceSpec()
   262  
   263  	// 1. Cleanup previously started entities, if any.
   264  	errCleanup := cleanupEntities(errCtx, boot.pod.Name, boot.svc.Name)
   265  	debug.AssertNoErr(errCleanup)
   266  
   267  	// 2. Creating service.
   268  	svcName = boot.svc.GetName()
   269  	if err = boot.createEntity(k8s.Svc); err != nil {
   270  		return
   271  	}
   272  	// 3. Creating pod.
   273  	podName = boot.pod.GetName()
   274  	if err = boot.createEntity(k8s.Pod); err != nil {
   275  		return
   276  	}
   277  	if err = boot.waitPodReady(); err != nil {
   278  		return
   279  	}
   280  	if cmn.Rom.FastV(4, cos.SmoduleETL) {
   281  		nlog.Infof("pod %q is ready, %+v, %s", podName, msg, boot.errCtx)
   282  	}
   283  	if err = boot.setupConnection(); err != nil {
   284  		return
   285  	}
   286  
   287  	boot.setupXaction(xid)
   288  
   289  	// finally, add Communicator to the runtime registry
   290  	comm := newCommunicator(newAborter(msg.IDX), boot)
   291  	if err = reg.add(msg.IDX, comm); err != nil {
   292  		return
   293  	}
   294  	core.T.Sowner().Listeners().Reg(comm)
   295  	return
   296  }
   297  
   298  // Stop deletes all occupied by the ETL resources, including Pods and Services.
   299  // It unregisters ETL smap listener.
   300  func Stop(id string, errCause error) error {
   301  	errCtx := &cmn.ETLErrCtx{
   302  		TID:     core.T.SID(),
   303  		ETLName: id,
   304  	}
   305  
   306  	// Abort all running offline ETLs.
   307  	xreg.AbortKind(errCause, apc.ActETLBck)
   308  
   309  	c, err := GetCommunicator(id)
   310  	if err != nil {
   311  		return cmn.NewErrETL(errCtx, err.Error())
   312  	}
   313  	errCtx.PodName = c.PodName()
   314  	errCtx.SvcName = c.SvcName()
   315  
   316  	if err := cleanupEntities(errCtx, c.PodName(), c.SvcName()); err != nil {
   317  		return err
   318  	}
   319  
   320  	if c := reg.del(id); c != nil {
   321  		core.T.Sowner().Listeners().Unreg(c)
   322  	}
   323  
   324  	c.Stop()
   325  
   326  	return nil
   327  }
   328  
   329  // StopAll terminates all running ETLs.
   330  func StopAll() {
   331  	if !k8s.IsK8s() {
   332  		return
   333  	}
   334  	for _, e := range List() {
   335  		if err := Stop(e.Name, nil); err != nil {
   336  			nlog.Errorln(err)
   337  		}
   338  	}
   339  }
   340  
   341  func GetCommunicator(etlName string) (Communicator, error) {
   342  	c, exists := reg.get(etlName)
   343  	if !exists {
   344  		return nil, cos.NewErrNotFound(core.T, "etl job "+etlName)
   345  	}
   346  	return c, nil
   347  }
   348  
   349  func List() []Info { return reg.list() }
   350  
   351  func PodLogs(transformID string) (logs Logs, err error) {
   352  	c, err := GetCommunicator(transformID)
   353  	if err != nil {
   354  		return logs, err
   355  	}
   356  	client, err := k8s.GetClient()
   357  	if err != nil {
   358  		return logs, err
   359  	}
   360  	b, err := client.Logs(c.PodName())
   361  	if err != nil {
   362  		return logs, err
   363  	}
   364  	return Logs{
   365  		TargetID: core.T.SID(),
   366  		Logs:     b,
   367  	}, nil
   368  }
   369  
   370  func PodHealth(etlName string) (string, error) {
   371  	c, err := GetCommunicator(etlName)
   372  	if err != nil {
   373  		return "", err
   374  	}
   375  	client, err := k8s.GetClient()
   376  	if err != nil {
   377  		return "", err
   378  	}
   379  	return client.Health(c.PodName())
   380  }
   381  
   382  func PodMetrics(etlName string) (*CPUMemUsed, error) {
   383  	c, err := GetCommunicator(etlName)
   384  	if err != nil {
   385  		return nil, err
   386  	}
   387  	client, err := k8s.GetClient()
   388  	if err != nil {
   389  		return nil, err
   390  	}
   391  	cpuUsed, memUsed, err := k8s.Metrics(c.PodName())
   392  	if err == nil {
   393  		return &CPUMemUsed{TargetID: core.T.SID(), CPU: cpuUsed, Mem: memUsed}, nil
   394  	}
   395  	if cos.IsErrNotFound(err) {
   396  		return nil, err
   397  	}
   398  	if metricsErr := client.CheckMetricsAvailability(); metricsErr != nil {
   399  		err = fmt.Errorf("%v; failed to fetch metrics from Kubernetes: %v", metricsErr, err)
   400  	}
   401  	return nil, err
   402  }
   403  
   404  // Pod conditions include enumerated lifecycle states, such as `PodScheduled`,
   405  // `ContainersReady`, `Initialized`, `Ready`
   406  // (see https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle).
   407  // First, we check that the Pod is still running (neither succeeded, nor failed),
   408  // and secondly, whether it contains `Ready` condition.
   409  func checkPodReady(client k8s.Client, podName string) (ready bool, err error) {
   410  	var p *corev1.Pod
   411  	if p, err = client.Pod(podName); err != nil {
   412  		return false, err
   413  	}
   414  
   415  	// Pod has run to completion, either by failing or by succeeding. We don't
   416  	// expect any of these to happen, as ETL containers are supposed to constantly
   417  	// listen to upcoming requests and never terminate.
   418  	switch p.Status.Phase {
   419  	case corev1.PodFailed, corev1.PodSucceeded:
   420  		return false, fmt.Errorf(
   421  			"pod ran to completion (phase: %s), state message: %q",
   422  			p.Status.Phase, p.Status.Message,
   423  		)
   424  	}
   425  
   426  	for _, cond := range p.Status.Conditions {
   427  		if cond.Type == corev1.PodReady && cond.Status == corev1.ConditionTrue {
   428  			return true, nil
   429  		}
   430  	}
   431  
   432  	return false, nil
   433  }
   434  
   435  func deleteEntity(errCtx *cmn.ETLErrCtx, entityType, entityName string) error {
   436  	client, err := k8s.GetClient()
   437  	if err != nil {
   438  		return cmn.NewErrETL(errCtx, err.Error())
   439  	}
   440  
   441  	// Remove entity immediately (ignoring not found).
   442  	if err = client.Delete(entityType, entityName); err != nil {
   443  		if k8sErrors.IsNotFound(err) {
   444  			return nil
   445  		}
   446  		return cmn.NewErrETL(errCtx, err.Error())
   447  	}
   448  
   449  	// wait
   450  	interval := cos.ProbingFrequency(DefaultTimeout)
   451  	err = wait.PollUntilContextTimeout(context.Background(), interval, DefaultTimeout, false, /*immediate*/
   452  		func(context.Context) (done bool, err error) {
   453  			var exists bool
   454  			exists, err = client.CheckExists(entityType, entityName)
   455  			if err == nil {
   456  				done = !exists
   457  			}
   458  			return
   459  		},
   460  	)
   461  	if err != nil {
   462  		return cmn.NewErrETL(errCtx, err.Error())
   463  	}
   464  	return nil
   465  }
   466  
   467  func podConditionsToString(conditions []corev1.PodCondition) string {
   468  	parts := make([]string, 0, len(conditions))
   469  	for i := range conditions {
   470  		parts = append(parts, podConditionToString(&conditions[i]))
   471  	}
   472  	return "[" + strings.Join(parts, ", ") + "]"
   473  }
   474  
   475  func podConditionToString(cond *corev1.PodCondition) string {
   476  	parts := []string{
   477  		fmt.Sprintf("type: %q", cond.Type),
   478  		fmt.Sprintf("status: %q", cond.Status),
   479  	}
   480  	if cond.Reason != "" {
   481  		parts = append(parts, fmt.Sprintf("reason: %q", cond.Reason))
   482  	}
   483  	if cond.Message != "" {
   484  		parts = append(parts, fmt.Sprintf("msg: %q", cond.Message))
   485  	}
   486  	return "{" + strings.Join(parts, ", ") + "}"
   487  }