github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/ais/test/etl_stress_test.go (about)

     1  // Package integration_test.
     2  /*
     3   * Copyright (c) 2021-2023, NVIDIA CORPORATION. All rights reserved.
     4   */
     5  package integration_test
     6  
     7  import (
     8  	"math/rand"
     9  	"testing"
    10  	"time"
    11  
    12  	"github.com/NVIDIA/aistore/api"
    13  	"github.com/NVIDIA/aistore/api/apc"
    14  	"github.com/NVIDIA/aistore/cmn"
    15  	"github.com/NVIDIA/aistore/cmn/cos"
    16  	"github.com/NVIDIA/aistore/cmn/debug"
    17  	"github.com/NVIDIA/aistore/ext/etl"
    18  	"github.com/NVIDIA/aistore/ext/etl/runtime"
    19  	"github.com/NVIDIA/aistore/tools"
    20  	"github.com/NVIDIA/aistore/tools/tassert"
    21  	"github.com/NVIDIA/aistore/tools/tetl"
    22  	"github.com/NVIDIA/aistore/tools/tlog"
    23  	"github.com/NVIDIA/aistore/tools/trand"
    24  	"github.com/NVIDIA/aistore/xact"
    25  )
    26  
    27  const etlBucketTimeout = cos.Duration(3 * time.Minute)
    28  
    29  func TestETLConnectionError(t *testing.T) {
    30  	tools.CheckSkip(t, &tools.SkipTestArgs{RequiredDeployment: tools.ClusterTypeK8s, Long: true})
    31  	tetl.CheckNoRunningETLContainers(t, baseParams)
    32  
    33  	// ETL should survive occasional failures and successfully transform all objects.
    34  	const timeoutFunc = `
    35  import random, requests, hashlib
    36  
    37  failures = {}
    38  
    39  def transform(input_bytes):
    40  	md5 = hashlib.md5(input_bytes).hexdigest()
    41  	failures_cnt = failures.get(md5, 0)
    42  	# Fail at most 2 times, otherwise ETL will be stopped.
    43  	if random.randint(0,50) == 0 and failures_cnt < 2:
    44  		failures[md5] = failures_cnt + 1
    45  		raise requests.exceptions.ConnectionError("fake connection error")
    46  
    47  	return input_bytes
    48  `
    49  
    50  	m := ioContext{
    51  		t:        t,
    52  		num:      10_000,
    53  		fileSize: cos.KiB,
    54  		bck:      cmn.Bck{Name: "etl_build_connection_err", Provider: apc.AIS},
    55  	}
    56  
    57  	tlog.Logln("Preparing source bucket")
    58  	tools.CreateBucket(t, proxyURL, m.bck, nil, true /*cleanup*/)
    59  
    60  	m.init(true /*cleanup*/)
    61  	m.puts()
    62  
    63  	msg := etl.InitCodeMsg{
    64  		InitMsgBase: etl.InitMsgBase{IDX: "etl-build-conn-err", Timeout: etlBucketTimeout},
    65  		Code:        []byte(timeoutFunc),
    66  		Runtime:     runtime.Py38,
    67  		ChunkSize:   0,
    68  	}
    69  	msg.Funcs.Transform = "transform"
    70  
    71  	_ = tetl.InitCode(t, baseParams, &msg)
    72  
    73  	bckTo := cmn.Bck{Name: "etldst_" + cos.GenTie(), Provider: apc.AIS}
    74  	testETLBucket(t, baseParams, msg.Name(), &m, bckTo, time.Duration(etlBucketTimeout),
    75  		true /* skip byte-count check*/, false /* remote src evicted */)
    76  }
    77  
    78  func TestETLBucketAbort(t *testing.T) {
    79  	tools.CheckSkip(t, &tools.SkipTestArgs{RequiredDeployment: tools.ClusterTypeK8s, Long: true})
    80  	tetl.CheckNoRunningETLContainers(t, baseParams)
    81  
    82  	m := &ioContext{
    83  		t:         t,
    84  		num:       1000,
    85  		fileSize:  512,
    86  		fixedSize: true,
    87  	}
    88  
    89  	xid := etlPrepareAndStart(t, m, tetl.Echo, etl.Hpull)
    90  
    91  	time.Sleep(time.Duration(rand.Intn(5)) * time.Second)
    92  
    93  	tlog.Logf("Aborting etl[%s]\n", xid)
    94  	args := xact.ArgsMsg{ID: xid, Kind: apc.ActETLBck}
    95  	err := api.AbortXaction(baseParams, &args)
    96  	tassert.CheckFatal(t, err)
    97  
    98  	err = tetl.WaitForAborted(baseParams, xid, apc.ActETLBck, 2*time.Minute)
    99  	tassert.CheckFatal(t, err)
   100  	etls, err := api.ETLList(baseParams)
   101  	tassert.CheckFatal(t, err)
   102  	// ETL stopped via etlPrepareAndStart.
   103  	tassert.Fatalf(t, len(etls) == 1, "expected exactly 1 etl running, got %+v", etls)
   104  }
   105  
   106  func TestETLTargetDown(t *testing.T) {
   107  	tools.CheckSkip(t, &tools.SkipTestArgs{RequiredDeployment: tools.ClusterTypeK8s, MinTargets: 2})
   108  	tetl.CheckNoRunningETLContainers(t, baseParams)
   109  
   110  	m := &ioContext{
   111  		t:         t,
   112  		num:       10000,
   113  		fileSize:  512,
   114  		fixedSize: true,
   115  	}
   116  	if testing.Short() {
   117  		m.num /= 100
   118  	} else {
   119  		// TODO: otherwise, error executing LSOF command
   120  		t.Skipf("skipping %s long test (kill-node vs maintenance vs ETL)", t.Name())
   121  	}
   122  	m.initAndSaveState(true /*cleanup*/)
   123  	xid := etlPrepareAndStart(t, m, tetl.Echo, etl.Hpull)
   124  
   125  	tlog.Logln("Waiting for ETL to process a few objects...")
   126  	time.Sleep(5 * time.Second)
   127  
   128  	targetNode, _ := m.smap.GetRandTarget()
   129  	tlog.Logf("Killing %s\n", targetNode.StringEx())
   130  	tcmd, err := tools.KillNode(targetNode) // TODO: alternatively, m.startMaintenanceNoRebalance()
   131  	tassert.CheckFatal(t, err)
   132  
   133  	t.Cleanup(func() {
   134  		time.Sleep(4 * time.Second)
   135  		tools.RestoreNode(tcmd, false, "target")
   136  		m.waitAndCheckCluState()
   137  
   138  		args := xact.ArgsMsg{Kind: apc.ActRebalance, Timeout: tools.RebalanceTimeout}
   139  		_, _ = api.WaitForXactionIC(baseParams, &args)
   140  
   141  		tetl.CheckNoRunningETLContainers(t, baseParams)
   142  	})
   143  
   144  	err = tetl.WaitForAborted(baseParams, xid, apc.ActETLBck, 5*time.Minute)
   145  	tassert.CheckFatal(t, err)
   146  	tetl.WaitForContainersStopped(t, baseParams)
   147  }
   148  
   149  func TestETLBigBucket(t *testing.T) {
   150  	// The test takes a lot of time if it's run against a single target deployment.
   151  	tools.CheckSkip(t, &tools.SkipTestArgs{RequiredDeployment: tools.ClusterTypeK8s, Long: true, MinTargets: 2})
   152  
   153  	const echoPythonTransform = `
   154  def transform(input_bytes):
   155  	return input_bytes
   156  `
   157  
   158  	var (
   159  		bckFrom = cmn.Bck{Provider: apc.AIS, Name: "etlbig"}
   160  		bckTo   = cmn.Bck{Provider: apc.AIS, Name: "etlbigout-" + trand.String(5)}
   161  
   162  		m = ioContext{
   163  			t:         t,
   164  			num:       200_000,
   165  			fileSize:  20 * cos.KiB, // 4GiB total
   166  			fixedSize: true,
   167  			bck:       bckFrom,
   168  		}
   169  
   170  		tests = []struct {
   171  			name        string
   172  			ty          string
   173  			etlSpecName string
   174  			etlCodeMsg  etl.InitCodeMsg
   175  		}{
   176  			{name: "spec-echo-python", ty: etl.Spec, etlSpecName: tetl.Echo},
   177  			{name: "spec-echo-golang", ty: etl.Spec, etlSpecName: tetl.EchoGolang},
   178  
   179  			{
   180  				name: "code-echo-py38",
   181  				ty:   etl.Code,
   182  				etlCodeMsg: etl.InitCodeMsg{
   183  					Code:      []byte(echoPythonTransform),
   184  					Runtime:   runtime.Py38,
   185  					ChunkSize: 0,
   186  				},
   187  			},
   188  			{
   189  				name: "code-echo-py310",
   190  				ty:   etl.Code,
   191  				etlCodeMsg: etl.InitCodeMsg{
   192  					Code:      []byte(echoPythonTransform),
   193  					Runtime:   runtime.Py310,
   194  					ChunkSize: 0,
   195  				},
   196  			},
   197  		}
   198  	)
   199  
   200  	tlog.Logf("Preparing source bucket (%d objects, %s each)\n", m.num, cos.ToSizeIEC(int64(m.fileSize), 2))
   201  	tools.CreateBucket(t, proxyURL, bckFrom, nil, true /*cleanup*/)
   202  	m.initAndSaveState(true /*cleanup*/)
   203  
   204  	m.puts()
   205  
   206  	for i := range tests {
   207  		test := tests[i]
   208  		t.Run(test.name, func(t *testing.T) {
   209  			tetl.CheckNoRunningETLContainers(t, baseParams)
   210  			var (
   211  				err            error
   212  				etlName        string
   213  				etlDoneCh      = cos.NewStopCh()
   214  				requestTimeout = 30 * time.Second
   215  			)
   216  			switch test.ty {
   217  			case etl.Spec:
   218  				etlName = test.etlSpecName
   219  				_ = tetl.InitSpec(t, baseParams, etlName, etl.Hpull)
   220  			case etl.Code:
   221  				etlName = test.name
   222  				{
   223  					test.etlCodeMsg.IDX = etlName
   224  					test.etlCodeMsg.Timeout = etlBucketTimeout
   225  					test.etlCodeMsg.Funcs.Transform = "transform"
   226  				}
   227  				_ = tetl.InitCode(t, baseParams, &test.etlCodeMsg)
   228  			default:
   229  				debug.Assert(false, test.ty)
   230  			}
   231  			t.Cleanup(func() {
   232  				tetl.StopAndDeleteETL(t, baseParams, etlName)
   233  				tetl.WaitForContainersStopped(t, baseParams)
   234  			})
   235  
   236  			tlog.Logf("Start offline ETL[%s]\n", etlName)
   237  			msg := &apc.TCBMsg{
   238  				Transform: apc.Transform{
   239  					Name:    etlName,
   240  					Timeout: cos.Duration(requestTimeout),
   241  				},
   242  				CopyBckMsg: apc.CopyBckMsg{Force: true},
   243  			}
   244  			xid := tetl.ETLBucketWithCleanup(t, baseParams, bckFrom, bckTo, msg)
   245  			tetl.ReportXactionStatus(baseParams, xid, etlDoneCh, 2*time.Minute, m.num)
   246  
   247  			tlog.Logln("Waiting for ETL to finish")
   248  			err = tetl.WaitForFinished(baseParams, xid, apc.ActETLBck, 15*time.Minute)
   249  			etlDoneCh.Close()
   250  			tassert.CheckFatal(t, err)
   251  
   252  			snaps, err := api.QueryXactionSnaps(baseParams, &xact.ArgsMsg{ID: xid})
   253  			tassert.CheckFatal(t, err)
   254  			total, err := snaps.TotalRunningTime(xid)
   255  			tassert.CheckFatal(t, err)
   256  			tlog.Logf("Transforming bucket %s took %v\n", bckFrom.Cname(""), total)
   257  
   258  			objList, err := api.ListObjects(baseParams, bckTo, nil, api.ListArgs{})
   259  			tassert.CheckFatal(t, err)
   260  			tassert.Fatalf(
   261  				t, len(objList.Entries) == m.num,
   262  				"expected %d objects to be transformed, got %d", m.num, len(objList.Entries),
   263  			)
   264  		})
   265  	}
   266  }
   267  
   268  // Responsible for cleaning all resources, except ETL xact.
   269  func etlPrepareAndStart(t *testing.T, m *ioContext, etlName, comm string) (xid string) {
   270  	var (
   271  		bckFrom = cmn.Bck{Name: "etl-in-" + trand.String(5), Provider: apc.AIS}
   272  		bckTo   = cmn.Bck{Name: "etl-out-" + trand.String(5), Provider: apc.AIS}
   273  	)
   274  	m.bck = bckFrom
   275  
   276  	tlog.Logf("Preparing source bucket %s\n", bckFrom.Cname(""))
   277  	tools.CreateBucket(t, proxyURL, bckFrom, nil, true /*cleanup*/)
   278  	m.initAndSaveState(true /*cleanup*/)
   279  
   280  	m.puts()
   281  
   282  	_ = tetl.InitSpec(t, baseParams, etlName, comm)
   283  	t.Cleanup(func() {
   284  		tetl.StopAndDeleteETL(t, baseParams, etlName)
   285  	})
   286  
   287  	tlog.Logf("Start offline ETL[%s] => %s\n", etlName, bckTo.Cname(""))
   288  	msg := &apc.TCBMsg{Transform: apc.Transform{Name: etlName}, CopyBckMsg: apc.CopyBckMsg{Force: true}}
   289  	xid = tetl.ETLBucketWithCleanup(t, baseParams, bckFrom, bckTo, msg)
   290  	return
   291  }