github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/ais/test/etl_stress_test.go (about) 1 // Package integration_test. 2 /* 3 * Copyright (c) 2021-2023, NVIDIA CORPORATION. All rights reserved. 4 */ 5 package integration_test 6 7 import ( 8 "math/rand" 9 "testing" 10 "time" 11 12 "github.com/NVIDIA/aistore/api" 13 "github.com/NVIDIA/aistore/api/apc" 14 "github.com/NVIDIA/aistore/cmn" 15 "github.com/NVIDIA/aistore/cmn/cos" 16 "github.com/NVIDIA/aistore/cmn/debug" 17 "github.com/NVIDIA/aistore/ext/etl" 18 "github.com/NVIDIA/aistore/ext/etl/runtime" 19 "github.com/NVIDIA/aistore/tools" 20 "github.com/NVIDIA/aistore/tools/tassert" 21 "github.com/NVIDIA/aistore/tools/tetl" 22 "github.com/NVIDIA/aistore/tools/tlog" 23 "github.com/NVIDIA/aistore/tools/trand" 24 "github.com/NVIDIA/aistore/xact" 25 ) 26 27 const etlBucketTimeout = cos.Duration(3 * time.Minute) 28 29 func TestETLConnectionError(t *testing.T) { 30 tools.CheckSkip(t, &tools.SkipTestArgs{RequiredDeployment: tools.ClusterTypeK8s, Long: true}) 31 tetl.CheckNoRunningETLContainers(t, baseParams) 32 33 // ETL should survive occasional failures and successfully transform all objects. 34 const timeoutFunc = ` 35 import random, requests, hashlib 36 37 failures = {} 38 39 def transform(input_bytes): 40 md5 = hashlib.md5(input_bytes).hexdigest() 41 failures_cnt = failures.get(md5, 0) 42 # Fail at most 2 times, otherwise ETL will be stopped. 43 if random.randint(0,50) == 0 and failures_cnt < 2: 44 failures[md5] = failures_cnt + 1 45 raise requests.exceptions.ConnectionError("fake connection error") 46 47 return input_bytes 48 ` 49 50 m := ioContext{ 51 t: t, 52 num: 10_000, 53 fileSize: cos.KiB, 54 bck: cmn.Bck{Name: "etl_build_connection_err", Provider: apc.AIS}, 55 } 56 57 tlog.Logln("Preparing source bucket") 58 tools.CreateBucket(t, proxyURL, m.bck, nil, true /*cleanup*/) 59 60 m.init(true /*cleanup*/) 61 m.puts() 62 63 msg := etl.InitCodeMsg{ 64 InitMsgBase: etl.InitMsgBase{IDX: "etl-build-conn-err", Timeout: etlBucketTimeout}, 65 Code: []byte(timeoutFunc), 66 Runtime: runtime.Py38, 67 ChunkSize: 0, 68 } 69 msg.Funcs.Transform = "transform" 70 71 _ = tetl.InitCode(t, baseParams, &msg) 72 73 bckTo := cmn.Bck{Name: "etldst_" + cos.GenTie(), Provider: apc.AIS} 74 testETLBucket(t, baseParams, msg.Name(), &m, bckTo, time.Duration(etlBucketTimeout), 75 true /* skip byte-count check*/, false /* remote src evicted */) 76 } 77 78 func TestETLBucketAbort(t *testing.T) { 79 tools.CheckSkip(t, &tools.SkipTestArgs{RequiredDeployment: tools.ClusterTypeK8s, Long: true}) 80 tetl.CheckNoRunningETLContainers(t, baseParams) 81 82 m := &ioContext{ 83 t: t, 84 num: 1000, 85 fileSize: 512, 86 fixedSize: true, 87 } 88 89 xid := etlPrepareAndStart(t, m, tetl.Echo, etl.Hpull) 90 91 time.Sleep(time.Duration(rand.Intn(5)) * time.Second) 92 93 tlog.Logf("Aborting etl[%s]\n", xid) 94 args := xact.ArgsMsg{ID: xid, Kind: apc.ActETLBck} 95 err := api.AbortXaction(baseParams, &args) 96 tassert.CheckFatal(t, err) 97 98 err = tetl.WaitForAborted(baseParams, xid, apc.ActETLBck, 2*time.Minute) 99 tassert.CheckFatal(t, err) 100 etls, err := api.ETLList(baseParams) 101 tassert.CheckFatal(t, err) 102 // ETL stopped via etlPrepareAndStart. 103 tassert.Fatalf(t, len(etls) == 1, "expected exactly 1 etl running, got %+v", etls) 104 } 105 106 func TestETLTargetDown(t *testing.T) { 107 tools.CheckSkip(t, &tools.SkipTestArgs{RequiredDeployment: tools.ClusterTypeK8s, MinTargets: 2}) 108 tetl.CheckNoRunningETLContainers(t, baseParams) 109 110 m := &ioContext{ 111 t: t, 112 num: 10000, 113 fileSize: 512, 114 fixedSize: true, 115 } 116 if testing.Short() { 117 m.num /= 100 118 } else { 119 // TODO: otherwise, error executing LSOF command 120 t.Skipf("skipping %s long test (kill-node vs maintenance vs ETL)", t.Name()) 121 } 122 m.initAndSaveState(true /*cleanup*/) 123 xid := etlPrepareAndStart(t, m, tetl.Echo, etl.Hpull) 124 125 tlog.Logln("Waiting for ETL to process a few objects...") 126 time.Sleep(5 * time.Second) 127 128 targetNode, _ := m.smap.GetRandTarget() 129 tlog.Logf("Killing %s\n", targetNode.StringEx()) 130 tcmd, err := tools.KillNode(targetNode) // TODO: alternatively, m.startMaintenanceNoRebalance() 131 tassert.CheckFatal(t, err) 132 133 t.Cleanup(func() { 134 time.Sleep(4 * time.Second) 135 tools.RestoreNode(tcmd, false, "target") 136 m.waitAndCheckCluState() 137 138 args := xact.ArgsMsg{Kind: apc.ActRebalance, Timeout: tools.RebalanceTimeout} 139 _, _ = api.WaitForXactionIC(baseParams, &args) 140 141 tetl.CheckNoRunningETLContainers(t, baseParams) 142 }) 143 144 err = tetl.WaitForAborted(baseParams, xid, apc.ActETLBck, 5*time.Minute) 145 tassert.CheckFatal(t, err) 146 tetl.WaitForContainersStopped(t, baseParams) 147 } 148 149 func TestETLBigBucket(t *testing.T) { 150 // The test takes a lot of time if it's run against a single target deployment. 151 tools.CheckSkip(t, &tools.SkipTestArgs{RequiredDeployment: tools.ClusterTypeK8s, Long: true, MinTargets: 2}) 152 153 const echoPythonTransform = ` 154 def transform(input_bytes): 155 return input_bytes 156 ` 157 158 var ( 159 bckFrom = cmn.Bck{Provider: apc.AIS, Name: "etlbig"} 160 bckTo = cmn.Bck{Provider: apc.AIS, Name: "etlbigout-" + trand.String(5)} 161 162 m = ioContext{ 163 t: t, 164 num: 200_000, 165 fileSize: 20 * cos.KiB, // 4GiB total 166 fixedSize: true, 167 bck: bckFrom, 168 } 169 170 tests = []struct { 171 name string 172 ty string 173 etlSpecName string 174 etlCodeMsg etl.InitCodeMsg 175 }{ 176 {name: "spec-echo-python", ty: etl.Spec, etlSpecName: tetl.Echo}, 177 {name: "spec-echo-golang", ty: etl.Spec, etlSpecName: tetl.EchoGolang}, 178 179 { 180 name: "code-echo-py38", 181 ty: etl.Code, 182 etlCodeMsg: etl.InitCodeMsg{ 183 Code: []byte(echoPythonTransform), 184 Runtime: runtime.Py38, 185 ChunkSize: 0, 186 }, 187 }, 188 { 189 name: "code-echo-py310", 190 ty: etl.Code, 191 etlCodeMsg: etl.InitCodeMsg{ 192 Code: []byte(echoPythonTransform), 193 Runtime: runtime.Py310, 194 ChunkSize: 0, 195 }, 196 }, 197 } 198 ) 199 200 tlog.Logf("Preparing source bucket (%d objects, %s each)\n", m.num, cos.ToSizeIEC(int64(m.fileSize), 2)) 201 tools.CreateBucket(t, proxyURL, bckFrom, nil, true /*cleanup*/) 202 m.initAndSaveState(true /*cleanup*/) 203 204 m.puts() 205 206 for i := range tests { 207 test := tests[i] 208 t.Run(test.name, func(t *testing.T) { 209 tetl.CheckNoRunningETLContainers(t, baseParams) 210 var ( 211 err error 212 etlName string 213 etlDoneCh = cos.NewStopCh() 214 requestTimeout = 30 * time.Second 215 ) 216 switch test.ty { 217 case etl.Spec: 218 etlName = test.etlSpecName 219 _ = tetl.InitSpec(t, baseParams, etlName, etl.Hpull) 220 case etl.Code: 221 etlName = test.name 222 { 223 test.etlCodeMsg.IDX = etlName 224 test.etlCodeMsg.Timeout = etlBucketTimeout 225 test.etlCodeMsg.Funcs.Transform = "transform" 226 } 227 _ = tetl.InitCode(t, baseParams, &test.etlCodeMsg) 228 default: 229 debug.Assert(false, test.ty) 230 } 231 t.Cleanup(func() { 232 tetl.StopAndDeleteETL(t, baseParams, etlName) 233 tetl.WaitForContainersStopped(t, baseParams) 234 }) 235 236 tlog.Logf("Start offline ETL[%s]\n", etlName) 237 msg := &apc.TCBMsg{ 238 Transform: apc.Transform{ 239 Name: etlName, 240 Timeout: cos.Duration(requestTimeout), 241 }, 242 CopyBckMsg: apc.CopyBckMsg{Force: true}, 243 } 244 xid := tetl.ETLBucketWithCleanup(t, baseParams, bckFrom, bckTo, msg) 245 tetl.ReportXactionStatus(baseParams, xid, etlDoneCh, 2*time.Minute, m.num) 246 247 tlog.Logln("Waiting for ETL to finish") 248 err = tetl.WaitForFinished(baseParams, xid, apc.ActETLBck, 15*time.Minute) 249 etlDoneCh.Close() 250 tassert.CheckFatal(t, err) 251 252 snaps, err := api.QueryXactionSnaps(baseParams, &xact.ArgsMsg{ID: xid}) 253 tassert.CheckFatal(t, err) 254 total, err := snaps.TotalRunningTime(xid) 255 tassert.CheckFatal(t, err) 256 tlog.Logf("Transforming bucket %s took %v\n", bckFrom.Cname(""), total) 257 258 objList, err := api.ListObjects(baseParams, bckTo, nil, api.ListArgs{}) 259 tassert.CheckFatal(t, err) 260 tassert.Fatalf( 261 t, len(objList.Entries) == m.num, 262 "expected %d objects to be transformed, got %d", m.num, len(objList.Entries), 263 ) 264 }) 265 } 266 } 267 268 // Responsible for cleaning all resources, except ETL xact. 269 func etlPrepareAndStart(t *testing.T, m *ioContext, etlName, comm string) (xid string) { 270 var ( 271 bckFrom = cmn.Bck{Name: "etl-in-" + trand.String(5), Provider: apc.AIS} 272 bckTo = cmn.Bck{Name: "etl-out-" + trand.String(5), Provider: apc.AIS} 273 ) 274 m.bck = bckFrom 275 276 tlog.Logf("Preparing source bucket %s\n", bckFrom.Cname("")) 277 tools.CreateBucket(t, proxyURL, bckFrom, nil, true /*cleanup*/) 278 m.initAndSaveState(true /*cleanup*/) 279 280 m.puts() 281 282 _ = tetl.InitSpec(t, baseParams, etlName, comm) 283 t.Cleanup(func() { 284 tetl.StopAndDeleteETL(t, baseParams, etlName) 285 }) 286 287 tlog.Logf("Start offline ETL[%s] => %s\n", etlName, bckTo.Cname("")) 288 msg := &apc.TCBMsg{Transform: apc.Transform{Name: etlName}, CopyBckMsg: apc.CopyBckMsg{Force: true}} 289 xid = tetl.ETLBucketWithCleanup(t, baseParams, bckFrom, bckTo, msg) 290 return 291 }