github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/tools/tetl/etl.go (about) 1 // Package tetl provides helpers for ETL. 2 /* 3 * Copyright (c) 2018-2023, NVIDIA CORPORATION. All rights reserved. 4 */ 5 package tetl 6 7 import ( 8 "bytes" 9 "fmt" 10 "io" 11 "net/http" 12 "os" 13 "strings" 14 "testing" 15 "time" 16 17 "github.com/NVIDIA/aistore/api" 18 "github.com/NVIDIA/aistore/api/apc" 19 "github.com/NVIDIA/aistore/cmn" 20 "github.com/NVIDIA/aistore/cmn/cos" 21 "github.com/NVIDIA/aistore/cmn/k8s" 22 "github.com/NVIDIA/aistore/core/meta" 23 "github.com/NVIDIA/aistore/ext/etl" 24 "github.com/NVIDIA/aistore/tools" 25 "github.com/NVIDIA/aistore/tools/tassert" 26 "github.com/NVIDIA/aistore/tools/tlog" 27 "github.com/NVIDIA/aistore/xact" 28 corev1 "k8s.io/api/core/v1" 29 ) 30 31 const ( 32 commTypeAnnotation = "communication_type" 33 waitTimeoutAnnotation = "wait_timeout" 34 35 Tar2TF = "tar2tf" 36 Echo = "transformer-echo" 37 EchoGolang = "echo-go" 38 MD5 = "transformer-md5" 39 Tar2tfFilters = "tar2tf-filters" 40 tar2tfFilter = ` 41 { 42 "conversions": [ 43 { "type": "Decode", "ext_name": "png"}, 44 { "type": "Rotate", "ext_name": "png"} 45 ], 46 "selections": [ 47 { "ext_name": "png" }, 48 { "ext_name": "cls" } 49 ] 50 } 51 ` 52 ) 53 54 var ( 55 links = map[string]string{ 56 MD5: "https://raw.githubusercontent.com/NVIDIA/ais-etl/master/transformers/md5/pod.yaml", 57 Tar2TF: "https://raw.githubusercontent.com/NVIDIA/ais-etl/master/transformers/tar2tf/pod.yaml", 58 Tar2tfFilters: "https://raw.githubusercontent.com/NVIDIA/ais-etl/master/transformers/tar2tf/pod.yaml", 59 Echo: "https://raw.githubusercontent.com/NVIDIA/ais-etl/master/transformers/echo/pod.yaml", 60 EchoGolang: "https://raw.githubusercontent.com/NVIDIA/ais-etl/master/transformers/go_echo/pod.yaml", 61 } 62 63 client = &http.Client{} 64 ) 65 66 func validateETLName(name string) error { 67 if _, ok := links[name]; !ok { 68 return fmt.Errorf("%s is invalid etlName, expected predefined (%s, %s, %s)", name, Echo, Tar2TF, MD5) 69 } 70 return nil 71 } 72 73 func GetTransformYaml(etlName string) ([]byte, error) { 74 if err := validateETLName(etlName); err != nil { 75 return nil, err 76 } 77 78 var resp *http.Response 79 // with retry in case github in unavailable for a moment 80 err := cmn.NetworkCallWithRetry(&cmn.RetryArgs{ 81 Call: func() (code int, err error) { 82 resp, err = client.Get(links[etlName]) //nolint:bodyclose // see defer close below 83 return 84 }, 85 Action: "get transform yaml for ETL[" + etlName + "]", 86 SoftErr: 3, 87 HardErr: 1, 88 IsClient: true, 89 }) 90 if err != nil { 91 return nil, err 92 } 93 defer resp.Body.Close() 94 95 b, err := io.ReadAll(resp.Body) 96 if err != nil { 97 return nil, err 98 } 99 100 if resp.StatusCode != http.StatusOK { 101 return nil, fmt.Errorf("%s: %s", resp.Status, string(b)) 102 } 103 104 specStr := os.Expand(string(b), func(v string) string { 105 // Hack: Neither os.Expand, nor os.ExpandEnv supports bash env variable default-value 106 // syntax. The whole ${VAR:-default} is matched as v. 107 if strings.Contains(v, "COMMUNICATION_TYPE") { 108 return etl.Hpull 109 } 110 if strings.Contains(v, "DOCKER_REGISTRY_URL") { 111 return "aistore" 112 } 113 if etlName == Tar2tfFilters { 114 if strings.Contains(v, "OPTION_KEY") { 115 return "--spec" 116 } 117 if strings.Contains(v, "OPTION_VALUE") { 118 return tar2tfFilter 119 } 120 } 121 return "" 122 }) 123 124 return []byte(specStr), nil 125 } 126 127 func StopAndDeleteETL(t *testing.T, bp api.BaseParams, etlName string) { 128 if t.Failed() { 129 tlog.Logln("Fetching logs from ETL containers") 130 if logsByTarget, err := api.ETLLogs(bp, etlName); err == nil { 131 for _, etlLogs := range logsByTarget { 132 tlog.Logln(headETLLogs(etlLogs, 10*cos.KiB)) 133 } 134 } else { 135 tlog.Logf("Error retrieving ETL[%s] logs: %v\n", etlName, err) 136 } 137 } 138 tlog.Logf("Stopping ETL[%s]\n", etlName) 139 140 if err := api.ETLStop(bp, etlName); err != nil { 141 tlog.Logf("Stopping ETL[%s] failed; err %v\n", etlName, err) 142 } else { 143 tlog.Logf("ETL[%s] stopped\n", etlName) 144 } 145 err := api.ETLDelete(bp, etlName) 146 tassert.CheckFatal(t, err) 147 } 148 149 func headETLLogs(etlLogs etl.Logs, maxLen int) string { 150 logs, l := etlLogs.Logs, len(etlLogs.Logs) 151 if maxLen < l { 152 logs = logs[:maxLen] 153 } 154 str := fmt.Sprintf("%s logs:\n%s", meta.Tname(etlLogs.TargetID), string(logs)) 155 if maxLen < l { 156 str += fmt.Sprintf("\nand %d bytes more...", l-maxLen) 157 } 158 return str 159 } 160 161 func WaitForContainersStopped(t *testing.T, bp api.BaseParams) { 162 tlog.Logln("Waiting for ETL containers to stop...") 163 var ( 164 etls etl.InfoList 165 stopDeadline = time.Now().Add(20 * time.Second) 166 interval = 2 * time.Second 167 err error 168 ) 169 170 for { 171 etls, err = api.ETLList(bp) 172 tassert.CheckFatal(t, err) 173 if len(etls) == 0 { 174 tlog.Logln("ETL containers stopped successfully") 175 return 176 } 177 if time.Now().After(stopDeadline) { 178 break 179 } 180 tlog.Logf("ETLs %+v still running, waiting %s... \n", etls, interval) 181 time.Sleep(interval) 182 } 183 184 err = fmt.Errorf("expected all ETLs to stop, got %+v still running", etls) 185 tassert.CheckFatal(t, err) 186 } 187 188 func WaitForAborted(bp api.BaseParams, xid, kind string, timeout time.Duration) error { 189 tlog.Logf("Waiting for ETL x-%s[%s] to abort...\n", kind, xid) 190 args := xact.ArgsMsg{ID: xid, Kind: kind, Timeout: timeout /* total timeout */} 191 status, err := api.WaitForXactionIC(bp, &args) 192 if err == nil { 193 if !status.Aborted() { 194 err = fmt.Errorf("expected ETL x-%s[%s] status to indicate 'abort', got: %+v", kind, xid, status) 195 } 196 return err 197 } 198 tlog.Logf("Aborting ETL x-%s[%s]\n", kind, xid) 199 if abortErr := api.AbortXaction(bp, &args); abortErr != nil { 200 tlog.Logf("Nested error: failed to abort upon api.wait failure: %v\n", abortErr) 201 } 202 return err 203 } 204 205 // NOTE: relies on x-kind to choose the waiting method 206 // TODO -- FIXME: remove and simplify - here and everywhere 207 func WaitForFinished(bp api.BaseParams, xid, kind string, timeout time.Duration) (err error) { 208 tlog.Logf("Waiting for ETL x-%s[%s] to finish...\n", kind, xid) 209 args := xact.ArgsMsg{ID: xid, Kind: kind, Timeout: timeout /* total timeout */} 210 if xact.IdlesBeforeFinishing(kind) { 211 err = api.WaitForXactionIdle(bp, &args) 212 } else { 213 _, err = api.WaitForXactionIC(bp, &args) 214 } 215 if err == nil { 216 return 217 } 218 tlog.Logf("Aborting ETL x-%s[%s]\n", kind, xid) 219 if abortErr := api.AbortXaction(bp, &args); abortErr != nil { 220 tlog.Logf("Nested error: failed to abort upon api.wait failure: %v\n", abortErr) 221 } 222 return err 223 } 224 225 func ReportXactionStatus(bp api.BaseParams, xid string, stopCh *cos.StopCh, interval time.Duration, totalObj int) { 226 go func() { 227 var ( 228 xactStart = time.Now() 229 etlTicker = time.NewTicker(interval) 230 ) 231 defer etlTicker.Stop() 232 for { 233 select { 234 case <-etlTicker.C: 235 // Check number of objects transformed. 236 xs, err := api.QueryXactionSnaps(bp, &xact.ArgsMsg{ID: xid}) 237 if err != nil { 238 tlog.Logf("Failed to get x-etl[%s] stats: %v\n", xid, err) 239 continue 240 } 241 locObjs, outObjs, inObjs := xs.ObjCounts(xid) 242 tlog.Logf("ETL[%s] progress: (objs=%d, outObjs=%d, inObjs=%d) out of %d objects\n", 243 xid, locObjs, outObjs, inObjs, totalObj) 244 locBytes, outBytes, inBytes := xs.ByteCounts(xid) 245 bps := float64(locBytes+outBytes) / time.Since(xactStart).Seconds() 246 bpsStr := cos.ToSizeIEC(int64(bps), 2) + "/s" 247 tlog.Logf("ETL[%s] progress: (bytes=%d, outBytes=%d, inBytes=%d), %sBps\n", 248 xid, locBytes, outBytes, inBytes, bpsStr) 249 case <-stopCh.Listen(): 250 return 251 } 252 } 253 }() 254 } 255 256 func InitSpec(t *testing.T, bp api.BaseParams, etlName, comm string) (xid string) { 257 tlog.Logf("InitSpec ETL[%s], communicator %s\n", etlName, comm) 258 259 msg := &etl.InitSpecMsg{} 260 msg.IDX = etlName 261 msg.CommTypeX = comm 262 spec, err := GetTransformYaml(etlName) 263 tassert.CheckFatal(t, err) 264 msg.Spec = spec 265 tassert.Fatalf(t, msg.Name() == etlName, "%q vs %q", msg.Name(), etlName) // assert 266 267 xid, err = api.ETLInit(bp, msg) 268 tassert.CheckFatal(t, err) 269 tassert.Errorf(t, cos.IsValidUUID(xid), "expected valid xaction ID, got %q", xid) 270 271 tlog.Logf("ETL %q: running x-etl-spec[%s]\n", etlName, xid) 272 273 // reread `InitMsg` and compare with the specified 274 etlMsg, err := api.ETLGetInitMsg(bp, etlName) 275 tassert.CheckFatal(t, err) 276 277 initSpec := etlMsg.(*etl.InitSpecMsg) 278 tassert.Errorf(t, initSpec.Name() == etlName, "expected etlName %s != %s", etlName, initSpec.Name()) 279 tassert.Errorf(t, initSpec.CommType() == comm, "expected communicator type %s != %s", comm, initSpec.CommType()) 280 tassert.Errorf(t, bytes.Equal(spec, initSpec.Spec), "pod specs differ") 281 282 return 283 } 284 285 func InitCode(t *testing.T, bp api.BaseParams, msg *etl.InitCodeMsg) (xid string) { 286 id, err := api.ETLInit(bp, msg) 287 tassert.CheckFatal(t, err) 288 tassert.Errorf(t, cos.IsValidUUID(id), "expected valid xaction ID, got %q", xid) 289 xid = id 290 291 // reread `InitMsg` and compare with the specified 292 etlMsg, err := api.ETLGetInitMsg(bp, msg.Name()) 293 tassert.CheckFatal(t, err) 294 295 initCode := etlMsg.(*etl.InitCodeMsg) 296 tassert.Errorf(t, initCode.Name() == msg.Name(), "expected etlName %q != %q", msg.Name(), initCode.Name()) 297 tassert.Errorf(t, msg.CommType() == "" || initCode.CommType() == msg.CommType(), 298 "expected communicator type %s != %s", msg.CommType(), initCode.CommType()) 299 tassert.Errorf(t, msg.Runtime == initCode.Runtime, "expected runtime %s != %s", msg.Runtime, initCode.Runtime) 300 tassert.Errorf(t, bytes.Equal(msg.Code, initCode.Code), "ETL codes differ") 301 tassert.Errorf(t, bytes.Equal(msg.Deps, initCode.Deps), "ETL dependencies differ") 302 303 return 304 } 305 306 func ETLBucketWithCleanup(t *testing.T, bp api.BaseParams, bckFrom, bckTo cmn.Bck, msg *apc.TCBMsg) string { 307 xid, err := api.ETLBucket(bp, bckFrom, bckTo, msg) 308 tassert.CheckFatal(t, err) 309 310 t.Cleanup(func() { 311 tools.DestroyBucket(t, bp.URL, bckTo) 312 }) 313 314 tlog.Logf("ETL[%s]: running %s => %s xaction %q\n", 315 msg.Transform.Name, bckFrom.Cname(""), bckTo.Cname(""), xid) 316 return xid 317 } 318 319 func ETLShouldBeRunning(t *testing.T, params api.BaseParams, etlName string) { 320 etls, err := api.ETLList(params) 321 tassert.CheckFatal(t, err) 322 for _, etl := range etls { 323 if etlName == etl.Name { 324 return 325 } 326 } 327 t.Fatalf("etl[%s] is not running (%v)", etlName, etls) 328 } 329 330 func ETLShouldNotBeRunning(t *testing.T, params api.BaseParams, etlName string) { 331 etls, err := api.ETLList(params) 332 tassert.CheckFatal(t, err) 333 for _, etl := range etls { 334 if etlName == etl.Name { 335 t.Fatalf("expected etl[%s] to be stopped (%v)", etlName, etls) 336 } 337 } 338 } 339 340 func CheckNoRunningETLContainers(t *testing.T, params api.BaseParams) { 341 etls, err := api.ETLList(params) 342 tassert.CheckFatal(t, err) 343 tassert.Fatalf(t, len(etls) == 0, "Expected no ETL running, got %+v", etls) 344 } 345 346 func SpecToInitMsg(spec []byte /*yaml*/) (msg *etl.InitSpecMsg, err error) { 347 errCtx := &cmn.ETLErrCtx{} 348 msg = &etl.InitSpecMsg{Spec: spec} 349 pod, err := etl.ParsePodSpec(errCtx, msg.Spec) 350 if err != nil { 351 return msg, err 352 } 353 errCtx.ETLName = pod.GetName() 354 msg.IDX = pod.GetName() 355 356 if err := k8s.ValidateEtlName(msg.IDX); err != nil { 357 return msg, err 358 } 359 // Check annotations. 360 msg.CommTypeX = podTransformCommType(pod) 361 if msg.Timeout, err = podTransformTimeout(errCtx, pod); err != nil { 362 return msg, err 363 } 364 365 return msg, msg.Validate() 366 } 367 368 func podTransformCommType(pod *corev1.Pod) string { 369 if pod.Annotations == nil || pod.Annotations[commTypeAnnotation] == "" { 370 // By default assume `Hpush`. 371 return etl.Hpush 372 } 373 return pod.Annotations[commTypeAnnotation] 374 } 375 376 func podTransformTimeout(errCtx *cmn.ETLErrCtx, pod *corev1.Pod) (cos.Duration, error) { 377 if pod.Annotations == nil || pod.Annotations[waitTimeoutAnnotation] == "" { 378 return 0, nil 379 } 380 381 v, err := time.ParseDuration(pod.Annotations[waitTimeoutAnnotation]) 382 if err != nil { 383 return cos.Duration(v), cmn.NewErrETL(errCtx, err.Error()).WithPodName(pod.Name) 384 } 385 return cos.Duration(v), nil 386 }