github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/python/examples/sdk/sdk-etl-tutorial.ipynb (about) 1 { 2 "cells": [ 3 { 4 "cell_type": "markdown", 5 "source": [ 6 "# AIStore Python SDK ETL Tutorial" 7 ], 8 "metadata": { 9 "collapsed": false 10 } 11 }, 12 { 13 "cell_type": "markdown", 14 "id": "e616503b", 15 "metadata": {}, 16 "source": [ 17 "### Set up constants and initialize the client\n" 18 ] 19 }, 20 { 21 "cell_type": "code", 22 "execution_count": null, 23 "id": "45d2c741", 24 "metadata": {}, 25 "outputs": [], 26 "source": [ 27 "from aistore import Client\n", 28 "from aistore.sdk.etl_templates import MD5\n", 29 "import hashlib\n", 30 "from itertools import cycle\n", 31 "\n", 32 "BUCKET_NAME = \"bucket-demo\"\n", 33 "SPEC_ETL_NAME = \"etl-spec-demo\"\n", 34 "CODE_ETL_NAME = \"etl-code-demo\"\n", 35 "\n", 36 "# Note: AIS-ETLs require Kubernetes.\n", 37 "client = Client(\"http://192.168.49.2:8080\")\n", 38 "client.bucket(bck_name=BUCKET_NAME).create(exist_ok=True)" 39 ] 40 }, 41 { 42 "cell_type": "markdown", 43 "source": [ 44 "### We can initialize ETLs with either [code](https://aiatscale.org/docs/etl#init-code-request) or [spec](https://aiatscale.org/docs/etl#init-spec-request)." 45 ], 46 "metadata": { 47 "collapsed": false 48 } 49 }, 50 { 51 "cell_type": "markdown", 52 "source": [ 53 "#### Initialize an ETL with code:" 54 ], 55 "metadata": { 56 "collapsed": false 57 } 58 }, 59 { 60 "cell_type": "code", 61 "execution_count": null, 62 "outputs": [], 63 "source": [ 64 "# Defining ETL transformation code\n", 65 "def transform(input_bytes):\n", 66 " md5 = hashlib.md5()\n", 67 " md5.update(input_bytes)\n", 68 " return md5.hexdigest().encode()\n", 69 "\n", 70 "\n", 71 "md5_code_etl = client.etl(etl_name=CODE_ETL_NAME)\n", 72 "# Initializing ETL with transform()\n", 73 "md5_code_etl.init_code(transform=transform)" 74 ], 75 "metadata": { 76 "collapsed": false 77 } 78 }, 79 { 80 "cell_type": "markdown", 81 "source": [ 82 "#### Initialize ETL with spec" 83 ], 84 "metadata": { 85 "collapsed": false 86 } 87 }, 88 { 89 "cell_type": "code", 90 "execution_count": null, 91 "outputs": [], 92 "source": [ 93 "# Use the provided template and substitute in the communication type\n", 94 "template = MD5.format(communication_type=\"hpush\")\n", 95 "md5_spec_etl = client.etl(etl_name=SPEC_ETL_NAME)\n", 96 "md5_spec_etl.init_spec(template=template)" 97 ], 98 "metadata": { 99 "collapsed": false 100 } 101 }, 102 { 103 "cell_type": "markdown", 104 "source": [ 105 "Refer to more ETL templates [here](https://github.com/NVIDIA/aistore/blob/master/python/aistore/sdk/etl_templates.py)." 106 ], 107 "metadata": { 108 "collapsed": false 109 } 110 }, 111 { 112 "cell_type": "markdown", 113 "id": "888ad4ee", 114 "metadata": {}, 115 "source": [ 116 "### List ETLs\n", 117 "Once initialized, we can verify the ETLs are running:" 118 ] 119 }, 120 { 121 "cell_type": "code", 122 "execution_count": null, 123 "id": "4f001731", 124 "metadata": {}, 125 "outputs": [], 126 "source": [ 127 "client.cluster().list_running_etls()" 128 ] 129 }, 130 { 131 "cell_type": "markdown", 132 "id": "2a0d8e79", 133 "metadata": {}, 134 "source": [ 135 "### View ETLs" 136 ] 137 }, 138 { 139 "cell_type": "code", 140 "execution_count": null, 141 "id": "030e8611", 142 "metadata": {}, 143 "outputs": [], 144 "source": [ 145 "md5_code_etl.view()" 146 ] 147 }, 148 { 149 "cell_type": "code", 150 "execution_count": null, 151 "id": "80903c9e", 152 "metadata": {}, 153 "outputs": [], 154 "source": [ 155 "md5_spec_etl.view()" 156 ] 157 }, 158 { 159 "cell_type": "markdown", 160 "id": "f7813f98", 161 "metadata": {}, 162 "source": [ 163 "## Get an object with ETL transformation applied" 164 ] 165 }, 166 { 167 "cell_type": "markdown", 168 "source": [ 169 "### First, create some objects to transform" 170 ], 171 "metadata": { 172 "collapsed": false 173 } 174 }, 175 { 176 "cell_type": "code", 177 "execution_count": null, 178 "id": "aeb15852", 179 "metadata": {}, 180 "outputs": [], 181 "source": [ 182 "import random\n", 183 "import string\n", 184 "import tempfile\n", 185 "\n", 186 "\n", 187 "def create_and_put_object(\n", 188 " client: Client,\n", 189 " bck_name: str,\n", 190 " obj_name: str,\n", 191 " provider: str = \"ais\",\n", 192 " obj_size: int = 0,\n", 193 "):\n", 194 " obj_size = obj_size if obj_size else random.randrange(10, 20)\n", 195 " obj_body = \"\".join(random.choices(string.ascii_letters, k=obj_size))\n", 196 " content = obj_body.encode(\"utf-8\")\n", 197 " with tempfile.NamedTemporaryFile() as file:\n", 198 " file.write(content)\n", 199 " file.flush()\n", 200 " client.bucket(bck_name, provider=provider).object(obj_name).put_file(file.name)\n", 201 " return content" 202 ] 203 }, 204 { 205 "cell_type": "code", 206 "execution_count": null, 207 "id": "56256969", 208 "metadata": {}, 209 "outputs": [], 210 "source": [ 211 "content = create_and_put_object(\n", 212 " client=client, bck_name=BUCKET_NAME, obj_name=\"object-demo.jpg\"\n", 213 ")" 214 ] 215 }, 216 { 217 "cell_type": "markdown", 218 "source": [ 219 "### Get single object with ETL code transformation" 220 ], 221 "metadata": { 222 "collapsed": false 223 } 224 }, 225 { 226 "cell_type": "code", 227 "execution_count": null, 228 "outputs": [], 229 "source": [ 230 "client.bucket(BUCKET_NAME).object(\"object-demo.jpg\").get(\n", 231 " etl_name=md5_code_etl.name\n", 232 ").read_all()" 233 ], 234 "metadata": { 235 "collapsed": false 236 } 237 }, 238 { 239 "cell_type": "markdown", 240 "source": [ 241 "### Get single object with ETL spec transformation" 242 ], 243 "metadata": { 244 "collapsed": false 245 } 246 }, 247 { 248 "cell_type": "code", 249 "execution_count": null, 250 "outputs": [], 251 "source": [ 252 "client.bucket(BUCKET_NAME).object(\"object-demo.jpg\").get(\n", 253 " etl_name=md5_spec_etl.name\n", 254 ").read_all()" 255 ], 256 "metadata": { 257 "collapsed": false 258 } 259 }, 260 { 261 "cell_type": "markdown", 262 "id": "52656fc1", 263 "metadata": {}, 264 "source": [ 265 "## Transform entire bucket with ETL" 266 ] 267 }, 268 { 269 "cell_type": "code", 270 "execution_count": null, 271 "id": "6760478f", 272 "metadata": {}, 273 "outputs": [], 274 "source": [ 275 "# Create bucket to store transformed objects\n", 276 "dest_bucket = client.bucket(\"transform-destination-bucket\").create(exist_ok=True)\n", 277 "\n", 278 "# Transform bucket contents (with on-the-fly object renames)\n", 279 "client.bucket(BUCKET_NAME).transform(\n", 280 " etl_name=md5_spec_etl.name,\n", 281 " to_bck=dest_bucket,\n", 282 " prepend=\"transformed-\",\n", 283 " ext={\"jpg\": \"txt\"},\n", 284 ")" 285 ] 286 }, 287 { 288 "cell_type": "code", 289 "execution_count": null, 290 "id": "db8ccf1a", 291 "metadata": {}, 292 "outputs": [], 293 "source": [ 294 "# Verify rename operations for transformed objects\n", 295 "dest_bucket.list_objects().get_entries()" 296 ] 297 }, 298 { 299 "cell_type": "markdown", 300 "id": "a1a2e8ae", 301 "metadata": {}, 302 "source": [ 303 "### Stop ETLs\n", 304 "If an ETL is stopped, any Kubernetes pods created for the ETL are *stopped*, but *not deleted*. Any transforms by the stopped ETL are terminated. Stopped ETLs can be resumed for use with method `start()`:\n" 305 ] 306 }, 307 { 308 "cell_type": "code", 309 "execution_count": null, 310 "id": "b7ab064f", 311 "metadata": {}, 312 "outputs": [], 313 "source": [ 314 "md5_code_etl.stop()\n", 315 "md5_spec_etl.stop()\n", 316 "client.cluster().list_running_etls()" 317 ] 318 }, 319 { 320 "cell_type": "markdown", 321 "id": "9beb3d0f", 322 "metadata": {}, 323 "source": [ 324 "### Restart Stopped ETLs" 325 ] 326 }, 327 { 328 "cell_type": "code", 329 "execution_count": null, 330 "id": "cea3c373", 331 "metadata": {}, 332 "outputs": [], 333 "source": [ 334 "md5_code_etl.start()\n", 335 "md5_spec_etl.start()\n", 336 "client.cluster().list_running_etls()" 337 ] 338 }, 339 { 340 "cell_type": "markdown", 341 "id": "e1fb0a93", 342 "metadata": {}, 343 "source": [ 344 "### Stop & Delete ETLs\n", 345 "Once completely finished with the ETLs, we clean up (for storage) by stopping the ETLs with `stop` and subsequently deleting the ETLs with `delete`.\n", 346 "Deleting an ETL deletes all pods created by Kubernetes for the ETL as well as any specifications for the ETL on Kubernetes. Consequently, deleted ETLs cannot be started again and will need to be re-initialized." 347 ] 348 }, 349 { 350 "cell_type": "code", 351 "execution_count": null, 352 "id": "bc33c20e", 353 "metadata": {}, 354 "outputs": [], 355 "source": [ 356 "md5_code_etl.stop()\n", 357 "md5_spec_etl.stop()\n", 358 "\n", 359 "md5_code_etl.delete()\n", 360 "md5_spec_etl.delete()" 361 ] 362 }, 363 { 364 "cell_type": "markdown", 365 "id": "7aaf1c52", 366 "metadata": {}, 367 "source": [ 368 "### Starting Deleted ETL Raises Exception" 369 ] 370 }, 371 { 372 "cell_type": "code", 373 "execution_count": null, 374 "id": "cf2a938a", 375 "metadata": {}, 376 "outputs": [], 377 "source": [ 378 "md5_code_etl.start()" 379 ] 380 }, 381 { 382 "cell_type": "code", 383 "execution_count": null, 384 "id": "02fa415c", 385 "metadata": {}, 386 "outputs": [], 387 "source": [ 388 "md5_spec_etl.start()" 389 ] 390 }, 391 { 392 "cell_type": "markdown", 393 "id": "278ecb98", 394 "metadata": {}, 395 "source": [ 396 "### Initialize ETL XOR+Checksum with streaming data" 397 ] 398 }, 399 { 400 "cell_type": "code", 401 "execution_count": null, 402 "id": "97214ac4", 403 "metadata": {}, 404 "outputs": [], 405 "source": [ 406 "content = create_and_put_object(\n", 407 " client=client, bck_name=BUCKET_NAME, obj_name=\"object-xor-demo.jpg\", obj_size=256\n", 408 ")" 409 ] 410 }, 411 { 412 "cell_type": "code", 413 "execution_count": null, 414 "id": "92cce61e", 415 "metadata": {}, 416 "outputs": [], 417 "source": [ 418 "def transform(reader, writer):\n", 419 " checksum = hashlib.md5()\n", 420 " key = b\"AISTORE\"\n", 421 " for b in reader:\n", 422 " out = bytes([_a ^ _b for _a, _b in zip(b, cycle(key))])\n", 423 " writer.write(out)\n", 424 " checksum.update(out)\n", 425 " writer.write(checksum.hexdigest().encode())\n", 426 "\n", 427 "\n", 428 "xor_stream_etl = client.etl(\"xor-md5-stream\")\n", 429 "xor_stream_etl.init_code(\n", 430 " transform=transform,\n", 431 " chunk_size=32,\n", 432 ")" 433 ] 434 }, 435 { 436 "cell_type": "markdown", 437 "source": [ 438 "### Get object with XOR+Checksum ETL and verify checksum" 439 ], 440 "metadata": { 441 "collapsed": false 442 } 443 }, 444 { 445 "cell_type": "code", 446 "execution_count": null, 447 "outputs": [], 448 "source": [ 449 "xor_obj = (\n", 450 " client.bucket(BUCKET_NAME)\n", 451 " .object(\"object-xor-demo.jpg\")\n", 452 " .get(etl_name=xor_stream_etl.name)\n", 453 " .read_all()\n", 454 ")\n", 455 "data, checksum = xor_obj[:-32], xor_obj[-32:]\n", 456 "computed_checksum = hashlib.md5(data).hexdigest().encode()\n", 457 "computed_checksum == checksum" 458 ], 459 "metadata": { 460 "collapsed": false 461 } 462 }, 463 { 464 "cell_type": "code", 465 "execution_count": null, 466 "id": "23ac67da", 467 "metadata": {}, 468 "outputs": [], 469 "source": [ 470 "xor_stream_etl.stop()\n", 471 "xor_stream_etl.delete()" 472 ] 473 }, 474 { 475 "cell_type": "markdown", 476 "source": [ 477 "### Cleanup buckets" 478 ], 479 "metadata": { 480 "collapsed": false 481 } 482 }, 483 { 484 "cell_type": "code", 485 "execution_count": null, 486 "outputs": [], 487 "source": [ 488 "for bucket in client.cluster().list_buckets():\n", 489 " client.bucket(bucket.name).delete()" 490 ], 491 "metadata": { 492 "collapsed": false 493 } 494 } 495 ], 496 "metadata": { 497 "kernelspec": { 498 "display_name": "Python 3 (ipykernel)", 499 "language": "python", 500 "name": "python3" 501 }, 502 "language_info": { 503 "codemirror_mode": { 504 "name": "ipython", 505 "version": 3 506 }, 507 "file_extension": ".py", 508 "mimetype": "text/x-python", 509 "name": "python", 510 "nbconvert_exporter": "python", 511 "pygments_lexer": "ipython3", 512 "version": "3.10.6 (main, Nov 14 2022, 16:10:14) [GCC 11.3.0]" 513 }, 514 "vscode": { 515 "interpreter": { 516 "hash": "916dbcbb3f70747c44a77c7bcd40155683ae19c65e1c03b4aa3499c5328201f1" 517 } 518 } 519 }, 520 "nbformat": 4, 521 "nbformat_minor": 5 522 }