github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/python/examples/sdk/sdk-etl-tutorial.ipynb (about)

     1  {
     2   "cells": [
     3    {
     4     "cell_type": "markdown",
     5     "source": [
     6      "# AIStore Python SDK ETL Tutorial"
     7     ],
     8     "metadata": {
     9      "collapsed": false
    10     }
    11    },
    12    {
    13     "cell_type": "markdown",
    14     "id": "e616503b",
    15     "metadata": {},
    16     "source": [
    17      "### Set up constants and initialize the client\n"
    18     ]
    19    },
    20    {
    21     "cell_type": "code",
    22     "execution_count": null,
    23     "id": "45d2c741",
    24     "metadata": {},
    25     "outputs": [],
    26     "source": [
    27      "from aistore import Client\n",
    28      "from aistore.sdk.etl_templates import MD5\n",
    29      "import hashlib\n",
    30      "from itertools import cycle\n",
    31      "\n",
    32      "BUCKET_NAME = \"bucket-demo\"\n",
    33      "SPEC_ETL_NAME = \"etl-spec-demo\"\n",
    34      "CODE_ETL_NAME = \"etl-code-demo\"\n",
    35      "\n",
    36      "# Note: AIS-ETLs require Kubernetes.\n",
    37      "client = Client(\"http://192.168.49.2:8080\")\n",
    38      "client.bucket(bck_name=BUCKET_NAME).create(exist_ok=True)"
    39     ]
    40    },
    41    {
    42     "cell_type": "markdown",
    43     "source": [
    44      "### We can initialize ETLs with either [code](https://aiatscale.org/docs/etl#init-code-request) or [spec](https://aiatscale.org/docs/etl#init-spec-request)."
    45     ],
    46     "metadata": {
    47      "collapsed": false
    48     }
    49    },
    50    {
    51     "cell_type": "markdown",
    52     "source": [
    53      "#### Initialize an ETL with code:"
    54     ],
    55     "metadata": {
    56      "collapsed": false
    57     }
    58    },
    59    {
    60     "cell_type": "code",
    61     "execution_count": null,
    62     "outputs": [],
    63     "source": [
    64      "# Defining ETL transformation code\n",
    65      "def transform(input_bytes):\n",
    66      "    md5 = hashlib.md5()\n",
    67      "    md5.update(input_bytes)\n",
    68      "    return md5.hexdigest().encode()\n",
    69      "\n",
    70      "\n",
    71      "md5_code_etl = client.etl(etl_name=CODE_ETL_NAME)\n",
    72      "# Initializing ETL with transform()\n",
    73      "md5_code_etl.init_code(transform=transform)"
    74     ],
    75     "metadata": {
    76      "collapsed": false
    77     }
    78    },
    79    {
    80     "cell_type": "markdown",
    81     "source": [
    82      "#### Initialize ETL with spec"
    83     ],
    84     "metadata": {
    85      "collapsed": false
    86     }
    87    },
    88    {
    89     "cell_type": "code",
    90     "execution_count": null,
    91     "outputs": [],
    92     "source": [
    93      "# Use the provided template and substitute in the communication type\n",
    94      "template = MD5.format(communication_type=\"hpush\")\n",
    95      "md5_spec_etl = client.etl(etl_name=SPEC_ETL_NAME)\n",
    96      "md5_spec_etl.init_spec(template=template)"
    97     ],
    98     "metadata": {
    99      "collapsed": false
   100     }
   101    },
   102    {
   103     "cell_type": "markdown",
   104     "source": [
   105      "Refer to more ETL templates [here](https://github.com/NVIDIA/aistore/blob/master/python/aistore/sdk/etl_templates.py)."
   106     ],
   107     "metadata": {
   108      "collapsed": false
   109     }
   110    },
   111    {
   112     "cell_type": "markdown",
   113     "id": "888ad4ee",
   114     "metadata": {},
   115     "source": [
   116      "### List ETLs\n",
   117      "Once initialized, we can verify the ETLs are running:"
   118     ]
   119    },
   120    {
   121     "cell_type": "code",
   122     "execution_count": null,
   123     "id": "4f001731",
   124     "metadata": {},
   125     "outputs": [],
   126     "source": [
   127      "client.cluster().list_running_etls()"
   128     ]
   129    },
   130    {
   131     "cell_type": "markdown",
   132     "id": "2a0d8e79",
   133     "metadata": {},
   134     "source": [
   135      "### View ETLs"
   136     ]
   137    },
   138    {
   139     "cell_type": "code",
   140     "execution_count": null,
   141     "id": "030e8611",
   142     "metadata": {},
   143     "outputs": [],
   144     "source": [
   145      "md5_code_etl.view()"
   146     ]
   147    },
   148    {
   149     "cell_type": "code",
   150     "execution_count": null,
   151     "id": "80903c9e",
   152     "metadata": {},
   153     "outputs": [],
   154     "source": [
   155      "md5_spec_etl.view()"
   156     ]
   157    },
   158    {
   159     "cell_type": "markdown",
   160     "id": "f7813f98",
   161     "metadata": {},
   162     "source": [
   163      "## Get an object with ETL transformation applied"
   164     ]
   165    },
   166    {
   167     "cell_type": "markdown",
   168     "source": [
   169      "### First, create some objects to transform"
   170     ],
   171     "metadata": {
   172      "collapsed": false
   173     }
   174    },
   175    {
   176     "cell_type": "code",
   177     "execution_count": null,
   178     "id": "aeb15852",
   179     "metadata": {},
   180     "outputs": [],
   181     "source": [
   182      "import random\n",
   183      "import string\n",
   184      "import tempfile\n",
   185      "\n",
   186      "\n",
   187      "def create_and_put_object(\n",
   188      "    client: Client,\n",
   189      "    bck_name: str,\n",
   190      "    obj_name: str,\n",
   191      "    provider: str = \"ais\",\n",
   192      "    obj_size: int = 0,\n",
   193      "):\n",
   194      "    obj_size = obj_size if obj_size else random.randrange(10, 20)\n",
   195      "    obj_body = \"\".join(random.choices(string.ascii_letters, k=obj_size))\n",
   196      "    content = obj_body.encode(\"utf-8\")\n",
   197      "    with tempfile.NamedTemporaryFile() as file:\n",
   198      "        file.write(content)\n",
   199      "        file.flush()\n",
   200      "        client.bucket(bck_name, provider=provider).object(obj_name).put_file(file.name)\n",
   201      "    return content"
   202     ]
   203    },
   204    {
   205     "cell_type": "code",
   206     "execution_count": null,
   207     "id": "56256969",
   208     "metadata": {},
   209     "outputs": [],
   210     "source": [
   211      "content = create_and_put_object(\n",
   212      "    client=client, bck_name=BUCKET_NAME, obj_name=\"object-demo.jpg\"\n",
   213      ")"
   214     ]
   215    },
   216    {
   217     "cell_type": "markdown",
   218     "source": [
   219      "### Get single object with ETL code transformation"
   220     ],
   221     "metadata": {
   222      "collapsed": false
   223     }
   224    },
   225    {
   226     "cell_type": "code",
   227     "execution_count": null,
   228     "outputs": [],
   229     "source": [
   230      "client.bucket(BUCKET_NAME).object(\"object-demo.jpg\").get(\n",
   231      "    etl_name=md5_code_etl.name\n",
   232      ").read_all()"
   233     ],
   234     "metadata": {
   235      "collapsed": false
   236     }
   237    },
   238    {
   239     "cell_type": "markdown",
   240     "source": [
   241      "### Get single object with ETL spec transformation"
   242     ],
   243     "metadata": {
   244      "collapsed": false
   245     }
   246    },
   247    {
   248     "cell_type": "code",
   249     "execution_count": null,
   250     "outputs": [],
   251     "source": [
   252      "client.bucket(BUCKET_NAME).object(\"object-demo.jpg\").get(\n",
   253      "    etl_name=md5_spec_etl.name\n",
   254      ").read_all()"
   255     ],
   256     "metadata": {
   257      "collapsed": false
   258     }
   259    },
   260    {
   261     "cell_type": "markdown",
   262     "id": "52656fc1",
   263     "metadata": {},
   264     "source": [
   265      "## Transform entire bucket with ETL"
   266     ]
   267    },
   268    {
   269     "cell_type": "code",
   270     "execution_count": null,
   271     "id": "6760478f",
   272     "metadata": {},
   273     "outputs": [],
   274     "source": [
   275      "# Create bucket to store transformed objects\n",
   276      "dest_bucket = client.bucket(\"transform-destination-bucket\").create(exist_ok=True)\n",
   277      "\n",
   278      "# Transform bucket contents (with on-the-fly object renames)\n",
   279      "client.bucket(BUCKET_NAME).transform(\n",
   280      "    etl_name=md5_spec_etl.name,\n",
   281      "    to_bck=dest_bucket,\n",
   282      "    prepend=\"transformed-\",\n",
   283      "    ext={\"jpg\": \"txt\"},\n",
   284      ")"
   285     ]
   286    },
   287    {
   288     "cell_type": "code",
   289     "execution_count": null,
   290     "id": "db8ccf1a",
   291     "metadata": {},
   292     "outputs": [],
   293     "source": [
   294      "# Verify rename operations for transformed objects\n",
   295      "dest_bucket.list_objects().get_entries()"
   296     ]
   297    },
   298    {
   299     "cell_type": "markdown",
   300     "id": "a1a2e8ae",
   301     "metadata": {},
   302     "source": [
   303      "### Stop ETLs\n",
   304      "If an ETL is stopped, any Kubernetes pods created for the ETL are *stopped*, but *not deleted*. Any transforms by the stopped ETL are terminated. Stopped ETLs can be resumed for use with method `start()`:\n"
   305     ]
   306    },
   307    {
   308     "cell_type": "code",
   309     "execution_count": null,
   310     "id": "b7ab064f",
   311     "metadata": {},
   312     "outputs": [],
   313     "source": [
   314      "md5_code_etl.stop()\n",
   315      "md5_spec_etl.stop()\n",
   316      "client.cluster().list_running_etls()"
   317     ]
   318    },
   319    {
   320     "cell_type": "markdown",
   321     "id": "9beb3d0f",
   322     "metadata": {},
   323     "source": [
   324      "### Restart Stopped ETLs"
   325     ]
   326    },
   327    {
   328     "cell_type": "code",
   329     "execution_count": null,
   330     "id": "cea3c373",
   331     "metadata": {},
   332     "outputs": [],
   333     "source": [
   334      "md5_code_etl.start()\n",
   335      "md5_spec_etl.start()\n",
   336      "client.cluster().list_running_etls()"
   337     ]
   338    },
   339    {
   340     "cell_type": "markdown",
   341     "id": "e1fb0a93",
   342     "metadata": {},
   343     "source": [
   344      "### Stop & Delete ETLs\n",
   345      "Once completely finished with the ETLs, we clean up (for storage) by stopping the ETLs with `stop` and subsequently deleting the ETLs with `delete`.\n",
   346      "Deleting an ETL deletes all pods created by Kubernetes for the ETL as well as any specifications for the ETL on Kubernetes. Consequently, deleted ETLs cannot be started again and will need to be re-initialized."
   347     ]
   348    },
   349    {
   350     "cell_type": "code",
   351     "execution_count": null,
   352     "id": "bc33c20e",
   353     "metadata": {},
   354     "outputs": [],
   355     "source": [
   356      "md5_code_etl.stop()\n",
   357      "md5_spec_etl.stop()\n",
   358      "\n",
   359      "md5_code_etl.delete()\n",
   360      "md5_spec_etl.delete()"
   361     ]
   362    },
   363    {
   364     "cell_type": "markdown",
   365     "id": "7aaf1c52",
   366     "metadata": {},
   367     "source": [
   368      "### Starting Deleted ETL Raises Exception"
   369     ]
   370    },
   371    {
   372     "cell_type": "code",
   373     "execution_count": null,
   374     "id": "cf2a938a",
   375     "metadata": {},
   376     "outputs": [],
   377     "source": [
   378      "md5_code_etl.start()"
   379     ]
   380    },
   381    {
   382     "cell_type": "code",
   383     "execution_count": null,
   384     "id": "02fa415c",
   385     "metadata": {},
   386     "outputs": [],
   387     "source": [
   388      "md5_spec_etl.start()"
   389     ]
   390    },
   391    {
   392     "cell_type": "markdown",
   393     "id": "278ecb98",
   394     "metadata": {},
   395     "source": [
   396      "### Initialize ETL XOR+Checksum with streaming data"
   397     ]
   398    },
   399    {
   400     "cell_type": "code",
   401     "execution_count": null,
   402     "id": "97214ac4",
   403     "metadata": {},
   404     "outputs": [],
   405     "source": [
   406      "content = create_and_put_object(\n",
   407      "    client=client, bck_name=BUCKET_NAME, obj_name=\"object-xor-demo.jpg\", obj_size=256\n",
   408      ")"
   409     ]
   410    },
   411    {
   412     "cell_type": "code",
   413     "execution_count": null,
   414     "id": "92cce61e",
   415     "metadata": {},
   416     "outputs": [],
   417     "source": [
   418      "def transform(reader, writer):\n",
   419      "    checksum = hashlib.md5()\n",
   420      "    key = b\"AISTORE\"\n",
   421      "    for b in reader:\n",
   422      "        out = bytes([_a ^ _b for _a, _b in zip(b, cycle(key))])\n",
   423      "        writer.write(out)\n",
   424      "        checksum.update(out)\n",
   425      "    writer.write(checksum.hexdigest().encode())\n",
   426      "\n",
   427      "\n",
   428      "xor_stream_etl = client.etl(\"xor-md5-stream\")\n",
   429      "xor_stream_etl.init_code(\n",
   430      "    transform=transform,\n",
   431      "    chunk_size=32,\n",
   432      ")"
   433     ]
   434    },
   435    {
   436     "cell_type": "markdown",
   437     "source": [
   438      "### Get object with XOR+Checksum ETL and verify checksum"
   439     ],
   440     "metadata": {
   441      "collapsed": false
   442     }
   443    },
   444    {
   445     "cell_type": "code",
   446     "execution_count": null,
   447     "outputs": [],
   448     "source": [
   449      "xor_obj = (\n",
   450      "    client.bucket(BUCKET_NAME)\n",
   451      "    .object(\"object-xor-demo.jpg\")\n",
   452      "    .get(etl_name=xor_stream_etl.name)\n",
   453      "    .read_all()\n",
   454      ")\n",
   455      "data, checksum = xor_obj[:-32], xor_obj[-32:]\n",
   456      "computed_checksum = hashlib.md5(data).hexdigest().encode()\n",
   457      "computed_checksum == checksum"
   458     ],
   459     "metadata": {
   460      "collapsed": false
   461     }
   462    },
   463    {
   464     "cell_type": "code",
   465     "execution_count": null,
   466     "id": "23ac67da",
   467     "metadata": {},
   468     "outputs": [],
   469     "source": [
   470      "xor_stream_etl.stop()\n",
   471      "xor_stream_etl.delete()"
   472     ]
   473    },
   474    {
   475     "cell_type": "markdown",
   476     "source": [
   477      "### Cleanup buckets"
   478     ],
   479     "metadata": {
   480      "collapsed": false
   481     }
   482    },
   483    {
   484     "cell_type": "code",
   485     "execution_count": null,
   486     "outputs": [],
   487     "source": [
   488      "for bucket in client.cluster().list_buckets():\n",
   489      "    client.bucket(bucket.name).delete()"
   490     ],
   491     "metadata": {
   492      "collapsed": false
   493     }
   494    }
   495   ],
   496   "metadata": {
   497    "kernelspec": {
   498     "display_name": "Python 3 (ipykernel)",
   499     "language": "python",
   500     "name": "python3"
   501    },
   502    "language_info": {
   503     "codemirror_mode": {
   504      "name": "ipython",
   505      "version": 3
   506     },
   507     "file_extension": ".py",
   508     "mimetype": "text/x-python",
   509     "name": "python",
   510     "nbconvert_exporter": "python",
   511     "pygments_lexer": "ipython3",
   512     "version": "3.10.6 (main, Nov 14 2022, 16:10:14) [GCC 11.3.0]"
   513    },
   514    "vscode": {
   515     "interpreter": {
   516      "hash": "916dbcbb3f70747c44a77c7bcd40155683ae19c65e1c03b4aa3499c5328201f1"
   517     }
   518    }
   519   },
   520   "nbformat": 4,
   521   "nbformat_minor": 5
   522  }