github.com/kubeflow/training-operator@v1.7.0/sdk/python/examples/create-pytorchjob-from-func.ipynb (about)

     1  {
     2   "cells": [
     3    {
     4     "cell_type": "markdown",
     5     "id": "90d43b56-97e5-45e2-8e67-4488ed31d2df",
     6     "metadata": {
     7      "tags": []
     8     },
     9     "source": [
    10      "# Run PyTorchJob From Function\n",
    11      "\n",
    12      "In this Notebook we are going to create [Kubeflow PyTorchJob](https://www.kubeflow.org/docs/components/training/pytorch/).\n",
    13      "\n",
    14      "The PyTorchJob will run distributive training using [DistributedDataParallel strategy](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html)."
    15     ]
    16    },
    17    {
    18     "cell_type": "markdown",
    19     "id": "a8bb6564-fde3-4c28-841c-012122643dd9",
    20     "metadata": {
    21      "tags": []
    22     },
    23     "source": [
    24      "## Install Kubeflow Python SDKs\n",
    25      "\n",
    26      "You need to install PyTorch packages and Kubeflow SDKs to run this Notebook."
    27     ]
    28    },
    29    {
    30     "cell_type": "code",
    31     "execution_count": null,
    32     "id": "d49f072e-2221-48bb-9f6d-561713d1a45c",
    33     "metadata": {},
    34     "outputs": [],
    35     "source": [
    36      "!pip install torch==1.12.1\n",
    37      "!pip install torchvision==0.13.1\n",
    38      "\n",
    39      "# TODO (andreyvelich): Change to release version when SDK with the new APIs is published.\n",
    40      "!pip install git+https://github.com/kubeflow/training-operator.git#subdirectory=sdk/python"
    41     ]
    42    },
    43    {
    44     "cell_type": "markdown",
    45     "id": "e9331a05-9127-4b3a-8077-31157e267827",
    46     "metadata": {},
    47     "source": [
    48      "## Create Train Script for CNN Model\n",
    49      "\n",
    50      "This is simple **Convolutional Neural Network (CNN)** model for recognizing different picture of clothing using [Fashion MNIST Dataset](https://github.com/zalandoresearch/fashion-mnist)."
    51     ]
    52    },
    53    {
    54     "cell_type": "code",
    55     "execution_count": 2,
    56     "id": "69f21f33-5c64-452c-90c4-977fc0dadb3b",
    57     "metadata": {
    58      "tags": []
    59     },
    60     "outputs": [],
    61     "source": [
    62      "def train_pytorch_model():\n",
    63      "    import logging\n",
    64      "    import os\n",
    65      "    from torchvision import transforms, datasets\n",
    66      "    import torch\n",
    67      "    from torch import nn\n",
    68      "    import torch.nn.functional as F\n",
    69      "    import torch.distributed as dist\n",
    70      "\n",
    71      "    logging.basicConfig(\n",
    72      "        format=\"%(asctime)s %(levelname)-8s %(message)s\",\n",
    73      "        datefmt=\"%Y-%m-%dT%H:%M:%SZ\",\n",
    74      "        level=logging.DEBUG,\n",
    75      "    )\n",
    76      "\n",
    77      "    # Create PyTorch CNN Model.\n",
    78      "    class Net(nn.Module):\n",
    79      "        def __init__(self):\n",
    80      "            super(Net, self).__init__()\n",
    81      "            self.conv1 = nn.Conv2d(1, 20, 5, 1)\n",
    82      "            self.conv2 = nn.Conv2d(20, 50, 5, 1)\n",
    83      "            self.fc1 = nn.Linear(4 * 4 * 50, 500)\n",
    84      "            self.fc2 = nn.Linear(500, 10)\n",
    85      "\n",
    86      "        def forward(self, x):\n",
    87      "            x = F.relu(self.conv1(x))\n",
    88      "            x = F.max_pool2d(x, 2, 2)\n",
    89      "            x = F.relu(self.conv2(x))\n",
    90      "            x = F.max_pool2d(x, 2, 2)\n",
    91      "            x = x.view(-1, 4 * 4 * 50)\n",
    92      "            x = F.relu(self.fc1(x))\n",
    93      "            x = self.fc2(x)\n",
    94      "            return F.log_softmax(x, dim=1)\n",
    95      "\n",
    96      "    # Get dist parameters.\n",
    97      "    # Kubeflow Training Operator automatically set appropriate RANK and WORLD_SIZE based on the configuration.\n",
    98      "    RANK = int(os.environ[\"RANK\"])\n",
    99      "    WORLD_SIZE = int(os.environ[\"WORLD_SIZE\"])\n",
   100      "    \n",
   101      "    model = Net()\n",
   102      "    # Attach model to DistributedDataParallel strategy.\n",
   103      "    dist.init_process_group(backend=\"gloo\", rank=RANK, world_size=WORLD_SIZE)\n",
   104      "    Distributor = nn.parallel.DistributedDataParallel\n",
   105      "    model = Distributor(model)\n",
   106      "\n",
   107      "    # Split batch size for each worker.\n",
   108      "    batch_size = int(128 / WORLD_SIZE)\n",
   109      "\n",
   110      "    # Get Fashion MNIST DataSet.\n",
   111      "    train_loader = torch.utils.data.DataLoader(\n",
   112      "        datasets.FashionMNIST(\n",
   113      "            \"./data\",\n",
   114      "            train=True,\n",
   115      "            download=True,\n",
   116      "            transform=transforms.Compose([transforms.ToTensor()]),\n",
   117      "        ),\n",
   118      "        batch_size=batch_size,\n",
   119      "    )\n",
   120      "\n",
   121      "    # Start Training.\n",
   122      "    logging.info(f\"Start training for RANK: {RANK}. WORLD_SIZE: {WORLD_SIZE}\")\n",
   123      "    for epoch in range(1):\n",
   124      "        model.train()\n",
   125      "        optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.5)\n",
   126      "\n",
   127      "        for batch_idx, (data, target) in enumerate(train_loader):\n",
   128      "            optimizer.zero_grad()\n",
   129      "            output = model(data)\n",
   130      "            loss = F.nll_loss(output, target)\n",
   131      "            loss.backward()\n",
   132      "            optimizer.step()\n",
   133      "            if batch_idx % 10 == 0:\n",
   134      "                logging.info(\n",
   135      "                    \"Train Epoch: {} [{}/{} ({:.0f}%)]\\tloss={:.4f}\".format(\n",
   136      "                        epoch,\n",
   137      "                        batch_idx * len(data),\n",
   138      "                        len(train_loader.dataset),\n",
   139      "                        100.0 * batch_idx / len(train_loader),\n",
   140      "                        loss.item(),\n",
   141      "                    )\n",
   142      "                )"
   143     ]
   144    },
   145    {
   146     "cell_type": "markdown",
   147     "id": "8cfe8739-1f94-476a-80e3-dd6e3237d9ed",
   148     "metadata": {
   149      "execution": {
   150       "iopub.execute_input": "2022-09-01T19:32:37.813779Z",
   151       "iopub.status.busy": "2022-09-01T19:32:37.812759Z",
   152       "iopub.status.idle": "2022-09-01T19:32:37.827050Z",
   153       "shell.execute_reply": "2022-09-01T19:32:37.825186Z",
   154       "shell.execute_reply.started": "2022-09-01T19:32:37.813690Z"
   155      }
   156     },
   157     "source": [
   158      "## Run Training Locally in the Notebook\n",
   159      "\n",
   160      "We are going to download Fashion MNIST Dataset and start local training."
   161     ]
   162    },
   163    {
   164     "cell_type": "code",
   165     "execution_count": 3,
   166     "id": "9e2c6fd8-d0ba-4bc6-ac90-d4cf09751ace",
   167     "metadata": {
   168      "tags": []
   169     },
   170     "outputs": [
   171      {
   172       "name": "stderr",
   173       "output_type": "stream",
   174       "text": [
   175        "2022-09-12T18:21:28Z INFO     Added key: store_based_barrier_key:1 to store for rank: 0\n"
   176       ]
   177      },
   178      {
   179       "name": "stdout",
   180       "output_type": "stream",
   181       "text": [
   182        "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz\n",
   183        "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz to ./data/FashionMNIST/raw/train-images-idx3-ubyte.gz\n"
   184       ]
   185      },
   186      {
   187       "data": {
   188        "application/vnd.jupyter.widget-view+json": {
   189         "model_id": "851b228ae0324915882f834224abe134",
   190         "version_major": 2,
   191         "version_minor": 0
   192        },
   193        "text/plain": [
   194         "  0%|          | 0/26421880 [00:00<?, ?it/s]"
   195        ]
   196       },
   197       "metadata": {},
   198       "output_type": "display_data"
   199      },
   200      {
   201       "name": "stdout",
   202       "output_type": "stream",
   203       "text": [
   204        "Extracting ./data/FashionMNIST/raw/train-images-idx3-ubyte.gz to ./data/FashionMNIST/raw\n",
   205        "\n",
   206        "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz\n",
   207        "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw/train-labels-idx1-ubyte.gz\n"
   208       ]
   209      },
   210      {
   211       "data": {
   212        "application/vnd.jupyter.widget-view+json": {
   213         "model_id": "c8dde30f1c2544f69c4f51331e0156c5",
   214         "version_major": 2,
   215         "version_minor": 0
   216        },
   217        "text/plain": [
   218         "  0%|          | 0/29515 [00:00<?, ?it/s]"
   219        ]
   220       },
   221       "metadata": {},
   222       "output_type": "display_data"
   223      },
   224      {
   225       "name": "stdout",
   226       "output_type": "stream",
   227       "text": [
   228        "Extracting ./data/FashionMNIST/raw/train-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw\n",
   229        "\n",
   230        "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz\n",
   231        "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz to ./data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz\n"
   232       ]
   233      },
   234      {
   235       "data": {
   236        "application/vnd.jupyter.widget-view+json": {
   237         "model_id": "04cb10c56f73404d997b1b31221f5b10",
   238         "version_major": 2,
   239         "version_minor": 0
   240        },
   241        "text/plain": [
   242         "  0%|          | 0/4422102 [00:00<?, ?it/s]"
   243        ]
   244       },
   245       "metadata": {},
   246       "output_type": "display_data"
   247      },
   248      {
   249       "name": "stdout",
   250       "output_type": "stream",
   251       "text": [
   252        "Extracting ./data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz to ./data/FashionMNIST/raw\n",
   253        "\n",
   254        "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz\n",
   255        "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz\n"
   256       ]
   257      },
   258      {
   259       "data": {
   260        "application/vnd.jupyter.widget-view+json": {
   261         "model_id": "dd74e22f50034e889c4b3f9e7fff3f0c",
   262         "version_major": 2,
   263         "version_minor": 0
   264        },
   265        "text/plain": [
   266         "  0%|          | 0/5148 [00:00<?, ?it/s]"
   267        ]
   268       },
   269       "metadata": {},
   270       "output_type": "display_data"
   271      },
   272      {
   273       "name": "stdout",
   274       "output_type": "stream",
   275       "text": [
   276        "Extracting ./data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw\n",
   277        "\n",
   278        "Processing...\n"
   279       ]
   280      },
   281      {
   282       "name": "stderr",
   283       "output_type": "stream",
   284       "text": [
   285        "/opt/conda/lib/python3.8/site-packages/torchvision/datasets/mnist.py:502: UserWarning: The given NumPy array is not writeable, and PyTorch does not support non-writeable tensors. This means you can write to the underlying (supposedly non-writeable) NumPy array using the tensor. You may want to copy the array to protect its data or make it writeable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at  /pytorch/torch/csrc/utils/tensor_numpy.cpp:143.)\n",
   286        "  return torch.from_numpy(parsed.astype(m[2], copy=False)).view(*s)\n",
   287        "2022-09-12T18:31:05Z INFO     Start training for RANK: 0. WORLD_SIZE: 1\n"
   288       ]
   289      },
   290      {
   291       "name": "stdout",
   292       "output_type": "stream",
   293       "text": [
   294        "Done!\n"
   295       ]
   296      },
   297      {
   298       "name": "stderr",
   299       "output_type": "stream",
   300       "text": [
   301        "2022-09-12T18:31:05Z INFO     Train Epoch: 0 [0/60000 (0%)]\tloss=2.3061\n",
   302        "2022-09-12T18:31:05Z INFO     Reducer buckets have been rebuilt in this iteration.\n",
   303        "2022-09-12T18:31:06Z INFO     Train Epoch: 0 [1280/60000 (2%)]\tloss=2.2979\n",
   304        "2022-09-12T18:31:07Z INFO     Train Epoch: 0 [2560/60000 (4%)]\tloss=2.2926\n",
   305        "2022-09-12T18:31:08Z INFO     Train Epoch: 0 [3840/60000 (6%)]\tloss=2.2796\n",
   306        "2022-09-12T18:31:10Z INFO     Train Epoch: 0 [5120/60000 (9%)]\tloss=2.2838\n",
   307        "2022-09-12T18:31:11Z INFO     Train Epoch: 0 [6400/60000 (11%)]\tloss=2.2751\n",
   308        "2022-09-12T18:31:12Z INFO     Train Epoch: 0 [7680/60000 (13%)]\tloss=2.2683\n",
   309        "2022-09-12T18:31:13Z INFO     Train Epoch: 0 [8960/60000 (15%)]\tloss=2.2443\n",
   310        "2022-09-12T18:31:15Z INFO     Train Epoch: 0 [10240/60000 (17%)]\tloss=2.2341\n",
   311        "2022-09-12T18:31:16Z INFO     Train Epoch: 0 [11520/60000 (19%)]\tloss=2.1962\n",
   312        "2022-09-12T18:31:17Z INFO     Train Epoch: 0 [12800/60000 (21%)]\tloss=2.1701\n",
   313        "2022-09-12T18:31:18Z INFO     Train Epoch: 0 [14080/60000 (23%)]\tloss=2.1368\n",
   314        "2022-09-12T18:31:20Z INFO     Train Epoch: 0 [15360/60000 (26%)]\tloss=2.0717\n",
   315        "2022-09-12T18:31:21Z INFO     Train Epoch: 0 [16640/60000 (28%)]\tloss=1.9831\n",
   316        "2022-09-12T18:31:22Z INFO     Train Epoch: 0 [17920/60000 (30%)]\tloss=1.8490\n",
   317        "2022-09-12T18:31:24Z INFO     Train Epoch: 0 [19200/60000 (32%)]\tloss=1.6720\n",
   318        "2022-09-12T18:31:25Z INFO     Train Epoch: 0 [20480/60000 (34%)]\tloss=1.4354\n",
   319        "2022-09-12T18:31:26Z INFO     Train Epoch: 0 [21760/60000 (36%)]\tloss=1.3926\n",
   320        "2022-09-12T18:31:28Z INFO     Train Epoch: 0 [23040/60000 (38%)]\tloss=1.2361\n",
   321        "2022-09-12T18:31:29Z INFO     Train Epoch: 0 [24320/60000 (41%)]\tloss=1.1674\n",
   322        "2022-09-12T18:31:30Z INFO     Train Epoch: 0 [25600/60000 (43%)]\tloss=0.9845\n",
   323        "2022-09-12T18:31:32Z INFO     Train Epoch: 0 [26880/60000 (45%)]\tloss=0.9887\n",
   324        "2022-09-12T18:31:33Z INFO     Train Epoch: 0 [28160/60000 (47%)]\tloss=1.0034\n",
   325        "2022-09-12T18:31:34Z INFO     Train Epoch: 0 [29440/60000 (49%)]\tloss=1.1126\n",
   326        "2022-09-12T18:31:35Z INFO     Train Epoch: 0 [30720/60000 (51%)]\tloss=0.9854\n",
   327        "2022-09-12T18:31:37Z INFO     Train Epoch: 0 [32000/60000 (53%)]\tloss=0.9148\n",
   328        "2022-09-12T18:31:38Z INFO     Train Epoch: 0 [33280/60000 (55%)]\tloss=0.8559\n",
   329        "2022-09-12T18:31:39Z INFO     Train Epoch: 0 [34560/60000 (58%)]\tloss=0.9737\n",
   330        "2022-09-12T18:31:41Z INFO     Train Epoch: 0 [35840/60000 (60%)]\tloss=0.7636\n",
   331        "2022-09-12T18:31:42Z INFO     Train Epoch: 0 [37120/60000 (62%)]\tloss=0.7537\n",
   332        "2022-09-12T18:31:43Z INFO     Train Epoch: 0 [38400/60000 (64%)]\tloss=0.7180\n",
   333        "2022-09-12T18:31:45Z INFO     Train Epoch: 0 [39680/60000 (66%)]\tloss=0.8250\n",
   334        "2022-09-12T18:31:46Z INFO     Train Epoch: 0 [40960/60000 (68%)]\tloss=0.8221\n",
   335        "2022-09-12T18:31:47Z INFO     Train Epoch: 0 [42240/60000 (70%)]\tloss=0.8605\n",
   336        "2022-09-12T18:31:49Z INFO     Train Epoch: 0 [43520/60000 (72%)]\tloss=0.7450\n",
   337        "2022-09-12T18:31:50Z INFO     Train Epoch: 0 [44800/60000 (75%)]\tloss=0.8031\n",
   338        "2022-09-12T18:31:51Z INFO     Train Epoch: 0 [46080/60000 (77%)]\tloss=0.8090\n",
   339        "2022-09-12T18:31:53Z INFO     Train Epoch: 0 [47360/60000 (79%)]\tloss=0.7897\n",
   340        "2022-09-12T18:31:54Z INFO     Train Epoch: 0 [48640/60000 (81%)]\tloss=0.8838\n",
   341        "2022-09-12T18:31:55Z INFO     Train Epoch: 0 [49920/60000 (83%)]\tloss=0.7967\n",
   342        "2022-09-12T18:31:57Z INFO     Train Epoch: 0 [51200/60000 (85%)]\tloss=0.7554\n",
   343        "2022-09-12T18:31:58Z INFO     Train Epoch: 0 [52480/60000 (87%)]\tloss=0.8402\n",
   344        "2022-09-12T18:31:59Z INFO     Train Epoch: 0 [53760/60000 (90%)]\tloss=0.7859\n",
   345        "2022-09-12T18:32:00Z INFO     Train Epoch: 0 [55040/60000 (92%)]\tloss=0.6342\n",
   346        "2022-09-12T18:32:02Z INFO     Train Epoch: 0 [56320/60000 (94%)]\tloss=0.6881\n",
   347        "2022-09-12T18:32:04Z INFO     Train Epoch: 0 [57600/60000 (96%)]\tloss=0.7722\n",
   348        "2022-09-12T18:32:05Z INFO     Train Epoch: 0 [58880/60000 (98%)]\tloss=0.7504\n"
   349       ]
   350      }
   351     ],
   352     "source": [
   353      "# Set dist env variables to run the above training locally on the Notebook.\n",
   354      "import os\n",
   355      "os.environ[\"RANK\"] = \"0\"\n",
   356      "os.environ[\"WORLD_SIZE\"] = \"1\"\n",
   357      "os.environ[\"MASTER_ADDR\"] = \"localhost\"\n",
   358      "os.environ[\"MASTER_PORT\"] = \"1234\"\n",
   359      "\n",
   360      "# Train Model locally in the Notebook.\n",
   361      "train_pytorch_model()"
   362     ]
   363    },
   364    {
   365     "cell_type": "markdown",
   366     "id": "5aae47e3-be31-468e-8f38-89e1e2f1c764",
   367     "metadata": {
   368      "tags": []
   369     },
   370     "source": [
   371      "## Start Distributive Training with PyTorchJob\n",
   372      "\n",
   373      "Before creating PyTorchJob, you have to create `TrainingClient()`. It uses [Kubernetes Python client](https://github.com/kubernetes-client/python) to communicate with Kubernetes API server. You can set path and context for [the kubeconfig file](https://kubernetes.io/docs/concepts/configuration/organize-cluster-access-kubeconfig/). The default location for the kubeconfig is `~/.kube/config`.\n",
   374      "\n",
   375      "Kubeflow Training Operator automatically set the appropriate env variables (`MASTER_PORT`, `MASTER_ADDR`, `WORLD_SIZE`, `RANK`) for each PyTorchJob container."
   376     ]
   377    },
   378    {
   379     "cell_type": "code",
   380     "execution_count": 5,
   381     "id": "eb1acd34-ebcf-409b-8bb3-0225cee37110",
   382     "metadata": {
   383      "tags": []
   384     },
   385     "outputs": [
   386      {
   387       "name": "stderr",
   388       "output_type": "stream",
   389       "text": [
   390        "PyTorchJob kubeflow-user-example-com/train-pytorch has been created\n"
   391       ]
   392      }
   393     ],
   394     "source": [
   395      "from kubeflow.training import TrainingClient\n",
   396      "\n",
   397      "# Start PyTorchJob Training.\n",
   398      "pytorchjob_name = \"train-pytorch\"\n",
   399      "training_client = TrainingClient()\n",
   400      "\n",
   401      "training_client.create_pytorchjob_from_func(\n",
   402      "    name=pytorchjob_name,\n",
   403      "    func=train_pytorch_model,\n",
   404      "    num_worker_replicas=3, # How many PyTorch Workers will be run.\n",
   405      ")"
   406     ]
   407    },
   408    {
   409     "cell_type": "markdown",
   410     "id": "e44c3ad7-62c4-4b58-b52a-15fd8746b772",
   411     "metadata": {},
   412     "source": [
   413      "### Check PyTorchJob Status\n",
   414      "\n",
   415      "Use `KubeflowClient` APIs to get information about created PyTorchJob."
   416     ]
   417    },
   418    {
   419     "cell_type": "code",
   420     "execution_count": 18,
   421     "id": "4141f6c2-c38f-4972-b68a-35d150ef7485",
   422     "metadata": {
   423      "tags": []
   424     },
   425     "outputs": [
   426      {
   427       "name": "stdout",
   428       "output_type": "stream",
   429       "text": [
   430        "PyTorchJob Status: True\n"
   431       ]
   432      }
   433     ],
   434     "source": [
   435      "print(f\"PyTorchJob Status: {training_client.is_job_running(name=pytorchjob_name, job_kind='PyTorchJob')}\")"
   436     ]
   437    },
   438    {
   439     "cell_type": "markdown",
   440     "id": "42e10587-7ac2-45bf-9c4f-d418e1585974",
   441     "metadata": {},
   442     "source": [
   443      "### Get PyTorchJob Pod Names"
   444     ]
   445    },
   446    {
   447     "cell_type": "code",
   448     "execution_count": 19,
   449     "id": "49b53308-a19b-45e8-942f-4333e727ee48",
   450     "metadata": {},
   451     "outputs": [
   452      {
   453       "data": {
   454        "text/plain": [
   455         "['train-pytorch-master-0',\n",
   456         " 'train-pytorch-worker-0',\n",
   457         " 'train-pytorch-worker-1',\n",
   458         " 'train-pytorch-worker-2']"
   459        ]
   460       },
   461       "execution_count": 19,
   462       "metadata": {},
   463       "output_type": "execute_result"
   464      }
   465     ],
   466     "source": [
   467      "training_client.get_job_pod_names(pytorchjob_name)"
   468     ]
   469    },
   470    {
   471     "cell_type": "markdown",
   472     "id": "b91d332d-487c-4a95-937d-26ffb6199cda",
   473     "metadata": {
   474      "execution": {
   475       "iopub.status.busy": "2022-09-01T20:10:25.759950Z",
   476       "iopub.status.idle": "2022-09-01T20:10:25.760581Z",
   477       "shell.execute_reply": "2022-09-01T20:10:25.760353Z",
   478       "shell.execute_reply.started": "2022-09-01T20:10:25.760328Z"
   479      },
   480      "tags": []
   481     },
   482     "source": [
   483      "### Get PyTorchJob Training Logs"
   484     ]
   485    },
   486    {
   487     "cell_type": "code",
   488     "execution_count": 27,
   489     "id": "5232d542-d4bf-4c51-8b11-ad0534fb0b9d",
   490     "metadata": {
   491      "tags": []
   492     },
   493     "outputs": [
   494      {
   495       "name": "stderr",
   496       "output_type": "stream",
   497       "text": [
   498        "The logs of pod train-pytorch-master-0:\n",
   499        " 2023-01-12T18:55:33Z INFO     Added key: store_based_barrier_key:1 to store for rank: 0\n",
   500        "2023-01-12T18:55:33Z INFO     Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 4 nodes.\n",
   501        "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz\n",
   502        "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz to ./data/FashionMNIST/raw/train-images-idx3-ubyte.gz\n",
   503        "100%|██████████| 26421880/26421880 [00:02<00:00, 12562567.98it/s]\n",
   504        "Extracting ./data/FashionMNIST/raw/train-images-idx3-ubyte.gz to ./data/FashionMNIST/raw\n",
   505        "\n",
   506        "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz\n",
   507        "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw/train-labels-idx1-ubyte.gz\n",
   508        "Extracting ./data/FashionMNIST/raw/train-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw\n",
   509        "\n",
   510        "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz\n",
   511        "100%|██████████| 29515/29515 [00:00<00:00, 211170.82it/s]\n",
   512        "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz to ./data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz\n",
   513        "100%|██████████| 4422102/4422102 [00:00<00:00, 4511582.77it/s]\n",
   514        "Extracting ./data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz to ./data/FashionMNIST/raw\n",
   515        "\n",
   516        "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz\n",
   517        "Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz\n",
   518        "100%|██████████| 5148/5148 [00:00<00:00, 23675742.32it/s]\n",
   519        "Extracting ./data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw\n",
   520        "\n",
   521        "2023-01-12T18:55:39Z INFO     Start training for RANK: 0. WORLD_SIZE: 4\n",
   522        "2023-01-12T18:55:40Z INFO     Train Epoch: 0 [0/60000 (0%)]\tloss=2.3033\n",
   523        "2023-01-12T18:55:40Z INFO     Reducer buckets have been rebuilt in this iteration.\n",
   524        "2023-01-12T18:55:42Z INFO     Train Epoch: 0 [320/60000 (1%)]\tloss=2.3035\n",
   525        "2023-01-12T18:55:43Z INFO     Train Epoch: 0 [640/60000 (1%)]\tloss=2.2942\n",
   526        "2023-01-12T18:55:43Z INFO     Train Epoch: 0 [960/60000 (2%)]\tloss=2.2920\n",
   527        "2023-01-12T18:55:44Z INFO     Train Epoch: 0 [1280/60000 (2%)]\tloss=2.2875\n",
   528        "2023-01-12T18:55:45Z INFO     Train Epoch: 0 [1600/60000 (3%)]\tloss=2.2658\n",
   529        "2023-01-12T18:55:46Z INFO     Train Epoch: 0 [1920/60000 (3%)]\tloss=2.2676\n",
   530        "2023-01-12T18:55:46Z INFO     Train Epoch: 0 [2240/60000 (4%)]\tloss=2.2092\n",
   531        "2023-01-12T18:55:47Z INFO     Train Epoch: 0 [2560/60000 (4%)]\tloss=2.2292\n",
   532        "2023-01-12T18:55:47Z INFO     Train Epoch: 0 [2880/60000 (5%)]\tloss=2.2402\n",
   533        "2023-01-12T18:55:48Z INFO     Train Epoch: 0 [3200/60000 (5%)]\tloss=2.1984\n",
   534        "2023-01-12T18:55:48Z INFO     Train Epoch: 0 [3520/60000 (6%)]\tloss=2.1415\n",
   535        "2023-01-12T18:55:49Z INFO     Train Epoch: 0 [3840/60000 (6%)]\tloss=2.0092\n",
   536        "2023-01-12T18:55:49Z INFO     Train Epoch: 0 [4160/60000 (7%)]\tloss=1.8847\n",
   537        "2023-01-12T18:55:50Z INFO     Train Epoch: 0 [4480/60000 (7%)]\tloss=1.8625\n",
   538        "2023-01-12T18:55:51Z INFO     Train Epoch: 0 [4800/60000 (8%)]\tloss=1.5723\n",
   539        "2023-01-12T18:55:51Z INFO     Train Epoch: 0 [5120/60000 (9%)]\tloss=1.4135\n",
   540        "2023-01-12T18:55:52Z INFO     Train Epoch: 0 [5440/60000 (9%)]\tloss=1.3640\n",
   541        "2023-01-12T18:55:52Z INFO     Train Epoch: 0 [5760/60000 (10%)]\tloss=1.3703\n",
   542        "2023-01-12T18:55:53Z INFO     Train Epoch: 0 [6080/60000 (10%)]\tloss=1.1940\n",
   543        "2023-01-12T18:55:53Z INFO     Train Epoch: 0 [6400/60000 (11%)]\tloss=1.1059\n",
   544        "2023-01-12T18:55:54Z INFO     Train Epoch: 0 [6720/60000 (11%)]\tloss=1.2499\n",
   545        "2023-01-12T18:55:54Z INFO     Train Epoch: 0 [7040/60000 (12%)]\tloss=0.9975\n",
   546        "2023-01-12T18:55:55Z INFO     Train Epoch: 0 [7360/60000 (12%)]\tloss=1.0447\n",
   547        "2023-01-12T18:55:56Z INFO     Train Epoch: 0 [7680/60000 (13%)]\tloss=1.0539\n",
   548        "2023-01-12T18:55:56Z INFO     Train Epoch: 0 [8000/60000 (13%)]\tloss=1.2946\n",
   549        "2023-01-12T18:55:57Z INFO     Train Epoch: 0 [8320/60000 (14%)]\tloss=1.0458\n",
   550        "2023-01-12T18:55:57Z INFO     Train Epoch: 0 [8640/60000 (14%)]\tloss=1.1081\n",
   551        "2023-01-12T18:55:58Z INFO     Train Epoch: 0 [8960/60000 (15%)]\tloss=1.2158\n",
   552        "2023-01-12T18:56:01Z INFO     Train Epoch: 0 [9280/60000 (15%)]\tloss=0.6873\n",
   553        "2023-01-12T18:56:01Z INFO     Train Epoch: 0 [9600/60000 (16%)]\tloss=1.3140\n",
   554        "2023-01-12T18:56:02Z INFO     Train Epoch: 0 [9920/60000 (17%)]\tloss=0.9072\n",
   555        "2023-01-12T18:56:02Z INFO     Train Epoch: 0 [10240/60000 (17%)]\tloss=1.1416\n",
   556        "2023-01-12T18:56:03Z INFO     Train Epoch: 0 [10560/60000 (18%)]\tloss=1.2440\n",
   557        "2023-01-12T18:56:04Z INFO     Train Epoch: 0 [10880/60000 (18%)]\tloss=0.9684\n",
   558        "2023-01-12T18:56:04Z INFO     Train Epoch: 0 [11200/60000 (19%)]\tloss=0.7044\n",
   559        "2023-01-12T18:56:05Z INFO     Train Epoch: 0 [11520/60000 (19%)]\tloss=0.9956\n",
   560        "2023-01-12T18:56:05Z INFO     Train Epoch: 0 [11840/60000 (20%)]\tloss=1.1197\n",
   561        "2023-01-12T18:56:06Z INFO     Train Epoch: 0 [12160/60000 (20%)]\tloss=0.9295\n",
   562        "2023-01-12T18:56:06Z INFO     Train Epoch: 0 [12480/60000 (21%)]\tloss=0.7795\n",
   563        "2023-01-12T18:56:07Z INFO     Train Epoch: 0 [12800/60000 (21%)]\tloss=0.8194\n",
   564        "2023-01-12T18:56:07Z INFO     Train Epoch: 0 [13120/60000 (22%)]\tloss=1.1227\n",
   565        "2023-01-12T18:56:08Z INFO     Train Epoch: 0 [13440/60000 (22%)]\tloss=0.9001\n",
   566        "2023-01-12T18:56:08Z INFO     Train Epoch: 0 [13760/60000 (23%)]\tloss=0.9062\n",
   567        "2023-01-12T18:56:09Z INFO     Train Epoch: 0 [14080/60000 (23%)]\tloss=0.9513\n",
   568        "2023-01-12T18:56:10Z INFO     Train Epoch: 0 [14400/60000 (24%)]\tloss=0.8561\n",
   569        "2023-01-12T18:56:11Z INFO     Train Epoch: 0 [14720/60000 (25%)]\tloss=0.7293\n",
   570        "2023-01-12T18:56:12Z INFO     Train Epoch: 0 [15040/60000 (25%)]\tloss=0.8429\n",
   571        "2023-01-12T18:56:12Z INFO     Train Epoch: 0 [15360/60000 (26%)]\tloss=0.9922\n",
   572        "2023-01-12T18:56:13Z INFO     Train Epoch: 0 [15680/60000 (26%)]\tloss=0.7432\n",
   573        "2023-01-12T18:56:15Z INFO     Train Epoch: 0 [16000/60000 (27%)]\tloss=1.0907\n",
   574        "2023-01-12T18:56:16Z INFO     Train Epoch: 0 [16320/60000 (27%)]\tloss=0.5217\n",
   575        "2023-01-12T18:56:16Z INFO     Train Epoch: 0 [16640/60000 (28%)]\tloss=0.9695\n",
   576        "2023-01-12T18:56:17Z INFO     Train Epoch: 0 [16960/60000 (28%)]\tloss=0.7314\n",
   577        "2023-01-12T18:56:17Z INFO     Train Epoch: 0 [17280/60000 (29%)]\tloss=0.8013\n",
   578        "2023-01-12T18:56:18Z INFO     Train Epoch: 0 [17600/60000 (29%)]\tloss=0.6232\n",
   579        "2023-01-12T18:56:18Z INFO     Train Epoch: 0 [17920/60000 (30%)]\tloss=0.6004\n",
   580        "2023-01-12T18:56:19Z INFO     Train Epoch: 0 [18240/60000 (30%)]\tloss=1.1647\n",
   581        "2023-01-12T18:56:19Z INFO     Train Epoch: 0 [18560/60000 (31%)]\tloss=1.1845\n",
   582        "2023-01-12T18:56:20Z INFO     Train Epoch: 0 [18880/60000 (31%)]\tloss=0.7494\n",
   583        "2023-01-12T18:56:21Z INFO     Train Epoch: 0 [19200/60000 (32%)]\tloss=0.6017\n",
   584        "2023-01-12T18:56:21Z INFO     Train Epoch: 0 [19520/60000 (33%)]\tloss=0.8297\n",
   585        "2023-01-12T18:56:22Z INFO     Train Epoch: 0 [19840/60000 (33%)]\tloss=0.8827\n",
   586        "2023-01-12T18:56:22Z INFO     Train Epoch: 0 [20160/60000 (34%)]\tloss=1.1165\n",
   587        "2023-01-12T18:56:23Z INFO     Train Epoch: 0 [20480/60000 (34%)]\tloss=0.5660\n",
   588        "2023-01-12T18:56:23Z INFO     Train Epoch: 0 [20800/60000 (35%)]\tloss=0.9627\n",
   589        "2023-01-12T18:56:24Z INFO     Train Epoch: 0 [21120/60000 (35%)]\tloss=0.4962\n",
   590        "2023-01-12T18:56:24Z INFO     Train Epoch: 0 [21440/60000 (36%)]\tloss=1.0196\n",
   591        "2023-01-12T18:56:25Z INFO     Train Epoch: 0 [21760/60000 (36%)]\tloss=0.7316\n",
   592        "2023-01-12T18:56:25Z INFO     Train Epoch: 0 [22080/60000 (37%)]\tloss=0.7878\n",
   593        "2023-01-12T18:56:26Z INFO     Train Epoch: 0 [22400/60000 (37%)]\tloss=0.5671\n",
   594        "2023-01-12T18:56:27Z INFO     Train Epoch: 0 [22720/60000 (38%)]\tloss=0.6081\n",
   595        "2023-01-12T18:56:27Z INFO     Train Epoch: 0 [23040/60000 (38%)]\tloss=1.0035\n",
   596        "2023-01-12T18:56:28Z INFO     Train Epoch: 0 [23360/60000 (39%)]\tloss=0.5702\n",
   597        "2023-01-12T18:56:30Z INFO     Train Epoch: 0 [23680/60000 (39%)]\tloss=0.7771\n",
   598        "2023-01-12T18:56:31Z INFO     Train Epoch: 0 [24000/60000 (40%)]\tloss=0.9109\n",
   599        "2023-01-12T18:56:32Z INFO     Train Epoch: 0 [24320/60000 (41%)]\tloss=0.8138\n",
   600        "2023-01-12T18:56:32Z INFO     Train Epoch: 0 [24640/60000 (41%)]\tloss=0.7430\n",
   601        "2023-01-12T18:56:33Z INFO     Train Epoch: 0 [24960/60000 (42%)]\tloss=0.7815\n",
   602        "2023-01-12T18:56:33Z INFO     Train Epoch: 0 [25280/60000 (42%)]\tloss=0.5246\n",
   603        "2023-01-12T18:56:34Z INFO     Train Epoch: 0 [25600/60000 (43%)]\tloss=0.7377\n",
   604        "2023-01-12T18:56:34Z INFO     Train Epoch: 0 [25920/60000 (43%)]\tloss=0.6146\n",
   605        "2023-01-12T18:56:35Z INFO     Train Epoch: 0 [26240/60000 (44%)]\tloss=0.9728\n",
   606        "2023-01-12T18:56:35Z INFO     Train Epoch: 0 [26560/60000 (44%)]\tloss=0.7355\n",
   607        "2023-01-12T18:56:36Z INFO     Train Epoch: 0 [26880/60000 (45%)]\tloss=0.6064\n",
   608        "2023-01-12T18:56:36Z INFO     Train Epoch: 0 [27200/60000 (45%)]\tloss=1.0344\n",
   609        "2023-01-12T18:56:37Z INFO     Train Epoch: 0 [27520/60000 (46%)]\tloss=0.4730\n",
   610        "2023-01-12T18:56:38Z INFO     Train Epoch: 0 [27840/60000 (46%)]\tloss=0.7260\n",
   611        "2023-01-12T18:56:38Z INFO     Train Epoch: 0 [28160/60000 (47%)]\tloss=0.8061\n",
   612        "2023-01-12T18:56:39Z INFO     Train Epoch: 0 [28480/60000 (47%)]\tloss=0.8537\n",
   613        "2023-01-12T18:56:39Z INFO     Train Epoch: 0 [28800/60000 (48%)]\tloss=1.0247\n",
   614        "2023-01-12T18:56:40Z INFO     Train Epoch: 0 [29120/60000 (49%)]\tloss=0.6724\n",
   615        "2023-01-12T18:56:41Z INFO     Train Epoch: 0 [29440/60000 (49%)]\tloss=0.9595\n",
   616        "2023-01-12T18:56:43Z INFO     Train Epoch: 0 [29760/60000 (50%)]\tloss=0.7610\n",
   617        "2023-01-12T18:56:44Z INFO     Train Epoch: 0 [30080/60000 (50%)]\tloss=0.9843\n",
   618        "2023-01-12T18:56:45Z INFO     Train Epoch: 0 [30400/60000 (51%)]\tloss=0.6334\n",
   619        "2023-01-12T18:56:45Z INFO     Train Epoch: 0 [30720/60000 (51%)]\tloss=0.6374\n",
   620        "2023-01-12T18:56:46Z INFO     Train Epoch: 0 [31040/60000 (52%)]\tloss=0.5124\n",
   621        "2023-01-12T18:56:46Z INFO     Train Epoch: 0 [31360/60000 (52%)]\tloss=0.5240\n",
   622        "2023-01-12T18:56:47Z INFO     Train Epoch: 0 [31680/60000 (53%)]\tloss=0.6984\n",
   623        "2023-01-12T18:56:47Z INFO     Train Epoch: 0 [32000/60000 (53%)]\tloss=0.8143\n",
   624        "2023-01-12T18:56:48Z INFO     Train Epoch: 0 [32320/60000 (54%)]\tloss=0.6173\n",
   625        "2023-01-12T18:56:49Z INFO     Train Epoch: 0 [32640/60000 (54%)]\tloss=0.6989\n",
   626        "2023-01-12T18:56:49Z INFO     Train Epoch: 0 [32960/60000 (55%)]\tloss=0.6109\n",
   627        "2023-01-12T18:56:50Z INFO     Train Epoch: 0 [33280/60000 (55%)]\tloss=0.5810\n",
   628        "2023-01-12T18:56:50Z INFO     Train Epoch: 0 [33600/60000 (56%)]\tloss=0.5392\n",
   629        "2023-01-12T18:56:51Z INFO     Train Epoch: 0 [33920/60000 (57%)]\tloss=0.4317\n",
   630        "2023-01-12T18:56:51Z INFO     Train Epoch: 0 [34240/60000 (57%)]\tloss=0.4624\n",
   631        "2023-01-12T18:56:52Z INFO     Train Epoch: 0 [34560/60000 (58%)]\tloss=0.3868\n",
   632        "2023-01-12T18:56:52Z INFO     Train Epoch: 0 [34880/60000 (58%)]\tloss=0.6871\n",
   633        "2023-01-12T18:56:53Z INFO     Train Epoch: 0 [35200/60000 (59%)]\tloss=0.5277\n",
   634        "2023-01-12T18:56:54Z INFO     Train Epoch: 0 [35520/60000 (59%)]\tloss=0.5487\n",
   635        "2023-01-12T18:56:54Z INFO     Train Epoch: 0 [35840/60000 (60%)]\tloss=0.5509\n",
   636        "2023-01-12T18:56:55Z INFO     Train Epoch: 0 [36160/60000 (60%)]\tloss=0.7043\n",
   637        "2023-01-12T18:56:55Z INFO     Train Epoch: 0 [36480/60000 (61%)]\tloss=0.7568\n",
   638        "2023-01-12T18:56:56Z INFO     Train Epoch: 0 [36800/60000 (61%)]\tloss=0.6199\n",
   639        "2023-01-12T18:56:56Z INFO     Train Epoch: 0 [37120/60000 (62%)]\tloss=0.7296\n",
   640        "2023-01-12T18:56:57Z INFO     Train Epoch: 0 [37440/60000 (62%)]\tloss=0.5492\n",
   641        "2023-01-12T18:56:58Z INFO     Train Epoch: 0 [37760/60000 (63%)]\tloss=0.4943\n",
   642        "2023-01-12T18:56:59Z INFO     Train Epoch: 0 [38080/60000 (63%)]\tloss=0.8262\n",
   643        "2023-01-12T18:57:01Z INFO     Train Epoch: 0 [38400/60000 (64%)]\tloss=0.6767\n",
   644        "2023-01-12T18:57:02Z INFO     Train Epoch: 0 [38720/60000 (65%)]\tloss=0.6093\n",
   645        "2023-01-12T18:57:02Z INFO     Train Epoch: 0 [39040/60000 (65%)]\tloss=0.5222\n",
   646        "2023-01-12T18:57:03Z INFO     Train Epoch: 0 [39360/60000 (66%)]\tloss=0.4399\n",
   647        "2023-01-12T18:57:03Z INFO     Train Epoch: 0 [39680/60000 (66%)]\tloss=0.6005\n",
   648        "2023-01-12T18:57:04Z INFO     Train Epoch: 0 [40000/60000 (67%)]\tloss=0.5421\n",
   649        "2023-01-12T18:57:04Z INFO     Train Epoch: 0 [40320/60000 (67%)]\tloss=0.4670\n",
   650        "2023-01-12T18:57:05Z INFO     Train Epoch: 0 [40640/60000 (68%)]\tloss=0.2799\n",
   651        "2023-01-12T18:57:06Z INFO     Train Epoch: 0 [40960/60000 (68%)]\tloss=0.5594\n",
   652        "2023-01-12T18:57:06Z INFO     Train Epoch: 0 [41280/60000 (69%)]\tloss=0.7234\n",
   653        "2023-01-12T18:57:07Z INFO     Train Epoch: 0 [41600/60000 (69%)]\tloss=0.8179\n",
   654        "2023-01-12T18:57:08Z INFO     Train Epoch: 0 [41920/60000 (70%)]\tloss=0.5361\n",
   655        "2023-01-12T18:57:08Z INFO     Train Epoch: 0 [42240/60000 (70%)]\tloss=0.6700\n",
   656        "2023-01-12T18:57:09Z INFO     Train Epoch: 0 [42560/60000 (71%)]\tloss=0.4328\n",
   657        "2023-01-12T18:57:09Z INFO     Train Epoch: 0 [42880/60000 (71%)]\tloss=0.7155\n",
   658        "2023-01-12T18:57:10Z INFO     Train Epoch: 0 [43200/60000 (72%)]\tloss=0.6536\n",
   659        "2023-01-12T18:57:11Z INFO     Train Epoch: 0 [43520/60000 (73%)]\tloss=0.4034\n",
   660        "2023-01-12T18:57:12Z INFO     Train Epoch: 0 [43840/60000 (73%)]\tloss=0.6295\n",
   661        "2023-01-12T18:57:13Z INFO     Train Epoch: 0 [44160/60000 (74%)]\tloss=0.6419\n",
   662        "2023-01-12T18:57:15Z INFO     Train Epoch: 0 [44480/60000 (74%)]\tloss=0.4257\n",
   663        "2023-01-12T18:57:15Z INFO     Train Epoch: 0 [44800/60000 (75%)]\tloss=0.6005\n",
   664        "2023-01-12T18:57:16Z INFO     Train Epoch: 0 [45120/60000 (75%)]\tloss=0.5280\n",
   665        "2023-01-12T18:57:17Z INFO     Train Epoch: 0 [45440/60000 (76%)]\tloss=0.7624\n",
   666        "2023-01-12T18:57:17Z INFO     Train Epoch: 0 [45760/60000 (76%)]\tloss=0.4500\n",
   667        "2023-01-12T18:57:18Z INFO     Train Epoch: 0 [46080/60000 (77%)]\tloss=0.6136\n",
   668        "2023-01-12T18:57:18Z INFO     Train Epoch: 0 [46400/60000 (77%)]\tloss=0.4631\n",
   669        "2023-01-12T18:57:19Z INFO     Train Epoch: 0 [46720/60000 (78%)]\tloss=0.6543\n",
   670        "2023-01-12T18:57:19Z INFO     Train Epoch: 0 [47040/60000 (78%)]\tloss=0.3783\n",
   671        "2023-01-12T18:57:20Z INFO     Train Epoch: 0 [47360/60000 (79%)]\tloss=0.6068\n",
   672        "2023-01-12T18:57:20Z INFO     Train Epoch: 0 [47680/60000 (79%)]\tloss=0.4288\n",
   673        "2023-01-12T18:57:21Z INFO     Train Epoch: 0 [48000/60000 (80%)]\tloss=0.5632\n",
   674        "2023-01-12T18:57:22Z INFO     Train Epoch: 0 [48320/60000 (81%)]\tloss=0.5509\n",
   675        "2023-01-12T18:57:22Z INFO     Train Epoch: 0 [48640/60000 (81%)]\tloss=0.7985\n",
   676        "2023-01-12T18:57:23Z INFO     Train Epoch: 0 [48960/60000 (82%)]\tloss=0.5953\n",
   677        "2023-01-12T18:57:23Z INFO     Train Epoch: 0 [49280/60000 (82%)]\tloss=0.6759\n",
   678        "2023-01-12T18:57:24Z INFO     Train Epoch: 0 [49600/60000 (83%)]\tloss=0.3233\n",
   679        "2023-01-12T18:57:24Z INFO     Train Epoch: 0 [49920/60000 (83%)]\tloss=0.3583\n",
   680        "2023-01-12T18:57:25Z INFO     Train Epoch: 0 [50240/60000 (84%)]\tloss=0.5348\n",
   681        "2023-01-12T18:57:25Z INFO     Train Epoch: 0 [50560/60000 (84%)]\tloss=0.8532\n",
   682        "2023-01-12T18:57:26Z INFO     Train Epoch: 0 [50880/60000 (85%)]\tloss=0.4251\n",
   683        "2023-01-12T18:57:27Z INFO     Train Epoch: 0 [51200/60000 (85%)]\tloss=0.4953\n",
   684        "2023-01-12T18:57:27Z INFO     Train Epoch: 0 [51520/60000 (86%)]\tloss=0.5538\n",
   685        "2023-01-12T18:57:28Z INFO     Train Epoch: 0 [51840/60000 (86%)]\tloss=0.7728\n",
   686        "2023-01-12T18:57:29Z INFO     Train Epoch: 0 [52160/60000 (87%)]\tloss=0.4604\n",
   687        "2023-01-12T18:57:31Z INFO     Train Epoch: 0 [52480/60000 (87%)]\tloss=0.8828\n",
   688        "2023-01-12T18:57:32Z INFO     Train Epoch: 0 [52800/60000 (88%)]\tloss=0.5369\n",
   689        "2023-01-12T18:57:32Z INFO     Train Epoch: 0 [53120/60000 (89%)]\tloss=0.7731\n",
   690        "2023-01-12T18:57:33Z INFO     Train Epoch: 0 [53440/60000 (89%)]\tloss=0.6234\n",
   691        "2023-01-12T18:57:33Z INFO     Train Epoch: 0 [53760/60000 (90%)]\tloss=0.5501\n",
   692        "2023-01-12T18:57:34Z INFO     Train Epoch: 0 [54080/60000 (90%)]\tloss=0.7707\n",
   693        "2023-01-12T18:57:34Z INFO     Train Epoch: 0 [54400/60000 (91%)]\tloss=0.7441\n",
   694        "2023-01-12T18:57:35Z INFO     Train Epoch: 0 [54720/60000 (91%)]\tloss=0.5040\n",
   695        "2023-01-12T18:57:36Z INFO     Train Epoch: 0 [55040/60000 (92%)]\tloss=0.4233\n",
   696        "2023-01-12T18:57:36Z INFO     Train Epoch: 0 [55360/60000 (92%)]\tloss=0.4983\n",
   697        "2023-01-12T18:57:37Z INFO     Train Epoch: 0 [55680/60000 (93%)]\tloss=0.5547\n",
   698        "2023-01-12T18:57:37Z INFO     Train Epoch: 0 [56000/60000 (93%)]\tloss=0.7808\n",
   699        "2023-01-12T18:57:38Z INFO     Train Epoch: 0 [56320/60000 (94%)]\tloss=0.5937\n",
   700        "2023-01-12T18:57:38Z INFO     Train Epoch: 0 [56640/60000 (94%)]\tloss=0.3243\n",
   701        "2023-01-12T18:57:39Z INFO     Train Epoch: 0 [56960/60000 (95%)]\tloss=0.7926\n",
   702        "2023-01-12T18:57:39Z INFO     Train Epoch: 0 [57280/60000 (95%)]\tloss=0.5203\n",
   703        "2023-01-12T18:57:40Z INFO     Train Epoch: 0 [57600/60000 (96%)]\tloss=0.5806\n",
   704        "2023-01-12T18:57:41Z INFO     Train Epoch: 0 [57920/60000 (97%)]\tloss=0.2864\n",
   705        "2023-01-12T18:57:42Z INFO     Train Epoch: 0 [58240/60000 (97%)]\tloss=0.4806\n",
   706        "2023-01-12T18:57:43Z INFO     Train Epoch: 0 [58560/60000 (98%)]\tloss=0.5448\n",
   707        "2023-01-12T18:57:44Z INFO     Train Epoch: 0 [58880/60000 (98%)]\tloss=0.7353\n",
   708        "2023-01-12T18:57:45Z INFO     Train Epoch: 0 [59200/60000 (99%)]\tloss=0.3771\n",
   709        "2023-01-12T18:57:45Z INFO     Train Epoch: 0 [59520/60000 (99%)]\tloss=0.5527\n",
   710        "2023-01-12T18:57:46Z INFO     Train Epoch: 0 [59840/60000 (100%)]\tloss=0.5935\n",
   711        "\n"
   712       ]
   713      }
   714     ],
   715     "source": [
   716      "training_client.get_job_logs(pytorchjob_name, container=\"pytorch\")"
   717     ]
   718    },
   719    {
   720     "cell_type": "markdown",
   721     "id": "17b0ca43-1936-4708-b03b-3ab9ac2bbdea",
   722     "metadata": {},
   723     "source": [
   724      "## Delete PyTorchJob\n",
   725      "\n",
   726      "When PyTorchJob is finished, you can delete the resource."
   727     ]
   728    },
   729    {
   730     "cell_type": "code",
   731     "execution_count": 28,
   732     "id": "32ae88fd-5b5d-4ba1-a560-9a35c5ac17de",
   733     "metadata": {
   734      "tags": []
   735     },
   736     "outputs": [
   737      {
   738       "name": "stderr",
   739       "output_type": "stream",
   740       "text": [
   741        "PyTorchJob kubeflow-user-example-com/train-pytorch has been deleted\n"
   742       ]
   743      }
   744     ],
   745     "source": [
   746      "training_client.delete_pytorchjob(pytorchjob_name)"
   747     ]
   748    },
   749    {
   750     "cell_type": "code",
   751     "execution_count": null,
   752     "id": "b9641e9f-551d-44d5-872b-002fffaedcef",
   753     "metadata": {},
   754     "outputs": [],
   755     "source": []
   756    }
   757   ],
   758   "metadata": {
   759    "kernelspec": {
   760     "display_name": "Python 3 (ipykernel)",
   761     "language": "python",
   762     "name": "python3"
   763    },
   764    "language_info": {
   765     "codemirror_mode": {
   766      "name": "ipython",
   767      "version": 3
   768     },
   769     "file_extension": ".py",
   770     "mimetype": "text/x-python",
   771     "name": "python",
   772     "nbconvert_exporter": "python",
   773     "pygments_lexer": "ipython3",
   774     "version": "3.8.10"
   775    }
   776   },
   777   "nbformat": 4,
   778   "nbformat_minor": 5
   779  }