github.com/kubeflow/training-operator@v1.7.0/sdk/python/examples/kubeflow-pytorchjob-sdk.ipynb

github.com/kubeflow/training-operator@v1.7.0/sdk/python/examples/kubeflow-pytorchjob-sdk.ipynb (about)

     1  {
     2   "cells": [
     3    {
     4     "cell_type": "markdown",
     5     "metadata": {
     6      "pycharm": {
     7       "name": "#%% md\n"
     8      }
     9     },
    10     "source": [
    11      "# Sample for Kubeflow PyTorchJob SDK"
    12     ]
    13    },
    14    {
    15     "cell_type": "markdown",
    16     "metadata": {
    17      "pycharm": {
    18       "name": "#%% md\n"
    19      }
    20     },
    21     "source": [
    22      "This is a sample for Kubeflow Training SDK `kubeflow-training`.\n",
    23      "\n",
    24      "The notebook shows how to use Kubeflow Training SDK to create, get, wait, check and delete PyTorchJob."
    25     ]
    26    },
    27    {
    28     "cell_type": "markdown",
    29     "metadata": {
    30      "tags": []
    31     },
    32     "source": [
    33      "## Install Kubeflow Training Python SDKs\n",
    34      "\n",
    35      "You need to install Kubeflow Training SDK to run this Notebook."
    36     ]
    37    },
    38    {
    39     "cell_type": "code",
    40     "execution_count": null,
    41     "metadata": {},
    42     "outputs": [],
    43     "source": [
    44      "# TODO (andreyvelich): Change to release version when SDK with the new APIs is published.\n",
    45      "!pip install git+https://github.com/kubeflow/training-operator.git#subdirectory=sdk/python"
    46     ]
    47    },
    48    {
    49     "cell_type": "code",
    50     "execution_count": 6,
    51     "metadata": {
    52      "pycharm": {
    53       "name": "#%%\n"
    54      }
    55     },
    56     "outputs": [],
    57     "source": [
    58      "from kubernetes.client import V1PodTemplateSpec\n",
    59      "from kubernetes.client import V1ObjectMeta\n",
    60      "from kubernetes.client import V1PodSpec\n",
    61      "from kubernetes.client import V1Container\n",
    62      "\n",
    63      "from kubeflow.training import V1ReplicaSpec\n",
    64      "from kubeflow.training import KubeflowOrgV1PyTorchJob\n",
    65      "from kubeflow.training import KubeflowOrgV1PyTorchJobSpec\n",
    66      "from kubeflow.training import V1RunPolicy\n",
    67      "from kubeflow.training import TrainingClient"
    68     ]
    69    },
    70    {
    71     "cell_type": "markdown",
    72     "metadata": {
    73      "pycharm": {
    74       "name": "#%% md\n"
    75      }
    76     },
    77     "source": [
    78      "## Define PyTorchJob"
    79     ]
    80    },
    81    {
    82     "cell_type": "markdown",
    83     "metadata": {
    84      "pycharm": {
    85       "name": "#%% md\n"
    86      }
    87     },
    88     "source": [
    89      "The demo only creates a worker of PyTorchJob to run mnist sample."
    90     ]
    91    },
    92    {
    93     "cell_type": "code",
    94     "execution_count": 37,
    95     "metadata": {
    96      "pycharm": {
    97       "name": "#%%\n"
    98      }
    99     },
   100     "outputs": [],
   101     "source": [
   102      "name = \"pytorch-dist-mnist-gloo\"\n",
   103      "namespace = \"kubeflow-user-example-com\"\n",
   104      "container_name = \"pytorch\"\n",
   105      "\n",
   106      "container = V1Container(\n",
   107      "    name=container_name,\n",
   108      "    image=\"gcr.io/kubeflow-ci/pytorch-dist-mnist-test:v1.0\",\n",
   109      "    args=[\"--backend\", \"gloo\"],\n",
   110      ")\n",
   111      "\n",
   112      "replica_spec = V1ReplicaSpec(\n",
   113      "    replicas=1,\n",
   114      "    restart_policy=\"OnFailure\",\n",
   115      "    template=V1PodTemplateSpec(\n",
   116      "        metadata=V1ObjectMeta(\n",
   117      "            name=name,\n",
   118      "            namespace=namespace,\n",
   119      "            annotations={\n",
   120      "                \"sidecar.istio.io/inject\": \"false\"\n",
   121      "            }\n",
   122      "        ),\n",
   123      "        spec=V1PodSpec(\n",
   124      "            containers=[\n",
   125      "                V1Container(\n",
   126      "                    name=container_name,\n",
   127      "                    image=\"gcr.io/kubeflow-ci/pytorch-dist-mnist-test:v1.0\",\n",
   128      "                    args=[\"--backend\", \"gloo\"],\n",
   129      "                )\n",
   130      "            ]\n",
   131      "        )\n",
   132      "    )\n",
   133      ")\n",
   134      "\n",
   135      "pytorchjob = KubeflowOrgV1PyTorchJob(\n",
   136      "    api_version=\"kubeflow.org/v1\",\n",
   137      "    kind=\"PyTorchJob\",\n",
   138      "    metadata=V1ObjectMeta(name=name, namespace=namespace),\n",
   139      "    spec=KubeflowOrgV1PyTorchJobSpec(\n",
   140      "        run_policy=V1RunPolicy(clean_pod_policy=\"None\"),\n",
   141      "        pytorch_replica_specs={\n",
   142      "            \"Master\": replica_spec,\n",
   143      "            \"Worker\": replica_spec\n",
   144      "        },\n",
   145      "    ),\n",
   146      ")"
   147     ]
   148    },
   149    {
   150     "cell_type": "markdown",
   151     "metadata": {
   152      "pycharm": {
   153       "name": "#%% md\n"
   154      }
   155     },
   156     "source": [
   157      "## Create PyTorchJob\n",
   158      "\n",
   159      "You have to create Training Client to deploy you PyTorchJob in you cluster."
   160     ]
   161    },
   162    {
   163     "cell_type": "code",
   164     "execution_count": 38,
   165     "metadata": {
   166      "pycharm": {
   167       "name": "#%%\n"
   168      }
   169     },
   170     "outputs": [
   171      {
   172       "name": "stderr",
   173       "output_type": "stream",
   174       "text": [
   175        "PyTorchJob kubeflow-user-example-com/pytorch-dist-mnist-gloo has been created\n"
   176       ]
   177      }
   178     ],
   179     "source": [
   180      "training_client = TrainingClient()\n",
   181      "training_client.create_pytorchjob(pytorchjob, namespace=namespace)"
   182     ]
   183    },
   184    {
   185     "cell_type": "markdown",
   186     "metadata": {
   187      "pycharm": {
   188       "name": "#%% md\n"
   189      }
   190     },
   191     "source": [
   192      "## Get the Created PyTorchJob\n",
   193      "\n",
   194      "You can verify the created PyTorchJob name"
   195     ]
   196    },
   197    {
   198     "cell_type": "code",
   199     "execution_count": 39,
   200     "metadata": {
   201      "pycharm": {
   202       "name": "#%%\n"
   203      }
   204     },
   205     "outputs": [
   206      {
   207       "data": {
   208        "text/plain": [
   209         "'pytorch-dist-mnist-gloo'"
   210        ]
   211       },
   212       "execution_count": 39,
   213       "metadata": {},
   214       "output_type": "execute_result"
   215      }
   216     ],
   217     "source": [
   218      "training_client.get_pytorchjob(name).metadata.name"
   219     ]
   220    },
   221    {
   222     "cell_type": "markdown",
   223     "metadata": {
   224      "pycharm": {
   225       "name": "#%% md\n"
   226      }
   227     },
   228     "source": [
   229      "## Get the PyTorchJob Conditions"
   230     ]
   231    },
   232    {
   233     "cell_type": "code",
   234     "execution_count": 40,
   235     "metadata": {
   236      "pycharm": {
   237       "name": "#%%\n"
   238      }
   239     },
   240     "outputs": [
   241      {
   242       "data": {
   243        "text/plain": [
   244         "[{'last_transition_time': datetime.datetime(2023, 1, 12, 18, 30, 13, tzinfo=tzlocal()),\n",
   245         "  'last_update_time': datetime.datetime(2023, 1, 12, 18, 30, 13, tzinfo=tzlocal()),\n",
   246         "  'message': 'PyTorchJob pytorch-dist-mnist-gloo is created.',\n",
   247         "  'reason': 'PyTorchJobCreated',\n",
   248         "  'status': 'True',\n",
   249         "  'type': 'Created'},\n",
   250         " {'last_transition_time': datetime.datetime(2023, 1, 12, 18, 30, 18, tzinfo=tzlocal()),\n",
   251         "  'last_update_time': datetime.datetime(2023, 1, 12, 18, 30, 18, tzinfo=tzlocal()),\n",
   252         "  'message': 'PyTorchJob pytorch-dist-mnist-gloo is running.',\n",
   253         "  'reason': 'JobRunning',\n",
   254         "  'status': 'True',\n",
   255         "  'type': 'Running'}]"
   256        ]
   257       },
   258       "execution_count": 40,
   259       "metadata": {},
   260       "output_type": "execute_result"
   261      }
   262     ],
   263     "source": [
   264      "training_client.get_job_conditions(name=name, namespace=namespace, job_kind=\"PyTorchJob\")"
   265     ]
   266    },
   267    {
   268     "cell_type": "markdown",
   269     "metadata": {
   270      "pycharm": {
   271       "name": "#%% md\n"
   272      }
   273     },
   274     "source": [
   275      "## Wait Until PyTorchJob Finishes"
   276     ]
   277    },
   278    {
   279     "cell_type": "code",
   280     "execution_count": 41,
   281     "metadata": {
   282      "pycharm": {
   283       "name": "#%%\n"
   284      }
   285     },
   286     "outputs": [
   287      {
   288       "name": "stdout",
   289       "output_type": "stream",
   290       "text": [
   291        "pytorch-dist-mnist-gloo        Running              2023-01-12 18:30:18+00:00\n",
   292        "pytorch-dist-mnist-gloo        Running              2023-01-12 18:30:18+00:00\n",
   293        "pytorch-dist-mnist-gloo        Running              2023-01-12 18:30:18+00:00\n",
   294        "pytorch-dist-mnist-gloo        Succeeded            2023-01-12 18:36:48+00:00\n",
   295        "Succeeded number of replicas: 1\n"
   296       ]
   297      }
   298     ],
   299     "source": [
   300      "pytorchjob = training_client.wait_for_job_conditions(name=name, namespace=namespace, job_kind=\"PyTorchJob\")\n",
   301      "\n",
   302      "print(f\"Succeeded number of replicas: {pytorchjob.status.replica_statuses['Master'].succeeded}\")"
   303     ]
   304    },
   305    {
   306     "cell_type": "markdown",
   307     "metadata": {
   308      "pycharm": {
   309       "name": "#%% md\n"
   310      }
   311     },
   312     "source": [
   313      "## Verify if PyTorchJob is Succeeded"
   314     ]
   315    },
   316    {
   317     "cell_type": "code",
   318     "execution_count": 42,
   319     "metadata": {
   320      "pycharm": {
   321       "name": "#%%\n"
   322      }
   323     },
   324     "outputs": [
   325      {
   326       "data": {
   327        "text/plain": [
   328         "True"
   329        ]
   330       },
   331       "execution_count": 42,
   332       "metadata": {},
   333       "output_type": "execute_result"
   334      }
   335     ],
   336     "source": [
   337      "training_client.is_job_succeeded(name=name, namespace=namespace, job_kind=\"PyTorchJob\")"
   338     ]
   339    },
   340    {
   341     "cell_type": "markdown",
   342     "metadata": {
   343      "pycharm": {
   344       "name": "#%% md\n"
   345      }
   346     },
   347     "source": [
   348      "## Get the PyTorchJob Training Logs"
   349     ]
   350    },
   351    {
   352     "cell_type": "code",
   353     "execution_count": 43,
   354     "metadata": {
   355      "pycharm": {
   356       "name": "#%%\n"
   357      }
   358     },
   359     "outputs": [
   360      {
   361       "name": "stderr",
   362       "output_type": "stream",
   363       "text": [
   364        "The logs of pod pytorch-dist-mnist-gloo-master-0:\n",
   365        " Using distributed PyTorch with gloo backend\n",
   366        "Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz\n",
   367        "Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz\n",
   368        "Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz\n",
   369        "Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz\n",
   370        "Processing...\n",
   371        "Done!\n",
   372        "Train Epoch: 1 [0/60000 (0%)]\tloss=2.3000\n",
   373        "Train Epoch: 1 [640/60000 (1%)]\tloss=2.2135\n",
   374        "Train Epoch: 1 [1280/60000 (2%)]\tloss=2.1704\n",
   375        "Train Epoch: 1 [1920/60000 (3%)]\tloss=2.0766\n",
   376        "Train Epoch: 1 [2560/60000 (4%)]\tloss=1.8679\n",
   377        "Train Epoch: 1 [3200/60000 (5%)]\tloss=1.4135\n",
   378        "Train Epoch: 1 [3840/60000 (6%)]\tloss=1.0003\n",
   379        "Train Epoch: 1 [4480/60000 (7%)]\tloss=0.7762\n",
   380        "Train Epoch: 1 [5120/60000 (9%)]\tloss=0.4598\n",
   381        "Train Epoch: 1 [5760/60000 (10%)]\tloss=0.4860\n",
   382        "Train Epoch: 1 [6400/60000 (11%)]\tloss=0.4389\n",
   383        "Train Epoch: 1 [7040/60000 (12%)]\tloss=0.4084\n",
   384        "Train Epoch: 1 [7680/60000 (13%)]\tloss=0.4602\n",
   385        "Train Epoch: 1 [8320/60000 (14%)]\tloss=0.4289\n",
   386        "Train Epoch: 1 [8960/60000 (15%)]\tloss=0.3990\n",
   387        "Train Epoch: 1 [9600/60000 (16%)]\tloss=0.3852\n",
   388        "Train Epoch: 1 [10240/60000 (17%)]\tloss=0.2984\n",
   389        "Train Epoch: 1 [10880/60000 (18%)]\tloss=0.5029\n",
   390        "Train Epoch: 1 [11520/60000 (19%)]\tloss=0.5236\n",
   391        "Train Epoch: 1 [12160/60000 (20%)]\tloss=0.3378\n",
   392        "Train Epoch: 1 [12800/60000 (21%)]\tloss=0.3674\n",
   393        "Train Epoch: 1 [13440/60000 (22%)]\tloss=0.4508\n",
   394        "Train Epoch: 1 [14080/60000 (23%)]\tloss=0.3034\n",
   395        "Train Epoch: 1 [14720/60000 (25%)]\tloss=0.3574\n",
   396        "Train Epoch: 1 [15360/60000 (26%)]\tloss=0.3313\n",
   397        "Train Epoch: 1 [16000/60000 (27%)]\tloss=0.4405\n",
   398        "Train Epoch: 1 [16640/60000 (28%)]\tloss=0.3642\n",
   399        "Train Epoch: 1 [17280/60000 (29%)]\tloss=0.3172\n",
   400        "Train Epoch: 1 [17920/60000 (30%)]\tloss=0.2016\n",
   401        "Train Epoch: 1 [18560/60000 (31%)]\tloss=0.4978\n",
   402        "Train Epoch: 1 [19200/60000 (32%)]\tloss=0.3254\n",
   403        "Train Epoch: 1 [19840/60000 (33%)]\tloss=0.1191\n",
   404        "Train Epoch: 1 [20480/60000 (34%)]\tloss=0.1905\n",
   405        "Train Epoch: 1 [21120/60000 (35%)]\tloss=0.1408\n",
   406        "Train Epoch: 1 [21760/60000 (36%)]\tloss=0.3150\n",
   407        "Train Epoch: 1 [22400/60000 (37%)]\tloss=0.1506\n",
   408        "Train Epoch: 1 [23040/60000 (38%)]\tloss=0.2899\n",
   409        "Train Epoch: 1 [23680/60000 (39%)]\tloss=0.4676\n",
   410        "Train Epoch: 1 [24320/60000 (41%)]\tloss=0.2157\n",
   411        "Train Epoch: 1 [24960/60000 (42%)]\tloss=0.1520\n",
   412        "Train Epoch: 1 [25600/60000 (43%)]\tloss=0.2244\n",
   413        "Train Epoch: 1 [26240/60000 (44%)]\tloss=0.2632\n",
   414        "Train Epoch: 1 [26880/60000 (45%)]\tloss=0.2335\n",
   415        "Train Epoch: 1 [27520/60000 (46%)]\tloss=0.2619\n",
   416        "Train Epoch: 1 [28160/60000 (47%)]\tloss=0.2126\n",
   417        "Train Epoch: 1 [28800/60000 (48%)]\tloss=0.1324\n",
   418        "Train Epoch: 1 [29440/60000 (49%)]\tloss=0.2795\n",
   419        "Train Epoch: 1 [30080/60000 (50%)]\tloss=0.0951\n",
   420        "Train Epoch: 1 [30720/60000 (51%)]\tloss=0.1284\n",
   421        "Train Epoch: 1 [31360/60000 (52%)]\tloss=0.2461\n",
   422        "Train Epoch: 1 [32000/60000 (53%)]\tloss=0.3394\n",
   423        "Train Epoch: 1 [32640/60000 (54%)]\tloss=0.1517\n",
   424        "Train Epoch: 1 [33280/60000 (55%)]\tloss=0.0916\n",
   425        "Train Epoch: 1 [33920/60000 (57%)]\tloss=0.1449\n",
   426        "Train Epoch: 1 [34560/60000 (58%)]\tloss=0.1978\n",
   427        "Train Epoch: 1 [35200/60000 (59%)]\tloss=0.2189\n",
   428        "Train Epoch: 1 [35840/60000 (60%)]\tloss=0.0637\n",
   429        "Train Epoch: 1 [36480/60000 (61%)]\tloss=0.1368\n",
   430        "Train Epoch: 1 [37120/60000 (62%)]\tloss=0.1153\n",
   431        "Train Epoch: 1 [37760/60000 (63%)]\tloss=0.2358\n",
   432        "Train Epoch: 1 [38400/60000 (64%)]\tloss=0.0631\n",
   433        "Train Epoch: 1 [39040/60000 (65%)]\tloss=0.1063\n",
   434        "Train Epoch: 1 [39680/60000 (66%)]\tloss=0.1602\n",
   435        "Train Epoch: 1 [40320/60000 (67%)]\tloss=0.1098\n",
   436        "Train Epoch: 1 [40960/60000 (68%)]\tloss=0.1781\n",
   437        "Train Epoch: 1 [41600/60000 (69%)]\tloss=0.2297\n",
   438        "Train Epoch: 1 [42240/60000 (70%)]\tloss=0.0735\n",
   439        "Train Epoch: 1 [42880/60000 (71%)]\tloss=0.1562\n",
   440        "Train Epoch: 1 [43520/60000 (72%)]\tloss=0.2771\n",
   441        "Train Epoch: 1 [44160/60000 (74%)]\tloss=0.1429\n",
   442        "Train Epoch: 1 [44800/60000 (75%)]\tloss=0.1172\n",
   443        "Train Epoch: 1 [45440/60000 (76%)]\tloss=0.1202\n",
   444        "Train Epoch: 1 [46080/60000 (77%)]\tloss=0.0767\n",
   445        "Train Epoch: 1 [46720/60000 (78%)]\tloss=0.1938\n",
   446        "Train Epoch: 1 [47360/60000 (79%)]\tloss=0.0699\n",
   447        "Train Epoch: 1 [48000/60000 (80%)]\tloss=0.2114\n",
   448        "Train Epoch: 1 [48640/60000 (81%)]\tloss=0.1373\n",
   449        "Train Epoch: 1 [49280/60000 (82%)]\tloss=0.0934\n",
   450        "Train Epoch: 1 [49920/60000 (83%)]\tloss=0.1075\n",
   451        "Train Epoch: 1 [50560/60000 (84%)]\tloss=0.1185\n",
   452        "Train Epoch: 1 [51200/60000 (85%)]\tloss=0.1457\n",
   453        "Train Epoch: 1 [51840/60000 (86%)]\tloss=0.0694\n",
   454        "Train Epoch: 1 [52480/60000 (87%)]\tloss=0.0242\n",
   455        "Train Epoch: 1 [53120/60000 (88%)]\tloss=0.2635\n",
   456        "Train Epoch: 1 [53760/60000 (90%)]\tloss=0.0922\n",
   457        "Train Epoch: 1 [54400/60000 (91%)]\tloss=0.1287\n",
   458        "Train Epoch: 1 [55040/60000 (92%)]\tloss=0.1908\n",
   459        "Train Epoch: 1 [55680/60000 (93%)]\tloss=0.0350\n",
   460        "Train Epoch: 1 [56320/60000 (94%)]\tloss=0.0359\n",
   461        "Train Epoch: 1 [56960/60000 (95%)]\tloss=0.0762\n",
   462        "Train Epoch: 1 [57600/60000 (96%)]\tloss=0.1173\n",
   463        "Train Epoch: 1 [58240/60000 (97%)]\tloss=0.1948\n",
   464        "Train Epoch: 1 [58880/60000 (98%)]\tloss=0.2035\n",
   465        "Train Epoch: 1 [59520/60000 (99%)]\tloss=0.0639\n",
   466        "\n",
   467        "accuracy=0.9665\n",
   468        "\n",
   469        "\n"
   470       ]
   471      }
   472     ],
   473     "source": [
   474      "training_client.get_job_logs(name=name, namespace=namespace, container=container_name)"
   475     ]
   476    },
   477    {
   478     "cell_type": "markdown",
   479     "metadata": {
   480      "pycharm": {
   481       "name": "#%% md\n"
   482      }
   483     },
   484     "source": [
   485      "## Delete the PyTorchJob"
   486     ]
   487    },
   488    {
   489     "cell_type": "code",
   490     "execution_count": 44,
   491     "metadata": {
   492      "pycharm": {
   493       "name": "#%%\n"
   494      }
   495     },
   496     "outputs": [
   497      {
   498       "name": "stderr",
   499       "output_type": "stream",
   500       "text": [
   501        "PyTorchJob kubeflow-user-example-com/pytorch-dist-mnist-gloo has been deleted\n"
   502       ]
   503      }
   504     ],
   505     "source": [
   506      "training_client.delete_pytorchjob(name)"
   507     ]
   508    },
   509    {
   510     "cell_type": "code",
   511     "execution_count": null,
   512     "metadata": {},
   513     "outputs": [],
   514     "source": []
   515    }
   516   ],
   517   "metadata": {
   518    "kernelspec": {
   519     "display_name": "Python 3 (ipykernel)",
   520     "language": "python",
   521     "name": "python3"
   522    },
   523    "language_info": {
   524     "codemirror_mode": {
   525      "name": "ipython",
   526      "version": 3
   527     },
   528     "file_extension": ".py",
   529     "mimetype": "text/x-python",
   530     "name": "python",
   531     "nbconvert_exporter": "python",
   532     "pygments_lexer": "ipython3",
   533     "version": "3.8.10"
   534    }
   535   },
   536   "nbformat": 4,
   537   "nbformat_minor": 4
   538  }