github.com/kubeflow/training-operator@v1.7.0/sdk/python/test/e2e/test_e2e_mxjob.py (about)

     1  # Copyright 2021 kubeflow.org.
     2  #
     3  # Licensed under the Apache License, Version 2.0 (the "License");
     4  # you may not use this file except in compliance with the License.
     5  # You may obtain a copy of the License at
     6  #
     7  #    http://www.apache.org/licenses/LICENSE-2.0
     8  #
     9  # Unless required by applicable law or agreed to in writing, software
    10  # distributed under the License is distributed on an "AS IS" BASIS,
    11  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  # See the License for the specific language governing permissions and
    13  # limitations under the License.
    14  
    15  import os
    16  import logging
    17  import pytest
    18  from typing import Tuple
    19  
    20  from kubernetes.client import V1PodTemplateSpec
    21  from kubernetes.client import V1ObjectMeta
    22  from kubernetes.client import V1PodSpec
    23  from kubernetes.client import V1Container
    24  from kubernetes.client import V1ContainerPort
    25  from kubernetes.client import V1ResourceRequirements
    26  
    27  from kubeflow.training import TrainingClient
    28  from kubeflow.training import KubeflowOrgV1ReplicaSpec
    29  from kubeflow.training import KubeflowOrgV1MXJob
    30  from kubeflow.training import KubeflowOrgV1MXJobSpec
    31  from kubeflow.training import KubeflowOrgV1RunPolicy
    32  from kubeflow.training import KubeflowOrgV1SchedulingPolicy
    33  from kubeflow.training.constants import constants
    34  
    35  from test.e2e.utils import verify_job_e2e, verify_unschedulable_job_e2e, get_pod_spec_scheduler_name
    36  from test.e2e.constants import TEST_GANG_SCHEDULER_NAME_ENV_KEY
    37  from test.e2e.constants import GANG_SCHEDULERS, NONE_GANG_SCHEDULERS
    38  
    39  logging.basicConfig(format="%(message)s")
    40  logging.getLogger().setLevel(logging.INFO)
    41  
    42  TRAINING_CLIENT = TrainingClient()
    43  JOB_NAME = "mxjob-mnist-ci-test"
    44  CONTAINER_NAME = "mxnet"
    45  GANG_SCHEDULER_NAME = os.getenv(TEST_GANG_SCHEDULER_NAME_ENV_KEY)
    46  
    47  
    48  @pytest.mark.skipif(
    49      GANG_SCHEDULER_NAME in NONE_GANG_SCHEDULERS, reason="For gang-scheduling",
    50  )
    51  def test_sdk_e2e_with_gang_scheduling(job_namespace):
    52      worker_container, server_container, scheduler_container = generate_containers()
    53  
    54      worker = KubeflowOrgV1ReplicaSpec(
    55          replicas=1,
    56          restart_policy="Never",
    57          template=V1PodTemplateSpec(
    58              metadata=V1ObjectMeta(annotations={constants.ISTIO_SIDECAR_INJECTION: "false"}),
    59              spec=V1PodSpec(
    60                  containers=[worker_container],
    61                  scheduler_name=get_pod_spec_scheduler_name(GANG_SCHEDULER_NAME),
    62              )
    63          ),
    64      )
    65  
    66      server = KubeflowOrgV1ReplicaSpec(
    67          replicas=1,
    68          restart_policy="Never",
    69          template=V1PodTemplateSpec(
    70              metadata=V1ObjectMeta(annotations={constants.ISTIO_SIDECAR_INJECTION: "false"}),
    71              spec=V1PodSpec(
    72                  containers=[server_container],
    73                  scheduler_name=get_pod_spec_scheduler_name(GANG_SCHEDULER_NAME),
    74              )
    75          ),
    76      )
    77  
    78      scheduler = KubeflowOrgV1ReplicaSpec(
    79          replicas=1,
    80          restart_policy="Never",
    81          template=V1PodTemplateSpec(
    82              metadata=V1ObjectMeta(annotations={constants.ISTIO_SIDECAR_INJECTION: "false"}),
    83              spec=V1PodSpec(
    84                  containers=[scheduler_container],
    85                  scheduler_name=get_pod_spec_scheduler_name(GANG_SCHEDULER_NAME),
    86              )
    87          ),
    88      )
    89  
    90      unschedulable_mxjob = generate_mxjob(scheduler, server, worker, KubeflowOrgV1SchedulingPolicy(min_available=10), job_namespace)
    91      schedulable_mxjob = generate_mxjob(scheduler, server, worker, KubeflowOrgV1SchedulingPolicy(min_available=3), job_namespace)
    92  
    93      TRAINING_CLIENT.create_mxjob(unschedulable_mxjob, job_namespace)
    94      logging.info(f"List of created {constants.MXJOB_KIND}s")
    95      logging.info(TRAINING_CLIENT.list_mxjobs(job_namespace))
    96  
    97      verify_unschedulable_job_e2e(
    98          TRAINING_CLIENT,
    99          JOB_NAME,
   100          job_namespace,
   101          constants.MXJOB_KIND,
   102      )
   103  
   104      TRAINING_CLIENT.patch_mxjob(schedulable_mxjob, JOB_NAME, job_namespace)
   105      logging.info(f"List of patched {constants.MXJOB_KIND}s")
   106      logging.info(TRAINING_CLIENT.list_mxjobs(job_namespace))
   107  
   108      verify_job_e2e(
   109          TRAINING_CLIENT,
   110          JOB_NAME,
   111          job_namespace,
   112          constants.MXJOB_KIND,
   113          CONTAINER_NAME,
   114      )
   115  
   116      TRAINING_CLIENT.delete_mxjob(JOB_NAME, job_namespace)
   117  
   118  
   119  @pytest.mark.skipif(
   120      GANG_SCHEDULER_NAME in GANG_SCHEDULERS, reason="For plain scheduling",
   121  )
   122  def test_sdk_e2e(job_namespace):
   123      worker_container, server_container, scheduler_container = generate_containers()
   124  
   125      worker = KubeflowOrgV1ReplicaSpec(
   126          replicas=1,
   127          restart_policy="Never",
   128          template=V1PodTemplateSpec(metadata=V1ObjectMeta(annotations={constants.ISTIO_SIDECAR_INJECTION: "false"}),
   129                                     spec=V1PodSpec(containers=[worker_container])),
   130      )
   131  
   132      server = KubeflowOrgV1ReplicaSpec(
   133          replicas=1,
   134          restart_policy="Never",
   135          template=V1PodTemplateSpec(metadata=V1ObjectMeta(annotations={constants.ISTIO_SIDECAR_INJECTION: "false"}),
   136                                     spec=V1PodSpec(containers=[server_container])),
   137      )
   138  
   139      scheduler = KubeflowOrgV1ReplicaSpec(
   140          replicas=1,
   141          restart_policy="Never",
   142          template=V1PodTemplateSpec(metadata=V1ObjectMeta(annotations={constants.ISTIO_SIDECAR_INJECTION: "false"}),
   143                                     spec=V1PodSpec(containers=[scheduler_container])),
   144      )
   145  
   146      mxjob = generate_mxjob(scheduler, server, worker, job_namespace=job_namespace)
   147  
   148      TRAINING_CLIENT.create_mxjob(mxjob, job_namespace)
   149      logging.info(f"List of created {constants.MXJOB_KIND}s")
   150      logging.info(TRAINING_CLIENT.list_mxjobs(job_namespace))
   151  
   152      verify_job_e2e(
   153          TRAINING_CLIENT,
   154          JOB_NAME,
   155          job_namespace,
   156          constants.MXJOB_KIND,
   157          CONTAINER_NAME,
   158      )
   159  
   160      TRAINING_CLIENT.delete_mxjob(JOB_NAME, job_namespace)
   161  
   162  
   163  def generate_mxjob(
   164      scheduler: KubeflowOrgV1ReplicaSpec,
   165      server: KubeflowOrgV1ReplicaSpec,
   166      worker: KubeflowOrgV1ReplicaSpec,
   167      scheduling_policy: KubeflowOrgV1SchedulingPolicy = None,
   168      job_namespace: str = "default",
   169  ) -> KubeflowOrgV1MXJob:
   170      return KubeflowOrgV1MXJob(
   171          api_version="kubeflow.org/v1",
   172          kind="MXJob",
   173          metadata=V1ObjectMeta(name=JOB_NAME, namespace=job_namespace),
   174          spec=KubeflowOrgV1MXJobSpec(
   175              job_mode="MXTrain",
   176              run_policy=KubeflowOrgV1RunPolicy(
   177                  clean_pod_policy="None",
   178                  scheduling_policy=scheduling_policy,
   179              ),
   180              mx_replica_specs={
   181                  "Scheduler": scheduler,
   182                  "Server": server,
   183                  "Worker": worker,
   184              },
   185          ),
   186      )
   187  
   188  
   189  def generate_containers() -> Tuple[V1Container, V1Container, V1Container]:
   190      worker_container = V1Container(
   191          name=CONTAINER_NAME,
   192          # TODO (tenzen-y): Replace the below image with the kubeflow hosted image
   193          image="docker.io/johnugeorge/mxnet:1.9.1_cpu_py3",
   194          command=["/usr/local/bin/python3"],
   195          args=[
   196              "incubator-mxnet/example/image-classification/train_mnist.py",
   197              "--num-epochs",
   198              "1",
   199              "--num-examples",
   200              "1000",
   201              "--kv-store",
   202              "dist_sync",
   203          ],
   204          ports=[V1ContainerPort(container_port=9991, name="mxjob-port")],
   205          resources=V1ResourceRequirements(limits={"memory": "1Gi", "cpu": "0.25"}),
   206      )
   207  
   208      server_container = V1Container(
   209          name=CONTAINER_NAME,
   210          # TODO (tenzen-y): Replace the below image with the kubeflow hosted image
   211          image="docker.io/johnugeorge/mxnet:1.9.1_cpu_py3",
   212          ports=[V1ContainerPort(container_port=9991, name="mxjob-port")],
   213          resources=V1ResourceRequirements(limits={"memory": "1Gi", "cpu": "0.25"}),
   214      )
   215  
   216      scheduler_container = V1Container(
   217          name=CONTAINER_NAME,
   218          # TODO (tenzen-y): Replace the below image with the kubeflow hosted image
   219          image="docker.io/johnugeorge/mxnet:1.9.1_cpu_py3",
   220          ports=[V1ContainerPort(container_port=9991, name="mxjob-port")],
   221          resources=V1ResourceRequirements(limits={"memory": "1Gi", "cpu": "0.25"}),
   222      )
   223  
   224      return worker_container, server_container, scheduler_container