github.com/kubeflow/training-operator@v1.7.0/sdk/python/test/e2e/test_e2e_mpijob.py (about)

     1  # Copyright 2021 kubeflow.org.
     2  #
     3  # Licensed under the Apache License, Version 2.0 (the "License");
     4  # you may not use this file except in compliance with the License.
     5  # You may obtain a copy of the License at
     6  #
     7  #    http://www.apache.org/licenses/LICENSE-2.0
     8  #
     9  # Unless required by applicable law or agreed to in writing, software
    10  # distributed under the License is distributed on an "AS IS" BASIS,
    11  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  # See the License for the specific language governing permissions and
    13  # limitations under the License.
    14  
    15  import os
    16  import logging
    17  import pytest
    18  from typing import Tuple
    19  
    20  from kubernetes.client import V1PodTemplateSpec
    21  from kubernetes.client import V1ObjectMeta
    22  from kubernetes.client import V1PodSpec
    23  from kubernetes.client import V1Container
    24  from kubernetes.client import V1ResourceRequirements
    25  
    26  from kubeflow.training import TrainingClient
    27  from kubeflow.training import KubeflowOrgV1ReplicaSpec
    28  from kubeflow.training import KubeflowOrgV1MPIJob
    29  from kubeflow.training import KubeflowOrgV1MPIJobSpec
    30  from kubeflow.training import KubeflowOrgV1RunPolicy
    31  from kubeflow.training import KubeflowOrgV1SchedulingPolicy
    32  from kubeflow.training.constants import constants
    33  
    34  from test.e2e.utils import verify_job_e2e, verify_unschedulable_job_e2e, get_pod_spec_scheduler_name
    35  from test.e2e.constants import TEST_GANG_SCHEDULER_NAME_ENV_KEY
    36  from test.e2e.constants import GANG_SCHEDULERS, NONE_GANG_SCHEDULERS
    37  
    38  logging.basicConfig(format="%(message)s")
    39  logging.getLogger().setLevel(logging.INFO)
    40  
    41  TRAINING_CLIENT = TrainingClient()
    42  JOB_NAME = "mpijob-mxnet-ci-test"
    43  CONTAINER_NAME = "mpi"
    44  GANG_SCHEDULER_NAME = os.getenv(TEST_GANG_SCHEDULER_NAME_ENV_KEY)
    45  
    46  
    47  @pytest.mark.skipif(
    48      GANG_SCHEDULER_NAME in NONE_GANG_SCHEDULERS, reason="For gang-scheduling",
    49  )
    50  def test_sdk_e2e_with_gang_scheduling(job_namespace):
    51      launcher_container, worker_container = generate_containers()
    52  
    53      launcher = KubeflowOrgV1ReplicaSpec(
    54          replicas=1,
    55          restart_policy="Never",
    56          template=V1PodTemplateSpec(
    57              metadata=V1ObjectMeta(annotations={constants.ISTIO_SIDECAR_INJECTION: "false"}),
    58              spec=V1PodSpec(
    59                  containers=[launcher_container],
    60                  scheduler_name=get_pod_spec_scheduler_name(GANG_SCHEDULER_NAME),
    61              )
    62          ),
    63      )
    64  
    65      worker = KubeflowOrgV1ReplicaSpec(
    66          replicas=1,
    67          restart_policy="Never",
    68          template=V1PodTemplateSpec(
    69              metadata=V1ObjectMeta(annotations={constants.ISTIO_SIDECAR_INJECTION: "false"}),
    70              spec=V1PodSpec(
    71                  containers=[worker_container],
    72                  scheduler_name=get_pod_spec_scheduler_name(GANG_SCHEDULER_NAME),
    73              )
    74          ),
    75      )
    76  
    77      mpijob = generate_mpijob(launcher, worker, KubeflowOrgV1SchedulingPolicy(min_available=10), job_namespace)
    78      patched_mpijob = generate_mpijob(launcher, worker, KubeflowOrgV1SchedulingPolicy(min_available=2), job_namespace)
    79  
    80      TRAINING_CLIENT.create_mpijob(mpijob, job_namespace)
    81      logging.info(f"List of created {constants.MPIJOB_KIND}s")
    82      logging.info(TRAINING_CLIENT.list_mpijobs(job_namespace))
    83  
    84      verify_unschedulable_job_e2e(
    85          TRAINING_CLIENT,
    86          JOB_NAME,
    87          job_namespace,
    88          constants.MPIJOB_KIND,
    89      )
    90  
    91      TRAINING_CLIENT.patch_mpijob(patched_mpijob, JOB_NAME, job_namespace)
    92      logging.info(f"List of patched {constants.MPIJOB_KIND}s")
    93      logging.info(TRAINING_CLIENT.list_mpijobs(job_namespace))
    94  
    95      verify_job_e2e(
    96          TRAINING_CLIENT,
    97          JOB_NAME,
    98          job_namespace,
    99          constants.MPIJOB_KIND,
   100          CONTAINER_NAME,
   101      )
   102  
   103      TRAINING_CLIENT.delete_mpijob(JOB_NAME, job_namespace)
   104  
   105  
   106  @pytest.mark.skipif(
   107      GANG_SCHEDULER_NAME in GANG_SCHEDULERS, reason="For plain scheduling",
   108  )
   109  def test_sdk_e2e(job_namespace):
   110      launcher_container, worker_container = generate_containers()
   111  
   112      launcher = KubeflowOrgV1ReplicaSpec(
   113          replicas=1,
   114          restart_policy="Never",
   115          template=V1PodTemplateSpec(metadata=V1ObjectMeta(annotations={constants.ISTIO_SIDECAR_INJECTION: "false"}),
   116                                     spec=V1PodSpec(containers=[launcher_container])),
   117      )
   118  
   119      worker = KubeflowOrgV1ReplicaSpec(
   120          replicas=1,
   121          restart_policy="Never",
   122          template=V1PodTemplateSpec(metadata=V1ObjectMeta(annotations={constants.ISTIO_SIDECAR_INJECTION: "false"}),
   123                                     spec=V1PodSpec(containers=[worker_container])),
   124      )
   125  
   126      mpijob = generate_mpijob(launcher, worker, job_namespace=job_namespace)
   127  
   128      TRAINING_CLIENT.create_mpijob(mpijob, job_namespace)
   129      logging.info(f"List of created {constants.MPIJOB_KIND}s")
   130      logging.info(TRAINING_CLIENT.list_mpijobs(job_namespace))
   131  
   132      verify_job_e2e(
   133          TRAINING_CLIENT,
   134          JOB_NAME,
   135          job_namespace,
   136          constants.MPIJOB_KIND,
   137          CONTAINER_NAME,
   138      )
   139  
   140      TRAINING_CLIENT.delete_mpijob(JOB_NAME, job_namespace)
   141  
   142  
   143  def generate_mpijob(
   144      launcher: KubeflowOrgV1ReplicaSpec,
   145      worker: KubeflowOrgV1ReplicaSpec,
   146      scheduling_policy: KubeflowOrgV1SchedulingPolicy = None,
   147      job_namespace: str = "default",
   148  ) -> KubeflowOrgV1MPIJob:
   149      return KubeflowOrgV1MPIJob(
   150          api_version="kubeflow.org/v1",
   151          kind="MPIJob",
   152          metadata=V1ObjectMeta(name=JOB_NAME, namespace=job_namespace),
   153          spec=KubeflowOrgV1MPIJobSpec(
   154              slots_per_worker=1,
   155              run_policy=KubeflowOrgV1RunPolicy(
   156                  clean_pod_policy="None",
   157                  scheduling_policy=scheduling_policy,
   158              ),
   159              mpi_replica_specs={"Launcher": launcher, "Worker": worker},
   160          ),
   161      )
   162  
   163  
   164  def generate_containers() -> Tuple[V1Container, V1Container]:
   165      launcher_container = V1Container(
   166          name=CONTAINER_NAME,
   167          image="horovod/horovod:0.20.0-tf2.3.0-torch1.6.0-mxnet1.5.0-py3.7-cpu",
   168          command=["mpirun"],
   169          args=[
   170              "-np",
   171              "1",
   172              "--allow-run-as-root",
   173              "-bind-to",
   174              "none",
   175              "-map-by",
   176              "slot",
   177              "-x",
   178              "LD_LIBRARY_PATH",
   179              "-x",
   180              "PATH",
   181              "-mca",
   182              "pml",
   183              "ob1",
   184              "-mca",
   185              "btl",
   186              "^openib",
   187              # "python", "/examples/tensorflow2_mnist.py"]
   188              "python",
   189              "/examples/pytorch_mnist.py",
   190              "--epochs",
   191              "1",
   192          ],
   193          resources=V1ResourceRequirements(limits={"memory": "1Gi", "cpu": "0.4"}),
   194      )
   195  
   196      worker_container = V1Container(
   197          name="mpi",
   198          image="horovod/horovod:0.20.0-tf2.3.0-torch1.6.0-mxnet1.5.0-py3.7-cpu",
   199          resources=V1ResourceRequirements(limits={"memory": "1Gi", "cpu": "0.4"}),
   200      )
   201  
   202      return launcher_container, worker_container