github.com/kubeflow/training-operator@v1.7.0/sdk/python/test/e2e/test_e2e_pytorchjob.py (about)

     1  # Copyright 2021 kubeflow.org.
     2  #
     3  # Licensed under the Apache License, Version 2.0 (the "License");
     4  # you may not use this file except in compliance with the License.
     5  # You may obtain a copy of the License at
     6  #
     7  #    http://www.apache.org/licenses/LICENSE-2.0
     8  #
     9  # Unless required by applicable law or agreed to in writing, software
    10  # distributed under the License is distributed on an "AS IS" BASIS,
    11  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  # See the License for the specific language governing permissions and
    13  # limitations under the License.
    14  
    15  import os
    16  import logging
    17  import pytest
    18  
    19  from kubernetes.client import V1PodTemplateSpec
    20  from kubernetes.client import V1ObjectMeta
    21  from kubernetes.client import V1PodSpec
    22  from kubernetes.client import V1Container
    23  from kubernetes.client import V1ResourceRequirements
    24  
    25  from kubeflow.training import TrainingClient
    26  from kubeflow.training import KubeflowOrgV1ReplicaSpec
    27  from kubeflow.training import KubeflowOrgV1PyTorchJob
    28  from kubeflow.training import KubeflowOrgV1PyTorchJobSpec
    29  from kubeflow.training import KubeflowOrgV1RunPolicy
    30  from kubeflow.training import KubeflowOrgV1SchedulingPolicy
    31  from kubeflow.training.constants import constants
    32  
    33  from test.e2e.utils import verify_job_e2e, verify_unschedulable_job_e2e, get_pod_spec_scheduler_name
    34  from test.e2e.constants import TEST_GANG_SCHEDULER_NAME_ENV_KEY
    35  from test.e2e.constants import GANG_SCHEDULERS, NONE_GANG_SCHEDULERS
    36  
    37  logging.basicConfig(format="%(message)s")
    38  logging.getLogger().setLevel(logging.INFO)
    39  
    40  TRAINING_CLIENT = TrainingClient()
    41  JOB_NAME = "pytorchjob-mnist-ci-test"
    42  CONTAINER_NAME = "pytorch"
    43  GANG_SCHEDULER_NAME = os.getenv(TEST_GANG_SCHEDULER_NAME_ENV_KEY)
    44  
    45  
    46  @pytest.mark.skipif(
    47      GANG_SCHEDULER_NAME in NONE_GANG_SCHEDULERS, reason="For gang-scheduling",
    48  )
    49  def test_sdk_e2e_with_gang_scheduling(job_namespace):
    50      container = generate_container()
    51  
    52      master = KubeflowOrgV1ReplicaSpec(
    53          replicas=1,
    54          restart_policy="OnFailure",
    55          template=V1PodTemplateSpec(
    56              metadata=V1ObjectMeta(annotations={constants.ISTIO_SIDECAR_INJECTION: "false"}),
    57              spec=V1PodSpec(
    58                  scheduler_name=get_pod_spec_scheduler_name(GANG_SCHEDULER_NAME),
    59                  containers=[container],
    60              )
    61          ),
    62      )
    63  
    64      worker = KubeflowOrgV1ReplicaSpec(
    65          replicas=1,
    66          restart_policy="OnFailure",
    67          template=V1PodTemplateSpec(
    68              metadata=V1ObjectMeta(annotations={constants.ISTIO_SIDECAR_INJECTION: "false"}),
    69              spec=V1PodSpec(
    70                  scheduler_name=get_pod_spec_scheduler_name(GANG_SCHEDULER_NAME),
    71                  containers=[container],
    72              )
    73          ),
    74      )
    75  
    76      unschedulable_pytorchjob = generate_pytorchjob(master, worker, KubeflowOrgV1SchedulingPolicy(min_available=10), job_namespace)
    77      schedulable_pytorchjob = generate_pytorchjob(master, worker, KubeflowOrgV1SchedulingPolicy(min_available=2), job_namespace)
    78  
    79      TRAINING_CLIENT.create_pytorchjob(unschedulable_pytorchjob, job_namespace)
    80      logging.info(f"List of created {constants.PYTORCHJOB_KIND}s")
    81      logging.info(TRAINING_CLIENT.list_pytorchjobs(job_namespace))
    82  
    83      verify_unschedulable_job_e2e(
    84          TRAINING_CLIENT,
    85          JOB_NAME,
    86          job_namespace,
    87          constants.PYTORCHJOB_KIND,
    88      )
    89  
    90      TRAINING_CLIENT.patch_pytorchjob(schedulable_pytorchjob, JOB_NAME, job_namespace)
    91      logging.info(f"List of patched {constants.PYTORCHJOB_KIND}s")
    92      logging.info(TRAINING_CLIENT.list_pytorchjobs(job_namespace))
    93  
    94      verify_job_e2e(
    95          TRAINING_CLIENT,
    96          JOB_NAME,
    97          job_namespace,
    98          constants.PYTORCHJOB_KIND,
    99          CONTAINER_NAME,
   100          timeout=900,
   101      )
   102  
   103      TRAINING_CLIENT.delete_pytorchjob(JOB_NAME, job_namespace)
   104  
   105  
   106  @pytest.mark.skipif(
   107      GANG_SCHEDULER_NAME in GANG_SCHEDULERS, reason="For plain scheduling",
   108  )
   109  def test_sdk_e2e(job_namespace):
   110      container = generate_container()
   111  
   112      master = KubeflowOrgV1ReplicaSpec(
   113          replicas=1,
   114          restart_policy="OnFailure",
   115          template=V1PodTemplateSpec(metadata=V1ObjectMeta(annotations={constants.ISTIO_SIDECAR_INJECTION: "false"}),
   116                                     spec=V1PodSpec(containers=[container])),
   117      )
   118  
   119      worker = KubeflowOrgV1ReplicaSpec(
   120          replicas=1,
   121          restart_policy="OnFailure",
   122          template=V1PodTemplateSpec(metadata=V1ObjectMeta(annotations={constants.ISTIO_SIDECAR_INJECTION: "false"}),
   123                                     spec=V1PodSpec(containers=[container])),
   124      )
   125  
   126      pytorchjob = generate_pytorchjob(master, worker, job_namespace=job_namespace)
   127  
   128      TRAINING_CLIENT.create_pytorchjob(pytorchjob, job_namespace)
   129      logging.info(f"List of created {constants.PYTORCHJOB_KIND}s")
   130      logging.info(TRAINING_CLIENT.list_pytorchjobs(job_namespace))
   131  
   132      verify_job_e2e(
   133          TRAINING_CLIENT,
   134          JOB_NAME,
   135          job_namespace,
   136          constants.PYTORCHJOB_KIND,
   137          CONTAINER_NAME,
   138          timeout=900,
   139      )
   140  
   141      TRAINING_CLIENT.delete_pytorchjob(JOB_NAME, job_namespace)
   142  
   143  
   144  def generate_pytorchjob(
   145      master: KubeflowOrgV1ReplicaSpec,
   146      worker: KubeflowOrgV1ReplicaSpec,
   147      scheduling_policy: KubeflowOrgV1SchedulingPolicy = None,
   148      job_namespace: str = "default",
   149  ) -> KubeflowOrgV1PyTorchJob:
   150      return KubeflowOrgV1PyTorchJob(
   151          api_version="kubeflow.org/v1",
   152          kind="PyTorchJob",
   153          metadata=V1ObjectMeta(name=JOB_NAME, namespace=job_namespace),
   154          spec=KubeflowOrgV1PyTorchJobSpec(
   155              run_policy=KubeflowOrgV1RunPolicy(
   156                  clean_pod_policy="None",
   157                  scheduling_policy=scheduling_policy,
   158              ),
   159              pytorch_replica_specs={"Master": master, "Worker": worker},
   160          ),
   161      )
   162  
   163  
   164  def generate_container() -> V1Container:
   165      return V1Container(
   166          name=CONTAINER_NAME,
   167          image="gcr.io/kubeflow-ci/pytorch-dist-mnist-test:v1.0",
   168          args=["--backend", "gloo"],
   169          resources=V1ResourceRequirements(limits={"memory": "1Gi", "cpu": "0.4"}),
   170      )