github.com/kubeflow/training-operator@v1.7.0/sdk/python/test/e2e/test_e2e_paddlejob.py (about)

     1  # Copyright 2022 kubeflow.org.
     2  #
     3  # Licensed under the Apache License, Version 2.0 (the "License");
     4  # you may not use this file except in compliance with the License.
     5  # You may obtain a copy of the License at
     6  #
     7  #    http://www.apache.org/licenses/LICENSE-2.0
     8  #
     9  # Unless required by applicable law or agreed to in writing, software
    10  # distributed under the License is distributed on an "AS IS" BASIS,
    11  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  # See the License for the specific language governing permissions and
    13  # limitations under the License.
    14  
    15  import os
    16  import logging
    17  import pytest
    18  
    19  from kubernetes.client import V1PodTemplateSpec
    20  from kubernetes.client import V1ObjectMeta
    21  from kubernetes.client import V1PodSpec
    22  from kubernetes.client import V1Container
    23  from kubernetes.client import V1ResourceRequirements
    24  
    25  from kubeflow.training import TrainingClient
    26  from kubeflow.training import KubeflowOrgV1ReplicaSpec
    27  from kubeflow.training import KubeflowOrgV1PaddleJob
    28  from kubeflow.training import KubeflowOrgV1PaddleJobSpec
    29  from kubeflow.training import KubeflowOrgV1RunPolicy
    30  from kubeflow.training import KubeflowOrgV1SchedulingPolicy
    31  from kubeflow.training.constants import constants
    32  
    33  from test.e2e.utils import verify_job_e2e, verify_unschedulable_job_e2e, get_pod_spec_scheduler_name
    34  from test.e2e.constants import TEST_GANG_SCHEDULER_NAME_ENV_KEY
    35  from test.e2e.constants import GANG_SCHEDULERS, NONE_GANG_SCHEDULERS
    36  
    37  logging.basicConfig(format="%(message)s")
    38  logging.getLogger().setLevel(logging.INFO)
    39  
    40  TRAINING_CLIENT = TrainingClient()
    41  JOB_NAME = "paddlejob-cpu-ci-test"
    42  CONTAINER_NAME = "paddle"
    43  GANG_SCHEDULER_NAME = os.getenv(TEST_GANG_SCHEDULER_NAME_ENV_KEY)
    44  
    45  
    46  @pytest.mark.skipif(
    47      GANG_SCHEDULER_NAME in NONE_GANG_SCHEDULERS, reason="For gang-scheduling",
    48  )
    49  def test_sdk_e2e_with_gang_scheduling(job_namespace):
    50      container = generate_container()
    51  
    52      worker = KubeflowOrgV1ReplicaSpec(
    53          replicas=2,
    54          restart_policy="OnFailure",
    55          template=V1PodTemplateSpec(
    56              metadata=V1ObjectMeta(annotations={constants.ISTIO_SIDECAR_INJECTION: "false"}),
    57              spec=V1PodSpec(
    58                  scheduler_name=get_pod_spec_scheduler_name(GANG_SCHEDULER_NAME),
    59                  containers=[container],
    60              )
    61          ),
    62      )
    63  
    64      unschedulable_paddlejob = generate_paddlejob(worker, KubeflowOrgV1SchedulingPolicy(min_available=10), job_namespace)
    65      schedulable_paddlejob = generate_paddlejob(worker, KubeflowOrgV1SchedulingPolicy(min_available=2), job_namespace)
    66  
    67      TRAINING_CLIENT.create_paddlejob(unschedulable_paddlejob, job_namespace)
    68      logging.info(f"List of created {constants.PADDLEJOB_KIND}s")
    69      logging.info(TRAINING_CLIENT.list_paddlejobs(job_namespace))
    70  
    71      verify_unschedulable_job_e2e(
    72          TRAINING_CLIENT,
    73          JOB_NAME,
    74          job_namespace,
    75          constants.PADDLEJOB_KIND,
    76      )
    77  
    78      TRAINING_CLIENT.patch_paddlejob(schedulable_paddlejob, JOB_NAME, job_namespace)
    79      logging.info(f"List of patched {constants.PADDLEJOB_KIND}s")
    80      logging.info(TRAINING_CLIENT.list_paddlejobs(job_namespace))
    81  
    82      verify_job_e2e(
    83          TRAINING_CLIENT,
    84          JOB_NAME,
    85          job_namespace,
    86          constants.PADDLEJOB_KIND,
    87          CONTAINER_NAME,
    88      )
    89  
    90      TRAINING_CLIENT.delete_paddlejob(JOB_NAME, job_namespace)
    91  
    92  
    93  @pytest.mark.skipif(
    94      GANG_SCHEDULER_NAME in GANG_SCHEDULERS, reason="For plain scheduling",
    95  )
    96  def test_sdk_e2e(job_namespace):
    97      container = generate_container()
    98  
    99      worker = KubeflowOrgV1ReplicaSpec(
   100          replicas=2,
   101          restart_policy="OnFailure",
   102          template=V1PodTemplateSpec(metadata=V1ObjectMeta(annotations={constants.ISTIO_SIDECAR_INJECTION: "false"}),
   103                                     spec=V1PodSpec(containers=[container])),
   104      )
   105  
   106      paddlejob = generate_paddlejob(worker, job_namespace=job_namespace)
   107  
   108      TRAINING_CLIENT.create_paddlejob(paddlejob, job_namespace)
   109      logging.info(f"List of created {constants.PADDLEJOB_KIND}s")
   110      logging.info(TRAINING_CLIENT.list_paddlejobs(job_namespace))
   111  
   112      verify_job_e2e(
   113          TRAINING_CLIENT,
   114          JOB_NAME,
   115          job_namespace,
   116          constants.PADDLEJOB_KIND,
   117          CONTAINER_NAME,
   118      )
   119  
   120      TRAINING_CLIENT.delete_paddlejob(JOB_NAME, job_namespace)
   121  
   122  
   123  def generate_paddlejob(
   124      worker: KubeflowOrgV1ReplicaSpec,
   125      scheduling_policy: KubeflowOrgV1SchedulingPolicy = None,
   126      job_namespace: str = "default",
   127  ) -> KubeflowOrgV1PaddleJob:
   128      return KubeflowOrgV1PaddleJob(
   129          api_version="kubeflow.org/v1",
   130          kind="PaddleJob",
   131          metadata=V1ObjectMeta(name=JOB_NAME, namespace=job_namespace),
   132          spec=KubeflowOrgV1PaddleJobSpec(
   133              run_policy=KubeflowOrgV1RunPolicy(
   134                  scheduling_policy=scheduling_policy,
   135                  clean_pod_policy="None",
   136              ),
   137              paddle_replica_specs={"Worker": worker},
   138          ),
   139      )
   140  
   141  
   142  def generate_container() -> V1Container:
   143      return V1Container(
   144          name=CONTAINER_NAME,
   145          image="docker.io/paddlepaddle/paddle:2.4.0rc0-cpu",
   146          command=["python"],
   147          args=["-m", "paddle.distributed.launch", "run_check"],
   148          resources=V1ResourceRequirements(limits={"memory": "1Gi", "cpu": "0.4"}),
   149      )