github.com/kubeflow/training-operator@v1.7.0/sdk/python/test/e2e/test_e2e_xgboostjob.py (about)

     1  # Copyright 2021 kubeflow.org.
     2  #
     3  # Licensed under the Apache License, Version 2.0 (the "License");
     4  # you may not use this file except in compliance with the License.
     5  # You may obtain a copy of the License at
     6  #
     7  #    http://www.apache.org/licenses/LICENSE-2.0
     8  #
     9  # Unless required by applicable law or agreed to in writing, software
    10  # distributed under the License is distributed on an "AS IS" BASIS,
    11  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  # See the License for the specific language governing permissions and
    13  # limitations under the License.
    14  
    15  import os
    16  import logging
    17  import pytest
    18  
    19  from kubernetes.client import V1PodTemplateSpec
    20  from kubernetes.client import V1ObjectMeta
    21  from kubernetes.client import V1PodSpec
    22  from kubernetes.client import V1Container
    23  from kubernetes.client import V1ResourceRequirements
    24  
    25  from kubeflow.training import TrainingClient
    26  from kubeflow.training import KubeflowOrgV1ReplicaSpec
    27  from kubeflow.training import KubeflowOrgV1XGBoostJob
    28  from kubeflow.training import KubeflowOrgV1XGBoostJobSpec
    29  from kubeflow.training import KubeflowOrgV1RunPolicy
    30  from kubeflow.training import KubeflowOrgV1SchedulingPolicy
    31  from kubeflow.training.constants import constants
    32  
    33  from test.e2e.utils import verify_job_e2e, verify_unschedulable_job_e2e, get_pod_spec_scheduler_name
    34  from test.e2e.constants import TEST_GANG_SCHEDULER_NAME_ENV_KEY
    35  from test.e2e.constants import GANG_SCHEDULERS, NONE_GANG_SCHEDULERS
    36  
    37  logging.basicConfig(format="%(message)s")
    38  logging.getLogger().setLevel(logging.INFO)
    39  
    40  TRAINING_CLIENT = TrainingClient()
    41  JOB_NAME = "xgboostjob-iris-ci-test"
    42  CONTAINER_NAME = "xgboost"
    43  GANG_SCHEDULER_NAME = os.getenv(TEST_GANG_SCHEDULER_NAME_ENV_KEY)
    44  
    45  
    46  @pytest.mark.skipif(
    47      GANG_SCHEDULER_NAME in NONE_GANG_SCHEDULERS, reason="For gang-scheduling",
    48  )
    49  def test_sdk_e2e_with_gang_scheduling(job_namespace):
    50      container = generate_container()
    51  
    52      master = KubeflowOrgV1ReplicaSpec(
    53          replicas=1,
    54          restart_policy="OnFailure",
    55          template=V1PodTemplateSpec(
    56              metadata=V1ObjectMeta(annotations={constants.ISTIO_SIDECAR_INJECTION: "false"}),
    57              spec=V1PodSpec(
    58                  containers=[container],
    59                  scheduler_name=get_pod_spec_scheduler_name(GANG_SCHEDULER_NAME),
    60              )
    61          ),
    62      )
    63  
    64      worker = KubeflowOrgV1ReplicaSpec(
    65          replicas=1,
    66          restart_policy="OnFailure",
    67          template=V1PodTemplateSpec(
    68              metadata=V1ObjectMeta(annotations={constants.ISTIO_SIDECAR_INJECTION: "false"}),
    69              spec=V1PodSpec(
    70                  containers=[container],
    71                  scheduler_name=get_pod_spec_scheduler_name(GANG_SCHEDULER_NAME),
    72              )
    73          ),
    74      )
    75  
    76      unschedulable_xgboostjob = generate_xgboostjob(master, worker, KubeflowOrgV1SchedulingPolicy(min_available=10), job_namespace)
    77      schedulable_xgboostjob = generate_xgboostjob(master, worker, KubeflowOrgV1SchedulingPolicy(min_available=2), job_namespace)
    78  
    79      TRAINING_CLIENT.create_xgboostjob(unschedulable_xgboostjob, job_namespace)
    80      logging.info(f"List of created {constants.XGBOOSTJOB_KIND}s")
    81      logging.info(TRAINING_CLIENT.list_xgboostjobs(job_namespace))
    82  
    83      verify_unschedulable_job_e2e(
    84          TRAINING_CLIENT,
    85          JOB_NAME,
    86          job_namespace,
    87          constants.XGBOOSTJOB_KIND,
    88      )
    89  
    90      TRAINING_CLIENT.patch_xgboostjob(schedulable_xgboostjob, JOB_NAME, job_namespace)
    91      logging.info(f"List of patched {constants.XGBOOSTJOB_KIND}s")
    92      logging.info(TRAINING_CLIENT.list_xgboostjobs(job_namespace))
    93  
    94      verify_job_e2e(
    95          TRAINING_CLIENT,
    96          JOB_NAME,
    97          job_namespace,
    98          constants.XGBOOSTJOB_KIND,
    99          CONTAINER_NAME,
   100      )
   101  
   102      TRAINING_CLIENT.delete_xgboostjob(JOB_NAME, job_namespace)
   103  
   104  
   105  @pytest.mark.skipif(
   106      GANG_SCHEDULER_NAME in GANG_SCHEDULERS, reason="For plain scheduling",
   107  )
   108  def test_sdk_e2e(job_namespace):
   109      container = generate_container()
   110  
   111      master = KubeflowOrgV1ReplicaSpec(
   112          replicas=1,
   113          restart_policy="OnFailure",
   114          template=V1PodTemplateSpec(metadata=V1ObjectMeta(annotations={constants.ISTIO_SIDECAR_INJECTION: "false"}),
   115                                     spec=V1PodSpec(containers=[container])),
   116      )
   117  
   118      worker = KubeflowOrgV1ReplicaSpec(
   119          replicas=1,
   120          restart_policy="OnFailure",
   121          template=V1PodTemplateSpec(metadata=V1ObjectMeta(annotations={constants.ISTIO_SIDECAR_INJECTION: "false"}),
   122                                     spec=V1PodSpec(containers=[container])),
   123      )
   124  
   125      xgboostjob = generate_xgboostjob(master, worker, job_namespace=job_namespace)
   126  
   127      TRAINING_CLIENT.create_xgboostjob(xgboostjob, job_namespace)
   128      logging.info(f"List of created {constants.XGBOOSTJOB_KIND}s")
   129      logging.info(TRAINING_CLIENT.list_xgboostjobs(job_namespace))
   130  
   131      verify_job_e2e(
   132          TRAINING_CLIENT,
   133          JOB_NAME,
   134          job_namespace,
   135          constants.XGBOOSTJOB_KIND,
   136          CONTAINER_NAME,
   137      )
   138  
   139      TRAINING_CLIENT.delete_xgboostjob(JOB_NAME, job_namespace)
   140  
   141  
   142  def generate_xgboostjob(
   143      master: KubeflowOrgV1ReplicaSpec,
   144      worker: KubeflowOrgV1ReplicaSpec,
   145      scheduling_policy: KubeflowOrgV1SchedulingPolicy = None,
   146      job_namespace: str = "default",
   147  ) -> KubeflowOrgV1XGBoostJob:
   148      return KubeflowOrgV1XGBoostJob(
   149          api_version="kubeflow.org/v1",
   150          kind="XGBoostJob",
   151          metadata=V1ObjectMeta(name=JOB_NAME, namespace=job_namespace),
   152          spec=KubeflowOrgV1XGBoostJobSpec(
   153              run_policy=KubeflowOrgV1RunPolicy(
   154                  clean_pod_policy="None",
   155                  scheduling_policy=scheduling_policy,
   156              ),
   157              xgb_replica_specs={"Master": master, "Worker": worker},
   158          ),
   159      )
   160  
   161  
   162  def generate_container() -> V1Container:
   163      return V1Container(
   164          name=CONTAINER_NAME,
   165          image="docker.io/merlintang/xgboost-dist-iris:1.1",
   166          args=[
   167              "--job_type=Train",
   168              "--xgboost_parameter=objective:multi:softprob,num_class:3",
   169              "--n_estimators=10",
   170              "--learning_rate=0.1",
   171              "--model_path=/tmp/xgboost-model",
   172              "--model_storage_type=local",
   173          ],
   174          resources=V1ResourceRequirements(limits={"memory": "1Gi", "cpu": "0.4"}),
   175      )