github.com/kubeflow/training-operator@v1.7.0/sdk/python/test/e2e/test_e2e_xgboostjob.py (about) 1 # Copyright 2021 kubeflow.org. 2 # 3 # Licensed under the Apache License, Version 2.0 (the "License"); 4 # you may not use this file except in compliance with the License. 5 # You may obtain a copy of the License at 6 # 7 # http://www.apache.org/licenses/LICENSE-2.0 8 # 9 # Unless required by applicable law or agreed to in writing, software 10 # distributed under the License is distributed on an "AS IS" BASIS, 11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 # See the License for the specific language governing permissions and 13 # limitations under the License. 14 15 import os 16 import logging 17 import pytest 18 19 from kubernetes.client import V1PodTemplateSpec 20 from kubernetes.client import V1ObjectMeta 21 from kubernetes.client import V1PodSpec 22 from kubernetes.client import V1Container 23 from kubernetes.client import V1ResourceRequirements 24 25 from kubeflow.training import TrainingClient 26 from kubeflow.training import KubeflowOrgV1ReplicaSpec 27 from kubeflow.training import KubeflowOrgV1XGBoostJob 28 from kubeflow.training import KubeflowOrgV1XGBoostJobSpec 29 from kubeflow.training import KubeflowOrgV1RunPolicy 30 from kubeflow.training import KubeflowOrgV1SchedulingPolicy 31 from kubeflow.training.constants import constants 32 33 from test.e2e.utils import verify_job_e2e, verify_unschedulable_job_e2e, get_pod_spec_scheduler_name 34 from test.e2e.constants import TEST_GANG_SCHEDULER_NAME_ENV_KEY 35 from test.e2e.constants import GANG_SCHEDULERS, NONE_GANG_SCHEDULERS 36 37 logging.basicConfig(format="%(message)s") 38 logging.getLogger().setLevel(logging.INFO) 39 40 TRAINING_CLIENT = TrainingClient() 41 JOB_NAME = "xgboostjob-iris-ci-test" 42 CONTAINER_NAME = "xgboost" 43 GANG_SCHEDULER_NAME = os.getenv(TEST_GANG_SCHEDULER_NAME_ENV_KEY) 44 45 46 @pytest.mark.skipif( 47 GANG_SCHEDULER_NAME in NONE_GANG_SCHEDULERS, reason="For gang-scheduling", 48 ) 49 def test_sdk_e2e_with_gang_scheduling(job_namespace): 50 container = generate_container() 51 52 master = KubeflowOrgV1ReplicaSpec( 53 replicas=1, 54 restart_policy="OnFailure", 55 template=V1PodTemplateSpec( 56 metadata=V1ObjectMeta(annotations={constants.ISTIO_SIDECAR_INJECTION: "false"}), 57 spec=V1PodSpec( 58 containers=[container], 59 scheduler_name=get_pod_spec_scheduler_name(GANG_SCHEDULER_NAME), 60 ) 61 ), 62 ) 63 64 worker = KubeflowOrgV1ReplicaSpec( 65 replicas=1, 66 restart_policy="OnFailure", 67 template=V1PodTemplateSpec( 68 metadata=V1ObjectMeta(annotations={constants.ISTIO_SIDECAR_INJECTION: "false"}), 69 spec=V1PodSpec( 70 containers=[container], 71 scheduler_name=get_pod_spec_scheduler_name(GANG_SCHEDULER_NAME), 72 ) 73 ), 74 ) 75 76 unschedulable_xgboostjob = generate_xgboostjob(master, worker, KubeflowOrgV1SchedulingPolicy(min_available=10), job_namespace) 77 schedulable_xgboostjob = generate_xgboostjob(master, worker, KubeflowOrgV1SchedulingPolicy(min_available=2), job_namespace) 78 79 TRAINING_CLIENT.create_xgboostjob(unschedulable_xgboostjob, job_namespace) 80 logging.info(f"List of created {constants.XGBOOSTJOB_KIND}s") 81 logging.info(TRAINING_CLIENT.list_xgboostjobs(job_namespace)) 82 83 verify_unschedulable_job_e2e( 84 TRAINING_CLIENT, 85 JOB_NAME, 86 job_namespace, 87 constants.XGBOOSTJOB_KIND, 88 ) 89 90 TRAINING_CLIENT.patch_xgboostjob(schedulable_xgboostjob, JOB_NAME, job_namespace) 91 logging.info(f"List of patched {constants.XGBOOSTJOB_KIND}s") 92 logging.info(TRAINING_CLIENT.list_xgboostjobs(job_namespace)) 93 94 verify_job_e2e( 95 TRAINING_CLIENT, 96 JOB_NAME, 97 job_namespace, 98 constants.XGBOOSTJOB_KIND, 99 CONTAINER_NAME, 100 ) 101 102 TRAINING_CLIENT.delete_xgboostjob(JOB_NAME, job_namespace) 103 104 105 @pytest.mark.skipif( 106 GANG_SCHEDULER_NAME in GANG_SCHEDULERS, reason="For plain scheduling", 107 ) 108 def test_sdk_e2e(job_namespace): 109 container = generate_container() 110 111 master = KubeflowOrgV1ReplicaSpec( 112 replicas=1, 113 restart_policy="OnFailure", 114 template=V1PodTemplateSpec(metadata=V1ObjectMeta(annotations={constants.ISTIO_SIDECAR_INJECTION: "false"}), 115 spec=V1PodSpec(containers=[container])), 116 ) 117 118 worker = KubeflowOrgV1ReplicaSpec( 119 replicas=1, 120 restart_policy="OnFailure", 121 template=V1PodTemplateSpec(metadata=V1ObjectMeta(annotations={constants.ISTIO_SIDECAR_INJECTION: "false"}), 122 spec=V1PodSpec(containers=[container])), 123 ) 124 125 xgboostjob = generate_xgboostjob(master, worker, job_namespace=job_namespace) 126 127 TRAINING_CLIENT.create_xgboostjob(xgboostjob, job_namespace) 128 logging.info(f"List of created {constants.XGBOOSTJOB_KIND}s") 129 logging.info(TRAINING_CLIENT.list_xgboostjobs(job_namespace)) 130 131 verify_job_e2e( 132 TRAINING_CLIENT, 133 JOB_NAME, 134 job_namespace, 135 constants.XGBOOSTJOB_KIND, 136 CONTAINER_NAME, 137 ) 138 139 TRAINING_CLIENT.delete_xgboostjob(JOB_NAME, job_namespace) 140 141 142 def generate_xgboostjob( 143 master: KubeflowOrgV1ReplicaSpec, 144 worker: KubeflowOrgV1ReplicaSpec, 145 scheduling_policy: KubeflowOrgV1SchedulingPolicy = None, 146 job_namespace: str = "default", 147 ) -> KubeflowOrgV1XGBoostJob: 148 return KubeflowOrgV1XGBoostJob( 149 api_version="kubeflow.org/v1", 150 kind="XGBoostJob", 151 metadata=V1ObjectMeta(name=JOB_NAME, namespace=job_namespace), 152 spec=KubeflowOrgV1XGBoostJobSpec( 153 run_policy=KubeflowOrgV1RunPolicy( 154 clean_pod_policy="None", 155 scheduling_policy=scheduling_policy, 156 ), 157 xgb_replica_specs={"Master": master, "Worker": worker}, 158 ), 159 ) 160 161 162 def generate_container() -> V1Container: 163 return V1Container( 164 name=CONTAINER_NAME, 165 image="docker.io/merlintang/xgboost-dist-iris:1.1", 166 args=[ 167 "--job_type=Train", 168 "--xgboost_parameter=objective:multi:softprob,num_class:3", 169 "--n_estimators=10", 170 "--learning_rate=0.1", 171 "--model_path=/tmp/xgboost-model", 172 "--model_storage_type=local", 173 ], 174 resources=V1ResourceRequirements(limits={"memory": "1Gi", "cpu": "0.4"}), 175 )