github.com/kubeflow/training-operator@v1.7.0/sdk/python/test/e2e/test_e2e_mxjob.py (about) 1 # Copyright 2021 kubeflow.org. 2 # 3 # Licensed under the Apache License, Version 2.0 (the "License"); 4 # you may not use this file except in compliance with the License. 5 # You may obtain a copy of the License at 6 # 7 # http://www.apache.org/licenses/LICENSE-2.0 8 # 9 # Unless required by applicable law or agreed to in writing, software 10 # distributed under the License is distributed on an "AS IS" BASIS, 11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 # See the License for the specific language governing permissions and 13 # limitations under the License. 14 15 import os 16 import logging 17 import pytest 18 from typing import Tuple 19 20 from kubernetes.client import V1PodTemplateSpec 21 from kubernetes.client import V1ObjectMeta 22 from kubernetes.client import V1PodSpec 23 from kubernetes.client import V1Container 24 from kubernetes.client import V1ContainerPort 25 from kubernetes.client import V1ResourceRequirements 26 27 from kubeflow.training import TrainingClient 28 from kubeflow.training import KubeflowOrgV1ReplicaSpec 29 from kubeflow.training import KubeflowOrgV1MXJob 30 from kubeflow.training import KubeflowOrgV1MXJobSpec 31 from kubeflow.training import KubeflowOrgV1RunPolicy 32 from kubeflow.training import KubeflowOrgV1SchedulingPolicy 33 from kubeflow.training.constants import constants 34 35 from test.e2e.utils import verify_job_e2e, verify_unschedulable_job_e2e, get_pod_spec_scheduler_name 36 from test.e2e.constants import TEST_GANG_SCHEDULER_NAME_ENV_KEY 37 from test.e2e.constants import GANG_SCHEDULERS, NONE_GANG_SCHEDULERS 38 39 logging.basicConfig(format="%(message)s") 40 logging.getLogger().setLevel(logging.INFO) 41 42 TRAINING_CLIENT = TrainingClient() 43 JOB_NAME = "mxjob-mnist-ci-test" 44 CONTAINER_NAME = "mxnet" 45 GANG_SCHEDULER_NAME = os.getenv(TEST_GANG_SCHEDULER_NAME_ENV_KEY) 46 47 48 @pytest.mark.skipif( 49 GANG_SCHEDULER_NAME in NONE_GANG_SCHEDULERS, reason="For gang-scheduling", 50 ) 51 def test_sdk_e2e_with_gang_scheduling(job_namespace): 52 worker_container, server_container, scheduler_container = generate_containers() 53 54 worker = KubeflowOrgV1ReplicaSpec( 55 replicas=1, 56 restart_policy="Never", 57 template=V1PodTemplateSpec( 58 metadata=V1ObjectMeta(annotations={constants.ISTIO_SIDECAR_INJECTION: "false"}), 59 spec=V1PodSpec( 60 containers=[worker_container], 61 scheduler_name=get_pod_spec_scheduler_name(GANG_SCHEDULER_NAME), 62 ) 63 ), 64 ) 65 66 server = KubeflowOrgV1ReplicaSpec( 67 replicas=1, 68 restart_policy="Never", 69 template=V1PodTemplateSpec( 70 metadata=V1ObjectMeta(annotations={constants.ISTIO_SIDECAR_INJECTION: "false"}), 71 spec=V1PodSpec( 72 containers=[server_container], 73 scheduler_name=get_pod_spec_scheduler_name(GANG_SCHEDULER_NAME), 74 ) 75 ), 76 ) 77 78 scheduler = KubeflowOrgV1ReplicaSpec( 79 replicas=1, 80 restart_policy="Never", 81 template=V1PodTemplateSpec( 82 metadata=V1ObjectMeta(annotations={constants.ISTIO_SIDECAR_INJECTION: "false"}), 83 spec=V1PodSpec( 84 containers=[scheduler_container], 85 scheduler_name=get_pod_spec_scheduler_name(GANG_SCHEDULER_NAME), 86 ) 87 ), 88 ) 89 90 unschedulable_mxjob = generate_mxjob(scheduler, server, worker, KubeflowOrgV1SchedulingPolicy(min_available=10), job_namespace) 91 schedulable_mxjob = generate_mxjob(scheduler, server, worker, KubeflowOrgV1SchedulingPolicy(min_available=3), job_namespace) 92 93 TRAINING_CLIENT.create_mxjob(unschedulable_mxjob, job_namespace) 94 logging.info(f"List of created {constants.MXJOB_KIND}s") 95 logging.info(TRAINING_CLIENT.list_mxjobs(job_namespace)) 96 97 verify_unschedulable_job_e2e( 98 TRAINING_CLIENT, 99 JOB_NAME, 100 job_namespace, 101 constants.MXJOB_KIND, 102 ) 103 104 TRAINING_CLIENT.patch_mxjob(schedulable_mxjob, JOB_NAME, job_namespace) 105 logging.info(f"List of patched {constants.MXJOB_KIND}s") 106 logging.info(TRAINING_CLIENT.list_mxjobs(job_namespace)) 107 108 verify_job_e2e( 109 TRAINING_CLIENT, 110 JOB_NAME, 111 job_namespace, 112 constants.MXJOB_KIND, 113 CONTAINER_NAME, 114 ) 115 116 TRAINING_CLIENT.delete_mxjob(JOB_NAME, job_namespace) 117 118 119 @pytest.mark.skipif( 120 GANG_SCHEDULER_NAME in GANG_SCHEDULERS, reason="For plain scheduling", 121 ) 122 def test_sdk_e2e(job_namespace): 123 worker_container, server_container, scheduler_container = generate_containers() 124 125 worker = KubeflowOrgV1ReplicaSpec( 126 replicas=1, 127 restart_policy="Never", 128 template=V1PodTemplateSpec(metadata=V1ObjectMeta(annotations={constants.ISTIO_SIDECAR_INJECTION: "false"}), 129 spec=V1PodSpec(containers=[worker_container])), 130 ) 131 132 server = KubeflowOrgV1ReplicaSpec( 133 replicas=1, 134 restart_policy="Never", 135 template=V1PodTemplateSpec(metadata=V1ObjectMeta(annotations={constants.ISTIO_SIDECAR_INJECTION: "false"}), 136 spec=V1PodSpec(containers=[server_container])), 137 ) 138 139 scheduler = KubeflowOrgV1ReplicaSpec( 140 replicas=1, 141 restart_policy="Never", 142 template=V1PodTemplateSpec(metadata=V1ObjectMeta(annotations={constants.ISTIO_SIDECAR_INJECTION: "false"}), 143 spec=V1PodSpec(containers=[scheduler_container])), 144 ) 145 146 mxjob = generate_mxjob(scheduler, server, worker, job_namespace=job_namespace) 147 148 TRAINING_CLIENT.create_mxjob(mxjob, job_namespace) 149 logging.info(f"List of created {constants.MXJOB_KIND}s") 150 logging.info(TRAINING_CLIENT.list_mxjobs(job_namespace)) 151 152 verify_job_e2e( 153 TRAINING_CLIENT, 154 JOB_NAME, 155 job_namespace, 156 constants.MXJOB_KIND, 157 CONTAINER_NAME, 158 ) 159 160 TRAINING_CLIENT.delete_mxjob(JOB_NAME, job_namespace) 161 162 163 def generate_mxjob( 164 scheduler: KubeflowOrgV1ReplicaSpec, 165 server: KubeflowOrgV1ReplicaSpec, 166 worker: KubeflowOrgV1ReplicaSpec, 167 scheduling_policy: KubeflowOrgV1SchedulingPolicy = None, 168 job_namespace: str = "default", 169 ) -> KubeflowOrgV1MXJob: 170 return KubeflowOrgV1MXJob( 171 api_version="kubeflow.org/v1", 172 kind="MXJob", 173 metadata=V1ObjectMeta(name=JOB_NAME, namespace=job_namespace), 174 spec=KubeflowOrgV1MXJobSpec( 175 job_mode="MXTrain", 176 run_policy=KubeflowOrgV1RunPolicy( 177 clean_pod_policy="None", 178 scheduling_policy=scheduling_policy, 179 ), 180 mx_replica_specs={ 181 "Scheduler": scheduler, 182 "Server": server, 183 "Worker": worker, 184 }, 185 ), 186 ) 187 188 189 def generate_containers() -> Tuple[V1Container, V1Container, V1Container]: 190 worker_container = V1Container( 191 name=CONTAINER_NAME, 192 # TODO (tenzen-y): Replace the below image with the kubeflow hosted image 193 image="docker.io/johnugeorge/mxnet:1.9.1_cpu_py3", 194 command=["/usr/local/bin/python3"], 195 args=[ 196 "incubator-mxnet/example/image-classification/train_mnist.py", 197 "--num-epochs", 198 "1", 199 "--num-examples", 200 "1000", 201 "--kv-store", 202 "dist_sync", 203 ], 204 ports=[V1ContainerPort(container_port=9991, name="mxjob-port")], 205 resources=V1ResourceRequirements(limits={"memory": "1Gi", "cpu": "0.25"}), 206 ) 207 208 server_container = V1Container( 209 name=CONTAINER_NAME, 210 # TODO (tenzen-y): Replace the below image with the kubeflow hosted image 211 image="docker.io/johnugeorge/mxnet:1.9.1_cpu_py3", 212 ports=[V1ContainerPort(container_port=9991, name="mxjob-port")], 213 resources=V1ResourceRequirements(limits={"memory": "1Gi", "cpu": "0.25"}), 214 ) 215 216 scheduler_container = V1Container( 217 name=CONTAINER_NAME, 218 # TODO (tenzen-y): Replace the below image with the kubeflow hosted image 219 image="docker.io/johnugeorge/mxnet:1.9.1_cpu_py3", 220 ports=[V1ContainerPort(container_port=9991, name="mxjob-port")], 221 resources=V1ResourceRequirements(limits={"memory": "1Gi", "cpu": "0.25"}), 222 ) 223 224 return worker_container, server_container, scheduler_container