github.com/kubeflow/training-operator@v1.7.0/sdk/python/test/e2e/test_e2e_mpijob.py (about) 1 # Copyright 2021 kubeflow.org. 2 # 3 # Licensed under the Apache License, Version 2.0 (the "License"); 4 # you may not use this file except in compliance with the License. 5 # You may obtain a copy of the License at 6 # 7 # http://www.apache.org/licenses/LICENSE-2.0 8 # 9 # Unless required by applicable law or agreed to in writing, software 10 # distributed under the License is distributed on an "AS IS" BASIS, 11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 # See the License for the specific language governing permissions and 13 # limitations under the License. 14 15 import os 16 import logging 17 import pytest 18 from typing import Tuple 19 20 from kubernetes.client import V1PodTemplateSpec 21 from kubernetes.client import V1ObjectMeta 22 from kubernetes.client import V1PodSpec 23 from kubernetes.client import V1Container 24 from kubernetes.client import V1ResourceRequirements 25 26 from kubeflow.training import TrainingClient 27 from kubeflow.training import KubeflowOrgV1ReplicaSpec 28 from kubeflow.training import KubeflowOrgV1MPIJob 29 from kubeflow.training import KubeflowOrgV1MPIJobSpec 30 from kubeflow.training import KubeflowOrgV1RunPolicy 31 from kubeflow.training import KubeflowOrgV1SchedulingPolicy 32 from kubeflow.training.constants import constants 33 34 from test.e2e.utils import verify_job_e2e, verify_unschedulable_job_e2e, get_pod_spec_scheduler_name 35 from test.e2e.constants import TEST_GANG_SCHEDULER_NAME_ENV_KEY 36 from test.e2e.constants import GANG_SCHEDULERS, NONE_GANG_SCHEDULERS 37 38 logging.basicConfig(format="%(message)s") 39 logging.getLogger().setLevel(logging.INFO) 40 41 TRAINING_CLIENT = TrainingClient() 42 JOB_NAME = "mpijob-mxnet-ci-test" 43 CONTAINER_NAME = "mpi" 44 GANG_SCHEDULER_NAME = os.getenv(TEST_GANG_SCHEDULER_NAME_ENV_KEY) 45 46 47 @pytest.mark.skipif( 48 GANG_SCHEDULER_NAME in NONE_GANG_SCHEDULERS, reason="For gang-scheduling", 49 ) 50 def test_sdk_e2e_with_gang_scheduling(job_namespace): 51 launcher_container, worker_container = generate_containers() 52 53 launcher = KubeflowOrgV1ReplicaSpec( 54 replicas=1, 55 restart_policy="Never", 56 template=V1PodTemplateSpec( 57 metadata=V1ObjectMeta(annotations={constants.ISTIO_SIDECAR_INJECTION: "false"}), 58 spec=V1PodSpec( 59 containers=[launcher_container], 60 scheduler_name=get_pod_spec_scheduler_name(GANG_SCHEDULER_NAME), 61 ) 62 ), 63 ) 64 65 worker = KubeflowOrgV1ReplicaSpec( 66 replicas=1, 67 restart_policy="Never", 68 template=V1PodTemplateSpec( 69 metadata=V1ObjectMeta(annotations={constants.ISTIO_SIDECAR_INJECTION: "false"}), 70 spec=V1PodSpec( 71 containers=[worker_container], 72 scheduler_name=get_pod_spec_scheduler_name(GANG_SCHEDULER_NAME), 73 ) 74 ), 75 ) 76 77 mpijob = generate_mpijob(launcher, worker, KubeflowOrgV1SchedulingPolicy(min_available=10), job_namespace) 78 patched_mpijob = generate_mpijob(launcher, worker, KubeflowOrgV1SchedulingPolicy(min_available=2), job_namespace) 79 80 TRAINING_CLIENT.create_mpijob(mpijob, job_namespace) 81 logging.info(f"List of created {constants.MPIJOB_KIND}s") 82 logging.info(TRAINING_CLIENT.list_mpijobs(job_namespace)) 83 84 verify_unschedulable_job_e2e( 85 TRAINING_CLIENT, 86 JOB_NAME, 87 job_namespace, 88 constants.MPIJOB_KIND, 89 ) 90 91 TRAINING_CLIENT.patch_mpijob(patched_mpijob, JOB_NAME, job_namespace) 92 logging.info(f"List of patched {constants.MPIJOB_KIND}s") 93 logging.info(TRAINING_CLIENT.list_mpijobs(job_namespace)) 94 95 verify_job_e2e( 96 TRAINING_CLIENT, 97 JOB_NAME, 98 job_namespace, 99 constants.MPIJOB_KIND, 100 CONTAINER_NAME, 101 ) 102 103 TRAINING_CLIENT.delete_mpijob(JOB_NAME, job_namespace) 104 105 106 @pytest.mark.skipif( 107 GANG_SCHEDULER_NAME in GANG_SCHEDULERS, reason="For plain scheduling", 108 ) 109 def test_sdk_e2e(job_namespace): 110 launcher_container, worker_container = generate_containers() 111 112 launcher = KubeflowOrgV1ReplicaSpec( 113 replicas=1, 114 restart_policy="Never", 115 template=V1PodTemplateSpec(metadata=V1ObjectMeta(annotations={constants.ISTIO_SIDECAR_INJECTION: "false"}), 116 spec=V1PodSpec(containers=[launcher_container])), 117 ) 118 119 worker = KubeflowOrgV1ReplicaSpec( 120 replicas=1, 121 restart_policy="Never", 122 template=V1PodTemplateSpec(metadata=V1ObjectMeta(annotations={constants.ISTIO_SIDECAR_INJECTION: "false"}), 123 spec=V1PodSpec(containers=[worker_container])), 124 ) 125 126 mpijob = generate_mpijob(launcher, worker, job_namespace=job_namespace) 127 128 TRAINING_CLIENT.create_mpijob(mpijob, job_namespace) 129 logging.info(f"List of created {constants.MPIJOB_KIND}s") 130 logging.info(TRAINING_CLIENT.list_mpijobs(job_namespace)) 131 132 verify_job_e2e( 133 TRAINING_CLIENT, 134 JOB_NAME, 135 job_namespace, 136 constants.MPIJOB_KIND, 137 CONTAINER_NAME, 138 ) 139 140 TRAINING_CLIENT.delete_mpijob(JOB_NAME, job_namespace) 141 142 143 def generate_mpijob( 144 launcher: KubeflowOrgV1ReplicaSpec, 145 worker: KubeflowOrgV1ReplicaSpec, 146 scheduling_policy: KubeflowOrgV1SchedulingPolicy = None, 147 job_namespace: str = "default", 148 ) -> KubeflowOrgV1MPIJob: 149 return KubeflowOrgV1MPIJob( 150 api_version="kubeflow.org/v1", 151 kind="MPIJob", 152 metadata=V1ObjectMeta(name=JOB_NAME, namespace=job_namespace), 153 spec=KubeflowOrgV1MPIJobSpec( 154 slots_per_worker=1, 155 run_policy=KubeflowOrgV1RunPolicy( 156 clean_pod_policy="None", 157 scheduling_policy=scheduling_policy, 158 ), 159 mpi_replica_specs={"Launcher": launcher, "Worker": worker}, 160 ), 161 ) 162 163 164 def generate_containers() -> Tuple[V1Container, V1Container]: 165 launcher_container = V1Container( 166 name=CONTAINER_NAME, 167 image="horovod/horovod:0.20.0-tf2.3.0-torch1.6.0-mxnet1.5.0-py3.7-cpu", 168 command=["mpirun"], 169 args=[ 170 "-np", 171 "1", 172 "--allow-run-as-root", 173 "-bind-to", 174 "none", 175 "-map-by", 176 "slot", 177 "-x", 178 "LD_LIBRARY_PATH", 179 "-x", 180 "PATH", 181 "-mca", 182 "pml", 183 "ob1", 184 "-mca", 185 "btl", 186 "^openib", 187 # "python", "/examples/tensorflow2_mnist.py"] 188 "python", 189 "/examples/pytorch_mnist.py", 190 "--epochs", 191 "1", 192 ], 193 resources=V1ResourceRequirements(limits={"memory": "1Gi", "cpu": "0.4"}), 194 ) 195 196 worker_container = V1Container( 197 name="mpi", 198 image="horovod/horovod:0.20.0-tf2.3.0-torch1.6.0-mxnet1.5.0-py3.7-cpu", 199 resources=V1ResourceRequirements(limits={"memory": "1Gi", "cpu": "0.4"}), 200 ) 201 202 return launcher_container, worker_container