github.com/kubeflow/training-operator@v1.7.0/examples/mxnet/train/byteps_dist_gpu_v1.yaml (about) 1 apiVersion: "kubeflow.org/v1" 2 kind: "MXJob" 3 metadata: 4 name: "byteps-mxnet-job" 5 spec: 6 jobMode: MXTrain 7 runPolicy: 8 cleanPodPolicy: Running 9 mxReplicaSpecs: 10 Scheduler: 11 replicas: 1 12 restartPolicy: Never 13 template: 14 spec: 15 containers: 16 - name: mxnet 17 image: bytepsimage/mxnet 18 command: ["bpslaunch"] 19 Server: 20 replicas: 2 21 restartPolicy: Never 22 template: 23 spec: 24 containers: 25 - name: mxnet 26 image: bytepsimage/mxnet 27 command: ["bpslaunch"] 28 Worker: 29 replicas: 2 30 restartPolicy: Never 31 template: 32 spec: 33 containers: 34 - name: mxnet 35 image: bytepsimage/mxnet 36 command: ["bpslaunch"] 37 args: ["python3", "/usr/local/byteps/example/mxnet/train_imagenet_byteps.py", "--benchmark", "1", "--batch-size=32"] 38 volumeMounts: 39 - mountPath: /dev/shm 40 name: dshm 41 resources: 42 limits: 43 nvidia.com/gpu: 8 44 volumes: 45 - name: dshm 46 emptyDir: 47 medium: Memory