github.com/kubeflow/training-operator@v1.7.0/examples/mxnet/train/mx_job_dist_gpu_v1.yaml (about) 1 apiVersion: "kubeflow.org/v1" 2 kind: "MXJob" 3 metadata: 4 name: "mxnet-job" 5 spec: 6 jobMode: MXTrain 7 mxReplicaSpecs: 8 Scheduler: 9 replicas: 1 10 restartPolicy: Never 11 template: 12 spec: 13 containers: 14 - name: mxnet 15 image: mxjob/mxnet:gpu 16 Server: 17 replicas: 1 18 restartPolicy: Never 19 template: 20 spec: 21 containers: 22 - name: mxnet 23 image: mxjob/mxnet:gpu 24 Worker: 25 replicas: 1 26 restartPolicy: Never 27 template: 28 spec: 29 containers: 30 - name: mxnet 31 image: mxjob/mxnet:gpu 32 command: ["python"] 33 args: ["/incubator-mxnet/example/image-classification/train_mnist.py","--num-epochs","10","--num-layers","2","--kv-store","dist_device_sync","--gpus","0"] 34 resources: 35 limits: 36 nvidia.com/gpu: 1