github.com/kubeflow/training-operator@v1.7.0/examples/mxnet/mxjob_dist_v1.yaml (about) 1 apiVersion: "kubeflow.org/v1" 2 kind: "MXJob" 3 metadata: 4 name: "mxnet-job" 5 spec: 6 jobMode: MXTrain 7 mxReplicaSpecs: 8 Scheduler: 9 replicas: 1 10 restartPolicy: Never 11 template: 12 spec: 13 containers: 14 - name: mxnet 15 image: mxjob/mxnet:gpu 16 ports: 17 - containerPort: 9991 18 name: mxjob-port 19 Server: 20 replicas: 1 21 restartPolicy: Never 22 template: 23 spec: 24 containers: 25 - name: mxnet 26 image: mxjob/mxnet:gpu 27 ports: 28 - containerPort: 9991 29 name: mxjob-port 30 Worker: 31 replicas: 1 32 restartPolicy: Never 33 template: 34 spec: 35 containers: 36 - name: mxnet 37 image: mxjob/mxnet:gpu 38 command: ["python"] 39 args: ["/incubator-mxnet/example/image-classification/train_mnist.py","--num-epochs","10","--num-layers","2","--kv-store","dist_device_sync","--gpus","0"] 40 resources: 41 limits: 42 nvidia.com/gpu: 1 43 ports: 44 - containerPort: 9991 45 name: mxjob-port