github.com/kubeflow/training-operator@v1.7.0/examples/pytorch/mnist/v1/pytorch_job_mnist_mpi.yaml (about) 1 apiVersion: "kubeflow.org/v1" 2 kind: "PyTorchJob" 3 metadata: 4 name: "pytorch-dist-mnist-mpi" 5 spec: 6 pytorchReplicaSpecs: 7 Master: 8 replicas: 1 9 restartPolicy: OnFailure 10 template: 11 metadata: 12 annotations: 13 sidecar.istio.io/inject: "false" 14 spec: 15 containers: 16 - name: pytorch 17 image: gcr.io/<your_project>/pytorch_dist_mnist:mpi 18 args: ["--backend", "mpi"] 19 # Comment out the below resources to use the CPU. 20 resources: 21 limits: 22 nvidia.com/gpu: 1 23 Worker: 24 replicas: 1 25 restartPolicy: OnFailure 26 template: 27 metadata: 28 annotations: 29 sidecar.istio.io/inject: "false" 30 spec: 31 containers: 32 - name: pytorch 33 image: gcr.io/<your_project>/pytorch_dist_mnist:mpi 34 args: ["--backend", "mpi"] 35 # Comment out the below resources to use the CPU. 36 resources: 37 limits: 38 nvidia.com/gpu: 1