github.com/kubeflow/training-operator@v1.7.0/examples/pytorch/mnist/v1/pytorch_job_mnist_nccl.yaml (about) 1 apiVersion: "kubeflow.org/v1" 2 kind: "PyTorchJob" 3 metadata: 4 name: "pytorch-dist-mnist-nccl" 5 spec: 6 pytorchReplicaSpecs: 7 Master: 8 replicas: 1 9 restartPolicy: OnFailure 10 template: 11 metadata: 12 annotations: 13 sidecar.istio.io/inject: "false" 14 spec: 15 containers: 16 - name: pytorch 17 image: gcr.io/<your_project>/pytorch_dist_mnist:latest 18 args: ["--backend", "nccl"] 19 resources: 20 limits: 21 nvidia.com/gpu: 1 22 Worker: 23 replicas: 1 24 restartPolicy: OnFailure 25 template: 26 metadata: 27 annotations: 28 sidecar.istio.io/inject: "false" 29 spec: 30 containers: 31 - name: pytorch 32 image: gcr.io/<your_project>/pytorch_dist_mnist:latest 33 args: ["--backend", "nccl"] 34 resources: 35 limits: 36 nvidia.com/gpu: 1