github.com/kubeflow/training-operator@v1.7.0/examples/pytorch/mnist/v1/pytorch_job_mnist_nccl.yaml (about)

     1  apiVersion: "kubeflow.org/v1"
     2  kind: "PyTorchJob"
     3  metadata:
     4    name: "pytorch-dist-mnist-nccl"
     5  spec:
     6    pytorchReplicaSpecs:
     7      Master:
     8        replicas: 1
     9        restartPolicy: OnFailure
    10        template:
    11          metadata:
    12            annotations:
    13              sidecar.istio.io/inject: "false"
    14          spec:
    15            containers:
    16              - name: pytorch
    17                image: gcr.io/<your_project>/pytorch_dist_mnist:latest
    18                args: ["--backend", "nccl"]
    19                resources: 
    20                  limits:
    21                    nvidia.com/gpu: 1
    22      Worker:
    23        replicas: 1
    24        restartPolicy: OnFailure
    25        template:
    26          metadata:
    27            annotations:
    28              sidecar.istio.io/inject: "false"
    29          spec:
    30            containers: 
    31              - name: pytorch
    32                image: gcr.io/<your_project>/pytorch_dist_mnist:latest
    33                args: ["--backend", "nccl"]
    34                resources: 
    35                  limits:
    36                    nvidia.com/gpu: 1