github.com/kubeflow/training-operator@v1.7.0/examples/pytorch/mnist/v1/pytorch_job_mnist_gloo.yaml (about)

     1  apiVersion: "kubeflow.org/v1"
     2  kind: "PyTorchJob"
     3  metadata:
     4    name: "pytorch-dist-mnist-gloo"
     5  spec:
     6    pytorchReplicaSpecs:
     7      Master:
     8        replicas: 1
     9        restartPolicy: OnFailure
    10        template:
    11          metadata:
    12            annotations:
    13              sidecar.istio.io/inject: "false"
    14          spec:
    15            containers:
    16              - name: pytorch
    17                image: gcr.io/<your_project>/pytorch_dist_mnist:latest
    18                args: ["--backend", "gloo"]
    19                # Comment out the below resources to use the CPU.
    20                resources: 
    21                  limits:
    22                    nvidia.com/gpu: 1
    23      Worker:
    24        replicas: 1
    25        restartPolicy: OnFailure
    26        template:
    27          metadata:
    28            annotations:
    29              sidecar.istio.io/inject: "false"
    30          spec:
    31            containers: 
    32              - name: pytorch
    33                image: gcr.io/<your_project>/pytorch_dist_mnist:latest
    34                args: ["--backend", "gloo"]
    35                # Comment out the below resources to use the CPU.
    36                resources: 
    37                  limits:
    38                    nvidia.com/gpu: 1