github.com/kubeflow/training-operator@v1.7.0/examples/pytorch/elastic/echo/echo.yaml (about)

     1  apiVersion: "kubeflow.org/v1"
     2  kind: PyTorchJob
     3  metadata:
     4    name: elastic-example-echo
     5  spec:
     6    elasticPolicy:
     7      rdzvBackend: c10d
     8      minReplicas: 1
     9      maxReplicas: 2
    10      maxRestarts: 100
    11    pytorchReplicaSpecs:
    12      Worker:
    13        replicas: 2
    14        template:
    15          spec:
    16            containers:
    17              - name: pytorch
    18                image: kubeflow/pytorch-elastic-example-echo:latest
    19                imagePullPolicy: IfNotPresent
    20                env:
    21                - name: LOGLEVEL
    22                  value: DEBUG
    23                command:
    24                  - python
    25                  - -m
    26                  - torch.distributed.run
    27                  - --rdzv_backend=c10d
    28                  - ./echo.py
    29