github.com/kubeflow/training-operator@v1.7.0/examples/pytorch/elastic/echo/echo.yaml (about) 1 apiVersion: "kubeflow.org/v1" 2 kind: PyTorchJob 3 metadata: 4 name: elastic-example-echo 5 spec: 6 elasticPolicy: 7 rdzvBackend: c10d 8 minReplicas: 1 9 maxReplicas: 2 10 maxRestarts: 100 11 pytorchReplicaSpecs: 12 Worker: 13 replicas: 2 14 template: 15 spec: 16 containers: 17 - name: pytorch 18 image: kubeflow/pytorch-elastic-example-echo:latest 19 imagePullPolicy: IfNotPresent 20 env: 21 - name: LOGLEVEL 22 value: DEBUG 23 command: 24 - python 25 - -m 26 - torch.distributed.run 27 - --rdzv_backend=c10d 28 - ./echo.py 29