github.com/kubeflow/training-operator@v1.7.0/examples/pytorch/elastic/imagenet/imagenet.yaml (about) 1 apiVersion: "kubeflow.org/v1" 2 kind: PyTorchJob 3 metadata: 4 name: elastic-example-imagenet 5 spec: 6 elasticPolicy: 7 rdzvBackend: c10d 8 minReplicas: 1 9 maxReplicas: 3 10 maxRestarts: 100 11 metrics: 12 - type: Resource 13 resource: 14 name: cpu 15 target: 16 type: Utilization 17 averageUtilization: 80 18 pytorchReplicaSpecs: 19 Worker: 20 replicas: 2 21 restartPolicy: OnFailure 22 template: 23 spec: 24 containers: 25 - name: pytorch 26 image: kubeflow/pytorch-elastic-example-imagenet:latest 27 imagePullPolicy: IfNotPresent 28 resources: 29 requests: 30 cpu: 4 31 env: 32 - name: LOGLEVEL 33 value: DEBUG 34 command: 35 - python 36 - -m 37 - torch.distributed.run 38 - /workspace/examples/imagenet.py 39 - "--arch=resnet18" 40 - "--epochs=1" 41 - "--batch-size=32" 42 - "--workers=0" 43 - "/workspace/data/tiny-imagenet-200"