github.com/kubeflow/training-operator@v1.7.0/examples/pytorch/elastic/imagenet/imagenet.yaml (about)

     1  apiVersion: "kubeflow.org/v1"
     2  kind: PyTorchJob
     3  metadata:
     4    name: elastic-example-imagenet
     5  spec:
     6    elasticPolicy:
     7      rdzvBackend: c10d
     8      minReplicas: 1
     9      maxReplicas: 3
    10      maxRestarts: 100
    11      metrics:
    12        - type: Resource
    13          resource:
    14            name: cpu
    15            target:
    16              type: Utilization
    17              averageUtilization: 80
    18    pytorchReplicaSpecs:
    19      Worker:
    20        replicas: 2
    21        restartPolicy: OnFailure
    22        template:
    23          spec:
    24            containers:
    25              - name: pytorch
    26                image: kubeflow/pytorch-elastic-example-imagenet:latest
    27                imagePullPolicy: IfNotPresent
    28                resources:
    29                  requests:
    30                    cpu: 4
    31                env:
    32                - name: LOGLEVEL
    33                  value: DEBUG
    34                command:
    35                  - python
    36                  - -m
    37                  - torch.distributed.run
    38                  - /workspace/examples/imagenet.py
    39                  - "--arch=resnet18"
    40                  - "--epochs=1"
    41                  - "--batch-size=32"
    42                  - "--workers=0"
    43                  - "/workspace/data/tiny-imagenet-200"