github.com/kubeflow/training-operator@v1.7.0/examples/mpi/tensorflow-mnist-elastic.yaml (about)

     1  apiVersion: kubeflow.org/v1
     2  kind: MPIJob
     3  metadata:
     4    name: tensorflow-mnist-elastic
     5  spec:
     6    slotsPerWorker: 1
     7    cleanPodPolicy: Running
     8    mpiReplicaSpecs:
     9      Launcher:
    10        replicas: 1
    11        template:
    12          spec:
    13            containers:
    14            - image: horovod/horovod:0.20.0-tf2.3.0-torch1.6.0-mxnet1.5.0-py3.7-cpu
    15              name: mpi-launcher
    16              command:
    17              - horovodrun
    18              args:
    19              - -np
    20              - "2"
    21              - --min-np
    22              - "1"
    23              - --max-np
    24              - "3"
    25              - --host-discovery-script
    26              - /etc/mpi/discover_hosts.sh
    27              - python
    28              - /examples/elastic/tensorflow2_mnist_elastic.py
    29              resources:
    30                limits:
    31                  cpu: 1
    32                  memory: 2Gi
    33      Worker:
    34        replicas: 2
    35        template:
    36          spec:
    37            containers:
    38            - image: horovod/horovod:0.20.0-tf2.3.0-torch1.6.0-mxnet1.5.0-py3.7-cpu
    39              name: mpi-worker
    40              resources:
    41                limits:
    42                  cpu: 2
    43                  memory: 4Gi