github.com/kubeflow/training-operator@v1.7.0/examples/mpi/tensorflow-mnist.yaml (about)

     1  apiVersion: kubeflow.org/v1
     2  kind: MPIJob
     3  metadata:
     4    name: tensorflow-mnist
     5  spec:
     6    slotsPerWorker: 1
     7    runPolicy:
     8      cleanPodPolicy: Running
     9    mpiReplicaSpecs:
    10      Launcher:
    11        replicas: 1
    12        template:
    13          spec:
    14            containers:
    15            - image: horovod/horovod:0.20.0-tf2.3.0-torch1.6.0-mxnet1.5.0-py3.7-cpu
    16              name: mpi
    17              command:
    18              - mpirun
    19              args:
    20              - -np
    21              - "2"
    22              - --allow-run-as-root
    23              - -bind-to
    24              - none
    25              - -map-by
    26              - slot
    27              - -x
    28              - LD_LIBRARY_PATH
    29              - -x
    30              - PATH
    31              - -mca
    32              - pml
    33              - ob1
    34              - -mca
    35              - btl
    36              - ^openib
    37              - python
    38              - /examples/tensorflow2_mnist.py
    39              resources:
    40                limits:
    41                  cpu: 1
    42                  memory: 2Gi
    43      Worker:
    44        replicas: 2
    45        template:
    46          spec:
    47            containers:
    48            - image: horovod/horovod:0.20.0-tf2.3.0-torch1.6.0-mxnet1.5.0-py3.7-cpu
    49              name: mpi
    50              resources:
    51                limits:
    52                  cpu: 2
    53                  memory: 4Gi