github.com/kubeflow/training-operator@v1.7.0/examples/mpi/tensorflow-mnist-elastic.yaml (about) 1 apiVersion: kubeflow.org/v1 2 kind: MPIJob 3 metadata: 4 name: tensorflow-mnist-elastic 5 spec: 6 slotsPerWorker: 1 7 cleanPodPolicy: Running 8 mpiReplicaSpecs: 9 Launcher: 10 replicas: 1 11 template: 12 spec: 13 containers: 14 - image: horovod/horovod:0.20.0-tf2.3.0-torch1.6.0-mxnet1.5.0-py3.7-cpu 15 name: mpi-launcher 16 command: 17 - horovodrun 18 args: 19 - -np 20 - "2" 21 - --min-np 22 - "1" 23 - --max-np 24 - "3" 25 - --host-discovery-script 26 - /etc/mpi/discover_hosts.sh 27 - python 28 - /examples/elastic/tensorflow2_mnist_elastic.py 29 resources: 30 limits: 31 cpu: 1 32 memory: 2Gi 33 Worker: 34 replicas: 2 35 template: 36 spec: 37 containers: 38 - image: horovod/horovod:0.20.0-tf2.3.0-torch1.6.0-mxnet1.5.0-py3.7-cpu 39 name: mpi-worker 40 resources: 41 limits: 42 cpu: 2 43 memory: 4Gi