volcano.sh/volcano@v1.9.0/example/kubecon-2019-china/horovod-sample/lm-horovod-tf-mnist-v0.5.yaml (about) 1 apiVersion: batch.volcano.sh/v1alpha1 2 kind: Job 3 metadata: 4 name: lm-horovod-job 5 labels: 6 "volcano.sh/job-type": Horovod 7 spec: 8 minAvailable: 4 9 schedulerName: volcano 10 plugins: 11 ssh: [] 12 svc: [] 13 # 如果有pod被 杀死,重启整个作业 14 policies: 15 - event: PodEvicted 16 action: RestartJob 17 tasks: 18 - replicas: 1 19 name: master 20 policies: 21 - event: TaskCompleted 22 action: CompleteJob 23 template: 24 spec: 25 containers: 26 - command: 27 - /bin/sh 28 - -c 29 - | 30 WORKER_HOST=`cat /etc/volcano/worker.host | tr "\n" ","`; 31 mkdir -p /var/run/sshd; /usr/sbin/sshd; 32 mpiexec --allow-run-as-root --host ${WORKER_HOST} -np 3 python tensorflow_mnist_lm.py; 33 image: volcanosh/horovod-tf-mnist:0.5 34 name: master 35 ports: 36 - containerPort: 22 37 name: job-port 38 resources: 39 requests: 40 cpu: "500m" 41 memory: "1024Mi" 42 limits: 43 cpu: "500m" 44 memory: "1024Mi" 45 restartPolicy: OnFailure 46 imagePullSecrets: 47 - name: default-secret 48 - replicas: 3 49 name: worker 50 template: 51 spec: 52 containers: 53 - command: 54 - /bin/sh 55 - -c 56 - | 57 mkdir -p /var/run/sshd; /usr/sbin/sshd -D; 58 image: volcanosh/horovod-tf-mnist:0.5 59 name: worker 60 ports: 61 - containerPort: 22 62 name: job-port 63 resources: 64 requests: 65 cpu: "1000m" 66 memory: "2048Mi" 67 limits: 68 cpu: "1000m" 69 memory: "2048Mi" 70 restartPolicy: OnFailure 71 imagePullSecrets: 72 - name: default-secret 73 ---