volcano.sh/volcano@v1.9.0/example/kubecon-2019-china/gang/mpi-example.yaml (about) 1 apiVersion: batch.volcano.sh/v1alpha1 2 kind: Job 3 metadata: 4 name: lm-mpi-job 5 labels: 6 # 根据业务需要设置作业类型 7 "volcano.sh/job-type": "MPI" 8 spec: 9 # 设置最小需要的服务 (小于总replicas数) 10 minAvailable: 4 11 schedulerName: volcano 12 plugins: 13 # 提供 ssh 免密认证 14 ssh: [] 15 # 提供运行作业所需要的网络信息,hosts文件,headless service等 16 svc: [] 17 # 如果有pod被 杀死,重启整个作业 18 policies: 19 - event: PodEvicted 20 action: RestartJob 21 tasks: 22 - replicas: 1 23 name: mpimaster 24 # 当 mpiexec 结束,认识整个mpi作业结束 25 policies: 26 - event: TaskCompleted 27 action: CompleteJob 28 template: 29 spec: 30 # Volcano 的信息会统一放到 /etc/volcano 目录下 31 containers: 32 - command: 33 - /bin/sh 34 - -c 35 - | 36 MPI_HOST=`cat /etc/volcano/mpiworker.host | tr "\n" ","`; 37 mkdir -p /var/run/sshd; /usr/sbin/sshd; 38 mpiexec --allow-run-as-root --host ${MPI_HOST} -np 3 mpi_hello_world; 39 image: volcanosh/example-mpi:0.0.1 40 name: mpimaster 41 ports: 42 - containerPort: 22 43 name: mpijob-port 44 workingDir: /home 45 resources: 46 requests: 47 cpu: "500m" 48 limits: 49 cpu: "500m" 50 restartPolicy: OnFailure 51 imagePullSecrets: 52 - name: default-secret 53 - replicas: 3 54 name: mpiworker 55 template: 56 spec: 57 containers: 58 - command: 59 - /bin/sh 60 - -c 61 - | 62 mkdir -p /var/run/sshd; /usr/sbin/sshd -D; 63 image: volcanosh/example-mpi:0.0.1 64 name: mpiworker 65 ports: 66 - containerPort: 22 67 name: mpijob-port 68 workingDir: /home 69 resources: 70 requests: 71 cpu: "1000m" 72 limits: 73 cpu: "1000m" 74 restartPolicy: OnFailure 75 imagePullSecrets: 76 - name: default-secret 77