volcano.sh/volcano@v1.9.0/example/kubecon-2019-china/horovod-sample/lm-horovod-tf-mnist-v0.5.yaml (about)

     1  apiVersion: batch.volcano.sh/v1alpha1
     2  kind: Job
     3  metadata:
     4    name: lm-horovod-job
     5    labels:
     6      "volcano.sh/job-type": Horovod
     7  spec:
     8    minAvailable: 4
     9    schedulerName: volcano
    10    plugins:
    11      ssh: []
    12      svc: []
    13    # 如果有pod被 杀死,重启整个作业
    14    policies:
    15      - event: PodEvicted
    16        action: RestartJob
    17    tasks:
    18      - replicas: 1
    19        name: master
    20        policies:
    21          - event: TaskCompleted
    22            action: CompleteJob
    23        template:
    24          spec:
    25            containers:
    26              - command:
    27                  - /bin/sh
    28                  - -c
    29                  - |
    30                    WORKER_HOST=`cat /etc/volcano/worker.host | tr "\n" ","`;
    31                    mkdir -p /var/run/sshd; /usr/sbin/sshd;
    32                    mpiexec --allow-run-as-root --host ${WORKER_HOST} -np 3 python tensorflow_mnist_lm.py;
    33                image: volcanosh/horovod-tf-mnist:0.5
    34                name: master
    35                ports:
    36                  - containerPort: 22
    37                    name: job-port
    38                resources:
    39                  requests:
    40                    cpu: "500m"
    41                    memory: "1024Mi"
    42                  limits:
    43                    cpu: "500m"
    44                    memory: "1024Mi"
    45            restartPolicy: OnFailure
    46            imagePullSecrets:
    47              - name: default-secret
    48      - replicas: 3
    49        name: worker
    50        template:
    51          spec:
    52            containers:
    53              - command:
    54                  - /bin/sh
    55                  - -c
    56                  - |
    57                    mkdir -p /var/run/sshd; /usr/sbin/sshd -D;
    58                image: volcanosh/horovod-tf-mnist:0.5
    59                name: worker
    60                ports:
    61                  - containerPort: 22
    62                    name: job-port
    63                resources:
    64                  requests:
    65                    cpu: "1000m"
    66                    memory: "2048Mi"
    67                  limits:
    68                    cpu: "1000m"
    69                    memory: "2048Mi"
    70            restartPolicy: OnFailure
    71            imagePullSecrets:
    72              - name: default-secret
    73  ---