volcano.sh/volcano@v1.9.0/example/kubecon-2019-china/tf-sample/tf-example.yaml

volcano.sh/volcano@v1.9.0/example/kubecon-2019-china/tf-sample/tf-example.yaml (about)

     1  ################################################
     2  #                                              #
     3  #    Demo for running TF tasks on Volcano      #
     4  #                                              #
     5  ################################################
     6  #
     7  # This yaml used to demonstrate how to running a TF task via Volcano Job,
     8  # the running sample program is from TF benchmark
     9  # (https://github.com/tensorflow/benchmarks/tree/master/scripts/tf_cnn_benchmarks)
    10  # The equivalent command when running locally:
    11  #
    12  #   python tf_cnn_benchmarks.py --num_gpus=1 --batch_size=32 --model=resnet50 --variable_update=parameter_server
    13  #   --local_parameter_device=cpu --device=cpu --data_format=NHWC
    14  #
    15  # The output from ps or worker pod can be used to identify whether the TF cluster
    16  # has been correctly configured:
    17  #
    18  #    (log from worker pod....)
    19  #    2019-04-23 11:10:25.554248: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215]
    20  #    Initialize GrpcChannelCache for job ps -> {0 -> tensorflow-benchmark-ps-0.tensorflow-benchmark:2222}
    21  #    2019-04-23 11:10:25.554308: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215]
    22  #    Initialize GrpcChannelCache for job worker -> {0 -> localhost:2222}
    23  #
    24  #    (log from ps pod....)
    25  #    2019-04-23 11:10:25.552827: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215]
    26  #    Initialize GrpcChannelCache for job ps -> {0 -> localhost:2222}
    27  #    2019-04-23 11:10:25.552861: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215]
    28  #    Initialize GrpcChannelCache for job worker -> {0 -> tensorflow-benchmark-worker-0.tensorflow-benchmark:2222}
    29  #
    30  # **NOTES**: This example may take about an hour to finish. When running multiple jobs, please ensure enough resource
    31  # is guaranteed for each of the worker pods.
    32  
    33  apiVersion: batch.volcano.sh/v1alpha1
    34  kind: Job
    35  metadata:
    36    name: tensorflow-benchmark
    37    labels:
    38      "volcano.sh/job-type": "Tensorflow"
    39  spec:
    40    minAvailable: 3
    41    schedulerName: volcano
    42    plugins:
    43      env: []
    44      svc: []
    45    policies:
    46      - event: PodEvicted
    47        action: RestartJob
    48    tasks:
    49      - replicas: 1
    50        name: ps
    51        template:
    52          spec:
    53            imagePullSecrets:
    54              - name: default-secret
    55            containers:
    56              - command:
    57                  - sh
    58                  - -c
    59                  - |
    60                    PS_HOST=`cat /etc/volcano/ps.host | sed 's/$/&:2222/g' | tr "\n" ","`;
    61                    WORKER_HOST=`cat /etc/volcano/worker.host | sed 's/$/&:2222/g' | tr "\n" ","`;
    62                    python tf_cnn_benchmarks.py --batch_size=32 --model=resnet50 --variable_update=parameter_server --flush_stdout=true --num_gpus=1 --local_parameter_device=cpu --device=cpu --data_format=NHWC --job_name=ps --task_index=${VK_TASK_INDEX} --ps_hosts=${PS_HOST} --worker_hosts=${WORKER_HOST}
    63                image: volcanosh/example-tf:0.0.1
    64                name: tensorflow
    65                ports:
    66                  - containerPort: 2222
    67                    name: tfjob-port
    68                resources:
    69                  requests:
    70                    cpu: "1000m"
    71                    memory: "2048Mi"
    72                  limits:
    73                    cpu: "1000m"
    74                    memory: "2048Mi"
    75                workingDir: /opt/tf-benchmarks/scripts/tf_cnn_benchmarks
    76            restartPolicy: OnFailure
    77      - replicas: 2
    78        name: worker
    79        policies:
    80          - event: TaskCompleted
    81            action: CompleteJob
    82        template:
    83          spec:
    84            imagePullSecrets:
    85              - name: default-secret
    86            containers:
    87              - command:
    88                  - sh
    89                  - -c
    90                  - |
    91                    PS_HOST=`cat /etc/volcano/ps.host | sed 's/$/&:2222/g' | tr "\n" ","`;
    92                    WORKER_HOST=`cat /etc/volcano/worker.host | sed 's/$/&:2222/g' | tr "\n" ","`;
    93                    python tf_cnn_benchmarks.py --batch_size=32 --model=resnet50 --variable_update=parameter_server --flush_stdout=true --num_gpus=1 --local_parameter_device=cpu --device=cpu --data_format=NHWC --job_name=worker --task_index=${VK_TASK_INDEX} --ps_hosts=${PS_HOST} --worker_hosts=${WORKER_HOST}
    94                image: volcanosh/example-tf:0.0.1
    95                name: tensorflow
    96                ports:
    97                  - containerPort: 2222
    98                    name: tfjob-port
    99                resources:
   100                  requests:
   101                    cpu: "2000m"
   102                    memory: "2048Mi"
   103                  limits:
   104                    cpu: "2000m"
   105                    memory: "4096Mi"
   106                workingDir: /opt/tf-benchmarks/scripts/tf_cnn_benchmarks
   107            restartPolicy: OnFailure