volcano.sh/volcano@v1.9.0/example/integrations/tensorflow/benchmark/tf-example.yaml (about)

     1  ################################################
     2  #                                              #
     3  #    Demo for running TF tasks on Volcano      #
     4  #                                              #
     5  ################################################
     6  #
     7  # This yaml used to demonstrate how to running a TF task via Volcano Job,
     8  # the running sample program is from TF benchmark
     9  # (https://github.com/tensorflow/benchmarks/tree/master/scripts/tf_cnn_benchmarks)
    10  # The equivalent command when running locally:
    11  #
    12  #   python tf_cnn_benchmarks.py --num_gpus=1 --batch_size=32 --model=resnet50 --variable_update=parameter_server
    13  #   --local_parameter_device=cpu --device=cpu --data_format=NHWC
    14  #
    15  # The output from ps or worker pod can be used to identify whether the TF cluster
    16  # has been correctly configured:
    17  #
    18  #    (log from worker pod....)
    19  #    2019-04-23 11:10:25.554248: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215]
    20  #    Initialize GrpcChannelCache for job ps -> {0 -> tensorflow-benchmark-ps-0.tensorflow-benchmark:2222}
    21  #    2019-04-23 11:10:25.554308: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215]
    22  #    Initialize GrpcChannelCache for job worker -> {0 -> localhost:2222}
    23  #
    24  #    (log from ps pod....)
    25  #    2019-04-23 11:10:25.552827: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215]
    26  #    Initialize GrpcChannelCache for job ps -> {0 -> localhost:2222}
    27  #    2019-04-23 11:10:25.552861: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215]
    28  #    Initialize GrpcChannelCache for job worker -> {0 -> tensorflow-benchmark-worker-0.tensorflow-benchmark:2222}
    29  #
    30  # **NOTES**: This example may take about an hour to finish. When running multiple jobs, please ensure enough resource
    31  # is guaranteed for each of the worker pods.
    32  
    33  apiVersion: batch.volcano.sh/v1alpha1
    34  kind: Job
    35  metadata:
    36    name: tensorflow-benchmark
    37  spec:
    38    minAvailable: 3
    39    schedulerName: volcano
    40    plugins:
    41      env: []
    42      svc: []
    43    policies:
    44      - event: PodEvicted
    45        action: RestartJob
    46    tasks:
    47      - replicas: 1
    48        name: ps
    49        template:
    50          spec:
    51            imagePullSecrets:
    52              - name: default-secret
    53            containers:
    54              - command:
    55                  - sh
    56                  - -c
    57                  - |
    58                    PS_HOST=`cat /etc/volcano/ps.host | sed 's/$/&:2222/g' | tr "\n" ","`;
    59                    WORKER_HOST=`cat /etc/volcano/worker.host | sed 's/$/&:2222/g' | tr "\n" ","`;
    60                    python tf_cnn_benchmarks.py --batch_size=32 --model=resnet50 --variable_update=parameter_server --flush_stdout=true --num_gpus=1 --local_parameter_device=cpu --device=cpu --data_format=NHWC --job_name=ps --task_index=${VK_TASK_INDEX} --ps_hosts=${PS_HOST} --worker_hosts=${WORKER_HOST}
    61                image: volcanosh/example-tf:0.0.1
    62                name: tensorflow
    63                ports:
    64                  - containerPort: 2222
    65                    name: tfjob-port
    66                resources: {}
    67                workingDir: /opt/tf-benchmarks/scripts/tf_cnn_benchmarks
    68            restartPolicy: OnFailure
    69      - replicas: 2
    70        name: worker
    71        policies:
    72          - event: TaskCompleted
    73            action: CompleteJob
    74        template:
    75          spec:
    76            imagePullSecrets:
    77              - name: default-secret
    78            containers:
    79              - command:
    80                  - sh
    81                  - -c
    82                  - |
    83                    PS_HOST=`cat /etc/volcano/ps.host | sed 's/$/&:2222/g' | tr "\n" ","`;
    84                    WORKER_HOST=`cat /etc/volcano/worker.host | sed 's/$/&:2222/g' | tr "\n" ","`;
    85                    python tf_cnn_benchmarks.py --batch_size=32 --model=resnet50 --variable_update=parameter_server --flush_stdout=true --num_gpus=1 --local_parameter_device=cpu --device=cpu --data_format=NHWC --job_name=worker --task_index=${VK_TASK_INDEX} --ps_hosts=${PS_HOST} --worker_hosts=${WORKER_HOST}
    86                image: volcanosh/example-tf:0.0.1
    87                name: tensorflow
    88                ports:
    89                  - containerPort: 2222
    90                    name: tfjob-port
    91                resources: {}
    92                workingDir: /opt/tf-benchmarks/scripts/tf_cnn_benchmarks
    93            restartPolicy: OnFailure