volcano.sh/volcano@v1.9.0/example/integrations/tensorflow/benchmark/tf-example.yaml (about) 1 ################################################ 2 # # 3 # Demo for running TF tasks on Volcano # 4 # # 5 ################################################ 6 # 7 # This yaml used to demonstrate how to running a TF task via Volcano Job, 8 # the running sample program is from TF benchmark 9 # (https://github.com/tensorflow/benchmarks/tree/master/scripts/tf_cnn_benchmarks) 10 # The equivalent command when running locally: 11 # 12 # python tf_cnn_benchmarks.py --num_gpus=1 --batch_size=32 --model=resnet50 --variable_update=parameter_server 13 # --local_parameter_device=cpu --device=cpu --data_format=NHWC 14 # 15 # The output from ps or worker pod can be used to identify whether the TF cluster 16 # has been correctly configured: 17 # 18 # (log from worker pod....) 19 # 2019-04-23 11:10:25.554248: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215] 20 # Initialize GrpcChannelCache for job ps -> {0 -> tensorflow-benchmark-ps-0.tensorflow-benchmark:2222} 21 # 2019-04-23 11:10:25.554308: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215] 22 # Initialize GrpcChannelCache for job worker -> {0 -> localhost:2222} 23 # 24 # (log from ps pod....) 25 # 2019-04-23 11:10:25.552827: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215] 26 # Initialize GrpcChannelCache for job ps -> {0 -> localhost:2222} 27 # 2019-04-23 11:10:25.552861: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215] 28 # Initialize GrpcChannelCache for job worker -> {0 -> tensorflow-benchmark-worker-0.tensorflow-benchmark:2222} 29 # 30 # **NOTES**: This example may take about an hour to finish. When running multiple jobs, please ensure enough resource 31 # is guaranteed for each of the worker pods. 32 33 apiVersion: batch.volcano.sh/v1alpha1 34 kind: Job 35 metadata: 36 name: tensorflow-benchmark 37 spec: 38 minAvailable: 3 39 schedulerName: volcano 40 plugins: 41 env: [] 42 svc: [] 43 policies: 44 - event: PodEvicted 45 action: RestartJob 46 tasks: 47 - replicas: 1 48 name: ps 49 template: 50 spec: 51 imagePullSecrets: 52 - name: default-secret 53 containers: 54 - command: 55 - sh 56 - -c 57 - | 58 PS_HOST=`cat /etc/volcano/ps.host | sed 's/$/&:2222/g' | tr "\n" ","`; 59 WORKER_HOST=`cat /etc/volcano/worker.host | sed 's/$/&:2222/g' | tr "\n" ","`; 60 python tf_cnn_benchmarks.py --batch_size=32 --model=resnet50 --variable_update=parameter_server --flush_stdout=true --num_gpus=1 --local_parameter_device=cpu --device=cpu --data_format=NHWC --job_name=ps --task_index=${VK_TASK_INDEX} --ps_hosts=${PS_HOST} --worker_hosts=${WORKER_HOST} 61 image: volcanosh/example-tf:0.0.1 62 name: tensorflow 63 ports: 64 - containerPort: 2222 65 name: tfjob-port 66 resources: {} 67 workingDir: /opt/tf-benchmarks/scripts/tf_cnn_benchmarks 68 restartPolicy: OnFailure 69 - replicas: 2 70 name: worker 71 policies: 72 - event: TaskCompleted 73 action: CompleteJob 74 template: 75 spec: 76 imagePullSecrets: 77 - name: default-secret 78 containers: 79 - command: 80 - sh 81 - -c 82 - | 83 PS_HOST=`cat /etc/volcano/ps.host | sed 's/$/&:2222/g' | tr "\n" ","`; 84 WORKER_HOST=`cat /etc/volcano/worker.host | sed 's/$/&:2222/g' | tr "\n" ","`; 85 python tf_cnn_benchmarks.py --batch_size=32 --model=resnet50 --variable_update=parameter_server --flush_stdout=true --num_gpus=1 --local_parameter_device=cpu --device=cpu --data_format=NHWC --job_name=worker --task_index=${VK_TASK_INDEX} --ps_hosts=${PS_HOST} --worker_hosts=${WORKER_HOST} 86 image: volcanosh/example-tf:0.0.1 87 name: tensorflow 88 ports: 89 - containerPort: 2222 90 name: tfjob-port 91 resources: {} 92 workingDir: /opt/tf-benchmarks/scripts/tf_cnn_benchmarks 93 restartPolicy: OnFailure