volcano.sh/volcano@v1.9.0/example/kubecon-2019-china/tf-sample/tf-example.yaml (about) 1 ################################################ 2 # # 3 # Demo for running TF tasks on Volcano # 4 # # 5 ################################################ 6 # 7 # This yaml used to demonstrate how to running a TF task via Volcano Job, 8 # the running sample program is from TF benchmark 9 # (https://github.com/tensorflow/benchmarks/tree/master/scripts/tf_cnn_benchmarks) 10 # The equivalent command when running locally: 11 # 12 # python tf_cnn_benchmarks.py --num_gpus=1 --batch_size=32 --model=resnet50 --variable_update=parameter_server 13 # --local_parameter_device=cpu --device=cpu --data_format=NHWC 14 # 15 # The output from ps or worker pod can be used to identify whether the TF cluster 16 # has been correctly configured: 17 # 18 # (log from worker pod....) 19 # 2019-04-23 11:10:25.554248: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215] 20 # Initialize GrpcChannelCache for job ps -> {0 -> tensorflow-benchmark-ps-0.tensorflow-benchmark:2222} 21 # 2019-04-23 11:10:25.554308: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215] 22 # Initialize GrpcChannelCache for job worker -> {0 -> localhost:2222} 23 # 24 # (log from ps pod....) 25 # 2019-04-23 11:10:25.552827: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215] 26 # Initialize GrpcChannelCache for job ps -> {0 -> localhost:2222} 27 # 2019-04-23 11:10:25.552861: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215] 28 # Initialize GrpcChannelCache for job worker -> {0 -> tensorflow-benchmark-worker-0.tensorflow-benchmark:2222} 29 # 30 # **NOTES**: This example may take about an hour to finish. When running multiple jobs, please ensure enough resource 31 # is guaranteed for each of the worker pods. 32 33 apiVersion: batch.volcano.sh/v1alpha1 34 kind: Job 35 metadata: 36 name: tensorflow-benchmark 37 labels: 38 "volcano.sh/job-type": "Tensorflow" 39 spec: 40 minAvailable: 3 41 schedulerName: volcano 42 plugins: 43 env: [] 44 svc: [] 45 policies: 46 - event: PodEvicted 47 action: RestartJob 48 tasks: 49 - replicas: 1 50 name: ps 51 template: 52 spec: 53 imagePullSecrets: 54 - name: default-secret 55 containers: 56 - command: 57 - sh 58 - -c 59 - | 60 PS_HOST=`cat /etc/volcano/ps.host | sed 's/$/&:2222/g' | tr "\n" ","`; 61 WORKER_HOST=`cat /etc/volcano/worker.host | sed 's/$/&:2222/g' | tr "\n" ","`; 62 python tf_cnn_benchmarks.py --batch_size=32 --model=resnet50 --variable_update=parameter_server --flush_stdout=true --num_gpus=1 --local_parameter_device=cpu --device=cpu --data_format=NHWC --job_name=ps --task_index=${VK_TASK_INDEX} --ps_hosts=${PS_HOST} --worker_hosts=${WORKER_HOST} 63 image: volcanosh/example-tf:0.0.1 64 name: tensorflow 65 ports: 66 - containerPort: 2222 67 name: tfjob-port 68 resources: 69 requests: 70 cpu: "1000m" 71 memory: "2048Mi" 72 limits: 73 cpu: "1000m" 74 memory: "2048Mi" 75 workingDir: /opt/tf-benchmarks/scripts/tf_cnn_benchmarks 76 restartPolicy: OnFailure 77 - replicas: 2 78 name: worker 79 policies: 80 - event: TaskCompleted 81 action: CompleteJob 82 template: 83 spec: 84 imagePullSecrets: 85 - name: default-secret 86 containers: 87 - command: 88 - sh 89 - -c 90 - | 91 PS_HOST=`cat /etc/volcano/ps.host | sed 's/$/&:2222/g' | tr "\n" ","`; 92 WORKER_HOST=`cat /etc/volcano/worker.host | sed 's/$/&:2222/g' | tr "\n" ","`; 93 python tf_cnn_benchmarks.py --batch_size=32 --model=resnet50 --variable_update=parameter_server --flush_stdout=true --num_gpus=1 --local_parameter_device=cpu --device=cpu --data_format=NHWC --job_name=worker --task_index=${VK_TASK_INDEX} --ps_hosts=${PS_HOST} --worker_hosts=${WORKER_HOST} 94 image: volcanosh/example-tf:0.0.1 95 name: tensorflow 96 ports: 97 - containerPort: 2222 98 name: tfjob-port 99 resources: 100 requests: 101 cpu: "2000m" 102 memory: "2048Mi" 103 limits: 104 cpu: "2000m" 105 memory: "4096Mi" 106 workingDir: /opt/tf-benchmarks/scripts/tf_cnn_benchmarks 107 restartPolicy: OnFailure