volcano.sh/volcano@v1.9.0/example/integrations/paddlepaddle/ctr-paddlepaddle-on-volcano.yaml (about) 1 apiVersion: batch.volcano.sh/v1alpha1 2 kind: Job 3 metadata: 4 name: ctr-volcano 5 spec: 6 minAvailable: 4 7 schedulerName: volcano 8 policies: 9 - event: PodEvicted 10 action: RestartJob 11 - event: PodFailed 12 action: RestartJob 13 tasks: 14 - replicas: 2 15 name: pserver 16 template: 17 metadata: 18 labels: 19 paddle-job-pserver: fluid-ctr 20 spec: 21 imagePullSecrets: 22 - name: default-secret 23 volumes: 24 - hostPath: 25 path: /home/work/ 26 type: "" 27 name: seqdata 28 containers: 29 - image: volcanosh/edlctr:v1 30 command: 31 - paddle_k8s 32 - start_fluid 33 imagePullPolicy: IfNotPresent 34 name: pserver 35 volumeMounts: 36 - mountPath: /mnt/seqdata 37 name: seqdata 38 resources: 39 limits: 40 cpu: 10 41 memory: 30Gi 42 ephemeral-storage: 10Gi 43 requests: 44 cpu: 1 45 memory: 100M 46 ephemeral-storage: 1Gi 47 env: 48 - name: GLOG_v 49 value: "0" 50 - name: GLOG_logtostderr 51 value: "1" 52 - name: TOPOLOGY 53 value: "" 54 - name: TRAINER_PACKAGE 55 value: /workspace 56 - name: NAMESPACE 57 valueFrom: 58 fieldRef: 59 apiVersion: v1 60 fieldPath: metadata.namespace 61 - name: POD_IP 62 valueFrom: 63 fieldRef: 64 apiVersion: v1 65 fieldPath: status.podIP 66 - name: POD_NAME 67 valueFrom: 68 fieldRef: 69 apiVersion: v1 70 fieldPath: metadata.name 71 - name: PADDLE_CURRENT_IP 72 valueFrom: 73 fieldRef: 74 apiVersion: v1 75 fieldPath: status.podIP 76 - name: PADDLE_JOB_NAME 77 value: fluid-ctr 78 - name: PADDLE_IS_LOCAL 79 value: "0" 80 - name: PADDLE_TRAINERS_NUM 81 value: "2" 82 - name: PADDLE_PSERVERS_NUM 83 value: "2" 84 - name: FLAGS_rpc_deadline 85 value: "36000000" 86 - name: ENTRY 87 value: cd /workspace/ctr && python train.py --is_local 0 --cloud_train 1 88 - name: PADDLE_PORT 89 value: "30236" 90 - name: LD_LIBRARY_PATH 91 value: /usr/local/lib:/usr/local/nvidia/lib64:/usr/local/rdma/lib64:/usr/lib64/mlnx_ofed/valgrind 92 - name: PADDLE_TRAINING_ROLE 93 value: PSERVER 94 - name: TRAINING_ROLE 95 value: PSERVER 96 restartPolicy: OnFailure 97 - replicas: 2 98 policies: 99 - event: TaskCompleted 100 action: CompleteJob 101 name: trainer 102 template: 103 metadata: 104 labels: 105 paddle-job: fluid-ctr 106 spec: 107 imagePullSecrets: 108 - name: default-secret 109 volumes: 110 - hostPath: 111 path: /home/work/ 112 type: "" 113 name: seqdata 114 containers: 115 - image: volcanosh/edlctr:v1 116 command: 117 - paddle_k8s 118 - start_fluid 119 imagePullPolicy: IfNotPresent 120 name: trainer 121 volumeMounts: 122 - mountPath: /mnt/seqdata 123 name: seqdata 124 resources: 125 limits: 126 cpu: 10 127 memory: 30Gi 128 ephemeral-storage: 10Gi 129 requests: 130 cpu: 1 131 memory: 100M 132 ephemeral-storage: 10Gi 133 env: 134 - name: GLOG_v 135 value: "0" 136 - name: GLOG_logtostderr 137 value: "1" 138 - name: TOPOLOGY 139 - name: TRAINER_PACKAGE 140 value: /workspace 141 - name: CPU_NUM 142 value: "2" 143 - name: NAMESPACE 144 valueFrom: 145 fieldRef: 146 apiVersion: v1 147 fieldPath: metadata.namespace 148 - name: POD_IP 149 valueFrom: 150 fieldRef: 151 apiVersion: v1 152 fieldPath: status.podIP 153 - name: POD_NAME 154 valueFrom: 155 fieldRef: 156 apiVersion: v1 157 fieldPath: metadata.name 158 - name: PADDLE_CURRENT_IP 159 valueFrom: 160 fieldRef: 161 apiVersion: v1 162 fieldPath: status.podIP 163 - name: PADDLE_JOB_NAME 164 value: fluid-ctr 165 - name: PADDLE_IS_LOCAL 166 value: "0" 167 - name: FLAGS_rpc_deadline 168 value: "36000000" 169 - name: PADDLE_PORT 170 value: "30236" 171 - name: PADDLE_PSERVERS_NUM 172 value: "2" 173 - name: PADDLE_TRAINERS_NUM 174 value: "2" 175 - name: PADDLE_TRAINING_ROLE 176 value: TRAINER 177 - name: TRAINING_ROLE 178 value: TRAINER 179 - name: LD_LIBRARY_PATH 180 value: /usr/local/lib:/usr/local/nvidia/lib64:/usr/local/rdma/lib64:/usr/lib64/mlnx_ofed/valgrind 181 - name: ENTRY 182 value: cd /workspace/ctr && python train.py --is_local 0 --cloud_train 1 183 restartPolicy: OnFailure