volcano.sh/volcano@v1.9.0/example/integrations/paddlepaddle/ctr-paddlepaddle-on-volcano.yaml (about)

     1  apiVersion: batch.volcano.sh/v1alpha1
     2  kind: Job
     3  metadata:
     4    name: ctr-volcano
     5  spec:
     6    minAvailable: 4
     7    schedulerName: volcano
     8    policies:
     9    - event: PodEvicted
    10      action: RestartJob
    11    - event: PodFailed
    12      action: RestartJob
    13    tasks:
    14    - replicas: 2
    15      name: pserver
    16      template:
    17        metadata:
    18          labels:
    19            paddle-job-pserver: fluid-ctr
    20        spec:
    21          imagePullSecrets:
    22          - name: default-secret
    23          volumes:
    24          - hostPath:
    25              path: /home/work/
    26              type: ""
    27            name: seqdata
    28          containers:
    29          - image: volcanosh/edlctr:v1
    30            command:
    31            - paddle_k8s
    32            - start_fluid
    33            imagePullPolicy: IfNotPresent
    34            name: pserver
    35            volumeMounts:
    36            - mountPath: /mnt/seqdata
    37              name: seqdata
    38            resources:
    39              limits:
    40                cpu: 10
    41                memory: 30Gi
    42                ephemeral-storage: 10Gi
    43              requests:
    44                cpu: 1
    45                memory: 100M
    46                ephemeral-storage: 1Gi
    47            env:
    48            - name: GLOG_v
    49              value: "0"
    50            - name: GLOG_logtostderr
    51              value: "1"
    52            - name: TOPOLOGY
    53              value: ""
    54            - name: TRAINER_PACKAGE
    55              value: /workspace
    56            - name: NAMESPACE
    57              valueFrom:
    58                fieldRef:
    59                  apiVersion: v1
    60                  fieldPath: metadata.namespace
    61            - name: POD_IP
    62              valueFrom:
    63                fieldRef:
    64                  apiVersion: v1
    65                  fieldPath: status.podIP
    66            - name: POD_NAME
    67              valueFrom:
    68                fieldRef:
    69                  apiVersion: v1
    70                  fieldPath: metadata.name
    71            - name: PADDLE_CURRENT_IP
    72              valueFrom:
    73                fieldRef:
    74                  apiVersion: v1
    75                  fieldPath: status.podIP
    76            - name: PADDLE_JOB_NAME
    77              value: fluid-ctr
    78            - name: PADDLE_IS_LOCAL
    79              value: "0"
    80            - name: PADDLE_TRAINERS_NUM
    81              value: "2"
    82            - name: PADDLE_PSERVERS_NUM
    83              value: "2"
    84            - name: FLAGS_rpc_deadline
    85              value: "36000000"
    86            - name: ENTRY
    87              value: cd /workspace/ctr && python train.py --is_local 0 --cloud_train 1
    88            - name: PADDLE_PORT
    89              value: "30236"
    90            - name: LD_LIBRARY_PATH
    91              value: /usr/local/lib:/usr/local/nvidia/lib64:/usr/local/rdma/lib64:/usr/lib64/mlnx_ofed/valgrind
    92            - name: PADDLE_TRAINING_ROLE
    93              value: PSERVER
    94            - name: TRAINING_ROLE
    95              value: PSERVER
    96          restartPolicy: OnFailure
    97    - replicas: 2
    98      policies:
    99      - event: TaskCompleted
   100        action: CompleteJob
   101      name: trainer
   102      template:
   103        metadata:
   104          labels:
   105            paddle-job: fluid-ctr
   106        spec:
   107          imagePullSecrets:
   108          - name: default-secret
   109          volumes:
   110          - hostPath:
   111              path: /home/work/
   112              type: ""
   113            name: seqdata
   114          containers:
   115          - image: volcanosh/edlctr:v1
   116            command:
   117            - paddle_k8s
   118            - start_fluid
   119            imagePullPolicy: IfNotPresent
   120            name: trainer
   121            volumeMounts:
   122            - mountPath: /mnt/seqdata
   123              name: seqdata
   124            resources:
   125              limits:
   126                cpu: 10
   127                memory: 30Gi
   128                ephemeral-storage: 10Gi
   129              requests:
   130                cpu: 1
   131                memory: 100M
   132                ephemeral-storage: 10Gi
   133            env:
   134            - name: GLOG_v
   135              value: "0"
   136            - name: GLOG_logtostderr
   137              value: "1"
   138            - name: TOPOLOGY
   139            - name: TRAINER_PACKAGE
   140              value: /workspace
   141            - name: CPU_NUM
   142              value: "2"
   143            - name: NAMESPACE
   144              valueFrom:
   145                fieldRef:
   146                  apiVersion: v1
   147                  fieldPath: metadata.namespace
   148            - name: POD_IP
   149              valueFrom:
   150                fieldRef:
   151                  apiVersion: v1
   152                  fieldPath: status.podIP
   153            - name: POD_NAME
   154              valueFrom:
   155                fieldRef:
   156                  apiVersion: v1
   157                  fieldPath: metadata.name
   158            - name: PADDLE_CURRENT_IP
   159              valueFrom:
   160                fieldRef:
   161                  apiVersion: v1
   162                  fieldPath: status.podIP
   163            - name: PADDLE_JOB_NAME
   164              value: fluid-ctr
   165            - name: PADDLE_IS_LOCAL
   166              value: "0"
   167            - name: FLAGS_rpc_deadline
   168              value: "36000000"
   169            - name: PADDLE_PORT
   170              value: "30236"
   171            - name: PADDLE_PSERVERS_NUM
   172              value: "2"
   173            - name: PADDLE_TRAINERS_NUM
   174              value: "2"
   175            - name: PADDLE_TRAINING_ROLE
   176              value: TRAINER
   177            - name: TRAINING_ROLE
   178              value: TRAINER
   179            - name: LD_LIBRARY_PATH
   180              value: /usr/local/lib:/usr/local/nvidia/lib64:/usr/local/rdma/lib64:/usr/lib64/mlnx_ofed/valgrind
   181            - name: ENTRY
   182              value: cd /workspace/ctr && python train.py --is_local 0 --cloud_train 1
   183          restartPolicy: OnFailure