github.com/kubeflow/training-operator@v1.7.0/examples/paddlepaddle/simple-gpu.yaml (about) 1 apiVersion: "kubeflow.org/v1" 2 kind: PaddleJob 3 metadata: 4 name: paddle-simple-gpu 5 namespace: kubeflow 6 spec: 7 paddleReplicaSpecs: 8 Worker: 9 replicas: 2 10 restartPolicy: OnFailure 11 template: 12 spec: 13 containers: 14 - name: paddle 15 image: registry.baidubce.com/paddlepaddle/paddle:2.4.0rc0-gpu-cuda11.2-cudnn8.1-trt8.0 16 command: 17 - python 18 args: 19 - "-m" 20 - paddle.distributed.launch 21 - "run_check" 22 ports: 23 - containerPort: 37777 24 name: master 25 imagePullPolicy: Always 26 resources: 27 limits: 28 nvidia.com/gpu: 2 29 volumeMounts: 30 - mountPath: /dev/shm 31 name: dshm 32 volumes: 33 - name: dshm 34 emptyDir: 35 medium: Memory 36