github.com/kubeflow/training-operator@v1.7.0/examples/paddlepaddle/simple-gpu.yaml (about)

     1  apiVersion: "kubeflow.org/v1"
     2  kind: PaddleJob
     3  metadata:
     4    name: paddle-simple-gpu
     5    namespace: kubeflow
     6  spec:
     7    paddleReplicaSpecs:
     8      Worker:
     9        replicas: 2
    10        restartPolicy: OnFailure
    11        template:
    12          spec:
    13            containers:
    14              - name: paddle
    15                image: registry.baidubce.com/paddlepaddle/paddle:2.4.0rc0-gpu-cuda11.2-cudnn8.1-trt8.0
    16                command:
    17                  - python
    18                args:
    19                  - "-m"
    20                  - paddle.distributed.launch
    21                  - "run_check"
    22                ports:
    23                  - containerPort: 37777
    24                    name: master
    25                imagePullPolicy: Always
    26                resources:
    27                    limits:
    28                        nvidia.com/gpu: 2
    29                volumeMounts:
    30                    - mountPath: /dev/shm
    31                      name: dshm
    32            volumes:
    33              - name: dshm
    34                emptyDir:
    35                  medium: Memory
    36