github.com/kubeflow/training-operator@v1.7.0/examples/mxnet/train/byteps_dist_gpu_v1.yaml (about)

     1  apiVersion: "kubeflow.org/v1"
     2  kind: "MXJob"
     3  metadata:
     4    name: "byteps-mxnet-job"
     5  spec:
     6    jobMode: MXTrain
     7    runPolicy:
     8      cleanPodPolicy: Running
     9    mxReplicaSpecs:
    10      Scheduler:
    11        replicas: 1
    12        restartPolicy: Never
    13        template:
    14          spec:
    15            containers:
    16              - name: mxnet
    17                image: bytepsimage/mxnet
    18                command: ["bpslaunch"]
    19      Server:
    20        replicas: 2
    21        restartPolicy: Never
    22        template:
    23          spec:
    24            containers:
    25              - name: mxnet
    26                image: bytepsimage/mxnet
    27                command: ["bpslaunch"]
    28      Worker:
    29        replicas: 2
    30        restartPolicy: Never
    31        template:
    32          spec:
    33            containers:
    34              - name: mxnet
    35                image: bytepsimage/mxnet
    36                command: ["bpslaunch"]
    37                args: ["python3", "/usr/local/byteps/example/mxnet/train_imagenet_byteps.py", "--benchmark", "1", "--batch-size=32"]
    38                volumeMounts:
    39                - mountPath: /dev/shm
    40                  name: dshm
    41                resources:
    42                  limits:
    43                    nvidia.com/gpu: 8
    44            volumes:
    45            - name: dshm
    46              emptyDir: 
    47                medium: Memory