github.com/kubeflow/training-operator@v1.7.0/examples/mxnet/train/mx_job_dist_gpu_v1.yaml (about)

     1  apiVersion: "kubeflow.org/v1"
     2  kind: "MXJob"
     3  metadata:
     4    name: "mxnet-job"
     5  spec:
     6    jobMode: MXTrain
     7    mxReplicaSpecs:
     8      Scheduler:
     9        replicas: 1
    10        restartPolicy: Never
    11        template:
    12          spec:
    13            containers:
    14              - name: mxnet
    15                image: mxjob/mxnet:gpu
    16      Server:
    17        replicas: 1
    18        restartPolicy: Never
    19        template:
    20          spec:
    21            containers:
    22              - name: mxnet
    23                image: mxjob/mxnet:gpu
    24      Worker:
    25        replicas: 1
    26        restartPolicy: Never
    27        template:
    28          spec:
    29            containers:
    30              - name: mxnet
    31                image: mxjob/mxnet:gpu
    32                command: ["python"]
    33                args: ["/incubator-mxnet/example/image-classification/train_mnist.py","--num-epochs","10","--num-layers","2","--kv-store","dist_device_sync","--gpus","0"]
    34                resources:
    35                  limits:
    36                    nvidia.com/gpu: 1