github.com/kubeflow/training-operator@v1.7.0/examples/mxnet/mxjob_dist_v1.yaml (about)

     1  apiVersion: "kubeflow.org/v1"
     2  kind: "MXJob"
     3  metadata:
     4    name: "mxnet-job"
     5  spec:
     6    jobMode: MXTrain
     7    mxReplicaSpecs:
     8      Scheduler:
     9        replicas: 1
    10        restartPolicy: Never
    11        template:
    12          spec:
    13            containers:
    14              - name: mxnet
    15                image: mxjob/mxnet:gpu
    16                ports:
    17                - containerPort: 9991
    18                  name: mxjob-port
    19      Server:
    20        replicas: 1
    21        restartPolicy: Never
    22        template:
    23          spec:
    24            containers:
    25              - name: mxnet
    26                image: mxjob/mxnet:gpu
    27                ports:
    28                - containerPort: 9991
    29                  name: mxjob-port
    30      Worker:
    31        replicas: 1
    32        restartPolicy: Never
    33        template:
    34          spec:
    35            containers:
    36              - name: mxnet
    37                image: mxjob/mxnet:gpu
    38                command: ["python"]
    39                args: ["/incubator-mxnet/example/image-classification/train_mnist.py","--num-epochs","10","--num-layers","2","--kv-store","dist_device_sync","--gpus","0"]
    40                resources:
    41                  limits:
    42                    nvidia.com/gpu: 1
    43                ports:
    44                - containerPort: 9991
    45                  name: mxjob-port