github.com/kubeflow/training-operator@v1.7.0/examples/xgboost/lightgbm-dist/README.md (about)

     1  ### Distributed Lightgbm Job train
     2  
     3  This folder containers Dockerfile and Python scripts to run a distributed Lightgbm training using the XGBoost operator.
     4  The code is based in this [example](https://github.com/microsoft/LightGBM/tree/master/examples/parallel_learning) in the official github repository of the library.
     5  
     6  
     7  **Build image**
     8  The default image name and tag is `kubeflow/lightgbm-dist-py-test:1.0` respectiveily.
     9  
    10  ```shell
    11  docker build -f Dockerfile -t kubeflow/lightgbm-dist-py-test:1.0 ./
    12  ```
    13  
    14  **Start the training**
    15  
    16  ```
    17  kubectl create -f xgboostjob_v1_lightgbm_dist_training.yaml
    18  ```
    19  
    20  **Look at the job status**
    21  ```
    22   kubectl get -o yaml XGBoostJob/lightgbm-dist-train-test
    23   ```
    24  Here is sample output when the job is running. The output result like this
    25  
    26  ```
    27  apiVersion: xgboostjob.kubeflow.org/v1
    28  kind: XGBoostJob
    29  metadata:
    30    annotations:
    31      kubectl.kubernetes.io/last-applied-configuration: |
    32        {"apiVersion":"xgboostjob.kubeflow.org/v1","kind":"XGBoostJob","metadata":{"annotations":{},"name":"lightgbm-dist-train-test","namespace":"default"},"spec":{"xgbReplicaSpecs":{"Master":{"replicas":1,"restartPolicy":"Never","template":{"apiVersion":"v1","kind":"Pod","spec":{"containers":[{"args":["--job_type=Train","--boosting_type=gbdt","--objective=binary","--metric=binary_logloss,auc","--metric_freq=1","--is_training_metric=true","--max_bin=255","--data=data/binary.train","--valid_data=data/binary.test","--num_trees=100","--learning_rate=01","--num_leaves=63","--tree_learner=feature","--feature_fraction=0.8","--bagging_freq=5","--bagging_fraction=0.8","--min_data_in_leaf=50","--min_sum_hessian_in_leaf=50","--is_enable_sparse=true","--use_two_round_loading=false","--is_save_binary_file=false"],"image":"kubeflow/lightgbm-dist-py-test:1.0","imagePullPolicy":"Never","name":"xgboostjob","ports":[{"containerPort":9991,"name":"xgboostjob-port"}]}]}}},"Worker":{"replicas":2,"restartPolicy":"ExitCode","template":{"apiVersion":"v1","kind":"Pod","spec":{"containers":[{"args":["--job_type=Train","--boosting_type=gbdt","--objective=binary","--metric=binary_logloss,auc","--metric_freq=1","--is_training_metric=true","--max_bin=255","--data=data/binary.train","--valid_data=data/binary.test","--num_trees=100","--learning_rate=01","--num_leaves=63","--tree_learner=feature","--feature_fraction=0.8","--bagging_freq=5","--bagging_fraction=0.8","--min_data_in_leaf=50","--min_sum_hessian_in_leaf=50","--is_enable_sparse=true","--use_two_round_loading=false","--is_save_binary_file=false"],"image":"kubeflow/lightgbm-dist-py-test:1.0","imagePullPolicy":"Never","name":"xgboostjob","ports":[{"containerPort":9991,"name":"xgboostjob-port"}]}]}}}}}}
    33    creationTimestamp: "2020-10-14T15:31:23Z"
    34    generation: 7
    35    managedFields:
    36    - apiVersion: xgboostjob.kubeflow.org/v1
    37      fieldsType: FieldsV1
    38      fieldsV1:
    39        f:metadata:
    40          f:annotations:
    41            .: {}
    42            f:kubectl.kubernetes.io/last-applied-configuration: {}
    43        f:spec:
    44          .: {}
    45          f:xgbReplicaSpecs:
    46            .: {}
    47            f:Master:
    48              .: {}
    49              f:replicas: {}
    50              f:restartPolicy: {}
    51              f:template:
    52                .: {}
    53                f:spec: {}
    54            f:Worker:
    55              .: {}
    56              f:replicas: {}
    57              f:restartPolicy: {}
    58              f:template:
    59                .: {}
    60                f:spec: {}
    61      manager: kubectl-client-side-apply
    62      operation: Update
    63      time: "2020-10-14T15:31:23Z"
    64    - apiVersion: xgboostjob.kubeflow.org/v1
    65      fieldsType: FieldsV1
    66      fieldsV1:
    67        f:spec:
    68          f:RunPolicy:
    69            .: {}
    70            f:cleanPodPolicy: {}
    71          f:xgbReplicaSpecs:
    72            f:Master:
    73              f:template:
    74                f:metadata:
    75                  .: {}
    76                  f:creationTimestamp: {}
    77                f:spec:
    78                  f:containers: {}
    79            f:Worker:
    80              f:template:
    81                f:metadata:
    82                  .: {}
    83                  f:creationTimestamp: {}
    84                f:spec:
    85                  f:containers: {}
    86        f:status:
    87          .: {}
    88          f:completionTime: {}
    89          f:conditions: {}
    90          f:replicaStatuses:
    91            .: {}
    92            f:Master:
    93              .: {}
    94              f:succeeded: {}
    95            f:Worker:
    96              .: {}
    97              f:succeeded: {}
    98      manager: main
    99      operation: Update
   100      time: "2020-10-14T15:34:44Z"
   101    name: lightgbm-dist-train-test
   102    namespace: default
   103    resourceVersion: "38923"
   104    selfLink: /apis/xgboostjob.kubeflow.org/v1/namespaces/default/xgboostjobs/lightgbm-dist-train-test
   105    uid: b2b887d0-445b-498b-8852-26c8edc98dc7
   106  spec:
   107    RunPolicy:
   108      cleanPodPolicy: None
   109    xgbReplicaSpecs:
   110      Master:
   111        replicas: 1
   112        restartPolicy: Never
   113        template:
   114          metadata:
   115            creationTimestamp: null
   116          spec:
   117            containers:
   118            - args:
   119              - --job_type=Train
   120              - --boosting_type=gbdt
   121              - --objective=binary
   122              - --metric=binary_logloss,auc
   123              - --metric_freq=1
   124              - --is_training_metric=true
   125              - --max_bin=255
   126              - --data=data/binary.train
   127              - --valid_data=data/binary.test
   128              - --num_trees=100
   129              - --learning_rate=01
   130              - --num_leaves=63
   131              - --tree_learner=feature
   132              - --feature_fraction=0.8
   133              - --bagging_freq=5
   134              - --bagging_fraction=0.8
   135              - --min_data_in_leaf=50
   136              - --min_sum_hessian_in_leaf=50
   137              - --is_enable_sparse=true
   138              - --use_two_round_loading=false
   139              - --is_save_binary_file=false
   140              image: kubeflow/lightgbm-dist-py-test:1.0
   141              imagePullPolicy: Never
   142              name: xgboostjob
   143              ports:
   144              - containerPort: 9991
   145                name: xgboostjob-port
   146              resources: {}
   147      Worker:
   148        replicas: 2
   149        restartPolicy: ExitCode
   150        template:
   151          metadata:
   152            creationTimestamp: null
   153          spec:
   154            containers:
   155            - args:
   156              - --job_type=Train
   157              - --boosting_type=gbdt
   158              - --objective=binary
   159              - --metric=binary_logloss,auc
   160              - --metric_freq=1
   161              - --is_training_metric=true
   162              - --max_bin=255
   163              - --data=data/binary.train
   164              - --valid_data=data/binary.test
   165              - --num_trees=100
   166              - --learning_rate=01
   167              - --num_leaves=63
   168              - --tree_learner=feature
   169              - --feature_fraction=0.8
   170              - --bagging_freq=5
   171              - --bagging_fraction=0.8
   172              - --min_data_in_leaf=50
   173              - --min_sum_hessian_in_leaf=50
   174              - --is_enable_sparse=true
   175              - --use_two_round_loading=false
   176              - --is_save_binary_file=false
   177              image: kubeflow/lightgbm-dist-py-test:1.0
   178              imagePullPolicy: Never
   179              name: xgboostjob
   180              ports:
   181              - containerPort: 9991
   182                name: xgboostjob-port
   183              resources: {}
   184  status:
   185    completionTime: "2020-10-14T15:34:44Z"
   186    conditions:
   187    - lastTransitionTime: "2020-10-14T15:31:23Z"
   188      lastUpdateTime: "2020-10-14T15:31:23Z"
   189      message: xgboostJob lightgbm-dist-train-test is created.
   190      reason: XGBoostJobCreated
   191      status: "True"
   192      type: Created
   193    - lastTransitionTime: "2020-10-14T15:31:23Z"
   194      lastUpdateTime: "2020-10-14T15:31:23Z"
   195      message: XGBoostJob lightgbm-dist-train-test is running.
   196      reason: XGBoostJobRunning
   197      status: "False"
   198      type: Running
   199    - lastTransitionTime: "2020-10-14T15:34:44Z"
   200      lastUpdateTime: "2020-10-14T15:34:44Z"
   201      message: XGBoostJob lightgbm-dist-train-test is successfully completed.
   202      reason: XGBoostJobSucceeded
   203      status: "True"
   204      type: Succeeded
   205    replicaStatuses:
   206      Master:
   207        succeeded: 1
   208      Worker:
   209        succeeded: 2
   210  ```