github.com/kubeflow/training-operator@v1.7.0/examples/xgboost/lightgbm-dist/README.md (about) 1 ### Distributed Lightgbm Job train 2 3 This folder containers Dockerfile and Python scripts to run a distributed Lightgbm training using the XGBoost operator. 4 The code is based in this [example](https://github.com/microsoft/LightGBM/tree/master/examples/parallel_learning) in the official github repository of the library. 5 6 7 **Build image** 8 The default image name and tag is `kubeflow/lightgbm-dist-py-test:1.0` respectiveily. 9 10 ```shell 11 docker build -f Dockerfile -t kubeflow/lightgbm-dist-py-test:1.0 ./ 12 ``` 13 14 **Start the training** 15 16 ``` 17 kubectl create -f xgboostjob_v1_lightgbm_dist_training.yaml 18 ``` 19 20 **Look at the job status** 21 ``` 22 kubectl get -o yaml XGBoostJob/lightgbm-dist-train-test 23 ``` 24 Here is sample output when the job is running. The output result like this 25 26 ``` 27 apiVersion: xgboostjob.kubeflow.org/v1 28 kind: XGBoostJob 29 metadata: 30 annotations: 31 kubectl.kubernetes.io/last-applied-configuration: | 32 {"apiVersion":"xgboostjob.kubeflow.org/v1","kind":"XGBoostJob","metadata":{"annotations":{},"name":"lightgbm-dist-train-test","namespace":"default"},"spec":{"xgbReplicaSpecs":{"Master":{"replicas":1,"restartPolicy":"Never","template":{"apiVersion":"v1","kind":"Pod","spec":{"containers":[{"args":["--job_type=Train","--boosting_type=gbdt","--objective=binary","--metric=binary_logloss,auc","--metric_freq=1","--is_training_metric=true","--max_bin=255","--data=data/binary.train","--valid_data=data/binary.test","--num_trees=100","--learning_rate=01","--num_leaves=63","--tree_learner=feature","--feature_fraction=0.8","--bagging_freq=5","--bagging_fraction=0.8","--min_data_in_leaf=50","--min_sum_hessian_in_leaf=50","--is_enable_sparse=true","--use_two_round_loading=false","--is_save_binary_file=false"],"image":"kubeflow/lightgbm-dist-py-test:1.0","imagePullPolicy":"Never","name":"xgboostjob","ports":[{"containerPort":9991,"name":"xgboostjob-port"}]}]}}},"Worker":{"replicas":2,"restartPolicy":"ExitCode","template":{"apiVersion":"v1","kind":"Pod","spec":{"containers":[{"args":["--job_type=Train","--boosting_type=gbdt","--objective=binary","--metric=binary_logloss,auc","--metric_freq=1","--is_training_metric=true","--max_bin=255","--data=data/binary.train","--valid_data=data/binary.test","--num_trees=100","--learning_rate=01","--num_leaves=63","--tree_learner=feature","--feature_fraction=0.8","--bagging_freq=5","--bagging_fraction=0.8","--min_data_in_leaf=50","--min_sum_hessian_in_leaf=50","--is_enable_sparse=true","--use_two_round_loading=false","--is_save_binary_file=false"],"image":"kubeflow/lightgbm-dist-py-test:1.0","imagePullPolicy":"Never","name":"xgboostjob","ports":[{"containerPort":9991,"name":"xgboostjob-port"}]}]}}}}}} 33 creationTimestamp: "2020-10-14T15:31:23Z" 34 generation: 7 35 managedFields: 36 - apiVersion: xgboostjob.kubeflow.org/v1 37 fieldsType: FieldsV1 38 fieldsV1: 39 f:metadata: 40 f:annotations: 41 .: {} 42 f:kubectl.kubernetes.io/last-applied-configuration: {} 43 f:spec: 44 .: {} 45 f:xgbReplicaSpecs: 46 .: {} 47 f:Master: 48 .: {} 49 f:replicas: {} 50 f:restartPolicy: {} 51 f:template: 52 .: {} 53 f:spec: {} 54 f:Worker: 55 .: {} 56 f:replicas: {} 57 f:restartPolicy: {} 58 f:template: 59 .: {} 60 f:spec: {} 61 manager: kubectl-client-side-apply 62 operation: Update 63 time: "2020-10-14T15:31:23Z" 64 - apiVersion: xgboostjob.kubeflow.org/v1 65 fieldsType: FieldsV1 66 fieldsV1: 67 f:spec: 68 f:RunPolicy: 69 .: {} 70 f:cleanPodPolicy: {} 71 f:xgbReplicaSpecs: 72 f:Master: 73 f:template: 74 f:metadata: 75 .: {} 76 f:creationTimestamp: {} 77 f:spec: 78 f:containers: {} 79 f:Worker: 80 f:template: 81 f:metadata: 82 .: {} 83 f:creationTimestamp: {} 84 f:spec: 85 f:containers: {} 86 f:status: 87 .: {} 88 f:completionTime: {} 89 f:conditions: {} 90 f:replicaStatuses: 91 .: {} 92 f:Master: 93 .: {} 94 f:succeeded: {} 95 f:Worker: 96 .: {} 97 f:succeeded: {} 98 manager: main 99 operation: Update 100 time: "2020-10-14T15:34:44Z" 101 name: lightgbm-dist-train-test 102 namespace: default 103 resourceVersion: "38923" 104 selfLink: /apis/xgboostjob.kubeflow.org/v1/namespaces/default/xgboostjobs/lightgbm-dist-train-test 105 uid: b2b887d0-445b-498b-8852-26c8edc98dc7 106 spec: 107 RunPolicy: 108 cleanPodPolicy: None 109 xgbReplicaSpecs: 110 Master: 111 replicas: 1 112 restartPolicy: Never 113 template: 114 metadata: 115 creationTimestamp: null 116 spec: 117 containers: 118 - args: 119 - --job_type=Train 120 - --boosting_type=gbdt 121 - --objective=binary 122 - --metric=binary_logloss,auc 123 - --metric_freq=1 124 - --is_training_metric=true 125 - --max_bin=255 126 - --data=data/binary.train 127 - --valid_data=data/binary.test 128 - --num_trees=100 129 - --learning_rate=01 130 - --num_leaves=63 131 - --tree_learner=feature 132 - --feature_fraction=0.8 133 - --bagging_freq=5 134 - --bagging_fraction=0.8 135 - --min_data_in_leaf=50 136 - --min_sum_hessian_in_leaf=50 137 - --is_enable_sparse=true 138 - --use_two_round_loading=false 139 - --is_save_binary_file=false 140 image: kubeflow/lightgbm-dist-py-test:1.0 141 imagePullPolicy: Never 142 name: xgboostjob 143 ports: 144 - containerPort: 9991 145 name: xgboostjob-port 146 resources: {} 147 Worker: 148 replicas: 2 149 restartPolicy: ExitCode 150 template: 151 metadata: 152 creationTimestamp: null 153 spec: 154 containers: 155 - args: 156 - --job_type=Train 157 - --boosting_type=gbdt 158 - --objective=binary 159 - --metric=binary_logloss,auc 160 - --metric_freq=1 161 - --is_training_metric=true 162 - --max_bin=255 163 - --data=data/binary.train 164 - --valid_data=data/binary.test 165 - --num_trees=100 166 - --learning_rate=01 167 - --num_leaves=63 168 - --tree_learner=feature 169 - --feature_fraction=0.8 170 - --bagging_freq=5 171 - --bagging_fraction=0.8 172 - --min_data_in_leaf=50 173 - --min_sum_hessian_in_leaf=50 174 - --is_enable_sparse=true 175 - --use_two_round_loading=false 176 - --is_save_binary_file=false 177 image: kubeflow/lightgbm-dist-py-test:1.0 178 imagePullPolicy: Never 179 name: xgboostjob 180 ports: 181 - containerPort: 9991 182 name: xgboostjob-port 183 resources: {} 184 status: 185 completionTime: "2020-10-14T15:34:44Z" 186 conditions: 187 - lastTransitionTime: "2020-10-14T15:31:23Z" 188 lastUpdateTime: "2020-10-14T15:31:23Z" 189 message: xgboostJob lightgbm-dist-train-test is created. 190 reason: XGBoostJobCreated 191 status: "True" 192 type: Created 193 - lastTransitionTime: "2020-10-14T15:31:23Z" 194 lastUpdateTime: "2020-10-14T15:31:23Z" 195 message: XGBoostJob lightgbm-dist-train-test is running. 196 reason: XGBoostJobRunning 197 status: "False" 198 type: Running 199 - lastTransitionTime: "2020-10-14T15:34:44Z" 200 lastUpdateTime: "2020-10-14T15:34:44Z" 201 message: XGBoostJob lightgbm-dist-train-test is successfully completed. 202 reason: XGBoostJobSucceeded 203 status: "True" 204 type: Succeeded 205 replicaStatuses: 206 Master: 207 succeeded: 1 208 Worker: 209 succeeded: 2 210 ```