github.com/kubeflow/training-operator@v1.7.0/examples/xgboost/xgboost-dist/utils.py (about) 1 # Licensed under the Apache License, Version 2.0 (the "License"); 2 # you may not use this file except in compliance with the License. 3 # You may obtain a copy of the License at 4 # 5 # http://www.apache.org/licenses/LICENSE-2.0 6 # 7 # Unless required by applicable law or agreed to in writing, software 8 # distributed under the License is distributed on an "AS IS" BASIS, 9 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 # See the License for the specific language governing permissions and 11 # limitations under the License. 12 13 import logging 14 import joblib 15 import xgboost as xgb 16 import os 17 import tempfile 18 import oss2 19 import json 20 import pandas as pd 21 22 from sklearn import datasets 23 24 logger = logging.getLogger(__name__) 25 26 27 def extract_xgbooost_cluster_env(): 28 """ 29 Extract the cluster env from pod 30 :return: the related cluster env to build rabit 31 """ 32 33 logger.info("starting to extract system env") 34 35 master_addr = os.environ.get("MASTER_ADDR", "{}") 36 master_port = int(os.environ.get("MASTER_PORT", "{}")) 37 rank = int(os.environ.get("RANK", "{}")) 38 world_size = int(os.environ.get("WORLD_SIZE", "{}")) 39 40 logger.info("extract the Rabit env from cluster :" 41 " %s, port: %d, rank: %d, word_size: %d ", 42 master_addr, master_port, rank, world_size) 43 44 return master_addr, master_port, rank, world_size 45 46 47 def read_train_data(rank, num_workers, path): 48 """ 49 Read file based on the rank of worker. 50 We use the sklearn.iris data for demonstration 51 You can extend this to read distributed data source like HDFS, HIVE etc 52 :param rank: the id of each worker 53 :param num_workers: total number of workers in this cluster 54 :param path: the input file name or the place to read the data 55 :return: XGBoost Dmatrix 56 """ 57 iris = datasets.load_iris() 58 x = iris.data 59 y = iris.target 60 61 start, end = get_range_data(len(x), rank, num_workers) 62 x = x[start:end, :] 63 y = y[start:end] 64 65 x = pd.DataFrame(x) 66 y = pd.DataFrame(y) 67 dtrain = xgb.DMatrix(data=x, label=y) 68 69 logging.info("Read data from IRIS data source with range from %d to %d", 70 start, end) 71 72 return dtrain 73 74 75 def read_predict_data(rank, num_workers, path): 76 """ 77 Read file based on the rank of worker. 78 We use the sklearn.iris data for demonstration 79 You can extend this to read distributed data source like HDFS, HIVE etc 80 :param rank: the id of each worker 81 :param num_workers: total number of workers in this cluster 82 :param path: the input file name or the place to read the data 83 :return: XGBoost Dmatrix, and real value 84 """ 85 iris = datasets.load_iris() 86 x = iris.data 87 y = iris.target 88 89 start, end = get_range_data(len(x), rank, num_workers) 90 x = x[start:end, :] 91 y = y[start:end] 92 x = pd.DataFrame(x) 93 y = pd.DataFrame(y) 94 95 logging.info("Read data from IRIS datasource with range from %d to %d", 96 start, end) 97 98 predict = xgb.DMatrix(x, label=y) 99 100 return predict, y 101 102 103 def get_range_data(num_row, rank, num_workers): 104 """ 105 compute the data range based on the input data size and worker id 106 :param num_row: total number of dataset 107 :param rank: the worker id 108 :param num_workers: total number of workers 109 :return: begin and end range of input matrix 110 """ 111 num_per_partition = int(num_row/num_workers) 112 113 x_start = rank * num_per_partition 114 x_end = (rank + 1) * num_per_partition 115 116 if x_end > num_row: 117 x_end = num_row 118 119 return x_start, x_end 120 121 122 def dump_model(model, type, model_path, args): 123 """ 124 dump the trained model into local place 125 you can update this function to store the model into a remote place 126 :param model: the xgboost trained booster 127 :param type: model storage type 128 :param model_path: place to store model 129 :param args: configuration for model storage 130 :return: True if the dump process success 131 """ 132 if model is None: 133 raise Exception("fail to get the XGBoost train model") 134 else: 135 if type == "local": 136 joblib.dump(model, model_path) 137 logging.info("Dump model into local place %s", model_path) 138 139 elif type == "oss": 140 oss_param = parse_parameters(args.oss_param, ",", ":") 141 if oss_param is None: 142 raise Exception("Please config oss parameter to store model") 143 144 oss_param['path'] = args.model_path 145 dump_model_to_oss(oss_param, model) 146 logging.info("Dump model into oss place %s", args.model_path) 147 148 return True 149 150 151 def read_model(type, model_path, args): 152 """ 153 read model from physical storage 154 :param type: oss or local 155 :param model_path: place to store the model 156 :param args: configuration to read model 157 :return: XGBoost model 158 """ 159 160 if type == "local": 161 model = joblib.load(model_path) 162 logging.info("Read model from local place %s", model_path) 163 164 elif type == "oss": 165 oss_param = parse_parameters(args.oss_param, ",", ":") 166 if oss_param is None: 167 raise Exception("Please config oss to read model") 168 return False 169 170 oss_param['path'] = args.model_path 171 172 model = read_model_from_oss(oss_param) 173 logging.info("read model from oss place %s", model_path) 174 175 return model 176 177 178 def dump_model_to_oss(oss_parameters, booster): 179 """ 180 dump the model to remote OSS disk 181 :param oss_parameters: oss configuration 182 :param booster: XGBoost model 183 :return: True if stored procedure is success 184 """ 185 """export model into oss""" 186 model_fname = os.path.join(tempfile.mkdtemp(), 'model') 187 text_model_fname = os.path.join(tempfile.mkdtemp(), 'model.text') 188 feature_importance = os.path.join(tempfile.mkdtemp(), 189 'feature_importance.json') 190 191 oss_path = oss_parameters['path'] 192 logger.info('---- export model ----') 193 booster.save_model(model_fname) 194 booster.dump_model(text_model_fname) # format output model 195 fscore_dict = booster.get_fscore() 196 with open(feature_importance, 'w') as file: 197 file.write(json.dumps(fscore_dict)) 198 logger.info('---- chief dump model successfully!') 199 200 if os.path.exists(model_fname): 201 logger.info('---- Upload Model start...') 202 203 while oss_path[-1] == '/': 204 oss_path = oss_path[:-1] 205 206 upload_oss(oss_parameters, model_fname, oss_path) 207 aux_path = oss_path + '_dir/' 208 upload_oss(oss_parameters, model_fname, aux_path) 209 upload_oss(oss_parameters, text_model_fname, aux_path) 210 upload_oss(oss_parameters, feature_importance, aux_path) 211 else: 212 raise Exception("fail to generate model") 213 return False 214 215 return True 216 217 218 def upload_oss(kw, local_file, oss_path): 219 """ 220 help function to upload a model to oss 221 :param kw: OSS parameter 222 :param local_file: local place of model 223 :param oss_path: remote place of OSS 224 :return: True if the procedure is success 225 """ 226 if oss_path[-1] == '/': 227 oss_path = '%s%s' % (oss_path, os.path.basename(local_file)) 228 229 auth = oss2.Auth(kw['access_id'], kw['access_key']) 230 bucket = kw['access_bucket'] 231 bkt = oss2.Bucket(auth=auth, endpoint=kw['endpoint'], bucket_name=bucket) 232 233 try: 234 bkt.put_object_from_file(key=oss_path, filename=local_file) 235 logger.info("upload %s to %s successfully!" % 236 (os.path.abspath(local_file), oss_path)) 237 except Exception(): 238 raise ValueError('upload %s to %s failed' % 239 (os.path.abspath(local_file), oss_path)) 240 241 242 def read_model_from_oss(kw): 243 """ 244 helper function to read a model from oss 245 :param kw: OSS parameter 246 :return: XGBoost booster model 247 """ 248 auth = oss2.Auth(kw['access_id'], kw['access_key']) 249 bucket = kw['access_bucket'] 250 bkt = oss2.Bucket(auth=auth, endpoint=kw['endpoint'], bucket_name=bucket) 251 oss_path = kw["path"] 252 253 temp_model_fname = os.path.join(tempfile.mkdtemp(), 'local_model') 254 try: 255 bkt.get_object_to_file(key=oss_path, filename=temp_model_fname) 256 logger.info("success to load model from oss %s", oss_path) 257 except Exception as e: 258 logging.error("fail to load model: " + e) 259 raise Exception("fail to load model from oss %s", oss_path) 260 261 bst = xgb.Booster({'nthread': 2}) # init model 262 263 bst.load_model(temp_model_fname) 264 265 return bst 266 267 268 def parse_parameters(input, splitter_between, splitter_in): 269 """ 270 helper function parse the input parameter 271 :param input: the string of configuration like key-value pairs 272 :param splitter_between: the splitter between config for input string 273 :param splitter_in: the splitter inside config for input string 274 :return: key-value pair configuration 275 """ 276 277 ky_pairs = input.split(splitter_between) 278 279 confs = {} 280 281 for kv in ky_pairs: 282 conf = kv.split(splitter_in) 283 key = conf[0].strip(" ") 284 if key == "objective" or key == "endpoint": 285 value = conf[1].strip("'") + ":" + conf[2].strip("'") 286 else: 287 value = conf[1] 288 289 confs[key] = value 290 return confs 291