github.com/kubeflow/training-operator@v1.7.0/examples/xgboost/xgboost-dist/utils.py

github.com/kubeflow/training-operator@v1.7.0/examples/xgboost/xgboost-dist/utils.py (about)

     1  # Licensed under the Apache License, Version 2.0 (the "License");
     2  # you may not use this file except in compliance with the License.
     3  # You may obtain a copy of the License at
     4  #
     5  #     http://www.apache.org/licenses/LICENSE-2.0
     6  #
     7  # Unless required by applicable law or agreed to in writing, software
     8  # distributed under the License is distributed on an "AS IS" BASIS,
     9  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    10  # See the License for the specific language governing permissions and
    11  # limitations under the License.
    12  
    13  import logging
    14  import joblib
    15  import xgboost as xgb
    16  import os
    17  import tempfile
    18  import oss2
    19  import json
    20  import pandas as pd
    21  
    22  from sklearn import datasets
    23  
    24  logger = logging.getLogger(__name__)
    25  
    26  
    27  def extract_xgbooost_cluster_env():
    28      """
    29      Extract the cluster env from pod
    30      :return: the related cluster env to build rabit
    31      """
    32  
    33      logger.info("starting to extract system env")
    34  
    35      master_addr = os.environ.get("MASTER_ADDR", "{}")
    36      master_port = int(os.environ.get("MASTER_PORT", "{}"))
    37      rank = int(os.environ.get("RANK", "{}"))
    38      world_size = int(os.environ.get("WORLD_SIZE", "{}"))
    39  
    40      logger.info("extract the Rabit env from cluster :"
    41                  " %s, port: %d, rank: %d, word_size: %d ",
    42                  master_addr, master_port, rank, world_size)
    43  
    44      return master_addr, master_port, rank, world_size
    45  
    46  
    47  def read_train_data(rank, num_workers, path):
    48      """
    49      Read file based on the rank of worker.
    50      We use the sklearn.iris data for demonstration
    51      You can extend this to read distributed data source like HDFS, HIVE etc
    52      :param rank: the id of each worker
    53      :param num_workers: total number of workers in this cluster
    54      :param path: the input file name or the place to read the data
    55      :return: XGBoost Dmatrix
    56      """
    57      iris = datasets.load_iris()
    58      x = iris.data
    59      y = iris.target
    60  
    61      start, end = get_range_data(len(x), rank, num_workers)
    62      x = x[start:end, :]
    63      y = y[start:end]
    64  
    65      x = pd.DataFrame(x)
    66      y = pd.DataFrame(y)
    67      dtrain = xgb.DMatrix(data=x, label=y)
    68  
    69      logging.info("Read data from IRIS data source with range from %d to %d",
    70                   start, end)
    71  
    72      return dtrain
    73  
    74  
    75  def read_predict_data(rank, num_workers, path):
    76      """
    77      Read file based on the rank of worker.
    78      We use the sklearn.iris data for demonstration
    79      You can extend this to read distributed data source like HDFS, HIVE etc
    80      :param rank: the id of each worker
    81      :param num_workers: total number of workers in this cluster
    82      :param path: the input file name or the place to read the data
    83      :return: XGBoost Dmatrix, and real value
    84      """
    85      iris = datasets.load_iris()
    86      x = iris.data
    87      y = iris.target
    88  
    89      start, end = get_range_data(len(x), rank, num_workers)
    90      x = x[start:end, :]
    91      y = y[start:end]
    92      x = pd.DataFrame(x)
    93      y = pd.DataFrame(y)
    94  
    95      logging.info("Read data from IRIS datasource with range from %d to %d",
    96                   start, end)
    97  
    98      predict = xgb.DMatrix(x, label=y)
    99  
   100      return predict, y
   101  
   102  
   103  def get_range_data(num_row, rank, num_workers):
   104      """
   105      compute the data range based on the input data size and worker id
   106      :param num_row: total number of dataset
   107      :param rank: the worker id
   108      :param num_workers: total number of workers
   109      :return: begin and end range of input matrix
   110      """
   111      num_per_partition = int(num_row/num_workers)
   112  
   113      x_start = rank * num_per_partition
   114      x_end = (rank + 1) * num_per_partition
   115  
   116      if x_end > num_row:
   117          x_end = num_row
   118  
   119      return x_start, x_end
   120  
   121  
   122  def dump_model(model, type, model_path, args):
   123      """
   124      dump the trained model into local place
   125      you can update this function to store the model into a remote place
   126      :param model: the xgboost trained booster
   127      :param type: model storage type
   128      :param model_path: place to store model
   129      :param args: configuration for model storage
   130      :return: True if the dump process success
   131      """
   132      if model is None:
   133          raise Exception("fail to get the XGBoost train model")
   134      else:
   135          if type == "local":
   136              joblib.dump(model, model_path)
   137              logging.info("Dump model into local place %s", model_path)
   138  
   139          elif type == "oss":
   140              oss_param = parse_parameters(args.oss_param, ",", ":")
   141              if oss_param is None:
   142                  raise Exception("Please config oss parameter to store model")
   143  
   144              oss_param['path'] = args.model_path            
   145              dump_model_to_oss(oss_param, model)
   146              logging.info("Dump model into oss place %s", args.model_path)
   147  
   148      return True
   149  
   150  
   151  def read_model(type, model_path, args):
   152      """
   153      read model from physical storage
   154      :param type: oss or local
   155      :param model_path: place to store the model
   156      :param args: configuration to read model
   157      :return: XGBoost model
   158      """
   159  
   160      if type == "local":
   161          model = joblib.load(model_path)
   162          logging.info("Read model from local place %s", model_path)
   163  
   164      elif type == "oss":
   165          oss_param = parse_parameters(args.oss_param, ",", ":")
   166          if oss_param is None:
   167              raise Exception("Please config oss to read model")
   168              return False
   169  
   170          oss_param['path'] = args.model_path        
   171  
   172          model = read_model_from_oss(oss_param)
   173          logging.info("read model from oss place %s", model_path)
   174  
   175      return model
   176  
   177  
   178  def dump_model_to_oss(oss_parameters, booster):
   179      """
   180      dump the model to remote OSS disk
   181      :param oss_parameters: oss configuration
   182      :param booster: XGBoost model
   183      :return: True if stored procedure is success
   184      """
   185      """export model into oss"""
   186      model_fname = os.path.join(tempfile.mkdtemp(), 'model')
   187      text_model_fname = os.path.join(tempfile.mkdtemp(), 'model.text')
   188      feature_importance = os.path.join(tempfile.mkdtemp(),
   189                                        'feature_importance.json')
   190  
   191      oss_path = oss_parameters['path']
   192      logger.info('---- export model ----')
   193      booster.save_model(model_fname)
   194      booster.dump_model(text_model_fname)  # format output model
   195      fscore_dict = booster.get_fscore()
   196      with open(feature_importance, 'w') as file:
   197          file.write(json.dumps(fscore_dict))
   198          logger.info('---- chief dump model successfully!')
   199  
   200      if os.path.exists(model_fname):
   201          logger.info('---- Upload Model start...')
   202  
   203          while oss_path[-1] == '/':
   204              oss_path = oss_path[:-1]
   205  
   206          upload_oss(oss_parameters, model_fname, oss_path)
   207          aux_path = oss_path + '_dir/'
   208          upload_oss(oss_parameters, model_fname, aux_path)
   209          upload_oss(oss_parameters, text_model_fname, aux_path)
   210          upload_oss(oss_parameters, feature_importance, aux_path)
   211      else:
   212          raise Exception("fail to generate model")
   213          return False
   214  
   215      return True
   216  
   217  
   218  def upload_oss(kw, local_file, oss_path):
   219      """
   220      help function to upload a model to oss
   221      :param kw: OSS parameter
   222      :param local_file: local place of model
   223      :param oss_path: remote place of OSS
   224      :return: True if the procedure is success
   225      """
   226      if oss_path[-1] == '/':
   227          oss_path = '%s%s' % (oss_path, os.path.basename(local_file))
   228  
   229      auth = oss2.Auth(kw['access_id'], kw['access_key'])
   230      bucket = kw['access_bucket']
   231      bkt = oss2.Bucket(auth=auth, endpoint=kw['endpoint'], bucket_name=bucket)
   232  
   233      try:
   234          bkt.put_object_from_file(key=oss_path, filename=local_file)
   235          logger.info("upload %s to %s successfully!" %
   236                      (os.path.abspath(local_file), oss_path))
   237      except Exception():
   238          raise ValueError('upload %s to %s failed' %
   239                           (os.path.abspath(local_file), oss_path))
   240  
   241  
   242  def read_model_from_oss(kw):
   243      """
   244      helper function to read a model from oss
   245      :param kw: OSS parameter
   246      :return: XGBoost booster model
   247      """
   248      auth = oss2.Auth(kw['access_id'], kw['access_key'])
   249      bucket = kw['access_bucket']
   250      bkt = oss2.Bucket(auth=auth, endpoint=kw['endpoint'], bucket_name=bucket)
   251      oss_path = kw["path"]
   252  
   253      temp_model_fname = os.path.join(tempfile.mkdtemp(), 'local_model')
   254      try:
   255          bkt.get_object_to_file(key=oss_path, filename=temp_model_fname)
   256          logger.info("success to load model from oss %s", oss_path)
   257      except Exception as e:
   258          logging.error("fail to load model: " + e)
   259          raise Exception("fail to load model from oss %s", oss_path)
   260  
   261      bst = xgb.Booster({'nthread': 2})  # init model
   262  
   263      bst.load_model(temp_model_fname)
   264  
   265      return bst
   266  
   267  
   268  def parse_parameters(input, splitter_between, splitter_in):
   269      """
   270      helper function parse the input parameter
   271      :param input: the string of configuration like key-value pairs
   272      :param splitter_between: the splitter between config for input string
   273      :param splitter_in: the splitter inside config for input string
   274      :return: key-value pair configuration
   275      """
   276  
   277      ky_pairs = input.split(splitter_between)
   278  
   279      confs = {}
   280  
   281      for kv in ky_pairs:
   282          conf = kv.split(splitter_in)
   283          key = conf[0].strip(" ")
   284          if key == "objective" or key == "endpoint":
   285              value = conf[1].strip("'") + ":" + conf[2].strip("'")       
   286          else:
   287              value = conf[1]
   288  
   289          confs[key] = value
   290      return confs
   291