github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/ml/inference/xgboost_inference_test.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  import os
    19  import shutil
    20  import sys
    21  import tempfile
    22  import unittest
    23  import zipfile
    24  from typing import Any
    25  from typing import Tuple
    26  
    27  try:
    28    import datatable
    29    import numpy
    30    import pandas
    31    import pytest
    32    import scipy
    33    import xgboost
    34  
    35    import apache_beam as beam
    36    from apache_beam.ml.inference import RunInference
    37    from apache_beam.ml.inference.base import KeyedModelHandler
    38    from apache_beam.ml.inference.base import PredictionResult
    39    from apache_beam.ml.inference.xgboost_inference import XGBoostModelHandlerDatatable
    40    from apache_beam.ml.inference.xgboost_inference import XGBoostModelHandlerNumpy
    41    from apache_beam.ml.inference.xgboost_inference import XGBoostModelHandlerPandas
    42    from apache_beam.ml.inference.xgboost_inference import XGBoostModelHandlerSciPy
    43    from apache_beam.testing.test_pipeline import TestPipeline
    44    from apache_beam.testing.util import assert_that
    45    from apache_beam.testing.util import equal_to
    46  except ImportError:
    47    raise unittest.SkipTest('XGBoost dependencies are not installed')
    48  
    49  
    50  def _compare_prediction_result(a: PredictionResult, b: PredictionResult):
    51    if isinstance(a.example, scipy.sparse.csr_matrix) and isinstance(
    52        b.example, scipy.sparse.csr_matrix):
    53      example_equal = numpy.array_equal(a.example.todense(), b.example.todense())
    54  
    55    else:
    56      example_equal = numpy.array_equal(a.example, b.example)
    57    if isinstance(a.inference, dict):
    58      return all(
    59          x == y for x, y in zip(a.inference.values(),
    60                                 b.inference.values())) and example_equal
    61    return a.inference == b.inference and example_equal
    62  
    63  
    64  def _compare_keyed_prediction_result(
    65      a: Tuple[Any, PredictionResult], b: Tuple[Any, PredictionResult]):
    66    a_key, a_val = a
    67    b_key, b_val = b
    68    keys_equal = a_key == b_key
    69    return _compare_prediction_result(a_val, b_val) and keys_equal
    70  
    71  
    72  def predict_fn(self, data):
    73    self.inference_calls += 1
    74    if isinstance(data, pandas.DataFrame):
    75      data = data.to_numpy()
    76    if isinstance(data, datatable.Frame):
    77      data = data.to_numpy()
    78    if isinstance(data, scipy.sparse.csr_matrix):
    79      data = data.toarray()
    80    return sum(sum(array) for array in data)
    81  
    82  
    83  @pytest.fixture(autouse=True)
    84  def predict_patched(monkeypatch):
    85    monkeypatch.setattr(xgboost.XGBClassifier, 'predict', predict_fn)
    86  
    87  
    88  def build_monkeypatched_xgboost_classifier() -> xgboost.XGBClassifier:
    89    model = xgboost.XGBClassifier()
    90    model.inference_calls = 0
    91    model.fit([[0, 0], [0, 1], [1, 0], [1, 1]], [0, 1, 0, 1])
    92    return model
    93  
    94  
    95  @pytest.mark.uses_xgboost
    96  class XGBoostRunInferenceTest(unittest.TestCase):
    97    def setUp(self):
    98      self.tmpdir = tempfile.mkdtemp()
    99  
   100    def tearDown(self):
   101      shutil.rmtree(self.tmpdir)
   102  
   103    def test_predict_output(self):
   104      model = build_monkeypatched_xgboost_classifier()
   105      inference_runner = XGBoostModelHandlerNumpy(xgboost.XGBClassifier, 'unused')
   106      batched_examples = [
   107          numpy.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]),
   108          numpy.array([[1, 1, 1], [1, 1, 1], [1, 1, 1]])
   109      ]
   110      expected_predictions = [
   111          PredictionResult(numpy.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]), 45),
   112          PredictionResult(numpy.array([[1, 1, 1], [1, 1, 1], [1, 1, 1]]), 9),
   113      ]
   114      inferences = inference_runner.run_inference(batched_examples, model)
   115      for actual, expected in zip(inferences, expected_predictions):
   116        self.assertTrue(_compare_prediction_result(actual, expected))
   117  
   118    def test_single_inference_call(self):
   119      model = build_monkeypatched_xgboost_classifier()
   120      inference_runner = XGBoostModelHandlerNumpy(xgboost.XGBClassifier, 'unused')
   121      self.assertEqual(model.inference_calls, 0)
   122      batched_examples = [numpy.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])]
   123      _ = inference_runner.run_inference(batched_examples, model)
   124      self.assertEqual(model.inference_calls, 1)
   125  
   126    def test_multiple_inference_calls(self):
   127      model = build_monkeypatched_xgboost_classifier()
   128      inference_runner = XGBoostModelHandlerNumpy(xgboost.XGBClassifier, 'unused')
   129      self.assertEqual(model.inference_calls, 0)
   130      batched_examples = [
   131          numpy.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]),
   132          numpy.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]),
   133          numpy.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]),
   134          numpy.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]),
   135          numpy.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
   136      ]
   137      _ = inference_runner.run_inference(batched_examples, model)
   138      self.assertEqual(model.inference_calls, 5)
   139  
   140    def test_num_bytes_numpy(self):
   141      inference_runner = XGBoostModelHandlerNumpy(
   142          model_class=xgboost.XGBClassifier, model_state='unused')
   143      batched_examples_int = [
   144          numpy.array([[1, 1], [2, 2]]),
   145          numpy.array([[2, 4], [6, 8]]),
   146      ]
   147      self.assertEqual(
   148          sys.getsizeof(batched_examples_int[0]) +
   149          sys.getsizeof(batched_examples_int[1]),
   150          inference_runner.get_num_bytes(batched_examples_int))
   151  
   152      batched_examples_float = [
   153          numpy.array([[1.0, 1.0], [2.0, 2.0]]),
   154          numpy.array([[2.0, 4.0], [6.0, 8.0]]),
   155      ]
   156      self.assertEqual(
   157          sys.getsizeof(batched_examples_float[0]) +
   158          sys.getsizeof(batched_examples_float[1]),
   159          inference_runner.get_num_bytes(batched_examples_float))
   160  
   161    def test_num_bytes_pandas(self):
   162      inference_runner = XGBoostModelHandlerPandas(
   163          model_class=xgboost.XGBClassifier, model_state='unused')
   164      batched_examples_int = [
   165          pandas.DataFrame([[1, 1], [2, 2]]),
   166          pandas.DataFrame([[2, 4], [6, 8]]),
   167      ]
   168      self.assertEqual(
   169          batched_examples_int[0].memory_usage(deep=True).sum() +
   170          batched_examples_int[1].memory_usage(deep=True).sum(),
   171          inference_runner.get_num_bytes(batched_examples_int))
   172  
   173      batched_examples_float = [
   174          pandas.DataFrame([[1.0, 1.0], [2.0, 2.0]]),
   175          pandas.DataFrame([[2.0, 4.0], [6.0, 8.0]]),
   176      ]
   177      self.assertEqual(
   178          batched_examples_float[0].memory_usage(deep=True).sum() +
   179          batched_examples_float[1].memory_usage(deep=True).sum(),
   180          inference_runner.get_num_bytes(batched_examples_float))
   181  
   182    def test_num_bytes_datatable(self):
   183      inference_runner = XGBoostModelHandlerDatatable(
   184          model_class=xgboost.XGBClassifier, model_state='unused')
   185      batched_examples_int = [
   186          datatable.Frame([[1, 1], [2, 2]]),
   187          datatable.Frame([[2, 4], [6, 8]]),
   188      ]
   189      self.assertEqual(
   190          sys.getsizeof(batched_examples_int[0]) +
   191          sys.getsizeof(batched_examples_int[1]),
   192          inference_runner.get_num_bytes(batched_examples_int))
   193  
   194      batched_examples_float = [
   195          datatable.Frame([[1.0, 1.0], [2.0, 2.0]]),
   196          datatable.Frame([[2.0, 4.0], [6.0, 8.0]]),
   197      ]
   198      self.assertEqual(
   199          sys.getsizeof(batched_examples_float[0]) +
   200          sys.getsizeof(batched_examples_float[1]),
   201          inference_runner.get_num_bytes(batched_examples_float))
   202  
   203    def test_num_bytes_scipy(self):
   204      inference_runner = XGBoostModelHandlerSciPy(
   205          model_class=xgboost.XGBClassifier, model_state='unused')
   206      batched_examples_int = [
   207          scipy.sparse.csr_matrix([[1, 1], [2, 2]]),
   208          scipy.sparse.csr_matrix([[2, 4], [6, 8]]),
   209      ]
   210      self.assertEqual(
   211          sys.getsizeof(batched_examples_int[0]) +
   212          sys.getsizeof(batched_examples_int[1]),
   213          inference_runner.get_num_bytes(batched_examples_int))
   214  
   215      batched_examples_float = [
   216          scipy.sparse.csr_matrix([[1.0, 1.0], [2.0, 2.0]]),
   217          scipy.sparse.csr_matrix([[2.0, 4.0], [6.0, 8.0]]),
   218      ]
   219      self.assertEqual(
   220          sys.getsizeof(batched_examples_float[0]) +
   221          sys.getsizeof(batched_examples_float[1]),
   222          inference_runner.get_num_bytes(batched_examples_float))
   223  
   224    def test_pipeline_numpy(self):
   225      model = build_monkeypatched_xgboost_classifier()
   226      model_state = self.tmpdir + os.sep + 'model.json'
   227      model.save_model(model_state)
   228  
   229      with TestPipeline() as pipeline:
   230        examples = [
   231            numpy.array([[1, 1], [2, 2]]),
   232            numpy.array([[2, 4], [6, 8]]),
   233        ]
   234  
   235        pcoll = pipeline | 'start' >> beam.Create(examples)
   236        actual = pcoll | RunInference(
   237            XGBoostModelHandlerNumpy(
   238                model_class=xgboost.XGBClassifier, model_state=model_state))
   239        expected = [
   240            PredictionResult(numpy.array([[1, 1], [2, 2]]), 6),
   241            PredictionResult(numpy.array([[2, 4], [6, 8]]), 20)
   242        ]
   243        assert_that(
   244            actual, equal_to(expected, equals_fn=_compare_prediction_result))
   245  
   246    def test_pipeline_numpy_sets_env_vars_correctly(self):
   247      model = build_monkeypatched_xgboost_classifier()
   248      model_state = self.tmpdir + os.sep + 'model.json'
   249      model.save_model(model_state)
   250      os.environ.pop('FOO', None)
   251      self.assertFalse('FOO' in os.environ)
   252  
   253      with TestPipeline() as pipeline:
   254        examples = [
   255            numpy.array([[1, 1], [2, 2]]),
   256            numpy.array([[2, 4], [6, 8]]),
   257        ]
   258        handler_with_vars = XGBoostModelHandlerNumpy(
   259            env_vars={'FOO': 'bar'},
   260            model_class=xgboost.XGBClassifier,
   261            model_state=model_state)
   262        _ = (
   263            pipeline
   264            | 'start' >> beam.Create(examples)
   265            | RunInference(handler_with_vars))
   266        pipeline.run()
   267        self.assertTrue('FOO' in os.environ)
   268        self.assertTrue((os.environ['FOO']) == 'bar')
   269  
   270    def test_pipeline_pandas(self):
   271      model = build_monkeypatched_xgboost_classifier()
   272      model_state = self.tmpdir + os.sep + 'model.json'
   273      model.save_model(model_state)
   274  
   275      with TestPipeline() as pipeline:
   276        examples = [
   277            pandas.DataFrame([[1, 1], [2, 2]]),
   278            pandas.DataFrame([[2, 4], [6, 8]]),
   279        ]
   280  
   281        pcoll = pipeline | 'start' >> beam.Create(examples)
   282        actual = pcoll | RunInference(
   283            XGBoostModelHandlerPandas(
   284                model_class=xgboost.XGBClassifier, model_state=model_state))
   285        expected = [
   286            PredictionResult(pandas.DataFrame([[1, 1], [2, 2]]), 6),
   287            PredictionResult(pandas.DataFrame([[2, 4], [6, 8]]), 20)
   288        ]
   289        assert_that(
   290            actual, equal_to(expected, equals_fn=_compare_prediction_result))
   291        pipeline.run()
   292  
   293    def test_pipeline_pandas_sets_env_vars_correctly(self):
   294      model = build_monkeypatched_xgboost_classifier()
   295      model_state = self.tmpdir + os.sep + 'model.json'
   296      model.save_model(model_state)
   297      os.environ.pop('FOO', None)
   298      self.assertFalse('FOO' in os.environ)
   299  
   300      with TestPipeline() as pipeline:
   301        examples = [
   302            pandas.DataFrame([[1, 1], [2, 2]]),
   303            pandas.DataFrame([[2, 4], [6, 8]]),
   304        ]
   305        handler_with_vars = XGBoostModelHandlerPandas(
   306            env_vars={'FOO': 'bar'},
   307            model_class=xgboost.XGBClassifier,
   308            model_state=model_state)
   309        _ = (
   310            pipeline
   311            | 'start' >> beam.Create(examples)
   312            | RunInference(handler_with_vars))
   313        pipeline.run()
   314        self.assertTrue('FOO' in os.environ)
   315        self.assertTrue((os.environ['FOO']) == 'bar')
   316  
   317    def test_pipeline_datatable(self):
   318      model = build_monkeypatched_xgboost_classifier()
   319      model_state = self.tmpdir + os.sep + 'model.json'
   320      model.save_model(model_state)
   321  
   322      with TestPipeline() as pipeline:
   323        examples = [
   324            datatable.Frame([[1, 1], [2, 2]]),
   325            datatable.Frame([[2, 4], [6, 8]]),
   326        ]
   327  
   328        pcoll = pipeline | 'start' >> beam.Create(examples)
   329        actual = pcoll | RunInference(
   330            XGBoostModelHandlerDatatable(
   331                model_class=xgboost.XGBClassifier, model_state=model_state))
   332        expected = [
   333            PredictionResult(datatable.Frame([[1, 1], [2, 2]]), 6),
   334            PredictionResult(datatable.Frame([[2, 4], [6, 8]]), 20)
   335        ]
   336        assert_that(
   337            actual, equal_to(expected, equals_fn=_compare_prediction_result))
   338  
   339    def test_pipeline_datatable_sets_env_vars_correctly(self):
   340      model = build_monkeypatched_xgboost_classifier()
   341      model_state = self.tmpdir + os.sep + 'model.json'
   342      model.save_model(model_state)
   343      os.environ.pop('FOO', None)
   344      self.assertFalse('FOO' in os.environ)
   345  
   346      with TestPipeline() as pipeline:
   347        examples = [
   348            datatable.Frame([[1, 1], [2, 2]]),
   349            datatable.Frame([[2, 4], [6, 8]]),
   350        ]
   351        handler_with_vars = XGBoostModelHandlerDatatable(
   352            env_vars={'FOO': 'bar'},
   353            model_class=xgboost.XGBClassifier,
   354            model_state=model_state)
   355        _ = (
   356            pipeline
   357            | 'start' >> beam.Create(examples)
   358            | RunInference(handler_with_vars))
   359        pipeline.run()
   360        self.assertTrue('FOO' in os.environ)
   361        self.assertTrue((os.environ['FOO']) == 'bar')
   362  
   363    def test_pipeline_scipy(self):
   364      model = build_monkeypatched_xgboost_classifier()
   365      model_state = self.tmpdir + os.sep + 'model.json'
   366      model.save_model(model_state)
   367  
   368      with TestPipeline() as pipeline:
   369        examples = [
   370            scipy.sparse.csr_matrix(numpy.array([[1, 1], [2, 2]])),
   371            scipy.sparse.csr_matrix(numpy.array([[2, 4], [6, 8]])),
   372        ]
   373  
   374        pcoll = pipeline | 'start' >> beam.Create(examples)
   375        actual = pcoll | RunInference(
   376            XGBoostModelHandlerSciPy(
   377                model_class=xgboost.XGBClassifier, model_state=model_state))
   378        expected = [
   379            PredictionResult(
   380                scipy.sparse.csr_matrix(numpy.array([[1, 1], [2, 2]])), 6),
   381            PredictionResult(
   382                scipy.sparse.csr_matrix(numpy.array([[2, 4], [6, 8]])), 20)
   383        ]
   384        assert_that(
   385            actual, equal_to(expected, equals_fn=_compare_prediction_result))
   386  
   387    def test_pipeline_scipy_sets_env_vars_correctly(self):
   388      model = build_monkeypatched_xgboost_classifier()
   389      model_state = self.tmpdir + os.sep + 'model.json'
   390      model.save_model(model_state)
   391      os.environ.pop('FOO', None)
   392      self.assertFalse('FOO' in os.environ)
   393  
   394      with TestPipeline() as pipeline:
   395        examples = [
   396            scipy.sparse.csr_matrix(numpy.array([[1, 1], [2, 2]])),
   397            scipy.sparse.csr_matrix(numpy.array([[2, 4], [6, 8]])),
   398        ]
   399        handler_with_vars = XGBoostModelHandlerSciPy(
   400            env_vars={'FOO': 'bar'},
   401            model_class=xgboost.XGBClassifier,
   402            model_state=model_state)
   403        _ = (
   404            pipeline
   405            | 'start' >> beam.Create(examples)
   406            | RunInference(handler_with_vars))
   407        pipeline.run()
   408        self.assertTrue('FOO' in os.environ)
   409        self.assertTrue((os.environ['FOO']) == 'bar')
   410  
   411    def test_bad_model_file_raises(self):
   412      model_state = self.tmpdir + os.sep + 'bad_file_name.json'
   413  
   414      with self.assertRaises(RuntimeError):
   415        with TestPipeline() as pipeline:
   416          examples = [
   417              datatable.Frame([[1, 1], [2, 2]]),
   418              datatable.Frame([[2, 4], [6, 8]]),
   419          ]
   420  
   421          pcoll = pipeline | 'start' >> beam.Create(examples)
   422          _ = pcoll | RunInference(
   423              XGBoostModelHandlerNumpy(xgboost.XGBClassifier, model_state))
   424          pipeline.run()
   425  
   426    def test_bad_input_type_raises(self):
   427      model = build_monkeypatched_xgboost_classifier()
   428      model_state = self.tmpdir + os.sep + 'model.json'
   429      model.save_model(model_state)
   430  
   431      archived_model_state = self.tmpdir + os.sep + 'model.zip'
   432  
   433      zip_file = zipfile.ZipFile(archived_model_state, "w", zipfile.ZIP_DEFLATED)
   434      zip_file.write(model_state)
   435      zip_file.close()
   436  
   437      with self.assertRaises(xgboost.core.XGBoostError):
   438        model_handler = XGBoostModelHandlerNumpy(
   439            xgboost.XGBClassifier, model_state=archived_model_state)
   440        model_handler.load_model()
   441  
   442    def test_pipeline_scipy_with_keys(self):
   443      model = build_monkeypatched_xgboost_classifier()
   444      model_state = self.tmpdir + os.sep + 'model.json'
   445      model.save_model(model_state)
   446  
   447      with TestPipeline() as pipeline:
   448        examples = [
   449            ('0', scipy.sparse.csr_matrix([[1, 1], [2, 2]])),
   450            ('1', scipy.sparse.csr_matrix([[2, 4], [6, 8]])),
   451        ]
   452  
   453        pcoll = pipeline | 'start' >> beam.Create(examples)
   454        actual = pcoll | RunInference(
   455            KeyedModelHandler(
   456                XGBoostModelHandlerSciPy(
   457                    model_class=xgboost.XGBClassifier, model_state=model_state)))
   458        expected = [
   459            ('0', PredictionResult(scipy.sparse.csr_matrix([[1, 1], [2, 2]]), 6)),
   460            (
   461                '1',
   462                PredictionResult(scipy.sparse.csr_matrix([[2, 4], [6, 8]]), 20))
   463        ]
   464        assert_that(
   465            actual,
   466            equal_to(expected, equals_fn=_compare_keyed_prediction_result))
   467  
   468    def test_pipeline_numpy_with_keys(self):
   469      model = build_monkeypatched_xgboost_classifier()
   470      model_state = self.tmpdir + os.sep + 'model.json'
   471      model.save_model(model_state)
   472  
   473      with TestPipeline() as pipeline:
   474        examples = [
   475            ('0', numpy.array([[1, 1], [2, 2]])),
   476            ('1', numpy.array([[2, 4], [6, 8]])),
   477        ]
   478  
   479        pcoll = pipeline | 'start' >> beam.Create(examples)
   480        actual = pcoll | RunInference(
   481            KeyedModelHandler(
   482                XGBoostModelHandlerNumpy(
   483                    model_class=xgboost.XGBClassifier, model_state=model_state)))
   484        expected = [('0', PredictionResult(numpy.array([[1, 1], [2, 2]]), 6)),
   485                    ('1', PredictionResult(numpy.array([[2, 4], [6, 8]]), 20))]
   486        assert_that(
   487            actual,
   488            equal_to(expected, equals_fn=_compare_keyed_prediction_result))
   489  
   490    def test_pipeline_pandas_with_keys(self):
   491      model = build_monkeypatched_xgboost_classifier()
   492      model_state = self.tmpdir + os.sep + 'model.json'
   493      model.save_model(model_state)
   494  
   495      with TestPipeline() as pipeline:
   496        examples = [
   497            ('0', pandas.DataFrame([[1, 1], [2, 2]])),
   498            ('1', pandas.DataFrame([[2, 4], [6, 8]])),
   499        ]
   500  
   501        pcoll = pipeline | 'start' >> beam.Create(examples)
   502        actual = pcoll | RunInference(
   503            KeyedModelHandler(
   504                XGBoostModelHandlerPandas(
   505                    model_class=xgboost.XGBClassifier, model_state=model_state)))
   506        expected = [
   507            ('0', PredictionResult(pandas.DataFrame([[1, 1], [2, 2]]), 6)),
   508            ('1', PredictionResult(pandas.DataFrame([[2, 4], [6, 8]]), 20))
   509        ]
   510        assert_that(
   511            actual,
   512            equal_to(expected, equals_fn=_compare_keyed_prediction_result))
   513  
   514    def test_pipeline_datatable_with_keys(self):
   515      model = build_monkeypatched_xgboost_classifier()
   516      model_state = self.tmpdir + os.sep + 'model.json'
   517      model.save_model(model_state)
   518  
   519      with TestPipeline() as pipeline:
   520        examples = [
   521            ('0', datatable.Frame([[1, 1], [2, 2]])),
   522            ('1', datatable.Frame([[2, 4], [6, 8]])),
   523        ]
   524  
   525        pcoll = pipeline | 'start' >> beam.Create(examples)
   526        actual = pcoll | RunInference(
   527            KeyedModelHandler(
   528                XGBoostModelHandlerDatatable(
   529                    model_class=xgboost.XGBClassifier, model_state=model_state)))
   530        expected = [
   531            ('0', PredictionResult(datatable.Frame([[1, 1], [2, 2]]), 6)),
   532            ('1', PredictionResult(datatable.Frame([[2, 4], [6, 8]]), 20))
   533        ]
   534        assert_that(
   535            actual,
   536            equal_to(expected, equals_fn=_compare_keyed_prediction_result))
   537  
   538  
   539  if __name__ == '__main__':
   540    unittest.main()