github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/ml/inference/sklearn_inference_test.py

github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/ml/inference/sklearn_inference_test.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  # pytype: skip-file
    19  import io
    20  import math
    21  import os
    22  import pickle
    23  import platform
    24  import shutil
    25  import sys
    26  import tempfile
    27  import unittest
    28  from typing import Any
    29  from typing import Dict
    30  from typing import Optional
    31  from typing import Sequence
    32  
    33  import joblib
    34  import numpy
    35  import pandas
    36  from sklearn import linear_model
    37  from sklearn import svm
    38  from sklearn.base import BaseEstimator
    39  from sklearn.compose import ColumnTransformer
    40  from sklearn.pipeline import Pipeline
    41  from sklearn.preprocessing import OneHotEncoder
    42  from sklearn.preprocessing import StandardScaler
    43  
    44  import apache_beam as beam
    45  from apache_beam.ml.inference.base import KeyedModelHandler
    46  from apache_beam.ml.inference.base import PredictionResult
    47  from apache_beam.ml.inference.base import RunInference
    48  from apache_beam.ml.inference.sklearn_inference import ModelFileType
    49  from apache_beam.ml.inference.sklearn_inference import SklearnModelHandlerNumpy
    50  from apache_beam.ml.inference.sklearn_inference import SklearnModelHandlerPandas
    51  from apache_beam.ml.inference.sklearn_inference import _default_numpy_inference_fn
    52  from apache_beam.ml.inference.sklearn_inference import _default_pandas_inference_fn
    53  from apache_beam.testing.test_pipeline import TestPipeline
    54  from apache_beam.testing.util import assert_that
    55  from apache_beam.testing.util import equal_to
    56  
    57  
    58  def _compare_prediction_result(a, b):
    59    example_equal = numpy.array_equal(a.example, b.example)
    60    if isinstance(a.inference, dict):
    61      return all(
    62          x == y for x, y in zip(a.inference.values(),
    63                                 b.inference.values())) and example_equal
    64    return a.inference == b.inference and example_equal
    65  
    66  
    67  def _compare_dataframe_predictions(a_in, b_in):
    68    keys_equal = True
    69    if isinstance(a_in, tuple) and not isinstance(a_in, PredictionResult):
    70      a_key, a = a_in
    71      b_key, b = b_in
    72      keys_equal = a_key == b_key
    73    else:
    74      a = a_in
    75      b = b_in
    76    example_equal = pandas.DataFrame.equals(a.example, b.example)
    77    if isinstance(a.inference, dict):
    78      return all(
    79          math.floor(a) == math.floor(b) for a,
    80          b in zip(a.inference.values(), b.inference.values())) and example_equal
    81    inference_equal = math.floor(a.inference) == math.floor(b.inference)
    82    return inference_equal and example_equal and keys_equal
    83  
    84  
    85  class FakeModel:
    86    def __init__(self):
    87      self.total_predict_calls = 0
    88  
    89    def predict(self, input_vector: numpy.ndarray):
    90      self.total_predict_calls += 1
    91      return numpy.sum(input_vector, axis=1)
    92  
    93  
    94  class FakeNumpyModelDictOut:
    95    def __init__(self):
    96      self.total_predict_calls = 0
    97  
    98    def predict(self, input_vector: numpy.ndarray):
    99      self.total_predict_calls += 1
   100      out = numpy.sum(input_vector, axis=1)
   101      return {"out1": out, "out2": out}
   102  
   103  
   104  class FakePandasModelDictOut:
   105    def __init__(self):
   106      self.total_predict_calls = 0
   107  
   108    def predict(self, df: pandas.DataFrame):
   109      self.total_predict_calls += 1
   110      out = df.loc[:, 'number_2']
   111      return {"out1": out, "out2": out}
   112  
   113  
   114  def build_model():
   115    x = [[0, 0], [1, 1]]
   116    y = [0, 1]
   117    model = svm.SVC()
   118    model.fit(x, y)
   119    return model
   120  
   121  
   122  def pandas_dataframe():
   123    csv_string = (
   124        'category_1,number_1,category_2,number_2,label,number_3\n'
   125        'red,4,frog,5,6,7\n'
   126        'blue,3,horse,8,9,10\n'
   127        'red,0,cow,1,2,3\n'
   128        'blue,4,frog,1,1,1\n'
   129        'red,1,horse,4,2,3')
   130    csv_string_io = io.StringIO(csv_string)
   131    return pandas.read_csv(csv_string_io)
   132  
   133  
   134  def build_pandas_pipeline():
   135    """Builds a common type of pandas pipeline with preprocessing."""
   136    categorical_columns = ['category_1', 'category_2']
   137    numerical_columns = ['number_1', 'number_2', 'number_3']
   138  
   139    categorical_transformer = OneHotEncoder(handle_unknown='ignore')
   140    numerical_transformer = StandardScaler()
   141  
   142    preprocessor = ColumnTransformer(
   143        transformers=[
   144            ("numerical", numerical_transformer, numerical_columns),
   145            ("categorical", categorical_transformer, categorical_columns),
   146        ])
   147    pipeline = Pipeline(
   148        steps=[("preprocessor",
   149                preprocessor), ("classifier", linear_model.SGDRegressor())])
   150    data = pandas_dataframe()
   151    labels = data['label']
   152    pipeline.fit(data, labels)
   153    return pipeline
   154  
   155  
   156  def convert_inference_to_floor(prediction_result):
   157    return math.floor(prediction_result.inference)
   158  
   159  
   160  def alternate_numpy_inference_fn(
   161      model: BaseEstimator,
   162      batch: Sequence[numpy.ndarray],
   163      inference_args: Optional[Dict[str, Any]] = None) -> Any:
   164    return [0]
   165  
   166  
   167  def alternate_pandas_inference_fn(
   168      model: BaseEstimator,
   169      batch: Sequence[pandas.DataFrame],
   170      inference_args: Optional[Dict[str, Any]] = None) -> Any:
   171    # vectorize data for better performance
   172    vectorized_batch = pandas.concat(batch, axis=0)
   173    predictions = model.predict(vectorized_batch)
   174    splits = [
   175        vectorized_batch.iloc[[i]] for i in range(vectorized_batch.shape[0])
   176    ]
   177    predictions = predictions - 1
   178    return predictions, splits
   179  
   180  
   181  class SkLearnRunInferenceTest(unittest.TestCase):
   182    def setUp(self):
   183      self.tmpdir = tempfile.mkdtemp()
   184  
   185    def tearDown(self):
   186      shutil.rmtree(self.tmpdir)
   187  
   188    def test_predict_output(self):
   189      fake_model = FakeModel()
   190      inference_runner = SklearnModelHandlerNumpy(model_uri='unused')
   191      batched_examples = [
   192          numpy.array([1, 2, 3]), numpy.array([4, 5, 6]), numpy.array([7, 8, 9])
   193      ]
   194      expected_predictions = [
   195          PredictionResult(numpy.array([1, 2, 3]), 6),
   196          PredictionResult(numpy.array([4, 5, 6]), 15),
   197          PredictionResult(numpy.array([7, 8, 9]), 24)
   198      ]
   199      inferences = inference_runner.run_inference(batched_examples, fake_model)
   200      for actual, expected in zip(inferences, expected_predictions):
   201        self.assertTrue(_compare_prediction_result(actual, expected))
   202  
   203    def test_custom_inference_fn(self):
   204      fake_model = FakeModel()
   205      inference_runner = SklearnModelHandlerNumpy(
   206          model_uri='unused', inference_fn=alternate_numpy_inference_fn)
   207      batched_examples = [
   208          numpy.array([1, 2, 3]), numpy.array([4, 5, 6]), numpy.array([7, 8, 9])
   209      ]
   210      expected_predictions = [
   211          PredictionResult(numpy.array([1, 2, 3]), 0),
   212          PredictionResult(numpy.array([4, 5, 6]), 0),
   213          PredictionResult(numpy.array([7, 8, 9]), 0)
   214      ]
   215      inferences = inference_runner.run_inference(batched_examples, fake_model)
   216      for actual, expected in zip(inferences, expected_predictions):
   217        self.assertTrue(_compare_prediction_result(actual, expected))
   218  
   219    def test_predict_output_dict(self):
   220      fake_model = FakeNumpyModelDictOut()
   221      inference_runner = SklearnModelHandlerNumpy(model_uri='unused')
   222      batched_examples = [
   223          numpy.array([1, 2, 3]), numpy.array([4, 5, 6]), numpy.array([7, 8, 9])
   224      ]
   225      expected_predictions = [
   226          PredictionResult(numpy.array([1, 2, 3]), {
   227              "out1": 6, "out2": 6
   228          }),
   229          PredictionResult(numpy.array([4, 5, 6]), {
   230              "out1": 15, "out2": 15
   231          }),
   232          PredictionResult(numpy.array([7, 8, 9]), {
   233              "out1": 24, "out2": 24
   234          })
   235      ]
   236      inferences = inference_runner.run_inference(batched_examples, fake_model)
   237      for actual, expected in zip(inferences, expected_predictions):
   238        self.assertTrue(_compare_prediction_result(actual, expected))
   239  
   240    def test_data_vectorized(self):
   241      fake_model = FakeModel()
   242      inference_runner = SklearnModelHandlerNumpy(model_uri='unused')
   243      batched_examples = [
   244          numpy.array([1, 2, 3]), numpy.array([4, 5, 6]), numpy.array([7, 8, 9])
   245      ]
   246      # even though there are 3 examples, the data should
   247      # be vectorized and only 1 call should happen.
   248      inference_runner.run_inference(batched_examples, fake_model)
   249      self.assertEqual(1, fake_model.total_predict_calls)
   250  
   251    def test_num_bytes_numpy(self):
   252      inference_runner = SklearnModelHandlerNumpy(model_uri='unused')
   253      batched_examples_int = [
   254          numpy.array([1, 2, 3]), numpy.array([4, 5, 6]), numpy.array([7, 8, 9])
   255      ]
   256      self.assertEqual(
   257          sys.getsizeof(batched_examples_int[0]) * 3,
   258          inference_runner.get_num_bytes(batched_examples_int))
   259  
   260      batched_examples_float = [
   261          numpy.array([1.0, 2.0, 3.0]),
   262          numpy.array([4.1, 5.2, 6.3]),
   263          numpy.array([7.7, 8.8, 9.9])
   264      ]
   265      self.assertEqual(
   266          sys.getsizeof(batched_examples_float[0]) * 3,
   267          inference_runner.get_num_bytes(batched_examples_float))
   268  
   269    def test_pipeline_pickled(self):
   270      temp_file_name = self.tmpdir + os.sep + 'pickled_file'
   271      with open(temp_file_name, 'wb') as file:
   272        pickle.dump(build_model(), file)
   273      with TestPipeline() as pipeline:
   274        examples = [numpy.array([0, 0]), numpy.array([1, 1])]
   275  
   276        pcoll = pipeline | 'start' >> beam.Create(examples)
   277        actual = pcoll | RunInference(
   278            SklearnModelHandlerNumpy(model_uri=temp_file_name))
   279        expected = [
   280            PredictionResult(numpy.array([0, 0]), 0),
   281            PredictionResult(numpy.array([1, 1]), 1)
   282        ]
   283        assert_that(
   284            actual, equal_to(expected, equals_fn=_compare_prediction_result))
   285  
   286    def test_pipeline_pickled_custom_batching(self):
   287      temp_file_name = self.tmpdir + os.sep + 'pickled_file'
   288      with open(temp_file_name, 'wb') as file:
   289        pickle.dump(build_model(), file)
   290  
   291      def batch_validator_numpy_inference_fn(
   292          model: BaseEstimator,
   293          batch: Sequence[numpy.ndarray],
   294          inference_args: Optional[Dict[str, Any]] = None) -> Any:
   295        if len(batch) != 2:
   296          raise Exception(
   297              f'Expected batch of size 2, received batch of size {len(batch)}')
   298        return _default_numpy_inference_fn(model, batch, inference_args)
   299  
   300      with TestPipeline() as pipeline:
   301        examples = [numpy.array([0, 0]), numpy.array([1, 1])]
   302  
   303        pcoll = pipeline | 'start' >> beam.Create(examples)
   304        actual = pcoll | RunInference(
   305            SklearnModelHandlerNumpy(
   306                model_uri=temp_file_name,
   307                inference_fn=batch_validator_numpy_inference_fn,
   308                min_batch_size=2,
   309                max_batch_size=2))
   310        expected = [
   311            PredictionResult(numpy.array([0, 0]), 0),
   312            PredictionResult(numpy.array([1, 1]), 1)
   313        ]
   314        assert_that(
   315            actual, equal_to(expected, equals_fn=_compare_prediction_result))
   316  
   317    def test_pipeline_joblib(self):
   318      temp_file_name = self.tmpdir + os.sep + 'joblib_file'
   319      with open(temp_file_name, 'wb') as file:
   320        joblib.dump(build_model(), file)
   321      with TestPipeline() as pipeline:
   322        examples = [numpy.array([0, 0]), numpy.array([1, 1])]
   323  
   324        pcoll = pipeline | 'start' >> beam.Create(examples)
   325  
   326        actual = pcoll | RunInference(
   327            SklearnModelHandlerNumpy(
   328                model_uri=temp_file_name, model_file_type=ModelFileType.JOBLIB))
   329        expected = [
   330            PredictionResult(numpy.array([0, 0]), 0),
   331            PredictionResult(numpy.array([1, 1]), 1)
   332        ]
   333        assert_that(
   334            actual, equal_to(expected, equals_fn=_compare_prediction_result))
   335  
   336    def test_bad_file_raises(self):
   337      with self.assertRaises(RuntimeError):
   338        with TestPipeline() as pipeline:
   339          examples = [numpy.array([0, 0])]
   340          pcoll = pipeline | 'start' >> beam.Create(examples)
   341          _ = pcoll | RunInference(
   342              SklearnModelHandlerNumpy(model_uri='/var/bad_file_name'))
   343          pipeline.run()
   344  
   345    def test_bad_input_type_raises(self):
   346      with self.assertRaisesRegex(AssertionError,
   347                                  'Unsupported serialization type'):
   348        with tempfile.NamedTemporaryFile(delete=False) as file:
   349          model_handler = SklearnModelHandlerNumpy(
   350              model_uri=file.name, model_file_type=None)
   351          model_handler.load_model()
   352  
   353    def test_env_vars_set_correctly_numpy(self):
   354      temp_file_name = self.tmpdir + os.sep + 'pickled_file'
   355      with open(temp_file_name, 'wb') as file:
   356        pickle.dump(build_model(), file)
   357      handler_with_vars = SklearnModelHandlerNumpy(
   358          env_vars={'FOO': 'bar'}, model_uri=temp_file_name)
   359      os.environ.pop('FOO', None)
   360      self.assertFalse('FOO' in os.environ)
   361      examples = [numpy.array([0, 0]), numpy.array([1, 1])]
   362      with TestPipeline() as pipeline:
   363        _ = (
   364            pipeline
   365            | 'start' >> beam.Create(examples)
   366            | RunInference(handler_with_vars))
   367        pipeline.run()
   368        self.assertTrue('FOO' in os.environ)
   369        self.assertTrue((os.environ['FOO']) == 'bar')
   370  
   371    def test_pipeline_pandas(self):
   372      temp_file_name = self.tmpdir + os.sep + 'pickled_file'
   373      with open(temp_file_name, 'wb') as file:
   374        pickle.dump(build_pandas_pipeline(), file)
   375      with TestPipeline() as pipeline:
   376        dataframe = pandas_dataframe()
   377        splits = [dataframe.loc[[i]] for i in dataframe.index]
   378        pcoll = pipeline | 'start' >> beam.Create(splits)
   379        actual = pcoll | RunInference(
   380            SklearnModelHandlerPandas(model_uri=temp_file_name))
   381  
   382        expected = [
   383            PredictionResult(splits[0], 5),
   384            PredictionResult(splits[1], 8),
   385            PredictionResult(splits[2], 1),
   386            PredictionResult(splits[3], 1),
   387            PredictionResult(splits[4], 2),
   388        ]
   389        assert_that(
   390            actual, equal_to(expected, equals_fn=_compare_dataframe_predictions))
   391  
   392    def test_pipeline_pandas_env_vars_set_correctly(self):
   393      temp_file_name = self.tmpdir + os.sep + 'pickled_file'
   394      with open(temp_file_name, 'wb') as file:
   395        pickle.dump(build_pandas_pipeline(), file)
   396  
   397      handler_with_vars = SklearnModelHandlerPandas(
   398          env_vars={'FOO': 'bar'}, model_uri=temp_file_name)
   399      os.environ.pop('FOO', None)
   400      self.assertFalse('FOO' in os.environ)
   401      with TestPipeline() as pipeline:
   402        dataframe = pandas_dataframe()
   403        splits = [dataframe.loc[[i]] for i in dataframe.index]
   404        _ = (
   405            pipeline
   406            | 'start' >> beam.Create(splits)
   407            | RunInference(handler_with_vars))
   408        pipeline.run()
   409        self.assertTrue('FOO' in os.environ)
   410        self.assertTrue((os.environ['FOO']) == 'bar')
   411  
   412    def test_pipeline_pandas_custom_batching(self):
   413      temp_file_name = self.tmpdir + os.sep + 'pickled_file'
   414      with open(temp_file_name, 'wb') as file:
   415        pickle.dump(build_pandas_pipeline(), file)
   416  
   417      def batch_validator_pandas_inference_fn(
   418          model: BaseEstimator,
   419          batch: Sequence[numpy.ndarray],
   420          inference_args: Optional[Dict[str, Any]] = None) -> Any:
   421        if len(batch) != 5:
   422          raise Exception(
   423              f'Expected batch of size 5, received batch of size {len(batch)}')
   424        return _default_pandas_inference_fn(model, batch, inference_args)
   425  
   426      with TestPipeline() as pipeline:
   427        dataframe = pandas_dataframe()
   428        splits = [dataframe.loc[[i]] for i in dataframe.index]
   429        pcoll = pipeline | 'start' >> beam.Create(splits)
   430        actual = pcoll | RunInference(
   431            SklearnModelHandlerPandas(
   432                model_uri=temp_file_name,
   433                inference_fn=batch_validator_pandas_inference_fn,
   434                min_batch_size=5,
   435                max_batch_size=5))
   436  
   437        expected = [
   438            PredictionResult(splits[0], 5),
   439            PredictionResult(splits[1], 8),
   440            PredictionResult(splits[2], 1),
   441            PredictionResult(splits[3], 1),
   442            PredictionResult(splits[4], 2),
   443        ]
   444        assert_that(
   445            actual, equal_to(expected, equals_fn=_compare_dataframe_predictions))
   446  
   447    def test_pipeline_pandas_custom_inference(self):
   448      temp_file_name = self.tmpdir + os.sep + 'pickled_file'
   449      with open(temp_file_name, 'wb') as file:
   450        pickle.dump(build_pandas_pipeline(), file)
   451      with TestPipeline() as pipeline:
   452        dataframe = pandas_dataframe()
   453        splits = [dataframe.loc[[i]] for i in dataframe.index]
   454        pcoll = pipeline | 'start' >> beam.Create(splits)
   455        actual = pcoll | RunInference(
   456            SklearnModelHandlerPandas(
   457                model_uri=temp_file_name,
   458                inference_fn=alternate_pandas_inference_fn))
   459  
   460        expected = [
   461            PredictionResult(splits[0], 4),
   462            PredictionResult(splits[1], 7),
   463            PredictionResult(splits[2], 0),
   464            PredictionResult(splits[3], 0),
   465            PredictionResult(splits[4], 1),
   466        ]
   467        assert_that(
   468            actual, equal_to(expected, equals_fn=_compare_dataframe_predictions))
   469  
   470    def test_pipeline_pandas_dict_out(self):
   471      temp_file_name = self.tmpdir + os.sep + 'pickled_file'
   472      with open(temp_file_name, 'wb') as file:
   473        pickle.dump(FakePandasModelDictOut(), file)
   474      with TestPipeline() as pipeline:
   475        dataframe = pandas_dataframe()
   476        splits = [dataframe.loc[[i]] for i in dataframe.index]
   477        pcoll = pipeline | 'start' >> beam.Create(splits)
   478        actual = pcoll | RunInference(
   479            SklearnModelHandlerPandas(model_uri=temp_file_name))
   480  
   481        expected = [
   482            PredictionResult(splits[0], {
   483                'out1': 5, 'out2': 5
   484            }),
   485            PredictionResult(splits[1], {
   486                'out1': 8, 'out2': 8
   487            }),
   488            PredictionResult(splits[2], {
   489                'out1': 1, 'out2': 1
   490            }),
   491            PredictionResult(splits[3], {
   492                'out1': 1, 'out2': 1
   493            }),
   494            PredictionResult(splits[4], {
   495                'out1': 4, 'out2': 4
   496            }),
   497        ]
   498        assert_that(
   499            actual, equal_to(expected, equals_fn=_compare_dataframe_predictions))
   500  
   501    @unittest.skipIf(platform.system() == 'Windows', 'BEAM-14359')
   502    def test_pipeline_pandas_joblib(self):
   503      temp_file_name = self.tmpdir + os.sep + 'pickled_file'
   504      with open(temp_file_name, 'wb') as file:
   505        joblib.dump(build_pandas_pipeline(), file)
   506      with TestPipeline() as pipeline:
   507        dataframe = pandas_dataframe()
   508        splits = [dataframe.loc[[i]] for i in dataframe.index]
   509        pcoll = pipeline | 'start' >> beam.Create(splits)
   510        actual = pcoll | RunInference(
   511            SklearnModelHandlerPandas(
   512                model_uri=temp_file_name, model_file_type=ModelFileType.JOBLIB))
   513  
   514        expected = [
   515            PredictionResult(splits[0], 5),
   516            PredictionResult(splits[1], 8),
   517            PredictionResult(splits[2], 1),
   518            PredictionResult(splits[3], 1),
   519            PredictionResult(splits[4], 2),
   520        ]
   521        assert_that(
   522            actual, equal_to(expected, equals_fn=_compare_dataframe_predictions))
   523  
   524    def test_pipeline_pandas_with_keys(self):
   525      temp_file_name = self.tmpdir + os.sep + 'pickled_file'
   526      with open(temp_file_name, 'wb') as file:
   527        pickle.dump(build_pandas_pipeline(), file)
   528      with TestPipeline() as pipeline:
   529        data_frame = pandas_dataframe()
   530        keys = [str(i) for i in range(5)]
   531        splits = [data_frame.loc[[i]] for i in data_frame.index]
   532        keyed_rows = [(key, value) for key, value in zip(keys, splits)]
   533  
   534        pcoll = pipeline | 'start' >> beam.Create(keyed_rows)
   535        actual = pcoll | RunInference(
   536            KeyedModelHandler(
   537                SklearnModelHandlerPandas(model_uri=temp_file_name)))
   538        expected = [
   539            ('0', PredictionResult(splits[0], 5)),
   540            ('1', PredictionResult(splits[1], 8)),
   541            ('2', PredictionResult(splits[2], 1)),
   542            ('3', PredictionResult(splits[3], 1)),
   543            ('4', PredictionResult(splits[4], 2)),
   544        ]
   545        assert_that(
   546            actual, equal_to(expected, equals_fn=_compare_dataframe_predictions))
   547  
   548    def test_infer_too_many_rows_in_dataframe(self):
   549      with self.assertRaisesRegex(
   550          ValueError, r'Only dataframes with single rows are supported'):
   551        data_frame_too_many_rows = pandas_dataframe()
   552        fake_model = FakeModel()
   553        inference_runner = SklearnModelHandlerPandas(model_uri='unused')
   554        inference_runner.run_inference([data_frame_too_many_rows], fake_model)
   555  
   556  
   557  if __name__ == '__main__':
   558    unittest.main()