github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/ml/inference/sklearn_inference_test.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 # pytype: skip-file 19 import io 20 import math 21 import os 22 import pickle 23 import platform 24 import shutil 25 import sys 26 import tempfile 27 import unittest 28 from typing import Any 29 from typing import Dict 30 from typing import Optional 31 from typing import Sequence 32 33 import joblib 34 import numpy 35 import pandas 36 from sklearn import linear_model 37 from sklearn import svm 38 from sklearn.base import BaseEstimator 39 from sklearn.compose import ColumnTransformer 40 from sklearn.pipeline import Pipeline 41 from sklearn.preprocessing import OneHotEncoder 42 from sklearn.preprocessing import StandardScaler 43 44 import apache_beam as beam 45 from apache_beam.ml.inference.base import KeyedModelHandler 46 from apache_beam.ml.inference.base import PredictionResult 47 from apache_beam.ml.inference.base import RunInference 48 from apache_beam.ml.inference.sklearn_inference import ModelFileType 49 from apache_beam.ml.inference.sklearn_inference import SklearnModelHandlerNumpy 50 from apache_beam.ml.inference.sklearn_inference import SklearnModelHandlerPandas 51 from apache_beam.ml.inference.sklearn_inference import _default_numpy_inference_fn 52 from apache_beam.ml.inference.sklearn_inference import _default_pandas_inference_fn 53 from apache_beam.testing.test_pipeline import TestPipeline 54 from apache_beam.testing.util import assert_that 55 from apache_beam.testing.util import equal_to 56 57 58 def _compare_prediction_result(a, b): 59 example_equal = numpy.array_equal(a.example, b.example) 60 if isinstance(a.inference, dict): 61 return all( 62 x == y for x, y in zip(a.inference.values(), 63 b.inference.values())) and example_equal 64 return a.inference == b.inference and example_equal 65 66 67 def _compare_dataframe_predictions(a_in, b_in): 68 keys_equal = True 69 if isinstance(a_in, tuple) and not isinstance(a_in, PredictionResult): 70 a_key, a = a_in 71 b_key, b = b_in 72 keys_equal = a_key == b_key 73 else: 74 a = a_in 75 b = b_in 76 example_equal = pandas.DataFrame.equals(a.example, b.example) 77 if isinstance(a.inference, dict): 78 return all( 79 math.floor(a) == math.floor(b) for a, 80 b in zip(a.inference.values(), b.inference.values())) and example_equal 81 inference_equal = math.floor(a.inference) == math.floor(b.inference) 82 return inference_equal and example_equal and keys_equal 83 84 85 class FakeModel: 86 def __init__(self): 87 self.total_predict_calls = 0 88 89 def predict(self, input_vector: numpy.ndarray): 90 self.total_predict_calls += 1 91 return numpy.sum(input_vector, axis=1) 92 93 94 class FakeNumpyModelDictOut: 95 def __init__(self): 96 self.total_predict_calls = 0 97 98 def predict(self, input_vector: numpy.ndarray): 99 self.total_predict_calls += 1 100 out = numpy.sum(input_vector, axis=1) 101 return {"out1": out, "out2": out} 102 103 104 class FakePandasModelDictOut: 105 def __init__(self): 106 self.total_predict_calls = 0 107 108 def predict(self, df: pandas.DataFrame): 109 self.total_predict_calls += 1 110 out = df.loc[:, 'number_2'] 111 return {"out1": out, "out2": out} 112 113 114 def build_model(): 115 x = [[0, 0], [1, 1]] 116 y = [0, 1] 117 model = svm.SVC() 118 model.fit(x, y) 119 return model 120 121 122 def pandas_dataframe(): 123 csv_string = ( 124 'category_1,number_1,category_2,number_2,label,number_3\n' 125 'red,4,frog,5,6,7\n' 126 'blue,3,horse,8,9,10\n' 127 'red,0,cow,1,2,3\n' 128 'blue,4,frog,1,1,1\n' 129 'red,1,horse,4,2,3') 130 csv_string_io = io.StringIO(csv_string) 131 return pandas.read_csv(csv_string_io) 132 133 134 def build_pandas_pipeline(): 135 """Builds a common type of pandas pipeline with preprocessing.""" 136 categorical_columns = ['category_1', 'category_2'] 137 numerical_columns = ['number_1', 'number_2', 'number_3'] 138 139 categorical_transformer = OneHotEncoder(handle_unknown='ignore') 140 numerical_transformer = StandardScaler() 141 142 preprocessor = ColumnTransformer( 143 transformers=[ 144 ("numerical", numerical_transformer, numerical_columns), 145 ("categorical", categorical_transformer, categorical_columns), 146 ]) 147 pipeline = Pipeline( 148 steps=[("preprocessor", 149 preprocessor), ("classifier", linear_model.SGDRegressor())]) 150 data = pandas_dataframe() 151 labels = data['label'] 152 pipeline.fit(data, labels) 153 return pipeline 154 155 156 def convert_inference_to_floor(prediction_result): 157 return math.floor(prediction_result.inference) 158 159 160 def alternate_numpy_inference_fn( 161 model: BaseEstimator, 162 batch: Sequence[numpy.ndarray], 163 inference_args: Optional[Dict[str, Any]] = None) -> Any: 164 return [0] 165 166 167 def alternate_pandas_inference_fn( 168 model: BaseEstimator, 169 batch: Sequence[pandas.DataFrame], 170 inference_args: Optional[Dict[str, Any]] = None) -> Any: 171 # vectorize data for better performance 172 vectorized_batch = pandas.concat(batch, axis=0) 173 predictions = model.predict(vectorized_batch) 174 splits = [ 175 vectorized_batch.iloc[[i]] for i in range(vectorized_batch.shape[0]) 176 ] 177 predictions = predictions - 1 178 return predictions, splits 179 180 181 class SkLearnRunInferenceTest(unittest.TestCase): 182 def setUp(self): 183 self.tmpdir = tempfile.mkdtemp() 184 185 def tearDown(self): 186 shutil.rmtree(self.tmpdir) 187 188 def test_predict_output(self): 189 fake_model = FakeModel() 190 inference_runner = SklearnModelHandlerNumpy(model_uri='unused') 191 batched_examples = [ 192 numpy.array([1, 2, 3]), numpy.array([4, 5, 6]), numpy.array([7, 8, 9]) 193 ] 194 expected_predictions = [ 195 PredictionResult(numpy.array([1, 2, 3]), 6), 196 PredictionResult(numpy.array([4, 5, 6]), 15), 197 PredictionResult(numpy.array([7, 8, 9]), 24) 198 ] 199 inferences = inference_runner.run_inference(batched_examples, fake_model) 200 for actual, expected in zip(inferences, expected_predictions): 201 self.assertTrue(_compare_prediction_result(actual, expected)) 202 203 def test_custom_inference_fn(self): 204 fake_model = FakeModel() 205 inference_runner = SklearnModelHandlerNumpy( 206 model_uri='unused', inference_fn=alternate_numpy_inference_fn) 207 batched_examples = [ 208 numpy.array([1, 2, 3]), numpy.array([4, 5, 6]), numpy.array([7, 8, 9]) 209 ] 210 expected_predictions = [ 211 PredictionResult(numpy.array([1, 2, 3]), 0), 212 PredictionResult(numpy.array([4, 5, 6]), 0), 213 PredictionResult(numpy.array([7, 8, 9]), 0) 214 ] 215 inferences = inference_runner.run_inference(batched_examples, fake_model) 216 for actual, expected in zip(inferences, expected_predictions): 217 self.assertTrue(_compare_prediction_result(actual, expected)) 218 219 def test_predict_output_dict(self): 220 fake_model = FakeNumpyModelDictOut() 221 inference_runner = SklearnModelHandlerNumpy(model_uri='unused') 222 batched_examples = [ 223 numpy.array([1, 2, 3]), numpy.array([4, 5, 6]), numpy.array([7, 8, 9]) 224 ] 225 expected_predictions = [ 226 PredictionResult(numpy.array([1, 2, 3]), { 227 "out1": 6, "out2": 6 228 }), 229 PredictionResult(numpy.array([4, 5, 6]), { 230 "out1": 15, "out2": 15 231 }), 232 PredictionResult(numpy.array([7, 8, 9]), { 233 "out1": 24, "out2": 24 234 }) 235 ] 236 inferences = inference_runner.run_inference(batched_examples, fake_model) 237 for actual, expected in zip(inferences, expected_predictions): 238 self.assertTrue(_compare_prediction_result(actual, expected)) 239 240 def test_data_vectorized(self): 241 fake_model = FakeModel() 242 inference_runner = SklearnModelHandlerNumpy(model_uri='unused') 243 batched_examples = [ 244 numpy.array([1, 2, 3]), numpy.array([4, 5, 6]), numpy.array([7, 8, 9]) 245 ] 246 # even though there are 3 examples, the data should 247 # be vectorized and only 1 call should happen. 248 inference_runner.run_inference(batched_examples, fake_model) 249 self.assertEqual(1, fake_model.total_predict_calls) 250 251 def test_num_bytes_numpy(self): 252 inference_runner = SklearnModelHandlerNumpy(model_uri='unused') 253 batched_examples_int = [ 254 numpy.array([1, 2, 3]), numpy.array([4, 5, 6]), numpy.array([7, 8, 9]) 255 ] 256 self.assertEqual( 257 sys.getsizeof(batched_examples_int[0]) * 3, 258 inference_runner.get_num_bytes(batched_examples_int)) 259 260 batched_examples_float = [ 261 numpy.array([1.0, 2.0, 3.0]), 262 numpy.array([4.1, 5.2, 6.3]), 263 numpy.array([7.7, 8.8, 9.9]) 264 ] 265 self.assertEqual( 266 sys.getsizeof(batched_examples_float[0]) * 3, 267 inference_runner.get_num_bytes(batched_examples_float)) 268 269 def test_pipeline_pickled(self): 270 temp_file_name = self.tmpdir + os.sep + 'pickled_file' 271 with open(temp_file_name, 'wb') as file: 272 pickle.dump(build_model(), file) 273 with TestPipeline() as pipeline: 274 examples = [numpy.array([0, 0]), numpy.array([1, 1])] 275 276 pcoll = pipeline | 'start' >> beam.Create(examples) 277 actual = pcoll | RunInference( 278 SklearnModelHandlerNumpy(model_uri=temp_file_name)) 279 expected = [ 280 PredictionResult(numpy.array([0, 0]), 0), 281 PredictionResult(numpy.array([1, 1]), 1) 282 ] 283 assert_that( 284 actual, equal_to(expected, equals_fn=_compare_prediction_result)) 285 286 def test_pipeline_pickled_custom_batching(self): 287 temp_file_name = self.tmpdir + os.sep + 'pickled_file' 288 with open(temp_file_name, 'wb') as file: 289 pickle.dump(build_model(), file) 290 291 def batch_validator_numpy_inference_fn( 292 model: BaseEstimator, 293 batch: Sequence[numpy.ndarray], 294 inference_args: Optional[Dict[str, Any]] = None) -> Any: 295 if len(batch) != 2: 296 raise Exception( 297 f'Expected batch of size 2, received batch of size {len(batch)}') 298 return _default_numpy_inference_fn(model, batch, inference_args) 299 300 with TestPipeline() as pipeline: 301 examples = [numpy.array([0, 0]), numpy.array([1, 1])] 302 303 pcoll = pipeline | 'start' >> beam.Create(examples) 304 actual = pcoll | RunInference( 305 SklearnModelHandlerNumpy( 306 model_uri=temp_file_name, 307 inference_fn=batch_validator_numpy_inference_fn, 308 min_batch_size=2, 309 max_batch_size=2)) 310 expected = [ 311 PredictionResult(numpy.array([0, 0]), 0), 312 PredictionResult(numpy.array([1, 1]), 1) 313 ] 314 assert_that( 315 actual, equal_to(expected, equals_fn=_compare_prediction_result)) 316 317 def test_pipeline_joblib(self): 318 temp_file_name = self.tmpdir + os.sep + 'joblib_file' 319 with open(temp_file_name, 'wb') as file: 320 joblib.dump(build_model(), file) 321 with TestPipeline() as pipeline: 322 examples = [numpy.array([0, 0]), numpy.array([1, 1])] 323 324 pcoll = pipeline | 'start' >> beam.Create(examples) 325 326 actual = pcoll | RunInference( 327 SklearnModelHandlerNumpy( 328 model_uri=temp_file_name, model_file_type=ModelFileType.JOBLIB)) 329 expected = [ 330 PredictionResult(numpy.array([0, 0]), 0), 331 PredictionResult(numpy.array([1, 1]), 1) 332 ] 333 assert_that( 334 actual, equal_to(expected, equals_fn=_compare_prediction_result)) 335 336 def test_bad_file_raises(self): 337 with self.assertRaises(RuntimeError): 338 with TestPipeline() as pipeline: 339 examples = [numpy.array([0, 0])] 340 pcoll = pipeline | 'start' >> beam.Create(examples) 341 _ = pcoll | RunInference( 342 SklearnModelHandlerNumpy(model_uri='/var/bad_file_name')) 343 pipeline.run() 344 345 def test_bad_input_type_raises(self): 346 with self.assertRaisesRegex(AssertionError, 347 'Unsupported serialization type'): 348 with tempfile.NamedTemporaryFile(delete=False) as file: 349 model_handler = SklearnModelHandlerNumpy( 350 model_uri=file.name, model_file_type=None) 351 model_handler.load_model() 352 353 def test_env_vars_set_correctly_numpy(self): 354 temp_file_name = self.tmpdir + os.sep + 'pickled_file' 355 with open(temp_file_name, 'wb') as file: 356 pickle.dump(build_model(), file) 357 handler_with_vars = SklearnModelHandlerNumpy( 358 env_vars={'FOO': 'bar'}, model_uri=temp_file_name) 359 os.environ.pop('FOO', None) 360 self.assertFalse('FOO' in os.environ) 361 examples = [numpy.array([0, 0]), numpy.array([1, 1])] 362 with TestPipeline() as pipeline: 363 _ = ( 364 pipeline 365 | 'start' >> beam.Create(examples) 366 | RunInference(handler_with_vars)) 367 pipeline.run() 368 self.assertTrue('FOO' in os.environ) 369 self.assertTrue((os.environ['FOO']) == 'bar') 370 371 def test_pipeline_pandas(self): 372 temp_file_name = self.tmpdir + os.sep + 'pickled_file' 373 with open(temp_file_name, 'wb') as file: 374 pickle.dump(build_pandas_pipeline(), file) 375 with TestPipeline() as pipeline: 376 dataframe = pandas_dataframe() 377 splits = [dataframe.loc[[i]] for i in dataframe.index] 378 pcoll = pipeline | 'start' >> beam.Create(splits) 379 actual = pcoll | RunInference( 380 SklearnModelHandlerPandas(model_uri=temp_file_name)) 381 382 expected = [ 383 PredictionResult(splits[0], 5), 384 PredictionResult(splits[1], 8), 385 PredictionResult(splits[2], 1), 386 PredictionResult(splits[3], 1), 387 PredictionResult(splits[4], 2), 388 ] 389 assert_that( 390 actual, equal_to(expected, equals_fn=_compare_dataframe_predictions)) 391 392 def test_pipeline_pandas_env_vars_set_correctly(self): 393 temp_file_name = self.tmpdir + os.sep + 'pickled_file' 394 with open(temp_file_name, 'wb') as file: 395 pickle.dump(build_pandas_pipeline(), file) 396 397 handler_with_vars = SklearnModelHandlerPandas( 398 env_vars={'FOO': 'bar'}, model_uri=temp_file_name) 399 os.environ.pop('FOO', None) 400 self.assertFalse('FOO' in os.environ) 401 with TestPipeline() as pipeline: 402 dataframe = pandas_dataframe() 403 splits = [dataframe.loc[[i]] for i in dataframe.index] 404 _ = ( 405 pipeline 406 | 'start' >> beam.Create(splits) 407 | RunInference(handler_with_vars)) 408 pipeline.run() 409 self.assertTrue('FOO' in os.environ) 410 self.assertTrue((os.environ['FOO']) == 'bar') 411 412 def test_pipeline_pandas_custom_batching(self): 413 temp_file_name = self.tmpdir + os.sep + 'pickled_file' 414 with open(temp_file_name, 'wb') as file: 415 pickle.dump(build_pandas_pipeline(), file) 416 417 def batch_validator_pandas_inference_fn( 418 model: BaseEstimator, 419 batch: Sequence[numpy.ndarray], 420 inference_args: Optional[Dict[str, Any]] = None) -> Any: 421 if len(batch) != 5: 422 raise Exception( 423 f'Expected batch of size 5, received batch of size {len(batch)}') 424 return _default_pandas_inference_fn(model, batch, inference_args) 425 426 with TestPipeline() as pipeline: 427 dataframe = pandas_dataframe() 428 splits = [dataframe.loc[[i]] for i in dataframe.index] 429 pcoll = pipeline | 'start' >> beam.Create(splits) 430 actual = pcoll | RunInference( 431 SklearnModelHandlerPandas( 432 model_uri=temp_file_name, 433 inference_fn=batch_validator_pandas_inference_fn, 434 min_batch_size=5, 435 max_batch_size=5)) 436 437 expected = [ 438 PredictionResult(splits[0], 5), 439 PredictionResult(splits[1], 8), 440 PredictionResult(splits[2], 1), 441 PredictionResult(splits[3], 1), 442 PredictionResult(splits[4], 2), 443 ] 444 assert_that( 445 actual, equal_to(expected, equals_fn=_compare_dataframe_predictions)) 446 447 def test_pipeline_pandas_custom_inference(self): 448 temp_file_name = self.tmpdir + os.sep + 'pickled_file' 449 with open(temp_file_name, 'wb') as file: 450 pickle.dump(build_pandas_pipeline(), file) 451 with TestPipeline() as pipeline: 452 dataframe = pandas_dataframe() 453 splits = [dataframe.loc[[i]] for i in dataframe.index] 454 pcoll = pipeline | 'start' >> beam.Create(splits) 455 actual = pcoll | RunInference( 456 SklearnModelHandlerPandas( 457 model_uri=temp_file_name, 458 inference_fn=alternate_pandas_inference_fn)) 459 460 expected = [ 461 PredictionResult(splits[0], 4), 462 PredictionResult(splits[1], 7), 463 PredictionResult(splits[2], 0), 464 PredictionResult(splits[3], 0), 465 PredictionResult(splits[4], 1), 466 ] 467 assert_that( 468 actual, equal_to(expected, equals_fn=_compare_dataframe_predictions)) 469 470 def test_pipeline_pandas_dict_out(self): 471 temp_file_name = self.tmpdir + os.sep + 'pickled_file' 472 with open(temp_file_name, 'wb') as file: 473 pickle.dump(FakePandasModelDictOut(), file) 474 with TestPipeline() as pipeline: 475 dataframe = pandas_dataframe() 476 splits = [dataframe.loc[[i]] for i in dataframe.index] 477 pcoll = pipeline | 'start' >> beam.Create(splits) 478 actual = pcoll | RunInference( 479 SklearnModelHandlerPandas(model_uri=temp_file_name)) 480 481 expected = [ 482 PredictionResult(splits[0], { 483 'out1': 5, 'out2': 5 484 }), 485 PredictionResult(splits[1], { 486 'out1': 8, 'out2': 8 487 }), 488 PredictionResult(splits[2], { 489 'out1': 1, 'out2': 1 490 }), 491 PredictionResult(splits[3], { 492 'out1': 1, 'out2': 1 493 }), 494 PredictionResult(splits[4], { 495 'out1': 4, 'out2': 4 496 }), 497 ] 498 assert_that( 499 actual, equal_to(expected, equals_fn=_compare_dataframe_predictions)) 500 501 @unittest.skipIf(platform.system() == 'Windows', 'BEAM-14359') 502 def test_pipeline_pandas_joblib(self): 503 temp_file_name = self.tmpdir + os.sep + 'pickled_file' 504 with open(temp_file_name, 'wb') as file: 505 joblib.dump(build_pandas_pipeline(), file) 506 with TestPipeline() as pipeline: 507 dataframe = pandas_dataframe() 508 splits = [dataframe.loc[[i]] for i in dataframe.index] 509 pcoll = pipeline | 'start' >> beam.Create(splits) 510 actual = pcoll | RunInference( 511 SklearnModelHandlerPandas( 512 model_uri=temp_file_name, model_file_type=ModelFileType.JOBLIB)) 513 514 expected = [ 515 PredictionResult(splits[0], 5), 516 PredictionResult(splits[1], 8), 517 PredictionResult(splits[2], 1), 518 PredictionResult(splits[3], 1), 519 PredictionResult(splits[4], 2), 520 ] 521 assert_that( 522 actual, equal_to(expected, equals_fn=_compare_dataframe_predictions)) 523 524 def test_pipeline_pandas_with_keys(self): 525 temp_file_name = self.tmpdir + os.sep + 'pickled_file' 526 with open(temp_file_name, 'wb') as file: 527 pickle.dump(build_pandas_pipeline(), file) 528 with TestPipeline() as pipeline: 529 data_frame = pandas_dataframe() 530 keys = [str(i) for i in range(5)] 531 splits = [data_frame.loc[[i]] for i in data_frame.index] 532 keyed_rows = [(key, value) for key, value in zip(keys, splits)] 533 534 pcoll = pipeline | 'start' >> beam.Create(keyed_rows) 535 actual = pcoll | RunInference( 536 KeyedModelHandler( 537 SklearnModelHandlerPandas(model_uri=temp_file_name))) 538 expected = [ 539 ('0', PredictionResult(splits[0], 5)), 540 ('1', PredictionResult(splits[1], 8)), 541 ('2', PredictionResult(splits[2], 1)), 542 ('3', PredictionResult(splits[3], 1)), 543 ('4', PredictionResult(splits[4], 2)), 544 ] 545 assert_that( 546 actual, equal_to(expected, equals_fn=_compare_dataframe_predictions)) 547 548 def test_infer_too_many_rows_in_dataframe(self): 549 with self.assertRaisesRegex( 550 ValueError, r'Only dataframes with single rows are supported'): 551 data_frame_too_many_rows = pandas_dataframe() 552 fake_model = FakeModel() 553 inference_runner = SklearnModelHandlerPandas(model_uri='unused') 554 inference_runner.run_inference([data_frame_too_many_rows], fake_model) 555 556 557 if __name__ == '__main__': 558 unittest.main()