github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/ml/inference/xgboost_inference_test.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 import os 19 import shutil 20 import sys 21 import tempfile 22 import unittest 23 import zipfile 24 from typing import Any 25 from typing import Tuple 26 27 try: 28 import datatable 29 import numpy 30 import pandas 31 import pytest 32 import scipy 33 import xgboost 34 35 import apache_beam as beam 36 from apache_beam.ml.inference import RunInference 37 from apache_beam.ml.inference.base import KeyedModelHandler 38 from apache_beam.ml.inference.base import PredictionResult 39 from apache_beam.ml.inference.xgboost_inference import XGBoostModelHandlerDatatable 40 from apache_beam.ml.inference.xgboost_inference import XGBoostModelHandlerNumpy 41 from apache_beam.ml.inference.xgboost_inference import XGBoostModelHandlerPandas 42 from apache_beam.ml.inference.xgboost_inference import XGBoostModelHandlerSciPy 43 from apache_beam.testing.test_pipeline import TestPipeline 44 from apache_beam.testing.util import assert_that 45 from apache_beam.testing.util import equal_to 46 except ImportError: 47 raise unittest.SkipTest('XGBoost dependencies are not installed') 48 49 50 def _compare_prediction_result(a: PredictionResult, b: PredictionResult): 51 if isinstance(a.example, scipy.sparse.csr_matrix) and isinstance( 52 b.example, scipy.sparse.csr_matrix): 53 example_equal = numpy.array_equal(a.example.todense(), b.example.todense()) 54 55 else: 56 example_equal = numpy.array_equal(a.example, b.example) 57 if isinstance(a.inference, dict): 58 return all( 59 x == y for x, y in zip(a.inference.values(), 60 b.inference.values())) and example_equal 61 return a.inference == b.inference and example_equal 62 63 64 def _compare_keyed_prediction_result( 65 a: Tuple[Any, PredictionResult], b: Tuple[Any, PredictionResult]): 66 a_key, a_val = a 67 b_key, b_val = b 68 keys_equal = a_key == b_key 69 return _compare_prediction_result(a_val, b_val) and keys_equal 70 71 72 def predict_fn(self, data): 73 self.inference_calls += 1 74 if isinstance(data, pandas.DataFrame): 75 data = data.to_numpy() 76 if isinstance(data, datatable.Frame): 77 data = data.to_numpy() 78 if isinstance(data, scipy.sparse.csr_matrix): 79 data = data.toarray() 80 return sum(sum(array) for array in data) 81 82 83 @pytest.fixture(autouse=True) 84 def predict_patched(monkeypatch): 85 monkeypatch.setattr(xgboost.XGBClassifier, 'predict', predict_fn) 86 87 88 def build_monkeypatched_xgboost_classifier() -> xgboost.XGBClassifier: 89 model = xgboost.XGBClassifier() 90 model.inference_calls = 0 91 model.fit([[0, 0], [0, 1], [1, 0], [1, 1]], [0, 1, 0, 1]) 92 return model 93 94 95 @pytest.mark.uses_xgboost 96 class XGBoostRunInferenceTest(unittest.TestCase): 97 def setUp(self): 98 self.tmpdir = tempfile.mkdtemp() 99 100 def tearDown(self): 101 shutil.rmtree(self.tmpdir) 102 103 def test_predict_output(self): 104 model = build_monkeypatched_xgboost_classifier() 105 inference_runner = XGBoostModelHandlerNumpy(xgboost.XGBClassifier, 'unused') 106 batched_examples = [ 107 numpy.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]), 108 numpy.array([[1, 1, 1], [1, 1, 1], [1, 1, 1]]) 109 ] 110 expected_predictions = [ 111 PredictionResult(numpy.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]), 45), 112 PredictionResult(numpy.array([[1, 1, 1], [1, 1, 1], [1, 1, 1]]), 9), 113 ] 114 inferences = inference_runner.run_inference(batched_examples, model) 115 for actual, expected in zip(inferences, expected_predictions): 116 self.assertTrue(_compare_prediction_result(actual, expected)) 117 118 def test_single_inference_call(self): 119 model = build_monkeypatched_xgboost_classifier() 120 inference_runner = XGBoostModelHandlerNumpy(xgboost.XGBClassifier, 'unused') 121 self.assertEqual(model.inference_calls, 0) 122 batched_examples = [numpy.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])] 123 _ = inference_runner.run_inference(batched_examples, model) 124 self.assertEqual(model.inference_calls, 1) 125 126 def test_multiple_inference_calls(self): 127 model = build_monkeypatched_xgboost_classifier() 128 inference_runner = XGBoostModelHandlerNumpy(xgboost.XGBClassifier, 'unused') 129 self.assertEqual(model.inference_calls, 0) 130 batched_examples = [ 131 numpy.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]), 132 numpy.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]), 133 numpy.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]), 134 numpy.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]), 135 numpy.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) 136 ] 137 _ = inference_runner.run_inference(batched_examples, model) 138 self.assertEqual(model.inference_calls, 5) 139 140 def test_num_bytes_numpy(self): 141 inference_runner = XGBoostModelHandlerNumpy( 142 model_class=xgboost.XGBClassifier, model_state='unused') 143 batched_examples_int = [ 144 numpy.array([[1, 1], [2, 2]]), 145 numpy.array([[2, 4], [6, 8]]), 146 ] 147 self.assertEqual( 148 sys.getsizeof(batched_examples_int[0]) + 149 sys.getsizeof(batched_examples_int[1]), 150 inference_runner.get_num_bytes(batched_examples_int)) 151 152 batched_examples_float = [ 153 numpy.array([[1.0, 1.0], [2.0, 2.0]]), 154 numpy.array([[2.0, 4.0], [6.0, 8.0]]), 155 ] 156 self.assertEqual( 157 sys.getsizeof(batched_examples_float[0]) + 158 sys.getsizeof(batched_examples_float[1]), 159 inference_runner.get_num_bytes(batched_examples_float)) 160 161 def test_num_bytes_pandas(self): 162 inference_runner = XGBoostModelHandlerPandas( 163 model_class=xgboost.XGBClassifier, model_state='unused') 164 batched_examples_int = [ 165 pandas.DataFrame([[1, 1], [2, 2]]), 166 pandas.DataFrame([[2, 4], [6, 8]]), 167 ] 168 self.assertEqual( 169 batched_examples_int[0].memory_usage(deep=True).sum() + 170 batched_examples_int[1].memory_usage(deep=True).sum(), 171 inference_runner.get_num_bytes(batched_examples_int)) 172 173 batched_examples_float = [ 174 pandas.DataFrame([[1.0, 1.0], [2.0, 2.0]]), 175 pandas.DataFrame([[2.0, 4.0], [6.0, 8.0]]), 176 ] 177 self.assertEqual( 178 batched_examples_float[0].memory_usage(deep=True).sum() + 179 batched_examples_float[1].memory_usage(deep=True).sum(), 180 inference_runner.get_num_bytes(batched_examples_float)) 181 182 def test_num_bytes_datatable(self): 183 inference_runner = XGBoostModelHandlerDatatable( 184 model_class=xgboost.XGBClassifier, model_state='unused') 185 batched_examples_int = [ 186 datatable.Frame([[1, 1], [2, 2]]), 187 datatable.Frame([[2, 4], [6, 8]]), 188 ] 189 self.assertEqual( 190 sys.getsizeof(batched_examples_int[0]) + 191 sys.getsizeof(batched_examples_int[1]), 192 inference_runner.get_num_bytes(batched_examples_int)) 193 194 batched_examples_float = [ 195 datatable.Frame([[1.0, 1.0], [2.0, 2.0]]), 196 datatable.Frame([[2.0, 4.0], [6.0, 8.0]]), 197 ] 198 self.assertEqual( 199 sys.getsizeof(batched_examples_float[0]) + 200 sys.getsizeof(batched_examples_float[1]), 201 inference_runner.get_num_bytes(batched_examples_float)) 202 203 def test_num_bytes_scipy(self): 204 inference_runner = XGBoostModelHandlerSciPy( 205 model_class=xgboost.XGBClassifier, model_state='unused') 206 batched_examples_int = [ 207 scipy.sparse.csr_matrix([[1, 1], [2, 2]]), 208 scipy.sparse.csr_matrix([[2, 4], [6, 8]]), 209 ] 210 self.assertEqual( 211 sys.getsizeof(batched_examples_int[0]) + 212 sys.getsizeof(batched_examples_int[1]), 213 inference_runner.get_num_bytes(batched_examples_int)) 214 215 batched_examples_float = [ 216 scipy.sparse.csr_matrix([[1.0, 1.0], [2.0, 2.0]]), 217 scipy.sparse.csr_matrix([[2.0, 4.0], [6.0, 8.0]]), 218 ] 219 self.assertEqual( 220 sys.getsizeof(batched_examples_float[0]) + 221 sys.getsizeof(batched_examples_float[1]), 222 inference_runner.get_num_bytes(batched_examples_float)) 223 224 def test_pipeline_numpy(self): 225 model = build_monkeypatched_xgboost_classifier() 226 model_state = self.tmpdir + os.sep + 'model.json' 227 model.save_model(model_state) 228 229 with TestPipeline() as pipeline: 230 examples = [ 231 numpy.array([[1, 1], [2, 2]]), 232 numpy.array([[2, 4], [6, 8]]), 233 ] 234 235 pcoll = pipeline | 'start' >> beam.Create(examples) 236 actual = pcoll | RunInference( 237 XGBoostModelHandlerNumpy( 238 model_class=xgboost.XGBClassifier, model_state=model_state)) 239 expected = [ 240 PredictionResult(numpy.array([[1, 1], [2, 2]]), 6), 241 PredictionResult(numpy.array([[2, 4], [6, 8]]), 20) 242 ] 243 assert_that( 244 actual, equal_to(expected, equals_fn=_compare_prediction_result)) 245 246 def test_pipeline_numpy_sets_env_vars_correctly(self): 247 model = build_monkeypatched_xgboost_classifier() 248 model_state = self.tmpdir + os.sep + 'model.json' 249 model.save_model(model_state) 250 os.environ.pop('FOO', None) 251 self.assertFalse('FOO' in os.environ) 252 253 with TestPipeline() as pipeline: 254 examples = [ 255 numpy.array([[1, 1], [2, 2]]), 256 numpy.array([[2, 4], [6, 8]]), 257 ] 258 handler_with_vars = XGBoostModelHandlerNumpy( 259 env_vars={'FOO': 'bar'}, 260 model_class=xgboost.XGBClassifier, 261 model_state=model_state) 262 _ = ( 263 pipeline 264 | 'start' >> beam.Create(examples) 265 | RunInference(handler_with_vars)) 266 pipeline.run() 267 self.assertTrue('FOO' in os.environ) 268 self.assertTrue((os.environ['FOO']) == 'bar') 269 270 def test_pipeline_pandas(self): 271 model = build_monkeypatched_xgboost_classifier() 272 model_state = self.tmpdir + os.sep + 'model.json' 273 model.save_model(model_state) 274 275 with TestPipeline() as pipeline: 276 examples = [ 277 pandas.DataFrame([[1, 1], [2, 2]]), 278 pandas.DataFrame([[2, 4], [6, 8]]), 279 ] 280 281 pcoll = pipeline | 'start' >> beam.Create(examples) 282 actual = pcoll | RunInference( 283 XGBoostModelHandlerPandas( 284 model_class=xgboost.XGBClassifier, model_state=model_state)) 285 expected = [ 286 PredictionResult(pandas.DataFrame([[1, 1], [2, 2]]), 6), 287 PredictionResult(pandas.DataFrame([[2, 4], [6, 8]]), 20) 288 ] 289 assert_that( 290 actual, equal_to(expected, equals_fn=_compare_prediction_result)) 291 pipeline.run() 292 293 def test_pipeline_pandas_sets_env_vars_correctly(self): 294 model = build_monkeypatched_xgboost_classifier() 295 model_state = self.tmpdir + os.sep + 'model.json' 296 model.save_model(model_state) 297 os.environ.pop('FOO', None) 298 self.assertFalse('FOO' in os.environ) 299 300 with TestPipeline() as pipeline: 301 examples = [ 302 pandas.DataFrame([[1, 1], [2, 2]]), 303 pandas.DataFrame([[2, 4], [6, 8]]), 304 ] 305 handler_with_vars = XGBoostModelHandlerPandas( 306 env_vars={'FOO': 'bar'}, 307 model_class=xgboost.XGBClassifier, 308 model_state=model_state) 309 _ = ( 310 pipeline 311 | 'start' >> beam.Create(examples) 312 | RunInference(handler_with_vars)) 313 pipeline.run() 314 self.assertTrue('FOO' in os.environ) 315 self.assertTrue((os.environ['FOO']) == 'bar') 316 317 def test_pipeline_datatable(self): 318 model = build_monkeypatched_xgboost_classifier() 319 model_state = self.tmpdir + os.sep + 'model.json' 320 model.save_model(model_state) 321 322 with TestPipeline() as pipeline: 323 examples = [ 324 datatable.Frame([[1, 1], [2, 2]]), 325 datatable.Frame([[2, 4], [6, 8]]), 326 ] 327 328 pcoll = pipeline | 'start' >> beam.Create(examples) 329 actual = pcoll | RunInference( 330 XGBoostModelHandlerDatatable( 331 model_class=xgboost.XGBClassifier, model_state=model_state)) 332 expected = [ 333 PredictionResult(datatable.Frame([[1, 1], [2, 2]]), 6), 334 PredictionResult(datatable.Frame([[2, 4], [6, 8]]), 20) 335 ] 336 assert_that( 337 actual, equal_to(expected, equals_fn=_compare_prediction_result)) 338 339 def test_pipeline_datatable_sets_env_vars_correctly(self): 340 model = build_monkeypatched_xgboost_classifier() 341 model_state = self.tmpdir + os.sep + 'model.json' 342 model.save_model(model_state) 343 os.environ.pop('FOO', None) 344 self.assertFalse('FOO' in os.environ) 345 346 with TestPipeline() as pipeline: 347 examples = [ 348 datatable.Frame([[1, 1], [2, 2]]), 349 datatable.Frame([[2, 4], [6, 8]]), 350 ] 351 handler_with_vars = XGBoostModelHandlerDatatable( 352 env_vars={'FOO': 'bar'}, 353 model_class=xgboost.XGBClassifier, 354 model_state=model_state) 355 _ = ( 356 pipeline 357 | 'start' >> beam.Create(examples) 358 | RunInference(handler_with_vars)) 359 pipeline.run() 360 self.assertTrue('FOO' in os.environ) 361 self.assertTrue((os.environ['FOO']) == 'bar') 362 363 def test_pipeline_scipy(self): 364 model = build_monkeypatched_xgboost_classifier() 365 model_state = self.tmpdir + os.sep + 'model.json' 366 model.save_model(model_state) 367 368 with TestPipeline() as pipeline: 369 examples = [ 370 scipy.sparse.csr_matrix(numpy.array([[1, 1], [2, 2]])), 371 scipy.sparse.csr_matrix(numpy.array([[2, 4], [6, 8]])), 372 ] 373 374 pcoll = pipeline | 'start' >> beam.Create(examples) 375 actual = pcoll | RunInference( 376 XGBoostModelHandlerSciPy( 377 model_class=xgboost.XGBClassifier, model_state=model_state)) 378 expected = [ 379 PredictionResult( 380 scipy.sparse.csr_matrix(numpy.array([[1, 1], [2, 2]])), 6), 381 PredictionResult( 382 scipy.sparse.csr_matrix(numpy.array([[2, 4], [6, 8]])), 20) 383 ] 384 assert_that( 385 actual, equal_to(expected, equals_fn=_compare_prediction_result)) 386 387 def test_pipeline_scipy_sets_env_vars_correctly(self): 388 model = build_monkeypatched_xgboost_classifier() 389 model_state = self.tmpdir + os.sep + 'model.json' 390 model.save_model(model_state) 391 os.environ.pop('FOO', None) 392 self.assertFalse('FOO' in os.environ) 393 394 with TestPipeline() as pipeline: 395 examples = [ 396 scipy.sparse.csr_matrix(numpy.array([[1, 1], [2, 2]])), 397 scipy.sparse.csr_matrix(numpy.array([[2, 4], [6, 8]])), 398 ] 399 handler_with_vars = XGBoostModelHandlerSciPy( 400 env_vars={'FOO': 'bar'}, 401 model_class=xgboost.XGBClassifier, 402 model_state=model_state) 403 _ = ( 404 pipeline 405 | 'start' >> beam.Create(examples) 406 | RunInference(handler_with_vars)) 407 pipeline.run() 408 self.assertTrue('FOO' in os.environ) 409 self.assertTrue((os.environ['FOO']) == 'bar') 410 411 def test_bad_model_file_raises(self): 412 model_state = self.tmpdir + os.sep + 'bad_file_name.json' 413 414 with self.assertRaises(RuntimeError): 415 with TestPipeline() as pipeline: 416 examples = [ 417 datatable.Frame([[1, 1], [2, 2]]), 418 datatable.Frame([[2, 4], [6, 8]]), 419 ] 420 421 pcoll = pipeline | 'start' >> beam.Create(examples) 422 _ = pcoll | RunInference( 423 XGBoostModelHandlerNumpy(xgboost.XGBClassifier, model_state)) 424 pipeline.run() 425 426 def test_bad_input_type_raises(self): 427 model = build_monkeypatched_xgboost_classifier() 428 model_state = self.tmpdir + os.sep + 'model.json' 429 model.save_model(model_state) 430 431 archived_model_state = self.tmpdir + os.sep + 'model.zip' 432 433 zip_file = zipfile.ZipFile(archived_model_state, "w", zipfile.ZIP_DEFLATED) 434 zip_file.write(model_state) 435 zip_file.close() 436 437 with self.assertRaises(xgboost.core.XGBoostError): 438 model_handler = XGBoostModelHandlerNumpy( 439 xgboost.XGBClassifier, model_state=archived_model_state) 440 model_handler.load_model() 441 442 def test_pipeline_scipy_with_keys(self): 443 model = build_monkeypatched_xgboost_classifier() 444 model_state = self.tmpdir + os.sep + 'model.json' 445 model.save_model(model_state) 446 447 with TestPipeline() as pipeline: 448 examples = [ 449 ('0', scipy.sparse.csr_matrix([[1, 1], [2, 2]])), 450 ('1', scipy.sparse.csr_matrix([[2, 4], [6, 8]])), 451 ] 452 453 pcoll = pipeline | 'start' >> beam.Create(examples) 454 actual = pcoll | RunInference( 455 KeyedModelHandler( 456 XGBoostModelHandlerSciPy( 457 model_class=xgboost.XGBClassifier, model_state=model_state))) 458 expected = [ 459 ('0', PredictionResult(scipy.sparse.csr_matrix([[1, 1], [2, 2]]), 6)), 460 ( 461 '1', 462 PredictionResult(scipy.sparse.csr_matrix([[2, 4], [6, 8]]), 20)) 463 ] 464 assert_that( 465 actual, 466 equal_to(expected, equals_fn=_compare_keyed_prediction_result)) 467 468 def test_pipeline_numpy_with_keys(self): 469 model = build_monkeypatched_xgboost_classifier() 470 model_state = self.tmpdir + os.sep + 'model.json' 471 model.save_model(model_state) 472 473 with TestPipeline() as pipeline: 474 examples = [ 475 ('0', numpy.array([[1, 1], [2, 2]])), 476 ('1', numpy.array([[2, 4], [6, 8]])), 477 ] 478 479 pcoll = pipeline | 'start' >> beam.Create(examples) 480 actual = pcoll | RunInference( 481 KeyedModelHandler( 482 XGBoostModelHandlerNumpy( 483 model_class=xgboost.XGBClassifier, model_state=model_state))) 484 expected = [('0', PredictionResult(numpy.array([[1, 1], [2, 2]]), 6)), 485 ('1', PredictionResult(numpy.array([[2, 4], [6, 8]]), 20))] 486 assert_that( 487 actual, 488 equal_to(expected, equals_fn=_compare_keyed_prediction_result)) 489 490 def test_pipeline_pandas_with_keys(self): 491 model = build_monkeypatched_xgboost_classifier() 492 model_state = self.tmpdir + os.sep + 'model.json' 493 model.save_model(model_state) 494 495 with TestPipeline() as pipeline: 496 examples = [ 497 ('0', pandas.DataFrame([[1, 1], [2, 2]])), 498 ('1', pandas.DataFrame([[2, 4], [6, 8]])), 499 ] 500 501 pcoll = pipeline | 'start' >> beam.Create(examples) 502 actual = pcoll | RunInference( 503 KeyedModelHandler( 504 XGBoostModelHandlerPandas( 505 model_class=xgboost.XGBClassifier, model_state=model_state))) 506 expected = [ 507 ('0', PredictionResult(pandas.DataFrame([[1, 1], [2, 2]]), 6)), 508 ('1', PredictionResult(pandas.DataFrame([[2, 4], [6, 8]]), 20)) 509 ] 510 assert_that( 511 actual, 512 equal_to(expected, equals_fn=_compare_keyed_prediction_result)) 513 514 def test_pipeline_datatable_with_keys(self): 515 model = build_monkeypatched_xgboost_classifier() 516 model_state = self.tmpdir + os.sep + 'model.json' 517 model.save_model(model_state) 518 519 with TestPipeline() as pipeline: 520 examples = [ 521 ('0', datatable.Frame([[1, 1], [2, 2]])), 522 ('1', datatable.Frame([[2, 4], [6, 8]])), 523 ] 524 525 pcoll = pipeline | 'start' >> beam.Create(examples) 526 actual = pcoll | RunInference( 527 KeyedModelHandler( 528 XGBoostModelHandlerDatatable( 529 model_class=xgboost.XGBClassifier, model_state=model_state))) 530 expected = [ 531 ('0', PredictionResult(datatable.Frame([[1, 1], [2, 2]]), 6)), 532 ('1', PredictionResult(datatable.Frame([[2, 4], [6, 8]]), 20)) 533 ] 534 assert_that( 535 actual, 536 equal_to(expected, equals_fn=_compare_keyed_prediction_result)) 537 538 539 if __name__ == '__main__': 540 unittest.main()