github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/ml/inference/xgboost_inference_it_test.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 import logging 18 import os 19 import uuid 20 21 try: 22 import pytest 23 import unittest 24 import xgboost 25 26 from apache_beam.examples.inference import xgboost_iris_classification 27 from apache_beam.io.filesystems import FileSystems 28 from apache_beam.testing.test_pipeline import TestPipeline 29 except ImportError as e: 30 xgboost = None 31 32 EXPECTED_OUTPUT_SINGLE_BATCHES = [ 33 "0,[1 1 1 0 0 0 0 1 2 0 0 2 0 2 1 2 2 2 2 0 0 0 0 2 2 0 2 2 2 1]" 34 ] 35 EXPECTED_OUTPUT_MULTIPLE_BATCHES = [ 36 "0,[1]", 37 "1,[1]", 38 "2,[1]", 39 "3,[0]", 40 "4,[0]", 41 "5,[0]", 42 "6,[0]", 43 "7,[1]", 44 "8,[2]", 45 "9,[0]", 46 "10,[0]", 47 "11,[2]", 48 "12,[0]", 49 "13,[2]", 50 "14,[1]", 51 "15,[2]", 52 "16,[2]", 53 "17,[2]", 54 "18,[2]", 55 "19,[0]", 56 "20,[0]", 57 "21,[0]", 58 "22,[0]", 59 "23,[2]", 60 "24,[2]", 61 "25,[0]", 62 "26,[2]", 63 "27,[2]", 64 "28,[2]", 65 "29,[1]", 66 ] 67 68 69 def process_outputs(filepath): 70 with FileSystems().open(filepath) as f: 71 lines = f.readlines() 72 lines = [l.decode('utf-8').strip('\n') for l in lines] 73 return lines 74 75 76 @unittest.skipIf( 77 os.getenv('FORCE_XGBOOST_IT') is None and xgboost is None, 78 'Missing dependencies. ' 79 'Test depends on xgboost and datatable') 80 @pytest.mark.uses_xgboost 81 @pytest.mark.it_postcommit 82 class XGBoostInference(unittest.TestCase): 83 def test_iris_classification_numpy_single_batch(self): 84 test_pipeline = TestPipeline(is_integration_test=True) 85 input_type = 'numpy' 86 output_file_dir = '/tmp' 87 output_file = '/'.join( 88 [output_file_dir, str(uuid.uuid4()), 'numpy_single_batch.txt']) 89 model_state_path = 'gs://apache-beam-ml/models/xgboost.iris_classifier.json' 90 extra_opts = { 91 'input_type': input_type, 92 'output': output_file, 93 'model_state': model_state_path, 94 'no_split': True 95 } 96 97 xgboost_iris_classification.run( 98 test_pipeline.get_full_options_as_args(**extra_opts), 99 save_main_session=False) 100 self.assertEqual(FileSystems().exists(output_file), True) 101 102 expected_outputs = EXPECTED_OUTPUT_SINGLE_BATCHES 103 104 predicted_outputs = process_outputs(output_file) 105 self.assertEqual(len(expected_outputs), len(predicted_outputs)) 106 107 predictions_dict = {} 108 for predicted_output in predicted_outputs: 109 true_label, prediction = predicted_output.split(',') 110 predictions_dict[true_label] = prediction 111 112 for expected_output in expected_outputs: 113 true_label, expected_prediction = expected_output.split(',') 114 self.assertEqual(predictions_dict[true_label], expected_prediction) 115 116 def test_iris_classification_pandas_single_batch(self): 117 test_pipeline = TestPipeline(is_integration_test=True) 118 input_type = 'pandas' 119 output_file_dir = '/tmp' 120 output_file = '/'.join( 121 [output_file_dir, str(uuid.uuid4()), 'pandas_single_batch.txt']) 122 model_state_path = 'gs://apache-beam-ml/models/xgboost.iris_classifier.json' 123 extra_opts = { 124 'input_type': input_type, 125 'output': output_file, 126 'model_state': model_state_path, 127 'no_split': True 128 } 129 130 xgboost_iris_classification.run( 131 test_pipeline.get_full_options_as_args(**extra_opts), 132 save_main_session=False) 133 self.assertEqual(FileSystems().exists(output_file), True) 134 135 expected_outputs = EXPECTED_OUTPUT_SINGLE_BATCHES 136 137 predicted_outputs = process_outputs(output_file) 138 self.assertEqual(len(expected_outputs), len(predicted_outputs)) 139 140 predictions_dict = {} 141 for predicted_output in predicted_outputs: 142 true_label, prediction = predicted_output.split(',') 143 predictions_dict[true_label] = prediction 144 145 for expected_output in expected_outputs: 146 true_label, expected_prediction = expected_output.split(',') 147 self.assertEqual(predictions_dict[true_label], expected_prediction) 148 149 def test_iris_classification_scipy_single_batch(self): 150 test_pipeline = TestPipeline(is_integration_test=True) 151 input_type = 'scipy' 152 output_file_dir = '/tmp' 153 output_file = '/'.join( 154 [output_file_dir, str(uuid.uuid4()), 'scipy_single_batch.txt']) 155 model_state_path = 'gs://apache-beam-ml/models/xgboost.iris_classifier.json' 156 extra_opts = { 157 'input_type': input_type, 158 'output': output_file, 159 'model_state': model_state_path, 160 'no_split': True 161 } 162 163 xgboost_iris_classification.run( 164 test_pipeline.get_full_options_as_args(**extra_opts), 165 save_main_session=False) 166 self.assertEqual(FileSystems().exists(output_file), True) 167 168 expected_outputs = EXPECTED_OUTPUT_SINGLE_BATCHES 169 170 predicted_outputs = process_outputs(output_file) 171 self.assertEqual(len(expected_outputs), len(predicted_outputs)) 172 173 predictions_dict = {} 174 for predicted_output in predicted_outputs: 175 true_label, prediction = predicted_output.split(',') 176 predictions_dict[true_label] = prediction 177 178 for expected_output in expected_outputs: 179 true_label, expected_prediction = expected_output.split(',') 180 self.assertEqual(predictions_dict[true_label], expected_prediction) 181 182 def test_iris_classification_datatable_single_batch(self): 183 test_pipeline = TestPipeline(is_integration_test=True) 184 input_type = 'datatable' 185 output_file_dir = '/tmp' 186 output_file = '/'.join( 187 [output_file_dir, str(uuid.uuid4()), 'datatable_single_batch.txt']) 188 model_state_path = 'gs://apache-beam-ml/models/xgboost.iris_classifier.json' 189 extra_opts = { 190 'input_type': input_type, 191 'output': output_file, 192 'model_state': model_state_path, 193 'no_split': True 194 } 195 196 xgboost_iris_classification.run( 197 test_pipeline.get_full_options_as_args(**extra_opts), 198 save_main_session=False) 199 self.assertEqual(FileSystems().exists(output_file), True) 200 201 expected_outputs = EXPECTED_OUTPUT_SINGLE_BATCHES 202 203 predicted_outputs = process_outputs(output_file) 204 self.assertEqual(len(expected_outputs), len(predicted_outputs)) 205 206 predictions_dict = {} 207 for predicted_output in predicted_outputs: 208 true_label, prediction = predicted_output.split(',') 209 predictions_dict[true_label] = prediction 210 211 for expected_output in expected_outputs: 212 true_label, expected_prediction = expected_output.split(',') 213 self.assertEqual(predictions_dict[true_label], expected_prediction) 214 215 def test_iris_classification_numpy_multi_batch(self): 216 test_pipeline = TestPipeline(is_integration_test=True) 217 input_type = 'numpy' 218 output_file_dir = '/tmp' 219 output_file = '/'.join( 220 [output_file_dir, str(uuid.uuid4()), 'numpy_multi_batch.txt']) 221 model_state_path = 'gs://apache-beam-ml/models/xgboost.iris_classifier.json' 222 extra_opts = { 223 'input_type': input_type, 224 'output': output_file, 225 'model_state': model_state_path, 226 'split': True 227 } 228 229 xgboost_iris_classification.run( 230 test_pipeline.get_full_options_as_args(**extra_opts), 231 save_main_session=False) 232 self.assertEqual(FileSystems().exists(output_file), True) 233 234 expected_outputs = EXPECTED_OUTPUT_MULTIPLE_BATCHES 235 236 predicted_outputs = process_outputs(output_file) 237 self.assertEqual(len(expected_outputs), len(predicted_outputs)) 238 239 predictions_dict = {} 240 for predicted_output in predicted_outputs: 241 true_label, prediction = predicted_output.split(',') 242 predictions_dict[true_label] = prediction 243 244 for expected_output in expected_outputs: 245 true_label, expected_prediction = expected_output.split(',') 246 self.assertEqual(predictions_dict[true_label], expected_prediction) 247 248 def test_iris_classification_pandas_multi_batch(self): 249 test_pipeline = TestPipeline(is_integration_test=True) 250 input_type = 'pandas' 251 output_file_dir = '/tmp' 252 output_file = '/'.join( 253 [output_file_dir, str(uuid.uuid4()), 'pandas_multi_batch.txt']) 254 model_state_path = 'gs://apache-beam-ml/models/xgboost.iris_classifier.json' 255 extra_opts = { 256 'input_type': input_type, 257 'output': output_file, 258 'model_state': model_state_path, 259 'split': True 260 } 261 262 xgboost_iris_classification.run( 263 test_pipeline.get_full_options_as_args(**extra_opts), 264 save_main_session=False) 265 self.assertEqual(FileSystems().exists(output_file), True) 266 267 expected_outputs = EXPECTED_OUTPUT_MULTIPLE_BATCHES 268 269 predicted_outputs = process_outputs(output_file) 270 self.assertEqual(len(expected_outputs), len(predicted_outputs)) 271 272 predictions_dict = {} 273 for predicted_output in predicted_outputs: 274 true_label, prediction = predicted_output.split(',') 275 predictions_dict[true_label] = prediction 276 277 for expected_output in expected_outputs: 278 true_label, expected_prediction = expected_output.split(',') 279 self.assertEqual(predictions_dict[true_label], expected_prediction) 280 281 def test_iris_classification_scipy_multi_batch(self): 282 test_pipeline = TestPipeline(is_integration_test=True) 283 input_type = 'scipy' 284 output_file_dir = '/tmp' 285 output_file = '/'.join( 286 [output_file_dir, str(uuid.uuid4()), 'scipy_multi_batch.txt']) 287 model_state_path = 'gs://apache-beam-ml/models/xgboost.iris_classifier.json' 288 extra_opts = { 289 'input_type': input_type, 290 'output': output_file, 291 'model_state': model_state_path, 292 'split': True 293 } 294 295 xgboost_iris_classification.run( 296 test_pipeline.get_full_options_as_args(**extra_opts), 297 save_main_session=False) 298 self.assertEqual(FileSystems().exists(output_file), True) 299 300 expected_outputs = EXPECTED_OUTPUT_MULTIPLE_BATCHES 301 302 predicted_outputs = process_outputs(output_file) 303 self.assertEqual(len(expected_outputs), len(predicted_outputs)) 304 305 predictions_dict = {} 306 for predicted_output in predicted_outputs: 307 true_label, prediction = predicted_output.split(',') 308 predictions_dict[true_label] = prediction 309 310 for expected_output in expected_outputs: 311 true_label, expected_prediction = expected_output.split(',') 312 self.assertEqual(predictions_dict[true_label], expected_prediction) 313 314 def test_iris_classification_datatable_multi_batch(self): 315 test_pipeline = TestPipeline(is_integration_test=True) 316 input_type = 'datatable' 317 output_file_dir = '/tmp' 318 output_file = '/'.join( 319 [output_file_dir, str(uuid.uuid4()), 'datatable_multi_batch.txt']) 320 model_state_path = 'gs://apache-beam-ml/models/xgboost.iris_classifier.json' 321 extra_opts = { 322 'input_type': input_type, 323 'output': output_file, 324 'model_state': model_state_path, 325 'split': True 326 } 327 328 xgboost_iris_classification.run( 329 test_pipeline.get_full_options_as_args(**extra_opts), 330 save_main_session=False) 331 self.assertEqual(FileSystems().exists(output_file), True) 332 333 expected_outputs = EXPECTED_OUTPUT_MULTIPLE_BATCHES 334 335 predicted_outputs = process_outputs(output_file) 336 self.assertEqual(len(expected_outputs), len(predicted_outputs)) 337 338 predictions_dict = {} 339 for predicted_output in predicted_outputs: 340 true_label, prediction = predicted_output.split(',') 341 predictions_dict[true_label] = prediction 342 343 for expected_output in expected_outputs: 344 true_label, expected_prediction = expected_output.split(',') 345 self.assertEqual(predictions_dict[true_label], expected_prediction) 346 347 348 if __name__ == '__main__': 349 logging.getLogger().setLevel(logging.DEBUG) 350 unittest.main()