github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/ml/inference/xgboost_inference_it_test.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  import logging
    18  import os
    19  import uuid
    20  
    21  try:
    22    import pytest
    23    import unittest
    24    import xgboost
    25  
    26    from apache_beam.examples.inference import xgboost_iris_classification
    27    from apache_beam.io.filesystems import FileSystems
    28    from apache_beam.testing.test_pipeline import TestPipeline
    29  except ImportError as e:
    30    xgboost = None
    31  
    32  EXPECTED_OUTPUT_SINGLE_BATCHES = [
    33      "0,[1 1 1 0 0 0 0 1 2 0 0 2 0 2 1 2 2 2 2 0 0 0 0 2 2 0 2 2 2 1]"
    34  ]
    35  EXPECTED_OUTPUT_MULTIPLE_BATCHES = [
    36      "0,[1]",
    37      "1,[1]",
    38      "2,[1]",
    39      "3,[0]",
    40      "4,[0]",
    41      "5,[0]",
    42      "6,[0]",
    43      "7,[1]",
    44      "8,[2]",
    45      "9,[0]",
    46      "10,[0]",
    47      "11,[2]",
    48      "12,[0]",
    49      "13,[2]",
    50      "14,[1]",
    51      "15,[2]",
    52      "16,[2]",
    53      "17,[2]",
    54      "18,[2]",
    55      "19,[0]",
    56      "20,[0]",
    57      "21,[0]",
    58      "22,[0]",
    59      "23,[2]",
    60      "24,[2]",
    61      "25,[0]",
    62      "26,[2]",
    63      "27,[2]",
    64      "28,[2]",
    65      "29,[1]",
    66  ]
    67  
    68  
    69  def process_outputs(filepath):
    70    with FileSystems().open(filepath) as f:
    71      lines = f.readlines()
    72    lines = [l.decode('utf-8').strip('\n') for l in lines]
    73    return lines
    74  
    75  
    76  @unittest.skipIf(
    77      os.getenv('FORCE_XGBOOST_IT') is None and xgboost is None,
    78      'Missing dependencies. '
    79      'Test depends on xgboost and datatable')
    80  @pytest.mark.uses_xgboost
    81  @pytest.mark.it_postcommit
    82  class XGBoostInference(unittest.TestCase):
    83    def test_iris_classification_numpy_single_batch(self):
    84      test_pipeline = TestPipeline(is_integration_test=True)
    85      input_type = 'numpy'
    86      output_file_dir = '/tmp'
    87      output_file = '/'.join(
    88          [output_file_dir, str(uuid.uuid4()), 'numpy_single_batch.txt'])
    89      model_state_path = 'gs://apache-beam-ml/models/xgboost.iris_classifier.json'
    90      extra_opts = {
    91          'input_type': input_type,
    92          'output': output_file,
    93          'model_state': model_state_path,
    94          'no_split': True
    95      }
    96  
    97      xgboost_iris_classification.run(
    98          test_pipeline.get_full_options_as_args(**extra_opts),
    99          save_main_session=False)
   100      self.assertEqual(FileSystems().exists(output_file), True)
   101  
   102      expected_outputs = EXPECTED_OUTPUT_SINGLE_BATCHES
   103  
   104      predicted_outputs = process_outputs(output_file)
   105      self.assertEqual(len(expected_outputs), len(predicted_outputs))
   106  
   107      predictions_dict = {}
   108      for predicted_output in predicted_outputs:
   109        true_label, prediction = predicted_output.split(',')
   110        predictions_dict[true_label] = prediction
   111  
   112      for expected_output in expected_outputs:
   113        true_label, expected_prediction = expected_output.split(',')
   114        self.assertEqual(predictions_dict[true_label], expected_prediction)
   115  
   116    def test_iris_classification_pandas_single_batch(self):
   117      test_pipeline = TestPipeline(is_integration_test=True)
   118      input_type = 'pandas'
   119      output_file_dir = '/tmp'
   120      output_file = '/'.join(
   121          [output_file_dir, str(uuid.uuid4()), 'pandas_single_batch.txt'])
   122      model_state_path = 'gs://apache-beam-ml/models/xgboost.iris_classifier.json'
   123      extra_opts = {
   124          'input_type': input_type,
   125          'output': output_file,
   126          'model_state': model_state_path,
   127          'no_split': True
   128      }
   129  
   130      xgboost_iris_classification.run(
   131          test_pipeline.get_full_options_as_args(**extra_opts),
   132          save_main_session=False)
   133      self.assertEqual(FileSystems().exists(output_file), True)
   134  
   135      expected_outputs = EXPECTED_OUTPUT_SINGLE_BATCHES
   136  
   137      predicted_outputs = process_outputs(output_file)
   138      self.assertEqual(len(expected_outputs), len(predicted_outputs))
   139  
   140      predictions_dict = {}
   141      for predicted_output in predicted_outputs:
   142        true_label, prediction = predicted_output.split(',')
   143        predictions_dict[true_label] = prediction
   144  
   145      for expected_output in expected_outputs:
   146        true_label, expected_prediction = expected_output.split(',')
   147        self.assertEqual(predictions_dict[true_label], expected_prediction)
   148  
   149    def test_iris_classification_scipy_single_batch(self):
   150      test_pipeline = TestPipeline(is_integration_test=True)
   151      input_type = 'scipy'
   152      output_file_dir = '/tmp'
   153      output_file = '/'.join(
   154          [output_file_dir, str(uuid.uuid4()), 'scipy_single_batch.txt'])
   155      model_state_path = 'gs://apache-beam-ml/models/xgboost.iris_classifier.json'
   156      extra_opts = {
   157          'input_type': input_type,
   158          'output': output_file,
   159          'model_state': model_state_path,
   160          'no_split': True
   161      }
   162  
   163      xgboost_iris_classification.run(
   164          test_pipeline.get_full_options_as_args(**extra_opts),
   165          save_main_session=False)
   166      self.assertEqual(FileSystems().exists(output_file), True)
   167  
   168      expected_outputs = EXPECTED_OUTPUT_SINGLE_BATCHES
   169  
   170      predicted_outputs = process_outputs(output_file)
   171      self.assertEqual(len(expected_outputs), len(predicted_outputs))
   172  
   173      predictions_dict = {}
   174      for predicted_output in predicted_outputs:
   175        true_label, prediction = predicted_output.split(',')
   176        predictions_dict[true_label] = prediction
   177  
   178      for expected_output in expected_outputs:
   179        true_label, expected_prediction = expected_output.split(',')
   180        self.assertEqual(predictions_dict[true_label], expected_prediction)
   181  
   182    def test_iris_classification_datatable_single_batch(self):
   183      test_pipeline = TestPipeline(is_integration_test=True)
   184      input_type = 'datatable'
   185      output_file_dir = '/tmp'
   186      output_file = '/'.join(
   187          [output_file_dir, str(uuid.uuid4()), 'datatable_single_batch.txt'])
   188      model_state_path = 'gs://apache-beam-ml/models/xgboost.iris_classifier.json'
   189      extra_opts = {
   190          'input_type': input_type,
   191          'output': output_file,
   192          'model_state': model_state_path,
   193          'no_split': True
   194      }
   195  
   196      xgboost_iris_classification.run(
   197          test_pipeline.get_full_options_as_args(**extra_opts),
   198          save_main_session=False)
   199      self.assertEqual(FileSystems().exists(output_file), True)
   200  
   201      expected_outputs = EXPECTED_OUTPUT_SINGLE_BATCHES
   202  
   203      predicted_outputs = process_outputs(output_file)
   204      self.assertEqual(len(expected_outputs), len(predicted_outputs))
   205  
   206      predictions_dict = {}
   207      for predicted_output in predicted_outputs:
   208        true_label, prediction = predicted_output.split(',')
   209        predictions_dict[true_label] = prediction
   210  
   211      for expected_output in expected_outputs:
   212        true_label, expected_prediction = expected_output.split(',')
   213        self.assertEqual(predictions_dict[true_label], expected_prediction)
   214  
   215    def test_iris_classification_numpy_multi_batch(self):
   216      test_pipeline = TestPipeline(is_integration_test=True)
   217      input_type = 'numpy'
   218      output_file_dir = '/tmp'
   219      output_file = '/'.join(
   220          [output_file_dir, str(uuid.uuid4()), 'numpy_multi_batch.txt'])
   221      model_state_path = 'gs://apache-beam-ml/models/xgboost.iris_classifier.json'
   222      extra_opts = {
   223          'input_type': input_type,
   224          'output': output_file,
   225          'model_state': model_state_path,
   226          'split': True
   227      }
   228  
   229      xgboost_iris_classification.run(
   230          test_pipeline.get_full_options_as_args(**extra_opts),
   231          save_main_session=False)
   232      self.assertEqual(FileSystems().exists(output_file), True)
   233  
   234      expected_outputs = EXPECTED_OUTPUT_MULTIPLE_BATCHES
   235  
   236      predicted_outputs = process_outputs(output_file)
   237      self.assertEqual(len(expected_outputs), len(predicted_outputs))
   238  
   239      predictions_dict = {}
   240      for predicted_output in predicted_outputs:
   241        true_label, prediction = predicted_output.split(',')
   242        predictions_dict[true_label] = prediction
   243  
   244      for expected_output in expected_outputs:
   245        true_label, expected_prediction = expected_output.split(',')
   246        self.assertEqual(predictions_dict[true_label], expected_prediction)
   247  
   248    def test_iris_classification_pandas_multi_batch(self):
   249      test_pipeline = TestPipeline(is_integration_test=True)
   250      input_type = 'pandas'
   251      output_file_dir = '/tmp'
   252      output_file = '/'.join(
   253          [output_file_dir, str(uuid.uuid4()), 'pandas_multi_batch.txt'])
   254      model_state_path = 'gs://apache-beam-ml/models/xgboost.iris_classifier.json'
   255      extra_opts = {
   256          'input_type': input_type,
   257          'output': output_file,
   258          'model_state': model_state_path,
   259          'split': True
   260      }
   261  
   262      xgboost_iris_classification.run(
   263          test_pipeline.get_full_options_as_args(**extra_opts),
   264          save_main_session=False)
   265      self.assertEqual(FileSystems().exists(output_file), True)
   266  
   267      expected_outputs = EXPECTED_OUTPUT_MULTIPLE_BATCHES
   268  
   269      predicted_outputs = process_outputs(output_file)
   270      self.assertEqual(len(expected_outputs), len(predicted_outputs))
   271  
   272      predictions_dict = {}
   273      for predicted_output in predicted_outputs:
   274        true_label, prediction = predicted_output.split(',')
   275        predictions_dict[true_label] = prediction
   276  
   277      for expected_output in expected_outputs:
   278        true_label, expected_prediction = expected_output.split(',')
   279        self.assertEqual(predictions_dict[true_label], expected_prediction)
   280  
   281    def test_iris_classification_scipy_multi_batch(self):
   282      test_pipeline = TestPipeline(is_integration_test=True)
   283      input_type = 'scipy'
   284      output_file_dir = '/tmp'
   285      output_file = '/'.join(
   286          [output_file_dir, str(uuid.uuid4()), 'scipy_multi_batch.txt'])
   287      model_state_path = 'gs://apache-beam-ml/models/xgboost.iris_classifier.json'
   288      extra_opts = {
   289          'input_type': input_type,
   290          'output': output_file,
   291          'model_state': model_state_path,
   292          'split': True
   293      }
   294  
   295      xgboost_iris_classification.run(
   296          test_pipeline.get_full_options_as_args(**extra_opts),
   297          save_main_session=False)
   298      self.assertEqual(FileSystems().exists(output_file), True)
   299  
   300      expected_outputs = EXPECTED_OUTPUT_MULTIPLE_BATCHES
   301  
   302      predicted_outputs = process_outputs(output_file)
   303      self.assertEqual(len(expected_outputs), len(predicted_outputs))
   304  
   305      predictions_dict = {}
   306      for predicted_output in predicted_outputs:
   307        true_label, prediction = predicted_output.split(',')
   308        predictions_dict[true_label] = prediction
   309  
   310      for expected_output in expected_outputs:
   311        true_label, expected_prediction = expected_output.split(',')
   312        self.assertEqual(predictions_dict[true_label], expected_prediction)
   313  
   314    def test_iris_classification_datatable_multi_batch(self):
   315      test_pipeline = TestPipeline(is_integration_test=True)
   316      input_type = 'datatable'
   317      output_file_dir = '/tmp'
   318      output_file = '/'.join(
   319          [output_file_dir, str(uuid.uuid4()), 'datatable_multi_batch.txt'])
   320      model_state_path = 'gs://apache-beam-ml/models/xgboost.iris_classifier.json'
   321      extra_opts = {
   322          'input_type': input_type,
   323          'output': output_file,
   324          'model_state': model_state_path,
   325          'split': True
   326      }
   327  
   328      xgboost_iris_classification.run(
   329          test_pipeline.get_full_options_as_args(**extra_opts),
   330          save_main_session=False)
   331      self.assertEqual(FileSystems().exists(output_file), True)
   332  
   333      expected_outputs = EXPECTED_OUTPUT_MULTIPLE_BATCHES
   334  
   335      predicted_outputs = process_outputs(output_file)
   336      self.assertEqual(len(expected_outputs), len(predicted_outputs))
   337  
   338      predictions_dict = {}
   339      for predicted_output in predicted_outputs:
   340        true_label, prediction = predicted_output.split(',')
   341        predictions_dict[true_label] = prediction
   342  
   343      for expected_output in expected_outputs:
   344        true_label, expected_prediction = expected_output.split(',')
   345        self.assertEqual(predictions_dict[true_label], expected_prediction)
   346  
   347  
   348  if __name__ == '__main__':
   349    logging.getLogger().setLevel(logging.DEBUG)
   350    unittest.main()