github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/examples/complete/tfidf_it_test.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """End-to-end test for  TF-IDF example."""
    19  
    20  # pytype: skip-file
    21  
    22  import logging
    23  import re
    24  import unittest
    25  import uuid
    26  
    27  import pytest
    28  
    29  from apache_beam.examples.complete import tfidf
    30  from apache_beam.testing.test_pipeline import TestPipeline
    31  from apache_beam.testing.test_utils import create_file
    32  from apache_beam.testing.test_utils import read_files_from_pattern
    33  
    34  EXPECTED_RESULTS = set([
    35      ('ghi', '1.txt', 0.3662040962227032), ('abc', '1.txt', 0.0),
    36      ('abc', '3.txt', 0.0), ('abc', '2.txt', 0.0),
    37      ('def', '1.txt', 0.13515503603605478), ('def', '2.txt', 0.2027325540540822)
    38  ])
    39  
    40  EXPECTED_LINE_RE = r'\(u?\'([a-z]*)\', \(\'.*([0-9]\.txt)\', (.*)\)\)'
    41  
    42  
    43  class TfIdfIT(unittest.TestCase):
    44    @pytest.mark.examples_postcommit
    45    @pytest.mark.sickbay_flink
    46    def test_basics(self):
    47      test_pipeline = TestPipeline(is_integration_test=True)
    48  
    49      # Setup the files with expected content.
    50      temp_location = test_pipeline.get_option('temp_location')
    51      input_folder = '/'.join([temp_location, str(uuid.uuid4())])
    52      create_file('/'.join([input_folder, '1.txt']), 'abc def ghi')
    53      create_file('/'.join([input_folder, '2.txt']), 'abc def')
    54      create_file('/'.join([input_folder, '3.txt']), 'abc')
    55      output = '/'.join([temp_location, str(uuid.uuid4()), 'result'])
    56  
    57      extra_opts = {'uris': '%s/**' % input_folder, 'output': output}
    58      tfidf.run(
    59          test_pipeline.get_full_options_as_args(**extra_opts),
    60          save_main_session=False)
    61  
    62      # Parse result file and compare.
    63      results = []
    64      lines = read_files_from_pattern('%s*' % output).splitlines()
    65      for line in lines:
    66        match = re.search(EXPECTED_LINE_RE, line)
    67        logging.info('Result line: %s', line)
    68        if match is not None:
    69          results.append((match.group(1), match.group(2), float(match.group(3))))
    70      logging.info('Computed results: %s', set(results))
    71      self.assertEqual(set(results), EXPECTED_RESULTS)
    72  
    73  
    74  if __name__ == '__main__':
    75    logging.getLogger().setLevel(logging.INFO)
    76    unittest.main()