github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/examples/complete/tfidf_test.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """Test for the TF-IDF example."""
    19  
    20  # pytype: skip-file
    21  
    22  import logging
    23  import unittest
    24  
    25  import apache_beam as beam
    26  from apache_beam.examples.complete import tfidf
    27  from apache_beam.testing.test_pipeline import TestPipeline
    28  from apache_beam.testing.util import assert_that
    29  from apache_beam.testing.util import equal_to
    30  
    31  EXPECTED_RESULTS = set([
    32      ('ghi', '1.txt', 0.3662040962227032), ('abc', '1.txt', 0.0),
    33      ('abc', '3.txt', 0.0), ('abc', '2.txt', 0.0),
    34      ('def', '1.txt', 0.13515503603605478), ('def', '2.txt', 0.2027325540540822)
    35  ])
    36  
    37  
    38  class TfIdfTest(unittest.TestCase):
    39    def test_tfidf_transform(self):
    40      with TestPipeline() as p:
    41  
    42        def re_key(word_uri_tfidf):
    43          (word, (uri, tfidf)) = word_uri_tfidf
    44          return (word, uri, tfidf)
    45  
    46        uri_to_line = p | 'create sample' >> beam.Create(
    47            [('1.txt', 'abc def ghi'), ('2.txt', 'abc def'), ('3.txt', 'abc')])
    48        result = (uri_to_line | tfidf.TfIdf() | beam.Map(re_key))
    49        assert_that(result, equal_to(EXPECTED_RESULTS))
    50        # Run the pipeline. Note that the assert_that above adds to the pipeline
    51        # a check that the result PCollection contains expected values.
    52        # To actually trigger the check the pipeline must be run (e.g. by
    53        # exiting the with context).
    54  
    55  
    56  if __name__ == '__main__':
    57    logging.getLogger().setLevel(logging.INFO)
    58    unittest.main()