github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/examples/complete/tfidf.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """A TF-IDF workflow (term frequency - inverse document frequency).
    19  
    20  For an explanation of the TF-IDF algorithm see the following link:
    21  http://en.wikipedia.org/wiki/Tf-idf
    22  """
    23  
    24  # pytype: skip-file
    25  
    26  import argparse
    27  import math
    28  import re
    29  
    30  import apache_beam as beam
    31  from apache_beam.io import ReadFromText
    32  from apache_beam.io import WriteToText
    33  from apache_beam.io.filesystems import FileSystems
    34  from apache_beam.options.pipeline_options import PipelineOptions
    35  from apache_beam.options.pipeline_options import SetupOptions
    36  from apache_beam.pvalue import AsSingleton
    37  
    38  
    39  def read_documents(pipeline, uris):
    40    """Read the documents at the provided uris and returns (uri, line) pairs."""
    41    pcolls = []
    42    for uri in uris:
    43      pcolls.append(
    44          pipeline
    45          | 'Read: %s' % uri >> ReadFromText(uri)
    46          | 'WithKey: %s' % uri >> beam.Map(lambda v, uri: (uri, v), uri))
    47    return pcolls | 'FlattenReadPColls' >> beam.Flatten()
    48  
    49  
    50  class TfIdf(beam.PTransform):
    51    """A transform containing a basic TF-IDF pipeline.
    52  
    53    The input consists of KV objects where the key is the document's URI and
    54    the value is a piece of the document's content.
    55    The output is mapping from terms to scores for each document URI.
    56    """
    57    def expand(self, uri_to_content):
    58  
    59      # Compute the total number of documents, and prepare a singleton
    60      # PCollection to use as side input.
    61      total_documents = (
    62          uri_to_content
    63          | 'GetUris 1' >> beam.Keys()
    64          | 'GetUniqueUris' >> beam.Distinct()
    65          | 'CountUris' >> beam.combiners.Count.Globally())
    66  
    67      # Create a collection of pairs mapping a URI to each of the words
    68      # in the document associated with that that URI.
    69  
    70      def split_into_words(uri_line):
    71        (uri, line) = uri_line
    72        return [(uri, w.lower()) for w in re.findall(r'[A-Za-z\']+', line)]
    73  
    74      uri_to_words = (
    75          uri_to_content
    76          | 'SplitWords' >> beam.FlatMap(split_into_words))
    77  
    78      # Compute a mapping from each word to the total number of documents
    79      # in which it appears.
    80      word_to_doc_count = (
    81          uri_to_words
    82          | 'GetUniqueWordsPerDoc' >> beam.Distinct()
    83          | 'GetWords' >> beam.Values()
    84          | 'CountDocsPerWord' >> beam.combiners.Count.PerElement())
    85  
    86      # Compute a mapping from each URI to the total number of words in the
    87      # document associated with that URI.
    88      uri_to_word_total = (
    89          uri_to_words
    90          | 'GetUris 2' >> beam.Keys()
    91          | 'CountWordsInDoc' >> beam.combiners.Count.PerElement())
    92  
    93      # Count, for each (URI, word) pair, the number of occurrences of that word
    94      # in the document associated with the URI.
    95      uri_and_word_to_count = (
    96          uri_to_words
    97          | 'CountWord-DocPairs' >> beam.combiners.Count.PerElement())
    98  
    99      # Adjust the above collection to a mapping from (URI, word) pairs to counts
   100      # into an isomorphic mapping from URI to (word, count) pairs, to prepare
   101      # for a join by the URI key.
   102      def shift_keys(uri_word_count):
   103        return (uri_word_count[0][0], (uri_word_count[0][1], uri_word_count[1]))
   104  
   105      uri_to_word_and_count = (
   106          uri_and_word_to_count
   107          | 'ShiftKeys' >> beam.Map(shift_keys))
   108  
   109      # Perform a CoGroupByKey (a sort of pre-join) on the prepared
   110      # uri_to_word_total and uri_to_word_and_count tagged by 'word totals' and
   111      # 'word counts' strings. This yields a mapping from URI to a dictionary
   112      # that maps the above mentioned tag strings to an iterable containing the
   113      # word total for that URI and word and count respectively.
   114      #
   115      # A diagram (in which '[]' just means 'iterable'):
   116      #
   117      #   URI: {'word totals': [count],  # Total words within this URI's document.
   118      #         'word counts': [(word, count),  # Counts of specific words
   119      #                         (word, count),  # within this URI's document.
   120      #                         ... ]}
   121      uri_to_word_and_count_and_total = ({
   122          'word totals': uri_to_word_total, 'word counts': uri_to_word_and_count
   123      }
   124                                         | 'CoGroupByUri' >> beam.CoGroupByKey())
   125  
   126      # Compute a mapping from each word to a (URI, term frequency) pair for each
   127      # URI. A word's term frequency for a document is simply the number of times
   128      # that word occurs in the document divided by the total number of words in
   129      # the document.
   130  
   131      def compute_term_frequency(uri_count_and_total):
   132        (uri, count_and_total) = uri_count_and_total
   133        word_and_count = count_and_total['word counts']
   134        # We have an iterable for one element that we want extracted.
   135        [word_total] = count_and_total['word totals']
   136        for word, count in word_and_count:
   137          yield word, (uri, float(count) / word_total)
   138  
   139      word_to_uri_and_tf = (
   140          uri_to_word_and_count_and_total
   141          | 'ComputeTermFrequencies' >> beam.FlatMap(compute_term_frequency))
   142  
   143      # Compute a mapping from each word to its document frequency.
   144      # A word's document frequency in a corpus is the number of
   145      # documents in which the word appears divided by the total
   146      # number of documents in the corpus.
   147      #
   148      # This calculation uses a side input, a Dataflow-computed auxiliary value
   149      # presented to each invocation of our MapFn lambda. The second argument to
   150      # the function (called total---note that the first argument is a tuple)
   151      # receives the value we listed after the lambda in Map(). Additional side
   152      # inputs (and ordinary Python values, too) can be provided to MapFns and
   153      # DoFns in this way.
   154      def div_word_count_by_total(word_count, total):
   155        (word, count) = word_count
   156        return (word, float(count) / total)
   157  
   158      word_to_df = (
   159          word_to_doc_count
   160          | 'ComputeDocFrequencies' >> beam.Map(
   161              div_word_count_by_total, AsSingleton(total_documents)))
   162  
   163      # Join the term frequency and document frequency collections,
   164      # each keyed on the word.
   165      word_to_uri_and_tf_and_df = ({
   166          'tf': word_to_uri_and_tf, 'df': word_to_df
   167      }
   168                                   | 'CoGroupWordsByTf-df' >> beam.CoGroupByKey())
   169  
   170      # Compute a mapping from each word to a (URI, TF-IDF) score for each URI.
   171      # There are a variety of definitions of TF-IDF
   172      # ("term frequency - inverse document frequency") score; here we use a
   173      # basic version that is the term frequency divided by the log of the
   174      # document frequency.
   175  
   176      def compute_tf_idf(word_tf_and_df):
   177        (word, tf_and_df) = word_tf_and_df
   178        [docf] = tf_and_df['df']
   179        for uri, tf in tf_and_df['tf']:
   180          yield word, (uri, tf * math.log(1 / docf))
   181  
   182      word_to_uri_and_tfidf = (
   183          word_to_uri_and_tf_and_df
   184          | 'ComputeTf-idf' >> beam.FlatMap(compute_tf_idf))
   185  
   186      return word_to_uri_and_tfidf
   187  
   188  
   189  def run(argv=None, save_main_session=True):
   190    """Main entry point; defines and runs the tfidf pipeline."""
   191    parser = argparse.ArgumentParser()
   192    parser.add_argument('--uris', required=True, help='URIs to process.')
   193    parser.add_argument(
   194        '--output', required=True, help='Output file to write results to.')
   195    known_args, pipeline_args = parser.parse_known_args(argv)
   196    # We use the save_main_session option because one or more DoFn's in this
   197    # workflow rely on global context (e.g., a module imported at module level).
   198    pipeline_options = PipelineOptions(pipeline_args)
   199    pipeline_options.view_as(SetupOptions).save_main_session = save_main_session
   200    with beam.Pipeline(options=pipeline_options) as p:
   201  
   202      # Read documents specified by the uris command line option.
   203      metadata_list = FileSystems.match([known_args.uris])[0].metadata_list
   204      uris = [metadata.path for metadata in metadata_list]
   205      pcoll = read_documents(p, uris)
   206      # Compute TF-IDF information for each word.
   207      output = pcoll | TfIdf()
   208      # Write the output using a "Write" transform that has side effects.
   209      # pylint: disable=expression-not-assigned
   210      output | 'write' >> WriteToText(known_args.output)
   211      # Execute the pipeline and wait until it is completed.
   212  
   213  
   214  if __name__ == '__main__':
   215    run()