github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/examples/complete/tfidf.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """A TF-IDF workflow (term frequency - inverse document frequency). 19 20 For an explanation of the TF-IDF algorithm see the following link: 21 http://en.wikipedia.org/wiki/Tf-idf 22 """ 23 24 # pytype: skip-file 25 26 import argparse 27 import math 28 import re 29 30 import apache_beam as beam 31 from apache_beam.io import ReadFromText 32 from apache_beam.io import WriteToText 33 from apache_beam.io.filesystems import FileSystems 34 from apache_beam.options.pipeline_options import PipelineOptions 35 from apache_beam.options.pipeline_options import SetupOptions 36 from apache_beam.pvalue import AsSingleton 37 38 39 def read_documents(pipeline, uris): 40 """Read the documents at the provided uris and returns (uri, line) pairs.""" 41 pcolls = [] 42 for uri in uris: 43 pcolls.append( 44 pipeline 45 | 'Read: %s' % uri >> ReadFromText(uri) 46 | 'WithKey: %s' % uri >> beam.Map(lambda v, uri: (uri, v), uri)) 47 return pcolls | 'FlattenReadPColls' >> beam.Flatten() 48 49 50 class TfIdf(beam.PTransform): 51 """A transform containing a basic TF-IDF pipeline. 52 53 The input consists of KV objects where the key is the document's URI and 54 the value is a piece of the document's content. 55 The output is mapping from terms to scores for each document URI. 56 """ 57 def expand(self, uri_to_content): 58 59 # Compute the total number of documents, and prepare a singleton 60 # PCollection to use as side input. 61 total_documents = ( 62 uri_to_content 63 | 'GetUris 1' >> beam.Keys() 64 | 'GetUniqueUris' >> beam.Distinct() 65 | 'CountUris' >> beam.combiners.Count.Globally()) 66 67 # Create a collection of pairs mapping a URI to each of the words 68 # in the document associated with that that URI. 69 70 def split_into_words(uri_line): 71 (uri, line) = uri_line 72 return [(uri, w.lower()) for w in re.findall(r'[A-Za-z\']+', line)] 73 74 uri_to_words = ( 75 uri_to_content 76 | 'SplitWords' >> beam.FlatMap(split_into_words)) 77 78 # Compute a mapping from each word to the total number of documents 79 # in which it appears. 80 word_to_doc_count = ( 81 uri_to_words 82 | 'GetUniqueWordsPerDoc' >> beam.Distinct() 83 | 'GetWords' >> beam.Values() 84 | 'CountDocsPerWord' >> beam.combiners.Count.PerElement()) 85 86 # Compute a mapping from each URI to the total number of words in the 87 # document associated with that URI. 88 uri_to_word_total = ( 89 uri_to_words 90 | 'GetUris 2' >> beam.Keys() 91 | 'CountWordsInDoc' >> beam.combiners.Count.PerElement()) 92 93 # Count, for each (URI, word) pair, the number of occurrences of that word 94 # in the document associated with the URI. 95 uri_and_word_to_count = ( 96 uri_to_words 97 | 'CountWord-DocPairs' >> beam.combiners.Count.PerElement()) 98 99 # Adjust the above collection to a mapping from (URI, word) pairs to counts 100 # into an isomorphic mapping from URI to (word, count) pairs, to prepare 101 # for a join by the URI key. 102 def shift_keys(uri_word_count): 103 return (uri_word_count[0][0], (uri_word_count[0][1], uri_word_count[1])) 104 105 uri_to_word_and_count = ( 106 uri_and_word_to_count 107 | 'ShiftKeys' >> beam.Map(shift_keys)) 108 109 # Perform a CoGroupByKey (a sort of pre-join) on the prepared 110 # uri_to_word_total and uri_to_word_and_count tagged by 'word totals' and 111 # 'word counts' strings. This yields a mapping from URI to a dictionary 112 # that maps the above mentioned tag strings to an iterable containing the 113 # word total for that URI and word and count respectively. 114 # 115 # A diagram (in which '[]' just means 'iterable'): 116 # 117 # URI: {'word totals': [count], # Total words within this URI's document. 118 # 'word counts': [(word, count), # Counts of specific words 119 # (word, count), # within this URI's document. 120 # ... ]} 121 uri_to_word_and_count_and_total = ({ 122 'word totals': uri_to_word_total, 'word counts': uri_to_word_and_count 123 } 124 | 'CoGroupByUri' >> beam.CoGroupByKey()) 125 126 # Compute a mapping from each word to a (URI, term frequency) pair for each 127 # URI. A word's term frequency for a document is simply the number of times 128 # that word occurs in the document divided by the total number of words in 129 # the document. 130 131 def compute_term_frequency(uri_count_and_total): 132 (uri, count_and_total) = uri_count_and_total 133 word_and_count = count_and_total['word counts'] 134 # We have an iterable for one element that we want extracted. 135 [word_total] = count_and_total['word totals'] 136 for word, count in word_and_count: 137 yield word, (uri, float(count) / word_total) 138 139 word_to_uri_and_tf = ( 140 uri_to_word_and_count_and_total 141 | 'ComputeTermFrequencies' >> beam.FlatMap(compute_term_frequency)) 142 143 # Compute a mapping from each word to its document frequency. 144 # A word's document frequency in a corpus is the number of 145 # documents in which the word appears divided by the total 146 # number of documents in the corpus. 147 # 148 # This calculation uses a side input, a Dataflow-computed auxiliary value 149 # presented to each invocation of our MapFn lambda. The second argument to 150 # the function (called total---note that the first argument is a tuple) 151 # receives the value we listed after the lambda in Map(). Additional side 152 # inputs (and ordinary Python values, too) can be provided to MapFns and 153 # DoFns in this way. 154 def div_word_count_by_total(word_count, total): 155 (word, count) = word_count 156 return (word, float(count) / total) 157 158 word_to_df = ( 159 word_to_doc_count 160 | 'ComputeDocFrequencies' >> beam.Map( 161 div_word_count_by_total, AsSingleton(total_documents))) 162 163 # Join the term frequency and document frequency collections, 164 # each keyed on the word. 165 word_to_uri_and_tf_and_df = ({ 166 'tf': word_to_uri_and_tf, 'df': word_to_df 167 } 168 | 'CoGroupWordsByTf-df' >> beam.CoGroupByKey()) 169 170 # Compute a mapping from each word to a (URI, TF-IDF) score for each URI. 171 # There are a variety of definitions of TF-IDF 172 # ("term frequency - inverse document frequency") score; here we use a 173 # basic version that is the term frequency divided by the log of the 174 # document frequency. 175 176 def compute_tf_idf(word_tf_and_df): 177 (word, tf_and_df) = word_tf_and_df 178 [docf] = tf_and_df['df'] 179 for uri, tf in tf_and_df['tf']: 180 yield word, (uri, tf * math.log(1 / docf)) 181 182 word_to_uri_and_tfidf = ( 183 word_to_uri_and_tf_and_df 184 | 'ComputeTf-idf' >> beam.FlatMap(compute_tf_idf)) 185 186 return word_to_uri_and_tfidf 187 188 189 def run(argv=None, save_main_session=True): 190 """Main entry point; defines and runs the tfidf pipeline.""" 191 parser = argparse.ArgumentParser() 192 parser.add_argument('--uris', required=True, help='URIs to process.') 193 parser.add_argument( 194 '--output', required=True, help='Output file to write results to.') 195 known_args, pipeline_args = parser.parse_known_args(argv) 196 # We use the save_main_session option because one or more DoFn's in this 197 # workflow rely on global context (e.g., a module imported at module level). 198 pipeline_options = PipelineOptions(pipeline_args) 199 pipeline_options.view_as(SetupOptions).save_main_session = save_main_session 200 with beam.Pipeline(options=pipeline_options) as p: 201 202 # Read documents specified by the uris command line option. 203 metadata_list = FileSystems.match([known_args.uris])[0].metadata_list 204 uris = [metadata.path for metadata in metadata_list] 205 pcoll = read_documents(p, uris) 206 # Compute TF-IDF information for each word. 207 output = pcoll | TfIdf() 208 # Write the output using a "Write" transform that has side effects. 209 # pylint: disable=expression-not-assigned 210 output | 'write' >> WriteToText(known_args.output) 211 # Execute the pipeline and wait until it is completed. 212 213 214 if __name__ == '__main__': 215 run()