github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/examples/complete/autocomplete.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """A workflow emitting the top k most common words for each prefix.""" 19 20 # pytype: skip-file 21 22 import argparse 23 import logging 24 import re 25 26 import apache_beam as beam 27 from apache_beam.io import ReadFromText 28 from apache_beam.io import WriteToText 29 from apache_beam.options.pipeline_options import PipelineOptions 30 from apache_beam.options.pipeline_options import SetupOptions 31 32 33 def run(argv=None): 34 35 parser = argparse.ArgumentParser() 36 parser.add_argument('--input', required=True, help='Input file to process.') 37 parser.add_argument( 38 '--output', required=True, help='Output file to write results to.') 39 known_args, pipeline_args = parser.parse_known_args(argv) 40 # We use the save_main_session option because one or more DoFn's in this 41 # workflow rely on global context (e.g., a module imported at module level). 42 pipeline_options = PipelineOptions(pipeline_args) 43 pipeline_options.view_as(SetupOptions).save_main_session = True 44 with beam.Pipeline(options=pipeline_options) as p: 45 46 def format_result(prefix_candidates): 47 (prefix, candidates) = prefix_candidates 48 return '%s: %s' % (prefix, candidates) 49 50 ( # pylint: disable=expression-not-assigned 51 p 52 | 'read' >> ReadFromText(known_args.input) 53 | 'split' >> beam.FlatMap(lambda x: re.findall(r'[A-Za-z\']+', x)) 54 | 'TopPerPrefix' >> TopPerPrefix(5) 55 | 'format' >> beam.Map(format_result) 56 | 'write' >> WriteToText(known_args.output)) 57 58 59 class TopPerPrefix(beam.PTransform): 60 def __init__(self, count): 61 # TODO(BEAM-6158): Revert the workaround once we can pickle super() on py3. 62 # super().__init__() 63 beam.PTransform.__init__(self) 64 self._count = count 65 66 def expand(self, words): 67 """Compute the most common words for each possible prefixes. 68 69 Args: 70 words: a PCollection of strings 71 72 Returns: 73 A PCollection of most common words with each prefix, in the form 74 (prefix, [(count, word), (count, word), ...]) 75 """ 76 return ( 77 words 78 | beam.combiners.Count.PerElement() 79 | beam.FlatMap(extract_prefixes) 80 | beam.combiners.Top.LargestPerKey(self._count)) 81 82 83 def extract_prefixes(element): 84 word, count = element 85 for k in range(1, len(word) + 1): 86 prefix = word[:k] 87 yield prefix, (count, word) 88 89 90 if __name__ == '__main__': 91 logging.getLogger().setLevel(logging.INFO) 92 run()