github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/examples/complete/autocomplete.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """A workflow emitting the top k most common words for each prefix."""
    19  
    20  # pytype: skip-file
    21  
    22  import argparse
    23  import logging
    24  import re
    25  
    26  import apache_beam as beam
    27  from apache_beam.io import ReadFromText
    28  from apache_beam.io import WriteToText
    29  from apache_beam.options.pipeline_options import PipelineOptions
    30  from apache_beam.options.pipeline_options import SetupOptions
    31  
    32  
    33  def run(argv=None):
    34  
    35    parser = argparse.ArgumentParser()
    36    parser.add_argument('--input', required=True, help='Input file to process.')
    37    parser.add_argument(
    38        '--output', required=True, help='Output file to write results to.')
    39    known_args, pipeline_args = parser.parse_known_args(argv)
    40    # We use the save_main_session option because one or more DoFn's in this
    41    # workflow rely on global context (e.g., a module imported at module level).
    42    pipeline_options = PipelineOptions(pipeline_args)
    43    pipeline_options.view_as(SetupOptions).save_main_session = True
    44    with beam.Pipeline(options=pipeline_options) as p:
    45  
    46      def format_result(prefix_candidates):
    47        (prefix, candidates) = prefix_candidates
    48        return '%s: %s' % (prefix, candidates)
    49  
    50      (  # pylint: disable=expression-not-assigned
    51          p
    52          | 'read' >> ReadFromText(known_args.input)
    53          | 'split' >> beam.FlatMap(lambda x: re.findall(r'[A-Za-z\']+', x))
    54          | 'TopPerPrefix' >> TopPerPrefix(5)
    55          | 'format' >> beam.Map(format_result)
    56          | 'write' >> WriteToText(known_args.output))
    57  
    58  
    59  class TopPerPrefix(beam.PTransform):
    60    def __init__(self, count):
    61      # TODO(BEAM-6158): Revert the workaround once we can pickle super() on py3.
    62      # super().__init__()
    63      beam.PTransform.__init__(self)
    64      self._count = count
    65  
    66    def expand(self, words):
    67      """Compute the most common words for each possible prefixes.
    68  
    69      Args:
    70        words: a PCollection of strings
    71  
    72      Returns:
    73        A PCollection of most common words with each prefix, in the form
    74            (prefix, [(count, word), (count, word), ...])
    75      """
    76      return (
    77          words
    78          | beam.combiners.Count.PerElement()
    79          | beam.FlatMap(extract_prefixes)
    80          | beam.combiners.Top.LargestPerKey(self._count))
    81  
    82  
    83  def extract_prefixes(element):
    84    word, count = element
    85    for k in range(1, len(word) + 1):
    86      prefix = word[:k]
    87      yield prefix, (count, word)
    88  
    89  
    90  if __name__ == '__main__':
    91    logging.getLogger().setLevel(logging.INFO)
    92    run()