github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/examples/streaming_wordcount.py

github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/examples/streaming_wordcount.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """A streaming word-counting workflow.
    19  """
    20  
    21  # pytype: skip-file
    22  
    23  import argparse
    24  import logging
    25  
    26  import apache_beam as beam
    27  from apache_beam.examples.wordcount_with_metrics import WordExtractingDoFn
    28  from apache_beam.options.pipeline_options import PipelineOptions
    29  from apache_beam.options.pipeline_options import SetupOptions
    30  from apache_beam.options.pipeline_options import StandardOptions
    31  from apache_beam.transforms import window
    32  
    33  
    34  def run(argv=None, save_main_session=True):
    35    """Build and run the pipeline."""
    36    parser = argparse.ArgumentParser()
    37    parser.add_argument(
    38        '--output_topic',
    39        required=True,
    40        help=(
    41            'Output PubSub topic of the form '
    42            '"projects/<PROJECT>/topics/<TOPIC>".'))
    43    group = parser.add_mutually_exclusive_group(required=True)
    44    group.add_argument(
    45        '--input_topic',
    46        help=(
    47            'Input PubSub topic of the form '
    48            '"projects/<PROJECT>/topics/<TOPIC>".'))
    49    group.add_argument(
    50        '--input_subscription',
    51        help=(
    52            'Input PubSub subscription of the form '
    53            '"projects/<PROJECT>/subscriptions/<SUBSCRIPTION>."'))
    54    known_args, pipeline_args = parser.parse_known_args(argv)
    55  
    56    # We use the save_main_session option because one or more DoFn's in this
    57    # workflow rely on global context (e.g., a module imported at module level).
    58    pipeline_options = PipelineOptions(pipeline_args)
    59    pipeline_options.view_as(SetupOptions).save_main_session = save_main_session
    60    pipeline_options.view_as(StandardOptions).streaming = True
    61    with beam.Pipeline(options=pipeline_options) as p:
    62  
    63      # Read from PubSub into a PCollection.
    64      if known_args.input_subscription:
    65        messages = (
    66            p
    67            | beam.io.ReadFromPubSub(subscription=known_args.input_subscription).
    68            with_output_types(bytes))
    69      else:
    70        messages = (
    71            p
    72            | beam.io.ReadFromPubSub(
    73                topic=known_args.input_topic).with_output_types(bytes))
    74  
    75      lines = messages | 'decode' >> beam.Map(lambda x: x.decode('utf-8'))
    76  
    77      # Count the occurrences of each word.
    78      def count_ones(word_ones):
    79        (word, ones) = word_ones
    80        return (word, sum(ones))
    81  
    82      counts = (
    83          lines
    84          | 'split' >> (beam.ParDo(WordExtractingDoFn()).with_output_types(str))
    85          | 'pair_with_one' >> beam.Map(lambda x: (x, 1))
    86          | beam.WindowInto(window.FixedWindows(15, 0))
    87          | 'group' >> beam.GroupByKey()
    88          | 'count' >> beam.Map(count_ones))
    89  
    90      # Format the counts into a PCollection of strings.
    91      def format_result(word_count):
    92        (word, count) = word_count
    93        return '%s: %d' % (word, count)
    94  
    95      output = (
    96          counts
    97          | 'format' >> beam.Map(format_result)
    98          | 'encode' >>
    99          beam.Map(lambda x: x.encode('utf-8')).with_output_types(bytes))
   100  
   101      # Write to PubSub.
   102      # pylint: disable=expression-not-assigned
   103      output | beam.io.WriteToPubSub(known_args.output_topic)
   104  
   105  
   106  if __name__ == '__main__':
   107    logging.getLogger().setLevel(logging.INFO)
   108    run()