github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/examples/streaming_wordcount_debugging.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """A streaming wordcount example with debugging capabilities.
    19  
    20  It demonstrate the use of logging and assert_that in streaming mode.
    21  
    22  This workflow only works with the DirectRunner
    23  (https://github.com/apache/beam/issues/18709).
    24  
    25  Usage:
    26  python streaming_wordcount_debugging.py
    27  --input_topic projects/$PROJECT_ID/topics/$PUBSUB_INPUT_TOPIC
    28  --output_topic projects/$PROJECT_ID/topics/$PUBSUB_OUTPUT_TOPIC
    29  --streaming
    30  
    31  To publish messages:
    32  gcloud alpha pubsub topics publish $PUBSUB_INPUT_TOPIC --message '210 213 151'
    33  
    34  """
    35  
    36  # pytype: skip-file
    37  
    38  import argparse
    39  import logging
    40  import re
    41  import time
    42  
    43  import apache_beam as beam
    44  from apache_beam.examples.wordcount import WordExtractingDoFn
    45  from apache_beam.options.pipeline_options import PipelineOptions
    46  from apache_beam.options.pipeline_options import SetupOptions
    47  from apache_beam.options.pipeline_options import StandardOptions
    48  from apache_beam.testing.util import assert_that
    49  from apache_beam.testing.util import equal_to_per_window
    50  from apache_beam.transforms import window
    51  from apache_beam.transforms.core import ParDo
    52  
    53  
    54  class PrintFn(beam.DoFn):
    55    """A DoFn that prints label, element, its window, and its timstamp. """
    56    def __init__(self, label):
    57      self.label = label
    58  
    59    def process(
    60        self,
    61        element,
    62        timestamp=beam.DoFn.TimestampParam,
    63        window=beam.DoFn.WindowParam):
    64      # Log at INFO level each element processed.
    65      logging.info('[%s]: %s %s %s', self.label, element, window, timestamp)
    66      yield element
    67  
    68  
    69  class AddTimestampFn(beam.DoFn):
    70    """A DoFn that attaches timestamps to its elements.
    71  
    72    It takes an element and attaches a timestamp of its same value for integer
    73    and current timestamp in other cases.
    74  
    75    For example, 120 and Sometext will result in:
    76    (120, Timestamp(120) and (Sometext, Timestamp(1234567890).
    77    """
    78    def process(self, element):
    79      logging.info('Adding timestamp to: %s', element)
    80      try:
    81        timestamp = int(element)
    82      except ValueError:
    83        timestamp = int(time.time())
    84      yield beam.window.TimestampedValue(element, timestamp)
    85  
    86  
    87  def run(argv=None, save_main_session=True):
    88    """Build and run the pipeline."""
    89    parser = argparse.ArgumentParser()
    90    parser.add_argument(
    91        '--output_topic',
    92        required=True,
    93        help=(
    94            'Output PubSub topic of the form '
    95            '"projects/<PROJECT>/topic/<TOPIC>".'))
    96    group = parser.add_mutually_exclusive_group(required=True)
    97    group.add_argument(
    98        '--input_topic',
    99        help=(
   100            'Input PubSub topic of the form '
   101            '"projects/<PROJECT>/topics/<TOPIC>".'))
   102    group.add_argument(
   103        '--input_subscription',
   104        help=(
   105            'Input PubSub subscription of the form '
   106            '"projects/<PROJECT>/subscriptions/<SUBSCRIPTION>."'))
   107    known_args, pipeline_args = parser.parse_known_args(argv)
   108  
   109    # We use the save_main_session option because one or more DoFn's in this
   110    # workflow rely on global context (e.g., a module imported at module level).
   111    pipeline_options = PipelineOptions(pipeline_args)
   112    pipeline_options.view_as(SetupOptions).save_main_session = save_main_session
   113    pipeline_options.view_as(StandardOptions).streaming = True
   114    with beam.Pipeline(options=pipeline_options) as p:
   115  
   116      # Read from PubSub into a PCollection.
   117      if known_args.input_subscription:
   118        messages = p | beam.io.ReadFromPubSub(
   119            subscription=known_args.input_subscription)
   120      else:
   121        messages = p | beam.io.ReadFromPubSub(topic=known_args.input_topic)
   122  
   123      lines = messages | 'decode' >> beam.Map(lambda x: x.decode('utf-8'))
   124  
   125      # Count the occurrences of each word.
   126      def count_ones(word_ones):
   127        (word, ones) = word_ones
   128        return (word, sum(ones))
   129  
   130      counts = (
   131          lines
   132          | 'Split' >> (beam.ParDo(WordExtractingDoFn()).with_output_types(str))
   133          | 'AddTimestampFn' >> beam.ParDo(AddTimestampFn())
   134          | 'After AddTimestampFn' >> ParDo(PrintFn('After AddTimestampFn'))
   135          | 'PairWithOne' >> beam.Map(lambda x: (x, 1))
   136          | beam.WindowInto(window.FixedWindows(5, 0))
   137          | 'GroupByKey' >> beam.GroupByKey()
   138          | 'CountOnes' >> beam.Map(count_ones))
   139  
   140      # Format the counts into a PCollection of strings.
   141      def format_result(word_count):
   142        (word, count) = word_count
   143        return '%s: %d' % (word, count)
   144  
   145      output = (
   146          counts
   147          | 'format' >> beam.Map(format_result)
   148          | 'encode' >>
   149          beam.Map(lambda x: x.encode('utf-8')).with_output_types(bytes))
   150  
   151      # Write to PubSub.
   152      # pylint: disable=expression-not-assigned
   153      output | beam.io.WriteToPubSub(known_args.output_topic)
   154  
   155      def check_gbk_format():
   156        # A matcher that checks that the output of GBK is of the form word: count.
   157        def matcher(elements):
   158          # pylint: disable=unused-variable
   159          actual_elements_in_window, window = elements
   160          for elm in actual_elements_in_window:
   161            assert re.match(r'\S+:\s+\d+', elm.decode('utf-8')) is not None
   162  
   163        return matcher
   164  
   165      # Check that the format of the output is correct.
   166      assert_that(
   167          output,
   168          check_gbk_format(),
   169          use_global_window=False,
   170          label='Assert word:count format.')
   171  
   172      # Check also that elements are ouput in the right window.
   173      # This expects exactly 1 occurrence of any subset of the elements
   174      # 150, 151, 152, 153, 154 in the window [150, 155)
   175      # or exactly 1 occurrence of any subset of the elements
   176      # 210, 211, 212, 213, 214 in the window [210, 215).
   177      first_window_val = [
   178          '150: 1',
   179          '151: 1',
   180          '152: 1',
   181          '153: 1',
   182          '154: 1',
   183      ]
   184      second_window_val = [
   185          '210: 1',
   186          '211: 1',
   187          '212: 1',
   188          '213: 1',
   189          '214: 1',
   190      ]
   191      expected_window_to_elements = {
   192          window.IntervalWindow(150, 155): [
   193              x.encode('utf-8') for x in first_window_val
   194          ],
   195          window.IntervalWindow(210, 215): [
   196              x.encode('utf-8') for x in second_window_val
   197          ],
   198      }
   199  
   200      # To pass, publish numbers in [150-155) or [210-215) with no repeats.
   201      # To fail, publish a repeated number in the range above range.
   202      # For example: '210 213 151 213'
   203      assert_that(
   204          output,
   205          equal_to_per_window(expected_window_to_elements),
   206          use_global_window=False,
   207          label='Assert correct streaming windowing.')
   208  
   209  
   210  if __name__ == '__main__':
   211    logging.getLogger().setLevel(logging.INFO)
   212    run()