github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/examples/streaming_wordcount_debugging.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """A streaming wordcount example with debugging capabilities. 19 20 It demonstrate the use of logging and assert_that in streaming mode. 21 22 This workflow only works with the DirectRunner 23 (https://github.com/apache/beam/issues/18709). 24 25 Usage: 26 python streaming_wordcount_debugging.py 27 --input_topic projects/$PROJECT_ID/topics/$PUBSUB_INPUT_TOPIC 28 --output_topic projects/$PROJECT_ID/topics/$PUBSUB_OUTPUT_TOPIC 29 --streaming 30 31 To publish messages: 32 gcloud alpha pubsub topics publish $PUBSUB_INPUT_TOPIC --message '210 213 151' 33 34 """ 35 36 # pytype: skip-file 37 38 import argparse 39 import logging 40 import re 41 import time 42 43 import apache_beam as beam 44 from apache_beam.examples.wordcount import WordExtractingDoFn 45 from apache_beam.options.pipeline_options import PipelineOptions 46 from apache_beam.options.pipeline_options import SetupOptions 47 from apache_beam.options.pipeline_options import StandardOptions 48 from apache_beam.testing.util import assert_that 49 from apache_beam.testing.util import equal_to_per_window 50 from apache_beam.transforms import window 51 from apache_beam.transforms.core import ParDo 52 53 54 class PrintFn(beam.DoFn): 55 """A DoFn that prints label, element, its window, and its timstamp. """ 56 def __init__(self, label): 57 self.label = label 58 59 def process( 60 self, 61 element, 62 timestamp=beam.DoFn.TimestampParam, 63 window=beam.DoFn.WindowParam): 64 # Log at INFO level each element processed. 65 logging.info('[%s]: %s %s %s', self.label, element, window, timestamp) 66 yield element 67 68 69 class AddTimestampFn(beam.DoFn): 70 """A DoFn that attaches timestamps to its elements. 71 72 It takes an element and attaches a timestamp of its same value for integer 73 and current timestamp in other cases. 74 75 For example, 120 and Sometext will result in: 76 (120, Timestamp(120) and (Sometext, Timestamp(1234567890). 77 """ 78 def process(self, element): 79 logging.info('Adding timestamp to: %s', element) 80 try: 81 timestamp = int(element) 82 except ValueError: 83 timestamp = int(time.time()) 84 yield beam.window.TimestampedValue(element, timestamp) 85 86 87 def run(argv=None, save_main_session=True): 88 """Build and run the pipeline.""" 89 parser = argparse.ArgumentParser() 90 parser.add_argument( 91 '--output_topic', 92 required=True, 93 help=( 94 'Output PubSub topic of the form ' 95 '"projects/<PROJECT>/topic/<TOPIC>".')) 96 group = parser.add_mutually_exclusive_group(required=True) 97 group.add_argument( 98 '--input_topic', 99 help=( 100 'Input PubSub topic of the form ' 101 '"projects/<PROJECT>/topics/<TOPIC>".')) 102 group.add_argument( 103 '--input_subscription', 104 help=( 105 'Input PubSub subscription of the form ' 106 '"projects/<PROJECT>/subscriptions/<SUBSCRIPTION>."')) 107 known_args, pipeline_args = parser.parse_known_args(argv) 108 109 # We use the save_main_session option because one or more DoFn's in this 110 # workflow rely on global context (e.g., a module imported at module level). 111 pipeline_options = PipelineOptions(pipeline_args) 112 pipeline_options.view_as(SetupOptions).save_main_session = save_main_session 113 pipeline_options.view_as(StandardOptions).streaming = True 114 with beam.Pipeline(options=pipeline_options) as p: 115 116 # Read from PubSub into a PCollection. 117 if known_args.input_subscription: 118 messages = p | beam.io.ReadFromPubSub( 119 subscription=known_args.input_subscription) 120 else: 121 messages = p | beam.io.ReadFromPubSub(topic=known_args.input_topic) 122 123 lines = messages | 'decode' >> beam.Map(lambda x: x.decode('utf-8')) 124 125 # Count the occurrences of each word. 126 def count_ones(word_ones): 127 (word, ones) = word_ones 128 return (word, sum(ones)) 129 130 counts = ( 131 lines 132 | 'Split' >> (beam.ParDo(WordExtractingDoFn()).with_output_types(str)) 133 | 'AddTimestampFn' >> beam.ParDo(AddTimestampFn()) 134 | 'After AddTimestampFn' >> ParDo(PrintFn('After AddTimestampFn')) 135 | 'PairWithOne' >> beam.Map(lambda x: (x, 1)) 136 | beam.WindowInto(window.FixedWindows(5, 0)) 137 | 'GroupByKey' >> beam.GroupByKey() 138 | 'CountOnes' >> beam.Map(count_ones)) 139 140 # Format the counts into a PCollection of strings. 141 def format_result(word_count): 142 (word, count) = word_count 143 return '%s: %d' % (word, count) 144 145 output = ( 146 counts 147 | 'format' >> beam.Map(format_result) 148 | 'encode' >> 149 beam.Map(lambda x: x.encode('utf-8')).with_output_types(bytes)) 150 151 # Write to PubSub. 152 # pylint: disable=expression-not-assigned 153 output | beam.io.WriteToPubSub(known_args.output_topic) 154 155 def check_gbk_format(): 156 # A matcher that checks that the output of GBK is of the form word: count. 157 def matcher(elements): 158 # pylint: disable=unused-variable 159 actual_elements_in_window, window = elements 160 for elm in actual_elements_in_window: 161 assert re.match(r'\S+:\s+\d+', elm.decode('utf-8')) is not None 162 163 return matcher 164 165 # Check that the format of the output is correct. 166 assert_that( 167 output, 168 check_gbk_format(), 169 use_global_window=False, 170 label='Assert word:count format.') 171 172 # Check also that elements are ouput in the right window. 173 # This expects exactly 1 occurrence of any subset of the elements 174 # 150, 151, 152, 153, 154 in the window [150, 155) 175 # or exactly 1 occurrence of any subset of the elements 176 # 210, 211, 212, 213, 214 in the window [210, 215). 177 first_window_val = [ 178 '150: 1', 179 '151: 1', 180 '152: 1', 181 '153: 1', 182 '154: 1', 183 ] 184 second_window_val = [ 185 '210: 1', 186 '211: 1', 187 '212: 1', 188 '213: 1', 189 '214: 1', 190 ] 191 expected_window_to_elements = { 192 window.IntervalWindow(150, 155): [ 193 x.encode('utf-8') for x in first_window_val 194 ], 195 window.IntervalWindow(210, 215): [ 196 x.encode('utf-8') for x in second_window_val 197 ], 198 } 199 200 # To pass, publish numbers in [150-155) or [210-215) with no repeats. 201 # To fail, publish a repeated number in the range above range. 202 # For example: '210 213 151 213' 203 assert_that( 204 output, 205 equal_to_per_window(expected_window_to_elements), 206 use_global_window=False, 207 label='Assert correct streaming windowing.') 208 209 210 if __name__ == '__main__': 211 logging.getLogger().setLevel(logging.INFO) 212 run()