github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/examples/streaming_wordcount.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """A streaming word-counting workflow. 19 """ 20 21 # pytype: skip-file 22 23 import argparse 24 import logging 25 26 import apache_beam as beam 27 from apache_beam.examples.wordcount_with_metrics import WordExtractingDoFn 28 from apache_beam.options.pipeline_options import PipelineOptions 29 from apache_beam.options.pipeline_options import SetupOptions 30 from apache_beam.options.pipeline_options import StandardOptions 31 from apache_beam.transforms import window 32 33 34 def run(argv=None, save_main_session=True): 35 """Build and run the pipeline.""" 36 parser = argparse.ArgumentParser() 37 parser.add_argument( 38 '--output_topic', 39 required=True, 40 help=( 41 'Output PubSub topic of the form ' 42 '"projects/<PROJECT>/topics/<TOPIC>".')) 43 group = parser.add_mutually_exclusive_group(required=True) 44 group.add_argument( 45 '--input_topic', 46 help=( 47 'Input PubSub topic of the form ' 48 '"projects/<PROJECT>/topics/<TOPIC>".')) 49 group.add_argument( 50 '--input_subscription', 51 help=( 52 'Input PubSub subscription of the form ' 53 '"projects/<PROJECT>/subscriptions/<SUBSCRIPTION>."')) 54 known_args, pipeline_args = parser.parse_known_args(argv) 55 56 # We use the save_main_session option because one or more DoFn's in this 57 # workflow rely on global context (e.g., a module imported at module level). 58 pipeline_options = PipelineOptions(pipeline_args) 59 pipeline_options.view_as(SetupOptions).save_main_session = save_main_session 60 pipeline_options.view_as(StandardOptions).streaming = True 61 with beam.Pipeline(options=pipeline_options) as p: 62 63 # Read from PubSub into a PCollection. 64 if known_args.input_subscription: 65 messages = ( 66 p 67 | beam.io.ReadFromPubSub(subscription=known_args.input_subscription). 68 with_output_types(bytes)) 69 else: 70 messages = ( 71 p 72 | beam.io.ReadFromPubSub( 73 topic=known_args.input_topic).with_output_types(bytes)) 74 75 lines = messages | 'decode' >> beam.Map(lambda x: x.decode('utf-8')) 76 77 # Count the occurrences of each word. 78 def count_ones(word_ones): 79 (word, ones) = word_ones 80 return (word, sum(ones)) 81 82 counts = ( 83 lines 84 | 'split' >> (beam.ParDo(WordExtractingDoFn()).with_output_types(str)) 85 | 'pair_with_one' >> beam.Map(lambda x: (x, 1)) 86 | beam.WindowInto(window.FixedWindows(15, 0)) 87 | 'group' >> beam.GroupByKey() 88 | 'count' >> beam.Map(count_ones)) 89 90 # Format the counts into a PCollection of strings. 91 def format_result(word_count): 92 (word, count) = word_count 93 return '%s: %d' % (word, count) 94 95 output = ( 96 counts 97 | 'format' >> beam.Map(format_result) 98 | 'encode' >> 99 beam.Map(lambda x: x.encode('utf-8')).with_output_types(bytes)) 100 101 # Write to PubSub. 102 # pylint: disable=expression-not-assigned 103 output | beam.io.WriteToPubSub(known_args.output_topic) 104 105 106 if __name__ == '__main__': 107 logging.getLogger().setLevel(logging.INFO) 108 run()