github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/examples/flink/flink_streaming_impulse.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """A streaming workflow that uses a synthetic streaming source. 19 20 This can only be used with the Flink portable runner. 21 """ 22 23 # pytype: skip-file 24 25 import argparse 26 import logging 27 import sys 28 29 import apache_beam as beam 30 from apache_beam.io.flink.flink_streaming_impulse_source import FlinkStreamingImpulseSource 31 from apache_beam.options.pipeline_options import PipelineOptions 32 from apache_beam.transforms import window 33 from apache_beam.transforms.trigger import AccumulationMode 34 from apache_beam.transforms.trigger import AfterProcessingTime 35 from apache_beam.transforms.trigger import Repeatedly 36 37 38 def split(s): 39 a = s.split("-") 40 return a[0], int(a[1]) 41 42 43 def count(x): 44 return x[0], sum(x[1]) 45 46 47 def apply_timestamp(element): 48 import time 49 yield window.TimestampedValue(element, time.time()) 50 51 52 def run(argv=None): 53 """Build and run the pipeline.""" 54 args = [ 55 "--runner=PortableRunner", "--job_endpoint=localhost:8099", "--streaming" 56 ] 57 if argv: 58 args.extend(argv) 59 60 parser = argparse.ArgumentParser() 61 parser.add_argument( 62 '--count', 63 dest='count', 64 default=0, 65 help='Number of triggers to generate ' 66 '(0 means emit forever).') 67 parser.add_argument( 68 '--interval_ms', 69 dest='interval_ms', 70 default=500, 71 help='Interval between records per parallel ' 72 'Flink subtask.') 73 74 known_args, pipeline_args = parser.parse_known_args(args) 75 76 pipeline_options = PipelineOptions(pipeline_args) 77 78 with beam.Pipeline(options=pipeline_options) as p: 79 80 messages = ( 81 p | FlinkStreamingImpulseSource().set_message_count( 82 known_args.count).set_interval_ms(known_args.interval_ms)) 83 84 _ = ( 85 messages | 'decode' >> beam.Map(lambda x: ('', 1)) 86 | 'window' >> beam.WindowInto( 87 window.GlobalWindows(), 88 trigger=Repeatedly(AfterProcessingTime(5 * 1000)), 89 accumulation_mode=AccumulationMode.DISCARDING) 90 | 'group' >> beam.GroupByKey() 91 | 'count' >> beam.Map(count) 92 | 'log' >> beam.Map(lambda x: logging.info("%d" % x[1]))) 93 94 95 if __name__ == '__main__': 96 logging.getLogger().setLevel(logging.INFO) 97 run(sys.argv[1:])