github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/examples/flink/flink_streaming_impulse.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """A streaming workflow that uses a synthetic streaming source.
    19  
    20  This can only be used with the Flink portable runner.
    21  """
    22  
    23  # pytype: skip-file
    24  
    25  import argparse
    26  import logging
    27  import sys
    28  
    29  import apache_beam as beam
    30  from apache_beam.io.flink.flink_streaming_impulse_source import FlinkStreamingImpulseSource
    31  from apache_beam.options.pipeline_options import PipelineOptions
    32  from apache_beam.transforms import window
    33  from apache_beam.transforms.trigger import AccumulationMode
    34  from apache_beam.transforms.trigger import AfterProcessingTime
    35  from apache_beam.transforms.trigger import Repeatedly
    36  
    37  
    38  def split(s):
    39    a = s.split("-")
    40    return a[0], int(a[1])
    41  
    42  
    43  def count(x):
    44    return x[0], sum(x[1])
    45  
    46  
    47  def apply_timestamp(element):
    48    import time
    49    yield window.TimestampedValue(element, time.time())
    50  
    51  
    52  def run(argv=None):
    53    """Build and run the pipeline."""
    54    args = [
    55        "--runner=PortableRunner", "--job_endpoint=localhost:8099", "--streaming"
    56    ]
    57    if argv:
    58      args.extend(argv)
    59  
    60    parser = argparse.ArgumentParser()
    61    parser.add_argument(
    62        '--count',
    63        dest='count',
    64        default=0,
    65        help='Number of triggers to generate '
    66        '(0 means emit forever).')
    67    parser.add_argument(
    68        '--interval_ms',
    69        dest='interval_ms',
    70        default=500,
    71        help='Interval between records per parallel '
    72        'Flink subtask.')
    73  
    74    known_args, pipeline_args = parser.parse_known_args(args)
    75  
    76    pipeline_options = PipelineOptions(pipeline_args)
    77  
    78    with beam.Pipeline(options=pipeline_options) as p:
    79  
    80      messages = (
    81          p | FlinkStreamingImpulseSource().set_message_count(
    82              known_args.count).set_interval_ms(known_args.interval_ms))
    83  
    84      _ = (
    85          messages | 'decode' >> beam.Map(lambda x: ('', 1))
    86          | 'window' >> beam.WindowInto(
    87              window.GlobalWindows(),
    88              trigger=Repeatedly(AfterProcessingTime(5 * 1000)),
    89              accumulation_mode=AccumulationMode.DISCARDING)
    90          | 'group' >> beam.GroupByKey()
    91          | 'count' >> beam.Map(count)
    92          | 'log' >> beam.Map(lambda x: logging.info("%d" % x[1])))
    93  
    94  
    95  if __name__ == '__main__':
    96    logging.getLogger().setLevel(logging.INFO)
    97    run(sys.argv[1:])