github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/tools/teststream_microbenchmark.py (about)

     1  # Licensed to the Apache Software Foundation (ASF) under one or more
     2  # contributor license agreements.  See the NOTICE file distributed with
     3  # this work for additional information regarding copyright ownership.
     4  # The ASF licenses this file to You under the Apache License, Version 2.0
     5  # (the "License"); you may not use this file except in compliance with
     6  # the License.  You may obtain a copy of the License at
     7  #
     8  #    http://www.apache.org/licenses/LICENSE-2.0
     9  #
    10  # Unless required by applicable law or agreed to in writing, software
    11  # distributed under the License is distributed on an "AS IS" BASIS,
    12  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  # See the License for the specific language governing permissions and
    14  # limitations under the License.
    15  #
    16  
    17  """A microbenchmark for measuring changes in the performance of TestStream
    18  running locally.
    19  This microbenchmark attempts to measure the overhead of the main data paths
    20  for the TestStream. Specifically new elements, watermark changes and processing
    21  time advances.
    22  
    23  This runs a series of N parallel pipelines with M parallel stages each. Each
    24  stage does the following:
    25  
    26  1) Put all the PCollection elements in a window
    27  2) Wait until the watermark advances past the end of the window.
    28  3) When the watermark passes, change the key and output all the elements
    29  4) Go back to #1 until all elements in the stream have been consumed.
    30  
    31  This executes the same codepaths that are run on the Fn API (and Dataflow)
    32  workers, but is generally easier to run (locally) and more stable.
    33  
    34  Run as
    35  
    36     python -m apache_beam.tools.teststream_microbenchmark
    37  
    38  """
    39  
    40  # pytype: skip-file
    41  
    42  import argparse
    43  import itertools
    44  import logging
    45  import random
    46  
    47  import apache_beam as beam
    48  from apache_beam import WindowInto
    49  from apache_beam.runners import DirectRunner
    50  from apache_beam.testing.test_stream import TestStream
    51  from apache_beam.tools import utils
    52  from apache_beam.transforms.window import FixedWindows
    53  from apache_beam.typehints import typehints
    54  
    55  NUM_PARALLEL_STAGES = 7
    56  
    57  NUM_SERIAL_STAGES = 6
    58  
    59  
    60  class RekeyElements(beam.DoFn):
    61    def process(self, element):
    62      _, values = element
    63      return [(random.randint(0, 1000), v) for v in values]
    64  
    65  
    66  def _build_serial_stages(input_pc, num_serial_stages, stage_count):
    67    pc = (input_pc | ('gbk_start_stage%s' % stage_count) >> beam.GroupByKey())
    68  
    69    for i in range(num_serial_stages):
    70      pc = (
    71          pc
    72          | ('stage%s_map%s' % (stage_count, i)) >> beam.ParDo(
    73              RekeyElements()).with_output_types(typehints.KV[int, int])
    74          | ('stage%s_gbk%s' % (stage_count, i)) >> beam.GroupByKey())
    75  
    76    return pc
    77  
    78  
    79  def run_single_pipeline(size):
    80    def _pipeline_runner():
    81      with beam.Pipeline(runner=DirectRunner()) as p:
    82        ts = TestStream().advance_watermark_to(0)
    83        all_elements = iter(range(size))
    84        watermark = 0
    85        while True:
    86          next_batch = list(itertools.islice(all_elements, 100))
    87          if not next_batch:
    88            break
    89          ts = ts.add_elements([(i, random.randint(0, 1000)) for i in next_batch])
    90          watermark = watermark + 100
    91          ts = ts.advance_watermark_to(watermark)
    92        ts = ts.advance_watermark_to_infinity()
    93  
    94        input_pc = p | ts | WindowInto(FixedWindows(100))
    95        for i in range(NUM_PARALLEL_STAGES):
    96          _build_serial_stages(input_pc, NUM_SERIAL_STAGES, i)
    97  
    98    return _pipeline_runner
    99  
   100  
   101  def run_benchmark(
   102      starting_point=1, num_runs=10, num_elements_step=300, verbose=True):
   103    suite = [
   104        utils.LinearRegressionBenchmarkConfig(
   105            run_single_pipeline, starting_point, num_elements_step, num_runs)
   106    ]
   107    return utils.run_benchmarks(suite, verbose=verbose)
   108  
   109  
   110  if __name__ == '__main__':
   111    logging.basicConfig()
   112    utils.check_compiled('apache_beam.runners.common')
   113  
   114    parser = argparse.ArgumentParser()
   115    parser.add_argument('--num_runs', default=10, type=int)
   116    parser.add_argument('--starting_point', default=1, type=int)
   117    parser.add_argument('--increment', default=300, type=int)
   118    parser.add_argument('--verbose', default=True, type=bool)
   119    options = parser.parse_args()
   120  
   121    run_benchmark(
   122        options.starting_point,
   123        options.num_runs,
   124        options.increment,
   125        options.verbose)