github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/tools/fn_api_runner_microbenchmark.py

github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/tools/fn_api_runner_microbenchmark.py (about)

     1  # Licensed to the Apache Software Foundation (ASF) under one or more
     2  # contributor license agreements.  See the NOTICE file distributed with
     3  # this work for additional information regarding copyright ownership.
     4  # The ASF licenses this file to You under the Apache License, Version 2.0
     5  # (the "License"); you may not use this file except in compliance with
     6  # the License.  You may obtain a copy of the License at
     7  #
     8  #    http://www.apache.org/licenses/LICENSE-2.0
     9  #
    10  # Unless required by applicable law or agreed to in writing, software
    11  # distributed under the License is distributed on an "AS IS" BASIS,
    12  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  # See the License for the specific language governing permissions and
    14  # limitations under the License.
    15  #
    16  
    17  """A microbenchmark for measuring changes in the critical path of FnApiRunner.
    18  This microbenchmark attempts to measure the overhead of the main data paths
    19  for the FnApiRunner. Specifically state, timers, and shuffling of data.
    20  
    21  This runs a series of N parallel pipelines with M parallel stages each. Each
    22  stage does the following:
    23  
    24  1) Put all the PCollection elements in state
    25  2) Set a timer for the future
    26  3) When the timer fires, change the key and output all the elements downstream
    27  
    28  This executes the same codepaths that are run on the Fn API (and Dataflow)
    29  workers, but is generally easier to run (locally) and more stable..
    30  
    31  Run as
    32  
    33     python -m apache_beam.tools.fn_api_runner_microbenchmark
    34  
    35  The main metric to work with for this benchmark is Fixed Cost. This represents
    36  the fixed cost of ovehead for the data path of the FnApiRunner.
    37  
    38  Initial results were:
    39  
    40  run 1 of 10, per element time cost: 3.6778 sec
    41  run 2 of 10, per element time cost: 0.053498 sec
    42  run 3 of 10, per element time cost: 0.0299434 sec
    43  run 4 of 10, per element time cost: 0.0211154 sec
    44  run 5 of 10, per element time cost: 0.0170031 sec
    45  run 6 of 10, per element time cost: 0.0150809 sec
    46  run 7 of 10, per element time cost: 0.013218 sec
    47  run 8 of 10, per element time cost: 0.0119685 sec
    48  run 9 of 10, per element time cost: 0.0107382 sec
    49  run 10 of 10, per element time cost: 0.0103208 sec
    50  
    51  
    52  Fixed cost   4.537164939085642
    53  Per-element  0.005474923321695039
    54  R^2          0.95189
    55  """
    56  
    57  # pytype: skip-file
    58  
    59  import argparse
    60  import logging
    61  import random
    62  
    63  import apache_beam as beam
    64  from apache_beam.coders import VarIntCoder
    65  from apache_beam.runners.portability.fn_api_runner import FnApiRunner
    66  from apache_beam.tools import utils
    67  from apache_beam.transforms.timeutil import TimeDomain
    68  from apache_beam.transforms.userstate import SetStateSpec
    69  from apache_beam.transforms.userstate import TimerSpec
    70  from apache_beam.transforms.userstate import on_timer
    71  from apache_beam.typehints import typehints
    72  
    73  NUM_PARALLEL_STAGES = 7
    74  
    75  NUM_SERIAL_STAGES = 7
    76  
    77  
    78  class BagInStateOutputAfterTimer(beam.DoFn):
    79  
    80    SET_STATE = SetStateSpec('buffer', VarIntCoder())
    81    EMIT_TIMER = TimerSpec('emit_timer', TimeDomain.WATERMARK)
    82  
    83    def process(
    84        self,
    85        element,
    86        set_state=beam.DoFn.StateParam(SET_STATE),
    87        emit_timer=beam.DoFn.TimerParam(EMIT_TIMER)):
    88      _, values = element
    89      for v in values:
    90        set_state.add(v)
    91      emit_timer.set(1)
    92  
    93    @on_timer(EMIT_TIMER)
    94    def emit_values(self, set_state=beam.DoFn.StateParam(SET_STATE)):
    95      values = set_state.read()
    96      return [(random.randint(0, 1000), v) for v in values]
    97  
    98  
    99  def _build_serial_stages(
   100      pipeline, num_serial_stages, num_elements, stage_count):
   101    pc = (
   102        pipeline | ('start_stage%s' % stage_count) >> beam.Create(
   103            [(random.randint(0, 1000), i) for i in range(num_elements)])
   104        | ('gbk_start_stage%s' % stage_count) >> beam.GroupByKey())
   105  
   106    for i in range(num_serial_stages):
   107      pc = (
   108          pc
   109          | ('stage%s_map%s' % (stage_count, i)) >> beam.ParDo(
   110              BagInStateOutputAfterTimer()).with_output_types(
   111                  typehints.KV[int, int])
   112          | ('stage%s_gbk%s' % (stage_count, i)) >> beam.GroupByKey())
   113  
   114    return pc
   115  
   116  
   117  def run_single_pipeline(size):
   118    def _pipeline_runner():
   119      with beam.Pipeline(runner=FnApiRunner()) as p:
   120        for i in range(NUM_PARALLEL_STAGES):
   121          _build_serial_stages(p, NUM_SERIAL_STAGES, size, i)
   122  
   123    return _pipeline_runner
   124  
   125  
   126  def run_benchmark(
   127      starting_point=1, num_runs=10, num_elements_step=100, verbose=True):
   128    suite = [
   129        utils.LinearRegressionBenchmarkConfig(
   130            run_single_pipeline, starting_point, num_elements_step, num_runs)
   131    ]
   132    return utils.run_benchmarks(suite, verbose=verbose)
   133  
   134  
   135  if __name__ == '__main__':
   136    logging.basicConfig()
   137    utils.check_compiled('apache_beam.runners.common')
   138  
   139    parser = argparse.ArgumentParser()
   140    parser.add_argument('--num_runs', default=10, type=int)
   141    parser.add_argument('--starting_point', default=1, type=int)
   142    parser.add_argument('--increment', default=100, type=int)
   143    parser.add_argument('--verbose', default=True, type=bool)
   144    options = parser.parse_args()
   145  
   146    run_benchmark(
   147        options.starting_point,
   148        options.num_runs,
   149        options.increment,
   150        options.verbose)