github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/tools/fn_api_runner_microbenchmark.py (about) 1 # Licensed to the Apache Software Foundation (ASF) under one or more 2 # contributor license agreements. See the NOTICE file distributed with 3 # this work for additional information regarding copyright ownership. 4 # The ASF licenses this file to You under the Apache License, Version 2.0 5 # (the "License"); you may not use this file except in compliance with 6 # the License. You may obtain a copy of the License at 7 # 8 # http://www.apache.org/licenses/LICENSE-2.0 9 # 10 # Unless required by applicable law or agreed to in writing, software 11 # distributed under the License is distributed on an "AS IS" BASIS, 12 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 # See the License for the specific language governing permissions and 14 # limitations under the License. 15 # 16 17 """A microbenchmark for measuring changes in the critical path of FnApiRunner. 18 This microbenchmark attempts to measure the overhead of the main data paths 19 for the FnApiRunner. Specifically state, timers, and shuffling of data. 20 21 This runs a series of N parallel pipelines with M parallel stages each. Each 22 stage does the following: 23 24 1) Put all the PCollection elements in state 25 2) Set a timer for the future 26 3) When the timer fires, change the key and output all the elements downstream 27 28 This executes the same codepaths that are run on the Fn API (and Dataflow) 29 workers, but is generally easier to run (locally) and more stable.. 30 31 Run as 32 33 python -m apache_beam.tools.fn_api_runner_microbenchmark 34 35 The main metric to work with for this benchmark is Fixed Cost. This represents 36 the fixed cost of ovehead for the data path of the FnApiRunner. 37 38 Initial results were: 39 40 run 1 of 10, per element time cost: 3.6778 sec 41 run 2 of 10, per element time cost: 0.053498 sec 42 run 3 of 10, per element time cost: 0.0299434 sec 43 run 4 of 10, per element time cost: 0.0211154 sec 44 run 5 of 10, per element time cost: 0.0170031 sec 45 run 6 of 10, per element time cost: 0.0150809 sec 46 run 7 of 10, per element time cost: 0.013218 sec 47 run 8 of 10, per element time cost: 0.0119685 sec 48 run 9 of 10, per element time cost: 0.0107382 sec 49 run 10 of 10, per element time cost: 0.0103208 sec 50 51 52 Fixed cost 4.537164939085642 53 Per-element 0.005474923321695039 54 R^2 0.95189 55 """ 56 57 # pytype: skip-file 58 59 import argparse 60 import logging 61 import random 62 63 import apache_beam as beam 64 from apache_beam.coders import VarIntCoder 65 from apache_beam.runners.portability.fn_api_runner import FnApiRunner 66 from apache_beam.tools import utils 67 from apache_beam.transforms.timeutil import TimeDomain 68 from apache_beam.transforms.userstate import SetStateSpec 69 from apache_beam.transforms.userstate import TimerSpec 70 from apache_beam.transforms.userstate import on_timer 71 from apache_beam.typehints import typehints 72 73 NUM_PARALLEL_STAGES = 7 74 75 NUM_SERIAL_STAGES = 7 76 77 78 class BagInStateOutputAfterTimer(beam.DoFn): 79 80 SET_STATE = SetStateSpec('buffer', VarIntCoder()) 81 EMIT_TIMER = TimerSpec('emit_timer', TimeDomain.WATERMARK) 82 83 def process( 84 self, 85 element, 86 set_state=beam.DoFn.StateParam(SET_STATE), 87 emit_timer=beam.DoFn.TimerParam(EMIT_TIMER)): 88 _, values = element 89 for v in values: 90 set_state.add(v) 91 emit_timer.set(1) 92 93 @on_timer(EMIT_TIMER) 94 def emit_values(self, set_state=beam.DoFn.StateParam(SET_STATE)): 95 values = set_state.read() 96 return [(random.randint(0, 1000), v) for v in values] 97 98 99 def _build_serial_stages( 100 pipeline, num_serial_stages, num_elements, stage_count): 101 pc = ( 102 pipeline | ('start_stage%s' % stage_count) >> beam.Create( 103 [(random.randint(0, 1000), i) for i in range(num_elements)]) 104 | ('gbk_start_stage%s' % stage_count) >> beam.GroupByKey()) 105 106 for i in range(num_serial_stages): 107 pc = ( 108 pc 109 | ('stage%s_map%s' % (stage_count, i)) >> beam.ParDo( 110 BagInStateOutputAfterTimer()).with_output_types( 111 typehints.KV[int, int]) 112 | ('stage%s_gbk%s' % (stage_count, i)) >> beam.GroupByKey()) 113 114 return pc 115 116 117 def run_single_pipeline(size): 118 def _pipeline_runner(): 119 with beam.Pipeline(runner=FnApiRunner()) as p: 120 for i in range(NUM_PARALLEL_STAGES): 121 _build_serial_stages(p, NUM_SERIAL_STAGES, size, i) 122 123 return _pipeline_runner 124 125 126 def run_benchmark( 127 starting_point=1, num_runs=10, num_elements_step=100, verbose=True): 128 suite = [ 129 utils.LinearRegressionBenchmarkConfig( 130 run_single_pipeline, starting_point, num_elements_step, num_runs) 131 ] 132 return utils.run_benchmarks(suite, verbose=verbose) 133 134 135 if __name__ == '__main__': 136 logging.basicConfig() 137 utils.check_compiled('apache_beam.runners.common') 138 139 parser = argparse.ArgumentParser() 140 parser.add_argument('--num_runs', default=10, type=int) 141 parser.add_argument('--starting_point', default=1, type=int) 142 parser.add_argument('--increment', default=100, type=int) 143 parser.add_argument('--verbose', default=True, type=bool) 144 options = parser.parse_args() 145 146 run_benchmark( 147 options.starting_point, 148 options.num_runs, 149 options.increment, 150 options.verbose)