github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/tools/teststream_microbenchmark.py (about) 1 # Licensed to the Apache Software Foundation (ASF) under one or more 2 # contributor license agreements. See the NOTICE file distributed with 3 # this work for additional information regarding copyright ownership. 4 # The ASF licenses this file to You under the Apache License, Version 2.0 5 # (the "License"); you may not use this file except in compliance with 6 # the License. You may obtain a copy of the License at 7 # 8 # http://www.apache.org/licenses/LICENSE-2.0 9 # 10 # Unless required by applicable law or agreed to in writing, software 11 # distributed under the License is distributed on an "AS IS" BASIS, 12 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 # See the License for the specific language governing permissions and 14 # limitations under the License. 15 # 16 17 """A microbenchmark for measuring changes in the performance of TestStream 18 running locally. 19 This microbenchmark attempts to measure the overhead of the main data paths 20 for the TestStream. Specifically new elements, watermark changes and processing 21 time advances. 22 23 This runs a series of N parallel pipelines with M parallel stages each. Each 24 stage does the following: 25 26 1) Put all the PCollection elements in a window 27 2) Wait until the watermark advances past the end of the window. 28 3) When the watermark passes, change the key and output all the elements 29 4) Go back to #1 until all elements in the stream have been consumed. 30 31 This executes the same codepaths that are run on the Fn API (and Dataflow) 32 workers, but is generally easier to run (locally) and more stable. 33 34 Run as 35 36 python -m apache_beam.tools.teststream_microbenchmark 37 38 """ 39 40 # pytype: skip-file 41 42 import argparse 43 import itertools 44 import logging 45 import random 46 47 import apache_beam as beam 48 from apache_beam import WindowInto 49 from apache_beam.runners import DirectRunner 50 from apache_beam.testing.test_stream import TestStream 51 from apache_beam.tools import utils 52 from apache_beam.transforms.window import FixedWindows 53 from apache_beam.typehints import typehints 54 55 NUM_PARALLEL_STAGES = 7 56 57 NUM_SERIAL_STAGES = 6 58 59 60 class RekeyElements(beam.DoFn): 61 def process(self, element): 62 _, values = element 63 return [(random.randint(0, 1000), v) for v in values] 64 65 66 def _build_serial_stages(input_pc, num_serial_stages, stage_count): 67 pc = (input_pc | ('gbk_start_stage%s' % stage_count) >> beam.GroupByKey()) 68 69 for i in range(num_serial_stages): 70 pc = ( 71 pc 72 | ('stage%s_map%s' % (stage_count, i)) >> beam.ParDo( 73 RekeyElements()).with_output_types(typehints.KV[int, int]) 74 | ('stage%s_gbk%s' % (stage_count, i)) >> beam.GroupByKey()) 75 76 return pc 77 78 79 def run_single_pipeline(size): 80 def _pipeline_runner(): 81 with beam.Pipeline(runner=DirectRunner()) as p: 82 ts = TestStream().advance_watermark_to(0) 83 all_elements = iter(range(size)) 84 watermark = 0 85 while True: 86 next_batch = list(itertools.islice(all_elements, 100)) 87 if not next_batch: 88 break 89 ts = ts.add_elements([(i, random.randint(0, 1000)) for i in next_batch]) 90 watermark = watermark + 100 91 ts = ts.advance_watermark_to(watermark) 92 ts = ts.advance_watermark_to_infinity() 93 94 input_pc = p | ts | WindowInto(FixedWindows(100)) 95 for i in range(NUM_PARALLEL_STAGES): 96 _build_serial_stages(input_pc, NUM_SERIAL_STAGES, i) 97 98 return _pipeline_runner 99 100 101 def run_benchmark( 102 starting_point=1, num_runs=10, num_elements_step=300, verbose=True): 103 suite = [ 104 utils.LinearRegressionBenchmarkConfig( 105 run_single_pipeline, starting_point, num_elements_step, num_runs) 106 ] 107 return utils.run_benchmarks(suite, verbose=verbose) 108 109 110 if __name__ == '__main__': 111 logging.basicConfig() 112 utils.check_compiled('apache_beam.runners.common') 113 114 parser = argparse.ArgumentParser() 115 parser.add_argument('--num_runs', default=10, type=int) 116 parser.add_argument('--starting_point', default=1, type=int) 117 parser.add_argument('--increment', default=300, type=int) 118 parser.add_argument('--verbose', default=True, type=bool) 119 options = parser.parse_args() 120 121 run_benchmark( 122 options.starting_point, 123 options.num_runs, 124 options.increment, 125 options.verbose)