github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/testing/load_tests/pardo_test.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """ 19 This is ParDo load test with Synthetic Source. Besides of the standard 20 input options there are additional options: 21 * iterations - number of subsequent ParDo transforms to be performed, 22 * number_of_counters - number of counter metrics to be created for one ParDo 23 transform, 24 * number_of_counter_operations - number of operations on counters to be 25 performed in one ParDo, 26 * project (optional) - the gcp project in case of saving 27 metrics in Big Query (in case of Dataflow Runner 28 it is required to specify project of runner), 29 * publish_to_big_query - if metrics should be published in big query, 30 * metrics_dataset (optional) - name of BigQuery dataset where metrics 31 will be stored, 32 * metrics_table (optional) - name of BigQuery table where metrics 33 will be stored, 34 * input_options - options for Synthetic Sources. 35 * stateful - When true, this will use a stateful DoFn 36 * state_cache - When true, this will enable the Python state cache 37 38 Example test run: 39 40 python -m apache_beam.testing.load_tests.pardo_test \ 41 --test-pipeline-options=" 42 --iterations=1 43 --number_of_counters=1 44 --number_of_counter_operations=1 45 --project=big-query-project 46 --region=... 47 --publish_to_big_query=true 48 --metrics_dataset=python_load_tests 49 --metrics_table=pardo 50 --input_options='{ 51 \"num_records\": 300, 52 \"key_size\": 5, 53 \"value_size\": 15 54 }'" 55 56 or: 57 58 ./gradlew -PloadTest.args=" 59 --publish_to_big_query=true 60 --project=... 61 --region=... 62 --metrics_dataset=python_load_tests 63 --metrics_table=pardo 64 --input_options='{ 65 \"num_records\": 1, 66 \"key_size\": 1, 67 \"value_size\": 1}' 68 --runner=DirectRunner" \ 69 -PloadTest.mainClass=apache_beam.testing.load_tests.pardo_test \ 70 -Prunner=DirectRunner :sdks:python:apache_beam:testing:load_tests:run 71 """ 72 73 # pytype: skip-file 74 75 import logging 76 77 import apache_beam as beam 78 from apache_beam.metrics import Metrics 79 from apache_beam.options.pipeline_options import DebugOptions 80 from apache_beam.testing.load_tests.load_test import LoadTest 81 from apache_beam.testing.load_tests.load_test_metrics_utils import AssignTimestamps 82 from apache_beam.testing.load_tests.load_test_metrics_utils import MeasureLatency 83 from apache_beam.testing.load_tests.load_test_metrics_utils import MeasureTime 84 from apache_beam.testing.synthetic_pipeline import StatefulLoadGenerator 85 from apache_beam.testing.synthetic_pipeline import SyntheticSource 86 from apache_beam.transforms import userstate 87 88 89 class ParDoTest(LoadTest): 90 def __init__(self): 91 super().__init__() 92 self.iterations = self.get_option_or_default('iterations') 93 self.number_of_counters = self.get_option_or_default( 94 'number_of_counters', 1) 95 self.number_of_operations = self.get_option_or_default( 96 'number_of_counter_operations', 1) 97 self.stateful = self.get_option_or_default('stateful', False) 98 if self.get_option_or_default('state_cache', False): 99 self.pipeline.options.view_as(DebugOptions).add_experiment( 100 'state_cache_size=1000') 101 102 def test(self): 103 class BaseCounterOperation(beam.DoFn): 104 def __init__(self, number_of_counters, number_of_operations): 105 self.number_of_operations = number_of_operations 106 self.counters = [] 107 for i in range(number_of_counters): 108 self.counters.append( 109 Metrics.counter('do-not-publish', 'name-{}'.format(i))) 110 111 class StatefulCounterOperation(BaseCounterOperation): 112 state_param = beam.DoFn.StateParam( 113 userstate.CombiningValueStateSpec( 114 'count', 115 beam.coders.IterableCoder(beam.coders.VarIntCoder()), 116 sum)) if self.stateful else None 117 118 def process(self, element, state=state_param): 119 for _ in range(self.number_of_operations): 120 for counter in self.counters: 121 counter.inc() 122 if state: 123 state.add(1) 124 yield element 125 126 class CounterOperation(BaseCounterOperation): 127 def process(self, element): 128 for _ in range(self.number_of_operations): 129 for counter in self.counters: 130 counter.inc() 131 yield element 132 133 if self.get_option_or_default('use_stateful_load_generator', False): 134 pc = ( 135 self.pipeline 136 | 'LoadGenerator' >> StatefulLoadGenerator(self.input_options) 137 | 'Measure time: Start' >> beam.ParDo( 138 MeasureTime(self.metrics_namespace)) 139 | 'Assign timestamps' >> beam.ParDo(AssignTimestamps())) 140 141 for i in range(self.iterations): 142 pc |= 'Step: %d' % i >> beam.ParDo( 143 StatefulCounterOperation( 144 self.number_of_counters, self.number_of_operations)) 145 146 pc |= 'Measure latency' >> beam.ParDo( 147 MeasureLatency(self.metrics_namespace)) 148 else: 149 pc = ( 150 self.pipeline 151 | 'Read synthetic' >> beam.io.Read( 152 SyntheticSource(self.parse_synthetic_source_options())) 153 | 'Measure time: Start' >> beam.ParDo( 154 MeasureTime(self.metrics_namespace))) 155 156 for i in range(self.iterations): 157 pc |= 'Step: %d' % i >> beam.ParDo( 158 CounterOperation( 159 self.number_of_counters, self.number_of_operations)) 160 161 # pylint: disable=expression-not-assigned 162 pc | 'Measure time: End' >> beam.ParDo(MeasureTime(self.metrics_namespace)) 163 164 165 if __name__ == '__main__': 166 logging.basicConfig(level=logging.INFO) 167 ParDoTest().run()