github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/testing/load_tests/sideinput_test.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """ 19 Load test for operations involving side inputs. 20 21 The purpose of this test is to measure the cost of materialization and 22 accessing side inputs. The test uses synthetic source which can be 23 parametrized to generate records with various sizes of keys and values, 24 impose delays in the pipeline and simulate other performance challenges. 25 26 This test can accept the following parameters: 27 * side_input_type (str) - Required. Specifies how the side input will be 28 materialized in ParDo operation. Choose from (dict, iter, list). 29 * window_count (int) - The number of fixed sized windows to subdivide the 30 side input into. By default, a global window will be used. 31 * access_percentage (int) - Specifies the percentage of elements in the side 32 input to be accessed. By default, all elements will be accessed. 33 34 Example test run: 35 36 python -m apache_beam.testing.load_tests.sideinput_test \ 37 --test-pipeline-options=" 38 --side_input_type=iter 39 --input_options='{ 40 \"num_records\": 300, 41 \"key_size\": 5, 42 \"value_size\": 15 43 }'" 44 45 or: 46 47 ./gradlew -PloadTest.args=" 48 --side_input_type=iter 49 --input_options='{ 50 \"num_records\": 300, 51 \"key_size\": 5, 52 \"value_size\": 15}'" \ 53 -PloadTest.mainClass=apache_beam.testing.load_tests.sideinput_test \ 54 -Prunner=DirectRunner :sdks:python:apache_beam:testing:load_tests:run 55 """ 56 57 # pytype: skip-file 58 59 import logging 60 from typing import Any 61 from typing import Dict 62 from typing import Iterable 63 from typing import Tuple 64 from typing import Union 65 66 import apache_beam as beam 67 from apache_beam.testing.load_tests.load_test import LoadTest 68 from apache_beam.testing.load_tests.load_test_metrics_utils import MeasureTime 69 from apache_beam.testing.synthetic_pipeline import SyntheticSDFAsSource 70 from apache_beam.transforms import window 71 72 73 class SideInputTest(LoadTest): 74 SIDE_INPUT_TYPES = { 75 'iter': beam.pvalue.AsIter, 76 'list': beam.pvalue.AsList, 77 'dict': beam.pvalue.AsDict, 78 } 79 SDF_INITIAL_ELEMENTS = 1000 80 81 def __init__(self): 82 super().__init__() 83 self.windows = self.get_option_or_default('window_count', default=1) 84 85 self.access_percentage = self.get_option_or_default( 86 'access_percentage', default=100) 87 if self.access_percentage < 0 or self.access_percentage > 100: 88 raise ValueError( 89 'access_percentage: Invalid value. Should be in range ' 90 'from 0 to 100, got {} instead'.format(self.access_percentage)) 91 92 self.elements_per_window = self.input_options['num_records'] // self.windows 93 94 self.side_input_type = self.pipeline.get_option('side_input_type') 95 if self.side_input_type is None: 96 raise ValueError( 97 'side_input_type is required. Please provide one of ' 98 'these: {}'.format(list(self.SIDE_INPUT_TYPES.keys()))) 99 100 def materialize_as(self): 101 try: 102 return self.SIDE_INPUT_TYPES[self.side_input_type] 103 except KeyError: 104 raise ValueError( 105 'Unknown side input type. Please provide one of ' 106 'these: {}'.format(list(self.SIDE_INPUT_TYPES.keys()))) 107 108 def test(self): 109 class SequenceSideInputTestDoFn(beam.DoFn): 110 """Iterate over first n side_input elements.""" 111 def __init__(self, first_n: int): 112 self._first_n = first_n 113 114 def process( # type: ignore[override] 115 self, element: Any, side_input: Iterable[Tuple[bytes, 116 bytes]]) -> None: 117 i = 0 118 it = iter(side_input) 119 while i < self._first_n: 120 i += 1 121 try: 122 # No-op. We only make sure that the element is accessed. 123 next(it) 124 except StopIteration: 125 break 126 127 class MappingSideInputTestDoFn(beam.DoFn): 128 """Iterates over first n keys in the dictionary and checks the value.""" 129 def __init__(self, first_n: int): 130 self._first_n = first_n 131 132 def process( # type: ignore[override] 133 self, element: Any, dict_side_input: Dict[bytes, bytes]) -> None: 134 i = 0 135 for key in dict_side_input: 136 if i == self._first_n: 137 break 138 # No-op. We only make sure that the element is accessed. 139 dict_side_input[key] 140 i += 1 141 142 class AssignTimestamps(beam.DoFn): 143 """Produces timestamped values. Timestamps are equal to the value of the 144 element.""" 145 def __init__(self): 146 # Avoid having to use save_main_session 147 self.window = window 148 149 def process(self, element: int) -> Iterable[window.TimestampedValue]: # type: ignore[override] 150 yield self.window.TimestampedValue(element, element) 151 152 class GetSyntheticSDFOptions(beam.DoFn): 153 def __init__( 154 self, elements_per_record: int, key_size: int, value_size: int): 155 self.elements_per_record = elements_per_record 156 self.key_size = key_size 157 self.value_size = value_size 158 159 def process(self, element: Any) -> Iterable[Dict[str, Union[int, str]]]: # type: ignore[override] 160 yield { 161 'num_records': self.elements_per_record, 162 'key_size': self.key_size, 163 'value_size': self.value_size, 164 'initial_splitting_num_bundles': 0, 165 'initial_splitting_desired_bundle_size': 0, 166 'sleep_per_input_record_sec': 0, 167 'initial_splitting': 'const' 168 } 169 170 main_input = self.pipeline | 'Create' >> beam.Create(range(self.windows)) 171 172 initial_elements = self.SDF_INITIAL_ELEMENTS 173 if self.windows > 1: 174 main_input = ( 175 main_input 176 | 'Assign timestamps' >> beam.ParDo(AssignTimestamps()) 177 | 'Apply windows' >> beam.WindowInto(window.FixedWindows(1))) 178 side_input = main_input 179 initial_elements = self.windows 180 else: 181 side_input = self.pipeline | 'Side input: create' >> beam.Create( 182 range(initial_elements)) 183 184 side_input = ( 185 side_input 186 | 'Get synthetic SDF options' >> beam.ParDo( 187 GetSyntheticSDFOptions( 188 self.input_options['num_records'] // initial_elements, 189 self.input_options['key_size'], 190 self.input_options['value_size'])) 191 | 'Generate input' >> beam.ParDo(SyntheticSDFAsSource())) 192 main_input |= 'Collect start time metrics' >> beam.ParDo( 193 MeasureTime(self.metrics_namespace)) 194 195 side_input_type = self.materialize_as() 196 elements_to_access = self.elements_per_window * \ 197 self.access_percentage // 100 198 logging.info( 199 '%s out of %s total elements in each window will be accessed.', 200 elements_to_access, 201 self.elements_per_window) 202 if side_input_type is beam.pvalue.AsDict: 203 dofn = MappingSideInputTestDoFn(elements_to_access) 204 else: 205 dofn = SequenceSideInputTestDoFn(elements_to_access) 206 207 _ = ( 208 main_input 209 | beam.ParDo(dofn, side_input_type(side_input)) 210 | 'Collect end time metrics' >> beam.ParDo( 211 MeasureTime(self.metrics_namespace))) 212 213 214 if __name__ == '__main__': 215 logging.basicConfig(level=logging.INFO) 216 SideInputTest().run()