github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/testing/load_tests/sideinput_test.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """
    19  Load test for operations involving side inputs.
    20  
    21  The purpose of this test is to measure the cost of materialization and
    22  accessing side inputs. The test uses synthetic source which can be
    23  parametrized to generate records with various sizes of keys and values,
    24  impose delays in the pipeline and simulate other performance challenges.
    25  
    26  This test can accept the following parameters:
    27    * side_input_type (str) - Required. Specifies how the side input will be
    28      materialized in ParDo operation. Choose from (dict, iter, list).
    29    * window_count (int) - The number of fixed sized windows to subdivide the
    30      side input into. By default, a global window will be used.
    31    * access_percentage (int) - Specifies the percentage of elements in the side
    32      input to be accessed. By default, all elements will be accessed.
    33  
    34  Example test run:
    35  
    36  python -m apache_beam.testing.load_tests.sideinput_test \
    37      --test-pipeline-options="
    38      --side_input_type=iter
    39      --input_options='{
    40      \"num_records\": 300,
    41      \"key_size\": 5,
    42      \"value_size\": 15
    43      }'"
    44  
    45  or:
    46  
    47  ./gradlew -PloadTest.args="
    48      --side_input_type=iter
    49      --input_options='{
    50        \"num_records\": 300,
    51        \"key_size\": 5,
    52        \"value_size\": 15}'" \
    53  -PloadTest.mainClass=apache_beam.testing.load_tests.sideinput_test \
    54  -Prunner=DirectRunner :sdks:python:apache_beam:testing:load_tests:run
    55  """
    56  
    57  # pytype: skip-file
    58  
    59  import logging
    60  from typing import Any
    61  from typing import Dict
    62  from typing import Iterable
    63  from typing import Tuple
    64  from typing import Union
    65  
    66  import apache_beam as beam
    67  from apache_beam.testing.load_tests.load_test import LoadTest
    68  from apache_beam.testing.load_tests.load_test_metrics_utils import MeasureTime
    69  from apache_beam.testing.synthetic_pipeline import SyntheticSDFAsSource
    70  from apache_beam.transforms import window
    71  
    72  
    73  class SideInputTest(LoadTest):
    74    SIDE_INPUT_TYPES = {
    75        'iter': beam.pvalue.AsIter,
    76        'list': beam.pvalue.AsList,
    77        'dict': beam.pvalue.AsDict,
    78    }
    79    SDF_INITIAL_ELEMENTS = 1000
    80  
    81    def __init__(self):
    82      super().__init__()
    83      self.windows = self.get_option_or_default('window_count', default=1)
    84  
    85      self.access_percentage = self.get_option_or_default(
    86          'access_percentage', default=100)
    87      if self.access_percentage < 0 or self.access_percentage > 100:
    88        raise ValueError(
    89            'access_percentage: Invalid value. Should be in range '
    90            'from 0 to 100, got {} instead'.format(self.access_percentage))
    91  
    92      self.elements_per_window = self.input_options['num_records'] // self.windows
    93  
    94      self.side_input_type = self.pipeline.get_option('side_input_type')
    95      if self.side_input_type is None:
    96        raise ValueError(
    97            'side_input_type is required. Please provide one of '
    98            'these: {}'.format(list(self.SIDE_INPUT_TYPES.keys())))
    99  
   100    def materialize_as(self):
   101      try:
   102        return self.SIDE_INPUT_TYPES[self.side_input_type]
   103      except KeyError:
   104        raise ValueError(
   105            'Unknown side input type. Please provide one of '
   106            'these: {}'.format(list(self.SIDE_INPUT_TYPES.keys())))
   107  
   108    def test(self):
   109      class SequenceSideInputTestDoFn(beam.DoFn):
   110        """Iterate over first n side_input elements."""
   111        def __init__(self, first_n: int):
   112          self._first_n = first_n
   113  
   114        def process(  # type: ignore[override]
   115            self, element: Any, side_input: Iterable[Tuple[bytes,
   116                                                           bytes]]) -> None:
   117          i = 0
   118          it = iter(side_input)
   119          while i < self._first_n:
   120            i += 1
   121            try:
   122              # No-op. We only make sure that the element is accessed.
   123              next(it)
   124            except StopIteration:
   125              break
   126  
   127      class MappingSideInputTestDoFn(beam.DoFn):
   128        """Iterates over first n keys in the dictionary and checks the value."""
   129        def __init__(self, first_n: int):
   130          self._first_n = first_n
   131  
   132        def process(  # type: ignore[override]
   133            self, element: Any, dict_side_input: Dict[bytes, bytes]) -> None:
   134          i = 0
   135          for key in dict_side_input:
   136            if i == self._first_n:
   137              break
   138            # No-op. We only make sure that the element is accessed.
   139            dict_side_input[key]
   140            i += 1
   141  
   142      class AssignTimestamps(beam.DoFn):
   143        """Produces timestamped values. Timestamps are equal to the value of the
   144        element."""
   145        def __init__(self):
   146          # Avoid having to use save_main_session
   147          self.window = window
   148  
   149        def process(self, element: int) -> Iterable[window.TimestampedValue]:  # type: ignore[override]
   150          yield self.window.TimestampedValue(element, element)
   151  
   152      class GetSyntheticSDFOptions(beam.DoFn):
   153        def __init__(
   154            self, elements_per_record: int, key_size: int, value_size: int):
   155          self.elements_per_record = elements_per_record
   156          self.key_size = key_size
   157          self.value_size = value_size
   158  
   159        def process(self, element: Any) -> Iterable[Dict[str, Union[int, str]]]:  # type: ignore[override]
   160          yield {
   161              'num_records': self.elements_per_record,
   162              'key_size': self.key_size,
   163              'value_size': self.value_size,
   164              'initial_splitting_num_bundles': 0,
   165              'initial_splitting_desired_bundle_size': 0,
   166              'sleep_per_input_record_sec': 0,
   167              'initial_splitting': 'const'
   168          }
   169  
   170      main_input = self.pipeline | 'Create' >> beam.Create(range(self.windows))
   171  
   172      initial_elements = self.SDF_INITIAL_ELEMENTS
   173      if self.windows > 1:
   174        main_input = (
   175            main_input
   176            | 'Assign timestamps' >> beam.ParDo(AssignTimestamps())
   177            | 'Apply windows' >> beam.WindowInto(window.FixedWindows(1)))
   178        side_input = main_input
   179        initial_elements = self.windows
   180      else:
   181        side_input = self.pipeline | 'Side input: create' >> beam.Create(
   182            range(initial_elements))
   183  
   184      side_input = (
   185          side_input
   186          | 'Get synthetic SDF options' >> beam.ParDo(
   187              GetSyntheticSDFOptions(
   188                  self.input_options['num_records'] // initial_elements,
   189                  self.input_options['key_size'],
   190                  self.input_options['value_size']))
   191          | 'Generate input' >> beam.ParDo(SyntheticSDFAsSource()))
   192      main_input |= 'Collect start time metrics' >> beam.ParDo(
   193          MeasureTime(self.metrics_namespace))
   194  
   195      side_input_type = self.materialize_as()
   196      elements_to_access = self.elements_per_window * \
   197                           self.access_percentage // 100
   198      logging.info(
   199          '%s out of %s total elements in each window will be accessed.',
   200          elements_to_access,
   201          self.elements_per_window)
   202      if side_input_type is beam.pvalue.AsDict:
   203        dofn = MappingSideInputTestDoFn(elements_to_access)
   204      else:
   205        dofn = SequenceSideInputTestDoFn(elements_to_access)
   206  
   207      _ = (
   208          main_input
   209          | beam.ParDo(dofn, side_input_type(side_input))
   210          | 'Collect end time metrics' >> beam.ParDo(
   211              MeasureTime(self.metrics_namespace)))
   212  
   213  
   214  if __name__ == '__main__':
   215    logging.basicConfig(level=logging.INFO)
   216    SideInputTest().run()