github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/testing/load_tests/pardo_test.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """
    19  This is ParDo load test with Synthetic Source. Besides of the standard
    20  input options there are additional options:
    21  * iterations - number of subsequent ParDo transforms to be performed,
    22  * number_of_counters - number of counter metrics to be created for one ParDo
    23  transform,
    24  * number_of_counter_operations - number of operations on counters to be
    25  performed in one ParDo,
    26  * project (optional) - the gcp project in case of saving
    27  metrics in Big Query (in case of Dataflow Runner
    28  it is required to specify project of runner),
    29  * publish_to_big_query - if metrics should be published in big query,
    30  * metrics_dataset (optional) - name of BigQuery dataset where metrics
    31  will be stored,
    32  * metrics_table (optional) - name of BigQuery table where metrics
    33  will be stored,
    34  * input_options - options for Synthetic Sources.
    35  * stateful - When true, this will use a stateful DoFn
    36  * state_cache - When true, this will enable the Python state cache
    37  
    38  Example test run:
    39  
    40  python -m apache_beam.testing.load_tests.pardo_test \
    41      --test-pipeline-options="
    42      --iterations=1
    43      --number_of_counters=1
    44      --number_of_counter_operations=1
    45      --project=big-query-project
    46      --region=...
    47      --publish_to_big_query=true
    48      --metrics_dataset=python_load_tests
    49      --metrics_table=pardo
    50      --input_options='{
    51      \"num_records\": 300,
    52      \"key_size\": 5,
    53      \"value_size\": 15
    54      }'"
    55  
    56  or:
    57  
    58  ./gradlew -PloadTest.args="
    59      --publish_to_big_query=true
    60      --project=...
    61      --region=...
    62      --metrics_dataset=python_load_tests
    63      --metrics_table=pardo
    64      --input_options='{
    65        \"num_records\": 1,
    66        \"key_size\": 1,
    67        \"value_size\": 1}'
    68      --runner=DirectRunner" \
    69  -PloadTest.mainClass=apache_beam.testing.load_tests.pardo_test \
    70  -Prunner=DirectRunner :sdks:python:apache_beam:testing:load_tests:run
    71  """
    72  
    73  # pytype: skip-file
    74  
    75  import logging
    76  
    77  import apache_beam as beam
    78  from apache_beam.metrics import Metrics
    79  from apache_beam.options.pipeline_options import DebugOptions
    80  from apache_beam.testing.load_tests.load_test import LoadTest
    81  from apache_beam.testing.load_tests.load_test_metrics_utils import AssignTimestamps
    82  from apache_beam.testing.load_tests.load_test_metrics_utils import MeasureLatency
    83  from apache_beam.testing.load_tests.load_test_metrics_utils import MeasureTime
    84  from apache_beam.testing.synthetic_pipeline import StatefulLoadGenerator
    85  from apache_beam.testing.synthetic_pipeline import SyntheticSource
    86  from apache_beam.transforms import userstate
    87  
    88  
    89  class ParDoTest(LoadTest):
    90    def __init__(self):
    91      super().__init__()
    92      self.iterations = self.get_option_or_default('iterations')
    93      self.number_of_counters = self.get_option_or_default(
    94          'number_of_counters', 1)
    95      self.number_of_operations = self.get_option_or_default(
    96          'number_of_counter_operations', 1)
    97      self.stateful = self.get_option_or_default('stateful', False)
    98      if self.get_option_or_default('state_cache', False):
    99        self.pipeline.options.view_as(DebugOptions).add_experiment(
   100            'state_cache_size=1000')
   101  
   102    def test(self):
   103      class BaseCounterOperation(beam.DoFn):
   104        def __init__(self, number_of_counters, number_of_operations):
   105          self.number_of_operations = number_of_operations
   106          self.counters = []
   107          for i in range(number_of_counters):
   108            self.counters.append(
   109                Metrics.counter('do-not-publish', 'name-{}'.format(i)))
   110  
   111      class StatefulCounterOperation(BaseCounterOperation):
   112        state_param = beam.DoFn.StateParam(
   113            userstate.CombiningValueStateSpec(
   114                'count',
   115                beam.coders.IterableCoder(beam.coders.VarIntCoder()),
   116                sum)) if self.stateful else None
   117  
   118        def process(self, element, state=state_param):
   119          for _ in range(self.number_of_operations):
   120            for counter in self.counters:
   121              counter.inc()
   122            if state:
   123              state.add(1)
   124          yield element
   125  
   126      class CounterOperation(BaseCounterOperation):
   127        def process(self, element):
   128          for _ in range(self.number_of_operations):
   129            for counter in self.counters:
   130              counter.inc()
   131          yield element
   132  
   133      if self.get_option_or_default('use_stateful_load_generator', False):
   134        pc = (
   135            self.pipeline
   136            | 'LoadGenerator' >> StatefulLoadGenerator(self.input_options)
   137            | 'Measure time: Start' >> beam.ParDo(
   138                MeasureTime(self.metrics_namespace))
   139            | 'Assign timestamps' >> beam.ParDo(AssignTimestamps()))
   140  
   141        for i in range(self.iterations):
   142          pc |= 'Step: %d' % i >> beam.ParDo(
   143              StatefulCounterOperation(
   144                  self.number_of_counters, self.number_of_operations))
   145  
   146        pc |= 'Measure latency' >> beam.ParDo(
   147            MeasureLatency(self.metrics_namespace))
   148      else:
   149        pc = (
   150            self.pipeline
   151            | 'Read synthetic' >> beam.io.Read(
   152                SyntheticSource(self.parse_synthetic_source_options()))
   153            | 'Measure time: Start' >> beam.ParDo(
   154                MeasureTime(self.metrics_namespace)))
   155  
   156        for i in range(self.iterations):
   157          pc |= 'Step: %d' % i >> beam.ParDo(
   158              CounterOperation(
   159                  self.number_of_counters, self.number_of_operations))
   160  
   161      # pylint: disable=expression-not-assigned
   162      pc | 'Measure time: End' >> beam.ParDo(MeasureTime(self.metrics_namespace))
   163  
   164  
   165  if __name__ == '__main__':
   166    logging.basicConfig(level=logging.INFO)
   167    ParDoTest().run()