github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/testing/load_tests/combine_test.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """
    19  This is Combine load test with Synthetic Source. Besides of the standard
    20  input options there are additional options:
    21  * fanout (optional) - number of GBK operations to run in parallel
    22  * top_count - an arguments passed to the Top combiner.
    23  * project (optional) - the gcp project in case of saving
    24  metrics in Big Query (in case of Dataflow Runner
    25  it is required to specify project of runner),
    26  * publish_to_big_query - if metrics should be published in big query,
    27  * metrics_namespace (optional) - name of BigQuery dataset where metrics
    28  will be stored,
    29  * metrics_table (optional) - name of BigQuery table where metrics
    30  will be stored,
    31  * input_options - options for Synthetic Sources.
    32  
    33  Example test run:
    34  
    35  python -m apache_beam.testing.load_tests.combine_test \
    36      --test-pipeline-options="
    37      --project=big-query-project
    38      --region=...
    39      --publish_to_big_query=true
    40      --metrics_dataset=python_load_tests
    41      --metrics_table=combine
    42      --fanout=1
    43      --top_count=1000
    44      --input_options='{
    45      \"num_records\": 300,
    46      \"key_size\": 5,
    47      \"value_size\": 15
    48      }'"
    49  
    50  or:
    51  
    52  ./gradlew -PloadTest.args="
    53      --publish_to_big_query=true
    54      --project=...
    55      --region=...
    56      --metrics_dataset=python_load_tests
    57      --metrics_table=combine
    58      --top_count=1000
    59      --fanout=1
    60      --input_options='{
    61        \"num_records\": 1,
    62        \"key_size\": 1,
    63        \"value_size\": 1}'
    64      --runner=DirectRunner" \
    65  -PloadTest.mainClass=apache_beam.testing.load_tests.combine_test \
    66  -Prunner=DirectRunner :sdks:python:apache_beam:testing:load_tests:run
    67  """
    68  
    69  # pytype: skip-file
    70  
    71  import logging
    72  import sys
    73  
    74  import apache_beam as beam
    75  from apache_beam.testing.load_tests.load_test import LoadTest
    76  from apache_beam.testing.load_tests.load_test_metrics_utils import AssignTimestamps
    77  from apache_beam.testing.load_tests.load_test_metrics_utils import MeasureTime
    78  from apache_beam.testing.synthetic_pipeline import StatefulLoadGenerator
    79  from apache_beam.testing.synthetic_pipeline import SyntheticSource
    80  from apache_beam.transforms.combiners import window
    81  
    82  
    83  class CombineTest(LoadTest):
    84    def __init__(self):
    85      super().__init__()
    86      self.fanout = self.get_option_or_default('fanout', 1)
    87      try:
    88        self.top_count = int(self.pipeline.get_option('top_count'))
    89      except (TypeError, ValueError):
    90        logging.error(
    91            'You should set \"--top_count\" option to use TOP '
    92            'combiners')
    93        sys.exit(1)
    94  
    95    class _GetElement(beam.DoFn):
    96      def process(self, element):
    97        yield element
    98  
    99    def test(self):
   100      if self.get_option_or_default('use_stateful_load_generator', False):
   101        source = (
   102            self.pipeline
   103            | 'LoadGenerator' >> StatefulLoadGenerator(self.input_options)
   104            | beam.ParDo(AssignTimestamps())
   105            | beam.WindowInto(window.FixedWindows(20)))
   106      else:
   107        source = (
   108            self.pipeline
   109            | 'Read synthetic' >> beam.io.Read(
   110                SyntheticSource(self.parse_synthetic_source_options())))
   111  
   112      pc = (
   113          source
   114          | 'Measure time: Start' >> beam.ParDo(
   115              MeasureTime(self.metrics_namespace)))
   116  
   117      for branch in range(self.fanout):
   118        (  # pylint: disable=expression-not-assigned
   119            pc
   120            | 'Combine with Top %i' % branch >> beam.CombineGlobally(
   121                beam.combiners.TopCombineFn(self.top_count)).without_defaults()
   122            | 'Consume %i' % branch >> beam.ParDo(self._GetElement())
   123            | 'Measure time: End %i' % branch >> beam.ParDo(
   124                MeasureTime(self.metrics_namespace)))
   125  
   126  
   127  if __name__ == '__main__':
   128    logging.basicConfig(level=logging.INFO)
   129    CombineTest().run()