github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/testing/load_tests/combine_test.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """ 19 This is Combine load test with Synthetic Source. Besides of the standard 20 input options there are additional options: 21 * fanout (optional) - number of GBK operations to run in parallel 22 * top_count - an arguments passed to the Top combiner. 23 * project (optional) - the gcp project in case of saving 24 metrics in Big Query (in case of Dataflow Runner 25 it is required to specify project of runner), 26 * publish_to_big_query - if metrics should be published in big query, 27 * metrics_namespace (optional) - name of BigQuery dataset where metrics 28 will be stored, 29 * metrics_table (optional) - name of BigQuery table where metrics 30 will be stored, 31 * input_options - options for Synthetic Sources. 32 33 Example test run: 34 35 python -m apache_beam.testing.load_tests.combine_test \ 36 --test-pipeline-options=" 37 --project=big-query-project 38 --region=... 39 --publish_to_big_query=true 40 --metrics_dataset=python_load_tests 41 --metrics_table=combine 42 --fanout=1 43 --top_count=1000 44 --input_options='{ 45 \"num_records\": 300, 46 \"key_size\": 5, 47 \"value_size\": 15 48 }'" 49 50 or: 51 52 ./gradlew -PloadTest.args=" 53 --publish_to_big_query=true 54 --project=... 55 --region=... 56 --metrics_dataset=python_load_tests 57 --metrics_table=combine 58 --top_count=1000 59 --fanout=1 60 --input_options='{ 61 \"num_records\": 1, 62 \"key_size\": 1, 63 \"value_size\": 1}' 64 --runner=DirectRunner" \ 65 -PloadTest.mainClass=apache_beam.testing.load_tests.combine_test \ 66 -Prunner=DirectRunner :sdks:python:apache_beam:testing:load_tests:run 67 """ 68 69 # pytype: skip-file 70 71 import logging 72 import sys 73 74 import apache_beam as beam 75 from apache_beam.testing.load_tests.load_test import LoadTest 76 from apache_beam.testing.load_tests.load_test_metrics_utils import AssignTimestamps 77 from apache_beam.testing.load_tests.load_test_metrics_utils import MeasureTime 78 from apache_beam.testing.synthetic_pipeline import StatefulLoadGenerator 79 from apache_beam.testing.synthetic_pipeline import SyntheticSource 80 from apache_beam.transforms.combiners import window 81 82 83 class CombineTest(LoadTest): 84 def __init__(self): 85 super().__init__() 86 self.fanout = self.get_option_or_default('fanout', 1) 87 try: 88 self.top_count = int(self.pipeline.get_option('top_count')) 89 except (TypeError, ValueError): 90 logging.error( 91 'You should set \"--top_count\" option to use TOP ' 92 'combiners') 93 sys.exit(1) 94 95 class _GetElement(beam.DoFn): 96 def process(self, element): 97 yield element 98 99 def test(self): 100 if self.get_option_or_default('use_stateful_load_generator', False): 101 source = ( 102 self.pipeline 103 | 'LoadGenerator' >> StatefulLoadGenerator(self.input_options) 104 | beam.ParDo(AssignTimestamps()) 105 | beam.WindowInto(window.FixedWindows(20))) 106 else: 107 source = ( 108 self.pipeline 109 | 'Read synthetic' >> beam.io.Read( 110 SyntheticSource(self.parse_synthetic_source_options()))) 111 112 pc = ( 113 source 114 | 'Measure time: Start' >> beam.ParDo( 115 MeasureTime(self.metrics_namespace))) 116 117 for branch in range(self.fanout): 118 ( # pylint: disable=expression-not-assigned 119 pc 120 | 'Combine with Top %i' % branch >> beam.CombineGlobally( 121 beam.combiners.TopCombineFn(self.top_count)).without_defaults() 122 | 'Consume %i' % branch >> beam.ParDo(self._GetElement()) 123 | 'Measure time: End %i' % branch >> beam.ParDo( 124 MeasureTime(self.metrics_namespace))) 125 126 127 if __name__ == '__main__': 128 logging.basicConfig(level=logging.INFO) 129 CombineTest().run()