github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/io/gcp/pubsub_io_perf_test.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """ 19 Performance PubsubIO streaming test for Write/Read operations. 20 21 Caution: only test runners (e.g. TestDataflowRunner) support matchers 22 23 Example for TestDataflowRunner: 24 25 python -m apache_beam.io.gcp.pubsub_io_perf_test \ 26 --test-pipeline-options=" 27 --runner=TestDataflowRunner 28 --sdk_location=.../dist/apache-beam-x.x.x.dev0.tar.gz 29 --project=<GCP_PROJECT_ID> 30 --temp_location=gs://<BUCKET_NAME>/tmp 31 --staging_location=gs://<BUCKET_NAME>/staging 32 --wait_until_finish_duration=<TIME_IN_MS> 33 --pubsub_namespace_prefix=<PUBSUB_NAMESPACE_PREFIX> 34 --publish_to_big_query=<OPTIONAL><true/false> 35 --metrics_dataset=<OPTIONAL> 36 --metrics_table=<OPTIONAL> 37 --input_options='{ 38 \"num_records\": <SIZE_OF_INPUT> 39 \"key_size\": 1 40 \"value_size\": <SIZE_OF_EACH_MESSAGE> 41 }'" 42 """ 43 44 # pytype: skip-file 45 46 import logging 47 import sys 48 49 from hamcrest import all_of 50 51 import apache_beam as beam 52 from apache_beam.io import Read 53 from apache_beam.io import ReadFromPubSub 54 from apache_beam.io.gcp.tests.pubsub_matcher import PubSubMessageMatcher 55 from apache_beam.options.pipeline_options import PipelineOptions 56 from apache_beam.options.pipeline_options import StandardOptions 57 from apache_beam.testing.load_tests.load_test import LoadTest 58 from apache_beam.testing.load_tests.load_test_metrics_utils import MeasureTime 59 from apache_beam.testing.synthetic_pipeline import SyntheticSource 60 from apache_beam.testing.test_pipeline import TestPipeline 61 from apache_beam.transforms import trigger 62 from apache_beam.transforms import window 63 64 # pylint: disable=wrong-import-order, wrong-import-position 65 try: 66 from google.cloud import pubsub 67 except ImportError: 68 pubsub = None 69 # pylint: enable=wrong-import-order, wrong-import-position 70 71 WRITE_METRICS_NAMESPACE = 'pubsub_io_perf_write' 72 READ_METRICS_NAMESPACE = 'pubsub_io_perf_read' 73 MATCHER_TIMEOUT = 60 * 15 74 MATCHER_PULL_TIMEOUT = 60 * 5 75 76 77 class PubsubIOPerfTest(LoadTest): 78 def _setup_env(self): 79 if not self.pipeline.get_option('pubsub_namespace_prefix'): 80 logging.error('--pubsub_namespace_prefix argument is required.') 81 sys.exit(1) 82 if not self.pipeline.get_option('wait_until_finish_duration'): 83 logging.error('--wait_until_finish_duration argument is required.') 84 sys.exit(1) 85 86 self.num_of_messages = int(self.input_options.get('num_records')) 87 pubsub_namespace_prefix = self.pipeline.get_option( 88 'pubsub_namespace_prefix') 89 self.pubsub_namespace = pubsub_namespace_prefix + unique_id 90 91 def _setup_pubsub(self): 92 self.pub_client = pubsub.PublisherClient() 93 self.topic_name = self.pub_client.topic_path( 94 self.project_id, self.pubsub_namespace) 95 96 self.matcher_topic_name = self.pub_client.topic_path( 97 self.project_id, self.pubsub_namespace + '_matcher') 98 99 self.sub_client = pubsub.SubscriberClient() 100 self.read_sub_name = self.sub_client.subscription_path( 101 self.project_id, 102 self.pubsub_namespace + '_read', 103 ) 104 self.read_matcher_sub_name = self.sub_client.subscription_path( 105 self.project_id, 106 self.pubsub_namespace + '_read_matcher', 107 ) 108 109 110 class PubsubWritePerfTest(PubsubIOPerfTest): 111 def __init__(self): 112 super().__init__(WRITE_METRICS_NAMESPACE) 113 self._setup_env() 114 self._setup_pubsub() 115 self._setup_pipeline() 116 117 def test(self): 118 def to_pubsub_message(element): 119 import uuid 120 from apache_beam.io import PubsubMessage 121 return PubsubMessage( 122 data=element[1], 123 attributes={'id': str(uuid.uuid1()).encode('utf-8')}, 124 ) 125 126 _ = ( 127 self.pipeline 128 | 'Create input' >> Read( 129 SyntheticSource(self.parse_synthetic_source_options())) 130 | 'Format to pubsub message in bytes' >> beam.Map(to_pubsub_message) 131 | 'Measure time' >> beam.ParDo(MeasureTime(self.metrics_namespace)) 132 | 'Write to Pubsub' >> beam.io.WriteToPubSub( 133 self.topic_name, 134 with_attributes=True, 135 id_label='id', 136 )) 137 138 def _setup_pipeline(self): 139 options = PipelineOptions(self.pipeline.get_full_options_as_args()) 140 options.view_as(StandardOptions).streaming = True 141 self.pipeline = TestPipeline(options=options) 142 143 def _setup_pubsub(self): 144 super()._setup_pubsub() 145 _ = self.pub_client.create_topic(name=self.topic_name) 146 147 _ = self.sub_client.create_subscription( 148 name=self.read_sub_name, 149 topic=self.topic_name, 150 ) 151 152 153 class PubsubReadPerfTest(PubsubIOPerfTest): 154 def __init__(self): 155 super().__init__(READ_METRICS_NAMESPACE) 156 self._setup_env() 157 self._setup_pubsub() 158 self._setup_pipeline() 159 160 def test(self): 161 _ = ( 162 self.pipeline 163 | 'Read from pubsub' >> ReadFromPubSub( 164 subscription=self.read_sub_name, 165 with_attributes=True, 166 id_label='id', 167 ) 168 | beam.Map(lambda x: bytes(1)).with_output_types(bytes) 169 | 'Measure time' >> beam.ParDo(MeasureTime(self.metrics_namespace)) 170 | 'Window' >> beam.WindowInto( 171 window.GlobalWindows(), 172 trigger=trigger.Repeatedly( 173 trigger.AfterCount(self.num_of_messages)), 174 accumulation_mode=trigger.AccumulationMode.DISCARDING) 175 | 'Count messages' >> beam.CombineGlobally( 176 beam.combiners.CountCombineFn()).without_defaults(). 177 with_output_types(int) 178 | 'Convert to bytes' >> 179 beam.Map(lambda count: str(count).encode('utf-8')) 180 | 'Write to Pubsub' >> beam.io.WriteToPubSub(self.matcher_topic_name)) 181 182 def _setup_pubsub(self): 183 super()._setup_pubsub() 184 _ = self.pub_client.create_topic(name=self.matcher_topic_name) 185 186 _ = self.sub_client.create_subscription( 187 name=self.read_matcher_sub_name, 188 topic=self.matcher_topic_name, 189 ) 190 191 def _setup_pipeline(self): 192 pubsub_msg_verifier = PubSubMessageMatcher( 193 self.project_id, 194 self.read_matcher_sub_name, 195 expected_msg=[str(self.num_of_messages).encode('utf-8')], 196 timeout=MATCHER_TIMEOUT, 197 pull_timeout=MATCHER_PULL_TIMEOUT, 198 ) 199 extra_opts = { 200 'on_success_matcher': all_of(pubsub_msg_verifier), 201 'streaming': True, 202 } 203 args = self.pipeline.get_full_options_as_args(**extra_opts) 204 self.pipeline = TestPipeline(options=PipelineOptions(args)) 205 206 def cleanup(self): 207 self.sub_client.delete_subscription(subscription=self.read_sub_name) 208 self.sub_client.delete_subscription(subscription=self.read_matcher_sub_name) 209 self.pub_client.delete_topic(topic=self.topic_name) 210 self.pub_client.delete_topic(topic=self.matcher_topic_name) 211 212 213 if __name__ == '__main__': 214 import uuid 215 unique_id = str(uuid.uuid4()) 216 217 logging.basicConfig(level=logging.INFO) 218 PubsubWritePerfTest().run() 219 PubsubReadPerfTest().run()