github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/io/gcp/pubsub_io_perf_test.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """
    19  Performance PubsubIO streaming test for Write/Read operations.
    20  
    21  Caution: only test runners (e.g. TestDataflowRunner) support matchers
    22  
    23  Example for TestDataflowRunner:
    24  
    25  python -m apache_beam.io.gcp.pubsub_io_perf_test \
    26      --test-pipeline-options="
    27      --runner=TestDataflowRunner
    28      --sdk_location=.../dist/apache-beam-x.x.x.dev0.tar.gz
    29      --project=<GCP_PROJECT_ID>
    30      --temp_location=gs://<BUCKET_NAME>/tmp
    31      --staging_location=gs://<BUCKET_NAME>/staging
    32      --wait_until_finish_duration=<TIME_IN_MS>
    33      --pubsub_namespace_prefix=<PUBSUB_NAMESPACE_PREFIX>
    34      --publish_to_big_query=<OPTIONAL><true/false>
    35      --metrics_dataset=<OPTIONAL>
    36      --metrics_table=<OPTIONAL>
    37      --input_options='{
    38        \"num_records\": <SIZE_OF_INPUT>
    39        \"key_size\": 1
    40        \"value_size\": <SIZE_OF_EACH_MESSAGE>
    41      }'"
    42  """
    43  
    44  # pytype: skip-file
    45  
    46  import logging
    47  import sys
    48  
    49  from hamcrest import all_of
    50  
    51  import apache_beam as beam
    52  from apache_beam.io import Read
    53  from apache_beam.io import ReadFromPubSub
    54  from apache_beam.io.gcp.tests.pubsub_matcher import PubSubMessageMatcher
    55  from apache_beam.options.pipeline_options import PipelineOptions
    56  from apache_beam.options.pipeline_options import StandardOptions
    57  from apache_beam.testing.load_tests.load_test import LoadTest
    58  from apache_beam.testing.load_tests.load_test_metrics_utils import MeasureTime
    59  from apache_beam.testing.synthetic_pipeline import SyntheticSource
    60  from apache_beam.testing.test_pipeline import TestPipeline
    61  from apache_beam.transforms import trigger
    62  from apache_beam.transforms import window
    63  
    64  # pylint: disable=wrong-import-order, wrong-import-position
    65  try:
    66    from google.cloud import pubsub
    67  except ImportError:
    68    pubsub = None
    69  # pylint: enable=wrong-import-order, wrong-import-position
    70  
    71  WRITE_METRICS_NAMESPACE = 'pubsub_io_perf_write'
    72  READ_METRICS_NAMESPACE = 'pubsub_io_perf_read'
    73  MATCHER_TIMEOUT = 60 * 15
    74  MATCHER_PULL_TIMEOUT = 60 * 5
    75  
    76  
    77  class PubsubIOPerfTest(LoadTest):
    78    def _setup_env(self):
    79      if not self.pipeline.get_option('pubsub_namespace_prefix'):
    80        logging.error('--pubsub_namespace_prefix argument is required.')
    81        sys.exit(1)
    82      if not self.pipeline.get_option('wait_until_finish_duration'):
    83        logging.error('--wait_until_finish_duration argument is required.')
    84        sys.exit(1)
    85  
    86      self.num_of_messages = int(self.input_options.get('num_records'))
    87      pubsub_namespace_prefix = self.pipeline.get_option(
    88          'pubsub_namespace_prefix')
    89      self.pubsub_namespace = pubsub_namespace_prefix + unique_id
    90  
    91    def _setup_pubsub(self):
    92      self.pub_client = pubsub.PublisherClient()
    93      self.topic_name = self.pub_client.topic_path(
    94          self.project_id, self.pubsub_namespace)
    95  
    96      self.matcher_topic_name = self.pub_client.topic_path(
    97          self.project_id, self.pubsub_namespace + '_matcher')
    98  
    99      self.sub_client = pubsub.SubscriberClient()
   100      self.read_sub_name = self.sub_client.subscription_path(
   101          self.project_id,
   102          self.pubsub_namespace + '_read',
   103      )
   104      self.read_matcher_sub_name = self.sub_client.subscription_path(
   105          self.project_id,
   106          self.pubsub_namespace + '_read_matcher',
   107      )
   108  
   109  
   110  class PubsubWritePerfTest(PubsubIOPerfTest):
   111    def __init__(self):
   112      super().__init__(WRITE_METRICS_NAMESPACE)
   113      self._setup_env()
   114      self._setup_pubsub()
   115      self._setup_pipeline()
   116  
   117    def test(self):
   118      def to_pubsub_message(element):
   119        import uuid
   120        from apache_beam.io import PubsubMessage
   121        return PubsubMessage(
   122            data=element[1],
   123            attributes={'id': str(uuid.uuid1()).encode('utf-8')},
   124        )
   125  
   126      _ = (
   127          self.pipeline
   128          | 'Create input' >> Read(
   129              SyntheticSource(self.parse_synthetic_source_options()))
   130          | 'Format to pubsub message in bytes' >> beam.Map(to_pubsub_message)
   131          | 'Measure time' >> beam.ParDo(MeasureTime(self.metrics_namespace))
   132          | 'Write to Pubsub' >> beam.io.WriteToPubSub(
   133              self.topic_name,
   134              with_attributes=True,
   135              id_label='id',
   136          ))
   137  
   138    def _setup_pipeline(self):
   139      options = PipelineOptions(self.pipeline.get_full_options_as_args())
   140      options.view_as(StandardOptions).streaming = True
   141      self.pipeline = TestPipeline(options=options)
   142  
   143    def _setup_pubsub(self):
   144      super()._setup_pubsub()
   145      _ = self.pub_client.create_topic(name=self.topic_name)
   146  
   147      _ = self.sub_client.create_subscription(
   148          name=self.read_sub_name,
   149          topic=self.topic_name,
   150      )
   151  
   152  
   153  class PubsubReadPerfTest(PubsubIOPerfTest):
   154    def __init__(self):
   155      super().__init__(READ_METRICS_NAMESPACE)
   156      self._setup_env()
   157      self._setup_pubsub()
   158      self._setup_pipeline()
   159  
   160    def test(self):
   161      _ = (
   162          self.pipeline
   163          | 'Read from pubsub' >> ReadFromPubSub(
   164              subscription=self.read_sub_name,
   165              with_attributes=True,
   166              id_label='id',
   167          )
   168          | beam.Map(lambda x: bytes(1)).with_output_types(bytes)
   169          | 'Measure time' >> beam.ParDo(MeasureTime(self.metrics_namespace))
   170          | 'Window' >> beam.WindowInto(
   171              window.GlobalWindows(),
   172              trigger=trigger.Repeatedly(
   173                  trigger.AfterCount(self.num_of_messages)),
   174              accumulation_mode=trigger.AccumulationMode.DISCARDING)
   175          | 'Count messages' >> beam.CombineGlobally(
   176              beam.combiners.CountCombineFn()).without_defaults().
   177          with_output_types(int)
   178          | 'Convert to bytes' >>
   179          beam.Map(lambda count: str(count).encode('utf-8'))
   180          | 'Write to Pubsub' >> beam.io.WriteToPubSub(self.matcher_topic_name))
   181  
   182    def _setup_pubsub(self):
   183      super()._setup_pubsub()
   184      _ = self.pub_client.create_topic(name=self.matcher_topic_name)
   185  
   186      _ = self.sub_client.create_subscription(
   187          name=self.read_matcher_sub_name,
   188          topic=self.matcher_topic_name,
   189      )
   190  
   191    def _setup_pipeline(self):
   192      pubsub_msg_verifier = PubSubMessageMatcher(
   193          self.project_id,
   194          self.read_matcher_sub_name,
   195          expected_msg=[str(self.num_of_messages).encode('utf-8')],
   196          timeout=MATCHER_TIMEOUT,
   197          pull_timeout=MATCHER_PULL_TIMEOUT,
   198      )
   199      extra_opts = {
   200          'on_success_matcher': all_of(pubsub_msg_verifier),
   201          'streaming': True,
   202      }
   203      args = self.pipeline.get_full_options_as_args(**extra_opts)
   204      self.pipeline = TestPipeline(options=PipelineOptions(args))
   205  
   206    def cleanup(self):
   207      self.sub_client.delete_subscription(subscription=self.read_sub_name)
   208      self.sub_client.delete_subscription(subscription=self.read_matcher_sub_name)
   209      self.pub_client.delete_topic(topic=self.topic_name)
   210      self.pub_client.delete_topic(topic=self.matcher_topic_name)
   211  
   212  
   213  if __name__ == '__main__':
   214    import uuid
   215    unique_id = str(uuid.uuid4())
   216  
   217    logging.basicConfig(level=logging.INFO)
   218    PubsubWritePerfTest().run()
   219    PubsubReadPerfTest().run()