github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/io/gcp/pubsub_integration_test.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """
    19  Integration test for Google Cloud Pub/Sub.
    20  """
    21  # pytype: skip-file
    22  
    23  import logging
    24  import time
    25  import unittest
    26  import uuid
    27  
    28  import pytest
    29  from hamcrest.core.core.allof import all_of
    30  
    31  from apache_beam.io.gcp import pubsub_it_pipeline
    32  from apache_beam.io.gcp.pubsub import PubsubMessage
    33  from apache_beam.io.gcp.tests.pubsub_matcher import PubSubMessageMatcher
    34  from apache_beam.runners.runner import PipelineState
    35  from apache_beam.testing import test_utils
    36  from apache_beam.testing.pipeline_verifiers import PipelineStateMatcher
    37  from apache_beam.testing.test_pipeline import TestPipeline
    38  
    39  INPUT_TOPIC = 'psit_topic_input'
    40  OUTPUT_TOPIC = 'psit_topic_output'
    41  INPUT_SUB = 'psit_subscription_input'
    42  OUTPUT_SUB = 'psit_subscription_output'
    43  
    44  # How long TestXXXRunner will wait for pubsub_it_pipeline to run before
    45  # cancelling it.
    46  TEST_PIPELINE_DURATION_MS = 8 * 60 * 1000
    47  # How long PubSubMessageMatcher will wait for the correct set of messages to
    48  # appear.
    49  MESSAGE_MATCHER_TIMEOUT_S = 5 * 60
    50  
    51  
    52  class PubSubIntegrationTest(unittest.TestCase):
    53  
    54    ID_LABEL = 'id'
    55    TIMESTAMP_ATTRIBUTE = 'timestamp'
    56    INPUT_MESSAGES = {
    57        # TODO(https://github.com/apache/beam/issues/18939): DirectRunner doesn't
    58        # support reading or writing label_ids, nor writing timestamp attributes.
    59        # Once these features exist, TestDirectRunner and TestDataflowRunner
    60        # should behave identically.
    61        'TestDirectRunner': [
    62            PubsubMessage(b'data001', {}),
    63            # For those elements that have the TIMESTAMP_ATTRIBUTE attribute, the
    64            # IT pipeline writes back the timestamp of each element (as reported
    65            # by Beam), as a TIMESTAMP_ATTRIBUTE + '_out' attribute.
    66            PubsubMessage(
    67                b'data002', {
    68                    TIMESTAMP_ATTRIBUTE: '2018-07-11T02:02:50.149000Z',
    69                }),
    70            PubsubMessage(b'data003\xab\xac', {}),
    71            PubsubMessage(
    72                b'data004\xab\xac', {
    73                    TIMESTAMP_ATTRIBUTE: '2018-07-11T02:02:50.149000Z',
    74                })
    75        ],
    76        'TestDataflowRunner': [
    77            # Use ID_LABEL attribute to deduplicate messages with the same ID.
    78            PubsubMessage(b'data001', {ID_LABEL: 'foo'}),
    79            PubsubMessage(b'data001', {ID_LABEL: 'foo'}),
    80            PubsubMessage(b'data001', {ID_LABEL: 'foo'}),
    81            # For those elements that have the TIMESTAMP_ATTRIBUTE attribute, the
    82            # IT pipeline writes back the timestamp of each element (as reported
    83            # by Beam), as a TIMESTAMP_ATTRIBUTE + '_out' attribute.
    84            PubsubMessage(
    85                b'data002', {
    86                    TIMESTAMP_ATTRIBUTE: '2018-07-11T02:02:50.149000Z',
    87                }),
    88            PubsubMessage(b'data003\xab\xac', {ID_LABEL: 'foo2'}),
    89            PubsubMessage(b'data003\xab\xac', {ID_LABEL: 'foo2'}),
    90            PubsubMessage(b'data003\xab\xac', {ID_LABEL: 'foo2'}),
    91            PubsubMessage(
    92                b'data004\xab\xac', {
    93                    TIMESTAMP_ATTRIBUTE: '2018-07-11T02:02:50.149000Z',
    94                })
    95        ],
    96    }
    97    EXPECTED_OUTPUT_MESSAGES = {
    98        'TestDirectRunner': [
    99            PubsubMessage(b'data001-seen', {'processed': 'IT'}),
   100            PubsubMessage(
   101                b'data002-seen',
   102                {
   103                    TIMESTAMP_ATTRIBUTE: '2018-07-11T02:02:50.149000Z',
   104                    TIMESTAMP_ATTRIBUTE + '_out': '2018-07-11T02:02:50.149000Z',
   105                    'processed': 'IT',
   106                }),
   107            PubsubMessage(b'data003\xab\xac-seen', {'processed': 'IT'}),
   108            PubsubMessage(
   109                b'data004\xab\xac-seen',
   110                {
   111                    TIMESTAMP_ATTRIBUTE: '2018-07-11T02:02:50.149000Z',
   112                    TIMESTAMP_ATTRIBUTE + '_out': '2018-07-11T02:02:50.149000Z',
   113                    'processed': 'IT',
   114                })
   115        ],
   116        'TestDataflowRunner': [
   117            PubsubMessage(b'data001-seen', {'processed': 'IT'}),
   118            PubsubMessage(
   119                b'data002-seen',
   120                {
   121                    TIMESTAMP_ATTRIBUTE + '_out': '2018-07-11T02:02:50.149000Z',
   122                    'processed': 'IT',
   123                }),
   124            PubsubMessage(b'data003\xab\xac-seen', {'processed': 'IT'}),
   125            PubsubMessage(
   126                b'data004\xab\xac-seen',
   127                {
   128                    TIMESTAMP_ATTRIBUTE + '_out': '2018-07-11T02:02:50.149000Z',
   129                    'processed': 'IT',
   130                })
   131        ],
   132    }
   133  
   134    def setUp(self):
   135      self.test_pipeline = TestPipeline(is_integration_test=True)
   136      self.runner_name = type(self.test_pipeline.runner).__name__
   137      self.project = self.test_pipeline.get_option('project')
   138      self.uuid = str(uuid.uuid4())
   139  
   140      # Set up PubSub environment.
   141      from google.cloud import pubsub
   142      self.pub_client = pubsub.PublisherClient()
   143      self.input_topic = self.pub_client.create_topic(
   144          name=self.pub_client.topic_path(self.project, INPUT_TOPIC + self.uuid))
   145      self.output_topic = self.pub_client.create_topic(
   146          name=self.pub_client.topic_path(self.project, OUTPUT_TOPIC + self.uuid))
   147  
   148      self.sub_client = pubsub.SubscriberClient()
   149      self.input_sub = self.sub_client.create_subscription(
   150          name=self.sub_client.subscription_path(
   151              self.project, INPUT_SUB + self.uuid),
   152          topic=self.input_topic.name)
   153      self.output_sub = self.sub_client.create_subscription(
   154          name=self.sub_client.subscription_path(
   155              self.project, OUTPUT_SUB + self.uuid),
   156          topic=self.output_topic.name)
   157      # Add a 30 second sleep after resource creation to ensure subscriptions will
   158      # receive messages.
   159      time.sleep(30)
   160  
   161    def tearDown(self):
   162      test_utils.cleanup_subscriptions(
   163          self.sub_client, [self.input_sub, self.output_sub])
   164      test_utils.cleanup_topics(
   165          self.pub_client, [self.input_topic, self.output_topic])
   166  
   167    def _test_streaming(self, with_attributes):
   168      """Runs IT pipeline with message verifier.
   169  
   170      Args:
   171        with_attributes: False - Reads and writes message data only.
   172          True - Reads and writes message data and attributes. Also verifies
   173          id_label and timestamp_attribute features.
   174      """
   175      # Set on_success_matcher to verify pipeline state and pubsub output. These
   176      # verifications run on a (remote) worker.
   177  
   178      # Expect the state to be RUNNING since a streaming pipeline is usually
   179      # never DONE. The test runner will cancel the pipeline after verification.
   180      state_verifier = PipelineStateMatcher(PipelineState.RUNNING)
   181      expected_messages = self.EXPECTED_OUTPUT_MESSAGES[self.runner_name]
   182      if not with_attributes:
   183        expected_messages = [pubsub_msg.data for pubsub_msg in expected_messages]
   184      if self.runner_name == 'TestDirectRunner':
   185        strip_attributes = None
   186      else:
   187        strip_attributes = [self.ID_LABEL, self.TIMESTAMP_ATTRIBUTE]
   188      pubsub_msg_verifier = PubSubMessageMatcher(
   189          self.project,
   190          self.output_sub.name,
   191          expected_messages,
   192          timeout=MESSAGE_MATCHER_TIMEOUT_S,
   193          with_attributes=with_attributes,
   194          strip_attributes=strip_attributes)
   195      extra_opts = {
   196          'input_subscription': self.input_sub.name,
   197          'output_topic': self.output_topic.name,
   198          'wait_until_finish_duration': TEST_PIPELINE_DURATION_MS,
   199          'on_success_matcher': all_of(state_verifier, pubsub_msg_verifier)
   200      }
   201  
   202      # Generate input data and inject to PubSub.
   203      for msg in self.INPUT_MESSAGES[self.runner_name]:
   204        self.pub_client.publish(
   205            self.input_topic.name, msg.data, **msg.attributes).result()
   206  
   207      # Get pipeline options from command argument: --test-pipeline-options,
   208      # and start pipeline job by calling pipeline main function.
   209      pubsub_it_pipeline.run_pipeline(
   210          argv=self.test_pipeline.get_full_options_as_args(**extra_opts),
   211          with_attributes=with_attributes,
   212          id_label=self.ID_LABEL,
   213          timestamp_attribute=self.TIMESTAMP_ATTRIBUTE)
   214  
   215    @pytest.mark.it_postcommit
   216    def test_streaming_data_only(self):
   217      self._test_streaming(with_attributes=False)
   218  
   219    @pytest.mark.it_postcommit
   220    def test_streaming_with_attributes(self):
   221      self._test_streaming(with_attributes=True)
   222  
   223  
   224  if __name__ == '__main__':
   225    logging.getLogger().setLevel(logging.DEBUG)
   226    unittest.main()