github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/io/gcp/pubsub_integration_test.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """ 19 Integration test for Google Cloud Pub/Sub. 20 """ 21 # pytype: skip-file 22 23 import logging 24 import time 25 import unittest 26 import uuid 27 28 import pytest 29 from hamcrest.core.core.allof import all_of 30 31 from apache_beam.io.gcp import pubsub_it_pipeline 32 from apache_beam.io.gcp.pubsub import PubsubMessage 33 from apache_beam.io.gcp.tests.pubsub_matcher import PubSubMessageMatcher 34 from apache_beam.runners.runner import PipelineState 35 from apache_beam.testing import test_utils 36 from apache_beam.testing.pipeline_verifiers import PipelineStateMatcher 37 from apache_beam.testing.test_pipeline import TestPipeline 38 39 INPUT_TOPIC = 'psit_topic_input' 40 OUTPUT_TOPIC = 'psit_topic_output' 41 INPUT_SUB = 'psit_subscription_input' 42 OUTPUT_SUB = 'psit_subscription_output' 43 44 # How long TestXXXRunner will wait for pubsub_it_pipeline to run before 45 # cancelling it. 46 TEST_PIPELINE_DURATION_MS = 8 * 60 * 1000 47 # How long PubSubMessageMatcher will wait for the correct set of messages to 48 # appear. 49 MESSAGE_MATCHER_TIMEOUT_S = 5 * 60 50 51 52 class PubSubIntegrationTest(unittest.TestCase): 53 54 ID_LABEL = 'id' 55 TIMESTAMP_ATTRIBUTE = 'timestamp' 56 INPUT_MESSAGES = { 57 # TODO(https://github.com/apache/beam/issues/18939): DirectRunner doesn't 58 # support reading or writing label_ids, nor writing timestamp attributes. 59 # Once these features exist, TestDirectRunner and TestDataflowRunner 60 # should behave identically. 61 'TestDirectRunner': [ 62 PubsubMessage(b'data001', {}), 63 # For those elements that have the TIMESTAMP_ATTRIBUTE attribute, the 64 # IT pipeline writes back the timestamp of each element (as reported 65 # by Beam), as a TIMESTAMP_ATTRIBUTE + '_out' attribute. 66 PubsubMessage( 67 b'data002', { 68 TIMESTAMP_ATTRIBUTE: '2018-07-11T02:02:50.149000Z', 69 }), 70 PubsubMessage(b'data003\xab\xac', {}), 71 PubsubMessage( 72 b'data004\xab\xac', { 73 TIMESTAMP_ATTRIBUTE: '2018-07-11T02:02:50.149000Z', 74 }) 75 ], 76 'TestDataflowRunner': [ 77 # Use ID_LABEL attribute to deduplicate messages with the same ID. 78 PubsubMessage(b'data001', {ID_LABEL: 'foo'}), 79 PubsubMessage(b'data001', {ID_LABEL: 'foo'}), 80 PubsubMessage(b'data001', {ID_LABEL: 'foo'}), 81 # For those elements that have the TIMESTAMP_ATTRIBUTE attribute, the 82 # IT pipeline writes back the timestamp of each element (as reported 83 # by Beam), as a TIMESTAMP_ATTRIBUTE + '_out' attribute. 84 PubsubMessage( 85 b'data002', { 86 TIMESTAMP_ATTRIBUTE: '2018-07-11T02:02:50.149000Z', 87 }), 88 PubsubMessage(b'data003\xab\xac', {ID_LABEL: 'foo2'}), 89 PubsubMessage(b'data003\xab\xac', {ID_LABEL: 'foo2'}), 90 PubsubMessage(b'data003\xab\xac', {ID_LABEL: 'foo2'}), 91 PubsubMessage( 92 b'data004\xab\xac', { 93 TIMESTAMP_ATTRIBUTE: '2018-07-11T02:02:50.149000Z', 94 }) 95 ], 96 } 97 EXPECTED_OUTPUT_MESSAGES = { 98 'TestDirectRunner': [ 99 PubsubMessage(b'data001-seen', {'processed': 'IT'}), 100 PubsubMessage( 101 b'data002-seen', 102 { 103 TIMESTAMP_ATTRIBUTE: '2018-07-11T02:02:50.149000Z', 104 TIMESTAMP_ATTRIBUTE + '_out': '2018-07-11T02:02:50.149000Z', 105 'processed': 'IT', 106 }), 107 PubsubMessage(b'data003\xab\xac-seen', {'processed': 'IT'}), 108 PubsubMessage( 109 b'data004\xab\xac-seen', 110 { 111 TIMESTAMP_ATTRIBUTE: '2018-07-11T02:02:50.149000Z', 112 TIMESTAMP_ATTRIBUTE + '_out': '2018-07-11T02:02:50.149000Z', 113 'processed': 'IT', 114 }) 115 ], 116 'TestDataflowRunner': [ 117 PubsubMessage(b'data001-seen', {'processed': 'IT'}), 118 PubsubMessage( 119 b'data002-seen', 120 { 121 TIMESTAMP_ATTRIBUTE + '_out': '2018-07-11T02:02:50.149000Z', 122 'processed': 'IT', 123 }), 124 PubsubMessage(b'data003\xab\xac-seen', {'processed': 'IT'}), 125 PubsubMessage( 126 b'data004\xab\xac-seen', 127 { 128 TIMESTAMP_ATTRIBUTE + '_out': '2018-07-11T02:02:50.149000Z', 129 'processed': 'IT', 130 }) 131 ], 132 } 133 134 def setUp(self): 135 self.test_pipeline = TestPipeline(is_integration_test=True) 136 self.runner_name = type(self.test_pipeline.runner).__name__ 137 self.project = self.test_pipeline.get_option('project') 138 self.uuid = str(uuid.uuid4()) 139 140 # Set up PubSub environment. 141 from google.cloud import pubsub 142 self.pub_client = pubsub.PublisherClient() 143 self.input_topic = self.pub_client.create_topic( 144 name=self.pub_client.topic_path(self.project, INPUT_TOPIC + self.uuid)) 145 self.output_topic = self.pub_client.create_topic( 146 name=self.pub_client.topic_path(self.project, OUTPUT_TOPIC + self.uuid)) 147 148 self.sub_client = pubsub.SubscriberClient() 149 self.input_sub = self.sub_client.create_subscription( 150 name=self.sub_client.subscription_path( 151 self.project, INPUT_SUB + self.uuid), 152 topic=self.input_topic.name) 153 self.output_sub = self.sub_client.create_subscription( 154 name=self.sub_client.subscription_path( 155 self.project, OUTPUT_SUB + self.uuid), 156 topic=self.output_topic.name) 157 # Add a 30 second sleep after resource creation to ensure subscriptions will 158 # receive messages. 159 time.sleep(30) 160 161 def tearDown(self): 162 test_utils.cleanup_subscriptions( 163 self.sub_client, [self.input_sub, self.output_sub]) 164 test_utils.cleanup_topics( 165 self.pub_client, [self.input_topic, self.output_topic]) 166 167 def _test_streaming(self, with_attributes): 168 """Runs IT pipeline with message verifier. 169 170 Args: 171 with_attributes: False - Reads and writes message data only. 172 True - Reads and writes message data and attributes. Also verifies 173 id_label and timestamp_attribute features. 174 """ 175 # Set on_success_matcher to verify pipeline state and pubsub output. These 176 # verifications run on a (remote) worker. 177 178 # Expect the state to be RUNNING since a streaming pipeline is usually 179 # never DONE. The test runner will cancel the pipeline after verification. 180 state_verifier = PipelineStateMatcher(PipelineState.RUNNING) 181 expected_messages = self.EXPECTED_OUTPUT_MESSAGES[self.runner_name] 182 if not with_attributes: 183 expected_messages = [pubsub_msg.data for pubsub_msg in expected_messages] 184 if self.runner_name == 'TestDirectRunner': 185 strip_attributes = None 186 else: 187 strip_attributes = [self.ID_LABEL, self.TIMESTAMP_ATTRIBUTE] 188 pubsub_msg_verifier = PubSubMessageMatcher( 189 self.project, 190 self.output_sub.name, 191 expected_messages, 192 timeout=MESSAGE_MATCHER_TIMEOUT_S, 193 with_attributes=with_attributes, 194 strip_attributes=strip_attributes) 195 extra_opts = { 196 'input_subscription': self.input_sub.name, 197 'output_topic': self.output_topic.name, 198 'wait_until_finish_duration': TEST_PIPELINE_DURATION_MS, 199 'on_success_matcher': all_of(state_verifier, pubsub_msg_verifier) 200 } 201 202 # Generate input data and inject to PubSub. 203 for msg in self.INPUT_MESSAGES[self.runner_name]: 204 self.pub_client.publish( 205 self.input_topic.name, msg.data, **msg.attributes).result() 206 207 # Get pipeline options from command argument: --test-pipeline-options, 208 # and start pipeline job by calling pipeline main function. 209 pubsub_it_pipeline.run_pipeline( 210 argv=self.test_pipeline.get_full_options_as_args(**extra_opts), 211 with_attributes=with_attributes, 212 id_label=self.ID_LABEL, 213 timestamp_attribute=self.TIMESTAMP_ATTRIBUTE) 214 215 @pytest.mark.it_postcommit 216 def test_streaming_data_only(self): 217 self._test_streaming(with_attributes=False) 218 219 @pytest.mark.it_postcommit 220 def test_streaming_with_attributes(self): 221 self._test_streaming(with_attributes=True) 222 223 224 if __name__ == '__main__': 225 logging.getLogger().setLevel(logging.DEBUG) 226 unittest.main()