github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/io/external/xlang_kinesisio_it_test.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """ 19 Integration test for Python cross-language pipelines for Java KinesisIO. 20 21 If you want to run the tests on localstack then run it just with pipeline 22 options. 23 24 To test it on a real AWS account you need to pass some additional params, e.g.: 25 python setup.py nosetests \ 26 --tests=apache_beam.io.external.xlang_kinesisio_it_test \ 27 --test-pipeline-options=" 28 --use_real_aws 29 --aws_kinesis_stream=<STREAM_NAME> 30 --aws_access_key=<AWS_ACCESS_KEY> 31 --aws_secret_key=<AWS_SECRET_KEY> 32 --aws_region=<AWS_REGION> 33 --runner=FlinkRunner" 34 """ 35 36 # pytype: skip-file 37 38 import argparse 39 import logging 40 import time 41 import unittest 42 import uuid 43 44 import apache_beam as beam 45 from apache_beam.io.kinesis import InitialPositionInStream 46 from apache_beam.io.kinesis import ReadDataFromKinesis 47 from apache_beam.io.kinesis import WatermarkPolicy 48 from apache_beam.io.kinesis import WriteToKinesis 49 from apache_beam.options.pipeline_options import PipelineOptions 50 from apache_beam.options.pipeline_options import StandardOptions 51 from apache_beam.testing.test_pipeline import TestPipeline 52 from apache_beam.testing.util import assert_that 53 from apache_beam.testing.util import equal_to 54 55 # pylint: disable=wrong-import-order, wrong-import-position, ungrouped-imports 56 try: 57 import boto3 58 except ImportError: 59 boto3 = None 60 61 try: 62 from testcontainers.core.container import DockerContainer 63 except ImportError: 64 DockerContainer = None 65 # pylint: enable=wrong-import-order, wrong-import-position, ungrouped-imports 66 67 LOCALSTACK_VERSION = '0.11.3' 68 NUM_RECORDS = 10 69 MAX_READ_TIME = 5 * 60 * 1000 # 5min 70 NOW_SECONDS = time.time() 71 NOW_MILLIS = NOW_SECONDS * 1000 72 REQUEST_RECORDS_LIMIT = 1000 73 RECORD = b'record' + str(uuid.uuid4()).encode() 74 75 76 @unittest.skipUnless(DockerContainer, 'testcontainers is not installed.') 77 @unittest.skipUnless(boto3, 'boto3 is not installed.') 78 @unittest.skipUnless( 79 TestPipeline().get_pipeline_options().view_as(StandardOptions).runner, 80 'Do not run this test on precommit suites.') 81 class CrossLanguageKinesisIOTest(unittest.TestCase): 82 @unittest.skipUnless( 83 TestPipeline().get_option('aws_kinesis_stream'), 84 'Cannot test on real aws without pipeline options provided') 85 def test_kinesis_io_roundtrip(self): 86 # TODO: enable this test for localstack once 87 # https://github.com/apache/beam/issues/20416 is resolved 88 self.run_kinesis_write() 89 self.run_kinesis_read() 90 91 @unittest.skipIf( 92 TestPipeline().get_option('aws_kinesis_stream'), 93 'Do not test on localstack when pipeline options were provided') 94 def test_kinesis_write(self): 95 # TODO: remove this test once 96 # https://github.com/apache/beam/issues/20416 is resolved 97 self.run_kinesis_write() 98 records = self.kinesis_helper.read_from_stream(self.aws_kinesis_stream) 99 self.assertEqual( 100 sorted(records), 101 sorted([RECORD + str(i).encode() for i in range(NUM_RECORDS)])) 102 103 def run_kinesis_write(self): 104 with TestPipeline(options=PipelineOptions(self.pipeline_args)) as p: 105 p.not_use_test_runner_api = True 106 _ = ( 107 p 108 | 'Impulse' >> beam.Impulse() 109 | 'Generate' >> beam.FlatMap(lambda x: range(NUM_RECORDS)) # pylint: disable=bad-option-value 110 | 'Map to bytes' >> 111 beam.Map(lambda x: RECORD + str(x).encode()).with_output_types(bytes) 112 | 'WriteToKinesis' >> WriteToKinesis( 113 stream_name=self.aws_kinesis_stream, 114 aws_access_key=self.aws_access_key, 115 aws_secret_key=self.aws_secret_key, 116 region=self.aws_region, 117 service_endpoint=self.aws_service_endpoint, 118 verify_certificate=(not self.use_localstack), 119 partition_key='1', 120 producer_properties=self.producer_properties, 121 )) 122 123 def run_kinesis_read(self): 124 records = [RECORD + str(i).encode() for i in range(NUM_RECORDS)] 125 126 with TestPipeline(options=PipelineOptions(self.pipeline_args)) as p: 127 result = ( 128 p 129 | 'ReadFromKinesis' >> ReadDataFromKinesis( 130 stream_name=self.aws_kinesis_stream, 131 aws_access_key=self.aws_access_key, 132 aws_secret_key=self.aws_secret_key, 133 region=self.aws_region, 134 service_endpoint=self.aws_service_endpoint, 135 verify_certificate=not self.use_localstack, 136 max_num_records=NUM_RECORDS, 137 max_read_time=MAX_READ_TIME, 138 request_records_limit=REQUEST_RECORDS_LIMIT, 139 watermark_policy=WatermarkPolicy.ARRIVAL_TIME, 140 watermark_idle_duration_threshold=MAX_READ_TIME, 141 initial_position_in_stream=InitialPositionInStream.AT_TIMESTAMP, 142 initial_timestamp_in_stream=NOW_MILLIS, 143 ).with_output_types(bytes)) 144 assert_that(result, equal_to(records)) 145 146 def set_localstack(self): 147 self.localstack = DockerContainer('localstack/localstack:{}' 148 .format(LOCALSTACK_VERSION))\ 149 .with_env('SERVICES', 'kinesis')\ 150 .with_env('KINESIS_PORT', '4568')\ 151 .with_env('USE_SSL', 'true')\ 152 .with_exposed_ports(4568)\ 153 .with_volume_mapping('/var/run/docker.sock', '/var/run/docker.sock', 'rw') 154 155 # Repeat if ReadTimeout is raised. 156 for i in range(4): 157 try: 158 self.localstack.start() 159 break 160 except Exception as e: # pylint: disable=bare-except 161 if i == 3: 162 logging.error('Could not initialize localstack container') 163 raise e 164 165 self.aws_service_endpoint = 'https://{}:{}'.format( 166 self.localstack.get_container_host_ip(), 167 self.localstack.get_exposed_port('4568'), 168 ) 169 170 def setUp(self): 171 parser = argparse.ArgumentParser() 172 173 parser.add_argument( 174 '--aws_kinesis_stream', 175 default='beam_kinesis_xlang', 176 help='Kinesis stream name', 177 ) 178 parser.add_argument( 179 '--aws_access_key', 180 default='accesskey', 181 help=('Aws access key'), 182 ) 183 parser.add_argument( 184 '--aws_secret_key', 185 default='secretkey', 186 help='Aws secret key', 187 ) 188 parser.add_argument( 189 '--aws_region', 190 default='us-east-1', 191 help='Aws region', 192 ) 193 parser.add_argument( 194 '--aws_service_endpoint', 195 default=None, 196 help='Url to external aws endpoint', 197 ) 198 parser.add_argument( 199 '--use_real_aws', 200 default=False, 201 dest='use_real_aws', 202 action='store_true', 203 help='Flag whether to use real aws for the tests purpose', 204 ) 205 parser.add_argument( 206 '--expansion_service', 207 help='Url to externally launched expansion service.', 208 ) 209 210 pipeline = TestPipeline() 211 argv = pipeline.get_full_options_as_args() 212 213 known_args, self.pipeline_args = parser.parse_known_args(argv) 214 215 self.aws_kinesis_stream = known_args.aws_kinesis_stream 216 self.aws_access_key = known_args.aws_access_key 217 self.aws_secret_key = known_args.aws_secret_key 218 self.aws_region = known_args.aws_region 219 self.aws_service_endpoint = known_args.aws_service_endpoint 220 self.use_localstack = not known_args.use_real_aws 221 self.expansion_service = known_args.expansion_service 222 self.producer_properties = { 223 'CollectionMaxCount': str(NUM_RECORDS), 224 'ConnectTimeout': str(MAX_READ_TIME), 225 } 226 227 if self.use_localstack: 228 self.set_localstack() 229 230 self.kinesis_helper = KinesisHelper( 231 self.aws_access_key, 232 self.aws_secret_key, 233 self.aws_region, 234 self.aws_service_endpoint.replace('https', 'http') 235 if self.aws_service_endpoint else None, 236 ) 237 238 if self.use_localstack: 239 self.kinesis_helper.create_stream(self.aws_kinesis_stream) 240 241 def tearDown(self): 242 if self.use_localstack: 243 self.kinesis_helper.delete_stream(self.aws_kinesis_stream) 244 245 try: 246 self.localstack.stop() 247 except: # pylint: disable=bare-except 248 logging.error('Could not stop the localstack container') 249 250 251 class KinesisHelper: 252 def __init__(self, access_key, secret_key, region, service_endpoint): 253 self.kinesis_client = boto3.client( 254 service_name='kinesis', 255 region_name=region, 256 endpoint_url=service_endpoint, 257 aws_access_key_id=access_key, 258 aws_secret_access_key=secret_key, 259 ) 260 261 def create_stream(self, stream_name): 262 # localstack could not have initialized in the container yet so repeat 263 retries = 10 264 for i in range(retries): 265 try: 266 self.kinesis_client.create_stream( 267 StreamName=stream_name, 268 ShardCount=1, 269 ) 270 time.sleep(2) 271 break 272 except Exception as e: 273 if i == retries - 1: 274 logging.error('Could not create kinesis stream') 275 raise e 276 277 # Wait for the stream to be active 278 self.get_first_shard_id(stream_name) 279 280 def delete_stream(self, stream_name): 281 self.kinesis_client.delete_stream( 282 StreamName=stream_name, 283 EnforceConsumerDeletion=True, 284 ) 285 286 def get_first_shard_id(self, stream_name): 287 retries = 10 288 stream = self.kinesis_client.describe_stream(StreamName=stream_name) 289 for i in range(retries): 290 if stream['StreamDescription']['StreamStatus'] == 'ACTIVE': 291 break 292 time.sleep(2) 293 if i == retries - 1: 294 logging.error('Could not initialize kinesis stream') 295 raise RuntimeError( 296 "Unable to initialize Kinesis Stream %s. Status: %s", 297 stream['StreamDescription']['StreamName'], 298 stream['StreamDescription']['StreamStatus']) 299 stream = self.kinesis_client.describe_stream(StreamName=stream_name) 300 301 return stream['StreamDescription']['Shards'][0]['ShardId'] 302 303 def read_from_stream(self, stream_name): 304 shard_id = self.get_first_shard_id(stream_name) 305 306 shard_iterator = self.kinesis_client.get_shard_iterator( 307 StreamName=stream_name, 308 ShardId=shard_id, 309 ShardIteratorType=InitialPositionInStream.AT_TIMESTAMP, 310 Timestamp=str(NOW_SECONDS), 311 ) 312 313 result = self.kinesis_client.get_records( 314 ShardIterator=shard_iterator['ShardIterator'], 315 Limit=NUM_RECORDS, 316 ) 317 318 return [record['Data'] for record in result['Records']] 319 320 321 if __name__ == '__main__': 322 logging.getLogger().setLevel(logging.INFO) 323 unittest.main()