github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/io/external/xlang_kinesisio_it_test.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """
    19  Integration test for Python cross-language pipelines for Java KinesisIO.
    20  
    21  If you want to run the tests on localstack then run it just with pipeline
    22  options.
    23  
    24  To test it on a real AWS account you need to pass some additional params, e.g.:
    25  python setup.py nosetests \
    26  --tests=apache_beam.io.external.xlang_kinesisio_it_test \
    27  --test-pipeline-options="
    28    --use_real_aws
    29    --aws_kinesis_stream=<STREAM_NAME>
    30    --aws_access_key=<AWS_ACCESS_KEY>
    31    --aws_secret_key=<AWS_SECRET_KEY>
    32    --aws_region=<AWS_REGION>
    33    --runner=FlinkRunner"
    34  """
    35  
    36  # pytype: skip-file
    37  
    38  import argparse
    39  import logging
    40  import time
    41  import unittest
    42  import uuid
    43  
    44  import apache_beam as beam
    45  from apache_beam.io.kinesis import InitialPositionInStream
    46  from apache_beam.io.kinesis import ReadDataFromKinesis
    47  from apache_beam.io.kinesis import WatermarkPolicy
    48  from apache_beam.io.kinesis import WriteToKinesis
    49  from apache_beam.options.pipeline_options import PipelineOptions
    50  from apache_beam.options.pipeline_options import StandardOptions
    51  from apache_beam.testing.test_pipeline import TestPipeline
    52  from apache_beam.testing.util import assert_that
    53  from apache_beam.testing.util import equal_to
    54  
    55  # pylint: disable=wrong-import-order, wrong-import-position, ungrouped-imports
    56  try:
    57    import boto3
    58  except ImportError:
    59    boto3 = None
    60  
    61  try:
    62    from testcontainers.core.container import DockerContainer
    63  except ImportError:
    64    DockerContainer = None
    65  # pylint: enable=wrong-import-order, wrong-import-position, ungrouped-imports
    66  
    67  LOCALSTACK_VERSION = '0.11.3'
    68  NUM_RECORDS = 10
    69  MAX_READ_TIME = 5 * 60 * 1000  # 5min
    70  NOW_SECONDS = time.time()
    71  NOW_MILLIS = NOW_SECONDS * 1000
    72  REQUEST_RECORDS_LIMIT = 1000
    73  RECORD = b'record' + str(uuid.uuid4()).encode()
    74  
    75  
    76  @unittest.skipUnless(DockerContainer, 'testcontainers is not installed.')
    77  @unittest.skipUnless(boto3, 'boto3 is not installed.')
    78  @unittest.skipUnless(
    79      TestPipeline().get_pipeline_options().view_as(StandardOptions).runner,
    80      'Do not run this test on precommit suites.')
    81  class CrossLanguageKinesisIOTest(unittest.TestCase):
    82    @unittest.skipUnless(
    83        TestPipeline().get_option('aws_kinesis_stream'),
    84        'Cannot test on real aws without pipeline options provided')
    85    def test_kinesis_io_roundtrip(self):
    86      # TODO: enable this test for localstack once
    87      # https://github.com/apache/beam/issues/20416 is resolved
    88      self.run_kinesis_write()
    89      self.run_kinesis_read()
    90  
    91    @unittest.skipIf(
    92        TestPipeline().get_option('aws_kinesis_stream'),
    93        'Do not test on localstack when pipeline options were provided')
    94    def test_kinesis_write(self):
    95      # TODO: remove this test once
    96      # https://github.com/apache/beam/issues/20416 is resolved
    97      self.run_kinesis_write()
    98      records = self.kinesis_helper.read_from_stream(self.aws_kinesis_stream)
    99      self.assertEqual(
   100          sorted(records),
   101          sorted([RECORD + str(i).encode() for i in range(NUM_RECORDS)]))
   102  
   103    def run_kinesis_write(self):
   104      with TestPipeline(options=PipelineOptions(self.pipeline_args)) as p:
   105        p.not_use_test_runner_api = True
   106        _ = (
   107            p
   108            | 'Impulse' >> beam.Impulse()
   109            | 'Generate' >> beam.FlatMap(lambda x: range(NUM_RECORDS))  # pylint: disable=bad-option-value
   110            | 'Map to bytes' >>
   111            beam.Map(lambda x: RECORD + str(x).encode()).with_output_types(bytes)
   112            | 'WriteToKinesis' >> WriteToKinesis(
   113                stream_name=self.aws_kinesis_stream,
   114                aws_access_key=self.aws_access_key,
   115                aws_secret_key=self.aws_secret_key,
   116                region=self.aws_region,
   117                service_endpoint=self.aws_service_endpoint,
   118                verify_certificate=(not self.use_localstack),
   119                partition_key='1',
   120                producer_properties=self.producer_properties,
   121            ))
   122  
   123    def run_kinesis_read(self):
   124      records = [RECORD + str(i).encode() for i in range(NUM_RECORDS)]
   125  
   126      with TestPipeline(options=PipelineOptions(self.pipeline_args)) as p:
   127        result = (
   128            p
   129            | 'ReadFromKinesis' >> ReadDataFromKinesis(
   130                stream_name=self.aws_kinesis_stream,
   131                aws_access_key=self.aws_access_key,
   132                aws_secret_key=self.aws_secret_key,
   133                region=self.aws_region,
   134                service_endpoint=self.aws_service_endpoint,
   135                verify_certificate=not self.use_localstack,
   136                max_num_records=NUM_RECORDS,
   137                max_read_time=MAX_READ_TIME,
   138                request_records_limit=REQUEST_RECORDS_LIMIT,
   139                watermark_policy=WatermarkPolicy.ARRIVAL_TIME,
   140                watermark_idle_duration_threshold=MAX_READ_TIME,
   141                initial_position_in_stream=InitialPositionInStream.AT_TIMESTAMP,
   142                initial_timestamp_in_stream=NOW_MILLIS,
   143            ).with_output_types(bytes))
   144        assert_that(result, equal_to(records))
   145  
   146    def set_localstack(self):
   147      self.localstack = DockerContainer('localstack/localstack:{}'
   148                                        .format(LOCALSTACK_VERSION))\
   149        .with_env('SERVICES', 'kinesis')\
   150        .with_env('KINESIS_PORT', '4568')\
   151        .with_env('USE_SSL', 'true')\
   152        .with_exposed_ports(4568)\
   153        .with_volume_mapping('/var/run/docker.sock', '/var/run/docker.sock', 'rw')
   154  
   155      # Repeat if ReadTimeout is raised.
   156      for i in range(4):
   157        try:
   158          self.localstack.start()
   159          break
   160        except Exception as e:  # pylint: disable=bare-except
   161          if i == 3:
   162            logging.error('Could not initialize localstack container')
   163            raise e
   164  
   165      self.aws_service_endpoint = 'https://{}:{}'.format(
   166          self.localstack.get_container_host_ip(),
   167          self.localstack.get_exposed_port('4568'),
   168      )
   169  
   170    def setUp(self):
   171      parser = argparse.ArgumentParser()
   172  
   173      parser.add_argument(
   174          '--aws_kinesis_stream',
   175          default='beam_kinesis_xlang',
   176          help='Kinesis stream name',
   177      )
   178      parser.add_argument(
   179          '--aws_access_key',
   180          default='accesskey',
   181          help=('Aws access key'),
   182      )
   183      parser.add_argument(
   184          '--aws_secret_key',
   185          default='secretkey',
   186          help='Aws secret key',
   187      )
   188      parser.add_argument(
   189          '--aws_region',
   190          default='us-east-1',
   191          help='Aws region',
   192      )
   193      parser.add_argument(
   194          '--aws_service_endpoint',
   195          default=None,
   196          help='Url to external aws endpoint',
   197      )
   198      parser.add_argument(
   199          '--use_real_aws',
   200          default=False,
   201          dest='use_real_aws',
   202          action='store_true',
   203          help='Flag whether to use real aws for the tests purpose',
   204      )
   205      parser.add_argument(
   206          '--expansion_service',
   207          help='Url to externally launched expansion service.',
   208      )
   209  
   210      pipeline = TestPipeline()
   211      argv = pipeline.get_full_options_as_args()
   212  
   213      known_args, self.pipeline_args = parser.parse_known_args(argv)
   214  
   215      self.aws_kinesis_stream = known_args.aws_kinesis_stream
   216      self.aws_access_key = known_args.aws_access_key
   217      self.aws_secret_key = known_args.aws_secret_key
   218      self.aws_region = known_args.aws_region
   219      self.aws_service_endpoint = known_args.aws_service_endpoint
   220      self.use_localstack = not known_args.use_real_aws
   221      self.expansion_service = known_args.expansion_service
   222      self.producer_properties = {
   223          'CollectionMaxCount': str(NUM_RECORDS),
   224          'ConnectTimeout': str(MAX_READ_TIME),
   225      }
   226  
   227      if self.use_localstack:
   228        self.set_localstack()
   229  
   230      self.kinesis_helper = KinesisHelper(
   231          self.aws_access_key,
   232          self.aws_secret_key,
   233          self.aws_region,
   234          self.aws_service_endpoint.replace('https', 'http')
   235          if self.aws_service_endpoint else None,
   236      )
   237  
   238      if self.use_localstack:
   239        self.kinesis_helper.create_stream(self.aws_kinesis_stream)
   240  
   241    def tearDown(self):
   242      if self.use_localstack:
   243        self.kinesis_helper.delete_stream(self.aws_kinesis_stream)
   244  
   245        try:
   246          self.localstack.stop()
   247        except:  # pylint: disable=bare-except
   248          logging.error('Could not stop the localstack container')
   249  
   250  
   251  class KinesisHelper:
   252    def __init__(self, access_key, secret_key, region, service_endpoint):
   253      self.kinesis_client = boto3.client(
   254          service_name='kinesis',
   255          region_name=region,
   256          endpoint_url=service_endpoint,
   257          aws_access_key_id=access_key,
   258          aws_secret_access_key=secret_key,
   259      )
   260  
   261    def create_stream(self, stream_name):
   262      # localstack could not have initialized in the container yet so repeat
   263      retries = 10
   264      for i in range(retries):
   265        try:
   266          self.kinesis_client.create_stream(
   267              StreamName=stream_name,
   268              ShardCount=1,
   269          )
   270          time.sleep(2)
   271          break
   272        except Exception as e:
   273          if i == retries - 1:
   274            logging.error('Could not create kinesis stream')
   275            raise e
   276  
   277      # Wait for the stream to be active
   278      self.get_first_shard_id(stream_name)
   279  
   280    def delete_stream(self, stream_name):
   281      self.kinesis_client.delete_stream(
   282          StreamName=stream_name,
   283          EnforceConsumerDeletion=True,
   284      )
   285  
   286    def get_first_shard_id(self, stream_name):
   287      retries = 10
   288      stream = self.kinesis_client.describe_stream(StreamName=stream_name)
   289      for i in range(retries):
   290        if stream['StreamDescription']['StreamStatus'] == 'ACTIVE':
   291          break
   292        time.sleep(2)
   293        if i == retries - 1:
   294          logging.error('Could not initialize kinesis stream')
   295          raise RuntimeError(
   296              "Unable to initialize Kinesis Stream %s. Status: %s",
   297              stream['StreamDescription']['StreamName'],
   298              stream['StreamDescription']['StreamStatus'])
   299        stream = self.kinesis_client.describe_stream(StreamName=stream_name)
   300  
   301      return stream['StreamDescription']['Shards'][0]['ShardId']
   302  
   303    def read_from_stream(self, stream_name):
   304      shard_id = self.get_first_shard_id(stream_name)
   305  
   306      shard_iterator = self.kinesis_client.get_shard_iterator(
   307          StreamName=stream_name,
   308          ShardId=shard_id,
   309          ShardIteratorType=InitialPositionInStream.AT_TIMESTAMP,
   310          Timestamp=str(NOW_SECONDS),
   311      )
   312  
   313      result = self.kinesis_client.get_records(
   314          ShardIterator=shard_iterator['ShardIterator'],
   315          Limit=NUM_RECORDS,
   316      )
   317  
   318      return [record['Data'] for record in result['Records']]
   319  
   320  
   321  if __name__ == '__main__':
   322    logging.getLogger().setLevel(logging.INFO)
   323    unittest.main()