github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/io/kinesis.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """PTransforms for supporting Kinesis streaming in Python pipelines.
    19  
    20    These transforms are currently supported by Beam Flink and Spark portable
    21    runners.
    22  
    23    **Setup**
    24  
    25    Transforms provided in this module are cross-language transforms
    26    implemented in the Beam Java SDK. During the pipeline construction, Python SDK
    27    will connect to a Java expansion service to expand these transforms.
    28    To facilitate this, a small amount of setup is needed before using these
    29    transforms in a Beam Python pipeline.
    30  
    31    There are several ways to setup cross-language Kinesis transforms.
    32  
    33    * Option 1: use the default expansion service
    34    * Option 2: specify a custom expansion service
    35  
    36    See below for details regarding each of these options.
    37  
    38    *Option 1: Use the default expansion service*
    39  
    40    This is the recommended and easiest setup option for using Python Kinesis
    41    transforms. This option is only available for Beam 2.25.0 and later.
    42  
    43    This option requires following pre-requisites before running the Beam
    44    pipeline.
    45  
    46    * Install Java runtime in the computer from where the pipeline is constructed
    47      and make sure that 'java' command is available.
    48  
    49    In this option, Python SDK will either download (for released Beam version) or
    50    build (when running from a Beam Git clone) a expansion service jar and use
    51    that to expand transforms. Currently Kinesis transforms use the
    52    'beam-sdks-java-io-kinesis-expansion-service' jar for this purpose.
    53  
    54    *Option 2: specify a custom expansion service*
    55  
    56    In this option, you startup your own expansion service and provide that as
    57    a parameter when using the transforms provided in this module.
    58  
    59    This option requires following pre-requisites before running the Beam
    60    pipeline.
    61  
    62    * Startup your own expansion service.
    63    * Update your pipeline to provide the expansion service address when
    64      initiating Kinesis transforms provided in this module.
    65  
    66    Flink Users can use the built-in Expansion Service of the Flink Runner's
    67    Job Server. If you start Flink's Job Server, the expansion service will be
    68    started on port 8097. For a different address, please set the
    69    expansion_service parameter.
    70  
    71    **More information**
    72  
    73    For more information regarding cross-language transforms see:
    74    - https://beam.apache.org/roadmap/portability/
    75  
    76    For more information specific to Flink runner see:
    77    - https://beam.apache.org/documentation/runners/flink/
    78  """
    79  
    80  # pytype: skip-file
    81  
    82  import logging
    83  import time
    84  from typing import Mapping
    85  from typing import NamedTuple
    86  from typing import Optional
    87  
    88  from apache_beam import BeamJarExpansionService
    89  from apache_beam import ExternalTransform
    90  from apache_beam import NamedTupleBasedPayloadBuilder
    91  
    92  __all__ = [
    93      'WriteToKinesis',
    94      'ReadDataFromKinesis',
    95      'InitialPositionInStream',
    96      'WatermarkPolicy',
    97  ]
    98  
    99  
   100  def default_io_expansion_service():
   101    return BeamJarExpansionService(
   102        'sdks:java:io:kinesis:expansion-service:shadowJar')
   103  
   104  
   105  WriteToKinesisSchema = NamedTuple(
   106      'WriteToKinesisSchema',
   107      [
   108          ('stream_name', str),
   109          ('aws_access_key', str),
   110          ('aws_secret_key', str),
   111          ('region', str),
   112          ('partition_key', str),
   113          ('service_endpoint', Optional[str]),
   114          ('verify_certificate', Optional[bool]),
   115          ('producer_properties', Optional[Mapping[str, str]]),
   116      ],
   117  )
   118  
   119  
   120  class WriteToKinesis(ExternalTransform):
   121    """
   122      An external PTransform which writes byte array stream to Amazon Kinesis.
   123  
   124      Experimental; no backwards compatibility guarantees.
   125    """
   126    URN = 'beam:transform:org.apache.beam:kinesis_write:v1'
   127  
   128    def __init__(
   129        self,
   130        stream_name,
   131        aws_access_key,
   132        aws_secret_key,
   133        region,
   134        partition_key,
   135        service_endpoint=None,
   136        verify_certificate=None,
   137        producer_properties=None,
   138        expansion_service=None,
   139    ):
   140      """
   141      Initializes a write operation to Kinesis.
   142  
   143      :param stream_name: Kinesis stream name.
   144      :param aws_access_key: Kinesis access key.
   145      :param aws_secret_key: Kinesis access key secret.
   146      :param region: AWS region. Example: 'us-east-1'.
   147      :param service_endpoint: Kinesis service endpoint
   148      :param verify_certificate: Enable or disable certificate verification.
   149          Never set to False on production. True by default.
   150      :param partition_key: Specify default partition key.
   151      :param producer_properties: Specify the configuration properties for Kinesis
   152          Producer Library (KPL) as dictionary.
   153          Example: {'CollectionMaxCount': '1000', 'ConnectTimeout': '10000'}
   154      :param expansion_service: The address (host:port) of the ExpansionService.
   155      """
   156      super().__init__(
   157          self.URN,
   158          NamedTupleBasedPayloadBuilder(
   159              WriteToKinesisSchema(
   160                  stream_name=stream_name,
   161                  aws_access_key=aws_access_key,
   162                  aws_secret_key=aws_secret_key,
   163                  region=region,
   164                  partition_key=partition_key,
   165                  service_endpoint=service_endpoint,
   166                  verify_certificate=verify_certificate,
   167                  producer_properties=producer_properties,
   168              )),
   169          expansion_service or default_io_expansion_service(),
   170      )
   171  
   172  
   173  ReadFromKinesisSchema = NamedTuple(
   174      'ReadFromKinesisSchema',
   175      [
   176          ('stream_name', str),
   177          ('aws_access_key', str),
   178          ('aws_secret_key', str),
   179          ('region', str),
   180          ('service_endpoint', Optional[str]),
   181          ('verify_certificate', Optional[bool]),
   182          ('max_num_records', Optional[int]),
   183          ('max_read_time', Optional[int]),
   184          ('initial_position_in_stream', Optional[str]),
   185          ('initial_timestamp_in_stream', Optional[int]),
   186          ('request_records_limit', Optional[int]),
   187          ('up_to_date_threshold', Optional[int]),
   188          ('max_capacity_per_shard', Optional[int]),
   189          ('watermark_policy', Optional[str]),
   190          ('watermark_idle_duration_threshold', Optional[int]),
   191          ('rate_limit', Optional[int]),
   192      ],
   193  )
   194  
   195  
   196  class ReadDataFromKinesis(ExternalTransform):
   197    """
   198      An external PTransform which reads byte array stream from Amazon Kinesis.
   199  
   200      Experimental; no backwards compatibility guarantees.
   201    """
   202    URN = 'beam:transform:org.apache.beam:kinesis_read_data:v1'
   203  
   204    def __init__(
   205        self,
   206        stream_name,
   207        aws_access_key,
   208        aws_secret_key,
   209        region,
   210        service_endpoint=None,
   211        verify_certificate=None,
   212        max_num_records=None,
   213        max_read_time=None,
   214        initial_position_in_stream=None,
   215        initial_timestamp_in_stream=None,
   216        request_records_limit=None,
   217        up_to_date_threshold=None,
   218        max_capacity_per_shard=None,
   219        watermark_policy=None,
   220        watermark_idle_duration_threshold=None,
   221        rate_limit=None,
   222        expansion_service=None,
   223    ):
   224      """
   225      Initializes a read operation from Kinesis.
   226  
   227      :param stream_name: Kinesis stream name.
   228      :param aws_access_key: Kinesis access key.
   229      :param aws_secret_key: Kinesis access key secret.
   230      :param region: AWS region. Example: 'us-east-1'.
   231      :param service_endpoint: Kinesis service endpoint
   232      :param verify_certificate: Enable or disable certificate verification.
   233          Never set to False on production. True by default.
   234      :param max_num_records: Specifies to read at most a given number of records.
   235          Must be greater than 0.
   236      :param max_read_time: Specifies to read records during x milliseconds.
   237      :param initial_timestamp_in_stream: Specify reading beginning at the given
   238          timestamp in milliseconds. Must be in the past.
   239      :param initial_position_in_stream: Specify reading from some initial
   240          position in stream. Possible values:
   241          LATEST - Start after the most recent data record (fetch new data).
   242          TRIM_HORIZON - Start from the oldest available data record.
   243          AT_TIMESTAMP - Start from the record at or after the specified
   244          server-side timestamp.
   245      :param request_records_limit: Specifies the maximum number of records in
   246          GetRecordsResult returned by GetRecords call which is limited by 10K
   247          records. If should be adjusted according to average size of data record
   248          to prevent shard overloading. More at:
   249          docs.aws.amazon.com/kinesis/latest/APIReference/API_GetRecords.html
   250      :param up_to_date_threshold: Specifies how late in milliseconds records
   251          consumed by this source can be to still be considered on time. Defaults
   252          to zero.
   253      :param max_capacity_per_shard: Specifies the maximum number of messages per
   254          one shard. Defaults to 10'000.
   255      :param watermark_policy: Specifies the watermark policy. Possible values:
   256          PROCESSING_TYPE, ARRIVAL_TIME. Defaults to ARRIVAL_TIME.
   257      :param watermark_idle_duration_threshold: Use only when watermark policy is
   258          ARRIVAL_TIME. Denotes the duration for which the watermark can be idle.
   259          Passed in milliseconds.
   260      :param rate_limit: Sets fixed rate policy for given milliseconds value. By
   261          default there is no rate limit.
   262      :param expansion_service: The address (host:port) of the ExpansionService.
   263      """
   264      WatermarkPolicy.validate_param(watermark_policy)
   265      InitialPositionInStream.validate_param(initial_position_in_stream)
   266  
   267      if watermark_idle_duration_threshold:
   268        assert WatermarkPolicy.ARRIVAL_TIME == watermark_policy
   269  
   270      if request_records_limit:
   271        assert 0 < request_records_limit <= 10000
   272  
   273      initial_timestamp_in_stream = int(
   274          initial_timestamp_in_stream) if initial_timestamp_in_stream else None
   275  
   276      if initial_timestamp_in_stream and initial_timestamp_in_stream < time.time(
   277      ):
   278        logging.warning('Provided timestamp emplaced not in the past.')
   279  
   280      super().__init__(
   281          self.URN,
   282          NamedTupleBasedPayloadBuilder(
   283              ReadFromKinesisSchema(
   284                  stream_name=stream_name,
   285                  aws_access_key=aws_access_key,
   286                  aws_secret_key=aws_secret_key,
   287                  region=region,
   288                  service_endpoint=service_endpoint,
   289                  verify_certificate=verify_certificate,
   290                  max_num_records=max_num_records,
   291                  max_read_time=max_read_time,
   292                  initial_position_in_stream=initial_position_in_stream,
   293                  initial_timestamp_in_stream=initial_timestamp_in_stream,
   294                  request_records_limit=request_records_limit,
   295                  up_to_date_threshold=up_to_date_threshold,
   296                  max_capacity_per_shard=max_capacity_per_shard,
   297                  watermark_policy=watermark_policy,
   298                  watermark_idle_duration_threshold=
   299                  watermark_idle_duration_threshold,
   300                  rate_limit=rate_limit,
   301              )),
   302          expansion_service or default_io_expansion_service(),
   303      )
   304  
   305  
   306  class InitialPositionInStream:
   307    LATEST = 'LATEST'
   308    TRIM_HORIZON = 'TRIM_HORIZON'
   309    AT_TIMESTAMP = 'AT_TIMESTAMP'
   310  
   311    @staticmethod
   312    def validate_param(param):
   313      if param and not hasattr(InitialPositionInStream, param):
   314        raise RuntimeError('Invalid initial position in stream: {}'.format(param))
   315  
   316  
   317  class WatermarkPolicy:
   318    PROCESSING_TYPE = 'PROCESSING_TYPE'
   319    ARRIVAL_TIME = 'ARRIVAL_TIME'
   320  
   321    @staticmethod
   322    def validate_param(param):
   323      if param and not hasattr(WatermarkPolicy, param):
   324        raise RuntimeError('Invalid watermark policy: {}'.format(param))