github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/io/kinesis.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """PTransforms for supporting Kinesis streaming in Python pipelines. 19 20 These transforms are currently supported by Beam Flink and Spark portable 21 runners. 22 23 **Setup** 24 25 Transforms provided in this module are cross-language transforms 26 implemented in the Beam Java SDK. During the pipeline construction, Python SDK 27 will connect to a Java expansion service to expand these transforms. 28 To facilitate this, a small amount of setup is needed before using these 29 transforms in a Beam Python pipeline. 30 31 There are several ways to setup cross-language Kinesis transforms. 32 33 * Option 1: use the default expansion service 34 * Option 2: specify a custom expansion service 35 36 See below for details regarding each of these options. 37 38 *Option 1: Use the default expansion service* 39 40 This is the recommended and easiest setup option for using Python Kinesis 41 transforms. This option is only available for Beam 2.25.0 and later. 42 43 This option requires following pre-requisites before running the Beam 44 pipeline. 45 46 * Install Java runtime in the computer from where the pipeline is constructed 47 and make sure that 'java' command is available. 48 49 In this option, Python SDK will either download (for released Beam version) or 50 build (when running from a Beam Git clone) a expansion service jar and use 51 that to expand transforms. Currently Kinesis transforms use the 52 'beam-sdks-java-io-kinesis-expansion-service' jar for this purpose. 53 54 *Option 2: specify a custom expansion service* 55 56 In this option, you startup your own expansion service and provide that as 57 a parameter when using the transforms provided in this module. 58 59 This option requires following pre-requisites before running the Beam 60 pipeline. 61 62 * Startup your own expansion service. 63 * Update your pipeline to provide the expansion service address when 64 initiating Kinesis transforms provided in this module. 65 66 Flink Users can use the built-in Expansion Service of the Flink Runner's 67 Job Server. If you start Flink's Job Server, the expansion service will be 68 started on port 8097. For a different address, please set the 69 expansion_service parameter. 70 71 **More information** 72 73 For more information regarding cross-language transforms see: 74 - https://beam.apache.org/roadmap/portability/ 75 76 For more information specific to Flink runner see: 77 - https://beam.apache.org/documentation/runners/flink/ 78 """ 79 80 # pytype: skip-file 81 82 import logging 83 import time 84 from typing import Mapping 85 from typing import NamedTuple 86 from typing import Optional 87 88 from apache_beam import BeamJarExpansionService 89 from apache_beam import ExternalTransform 90 from apache_beam import NamedTupleBasedPayloadBuilder 91 92 __all__ = [ 93 'WriteToKinesis', 94 'ReadDataFromKinesis', 95 'InitialPositionInStream', 96 'WatermarkPolicy', 97 ] 98 99 100 def default_io_expansion_service(): 101 return BeamJarExpansionService( 102 'sdks:java:io:kinesis:expansion-service:shadowJar') 103 104 105 WriteToKinesisSchema = NamedTuple( 106 'WriteToKinesisSchema', 107 [ 108 ('stream_name', str), 109 ('aws_access_key', str), 110 ('aws_secret_key', str), 111 ('region', str), 112 ('partition_key', str), 113 ('service_endpoint', Optional[str]), 114 ('verify_certificate', Optional[bool]), 115 ('producer_properties', Optional[Mapping[str, str]]), 116 ], 117 ) 118 119 120 class WriteToKinesis(ExternalTransform): 121 """ 122 An external PTransform which writes byte array stream to Amazon Kinesis. 123 124 Experimental; no backwards compatibility guarantees. 125 """ 126 URN = 'beam:transform:org.apache.beam:kinesis_write:v1' 127 128 def __init__( 129 self, 130 stream_name, 131 aws_access_key, 132 aws_secret_key, 133 region, 134 partition_key, 135 service_endpoint=None, 136 verify_certificate=None, 137 producer_properties=None, 138 expansion_service=None, 139 ): 140 """ 141 Initializes a write operation to Kinesis. 142 143 :param stream_name: Kinesis stream name. 144 :param aws_access_key: Kinesis access key. 145 :param aws_secret_key: Kinesis access key secret. 146 :param region: AWS region. Example: 'us-east-1'. 147 :param service_endpoint: Kinesis service endpoint 148 :param verify_certificate: Enable or disable certificate verification. 149 Never set to False on production. True by default. 150 :param partition_key: Specify default partition key. 151 :param producer_properties: Specify the configuration properties for Kinesis 152 Producer Library (KPL) as dictionary. 153 Example: {'CollectionMaxCount': '1000', 'ConnectTimeout': '10000'} 154 :param expansion_service: The address (host:port) of the ExpansionService. 155 """ 156 super().__init__( 157 self.URN, 158 NamedTupleBasedPayloadBuilder( 159 WriteToKinesisSchema( 160 stream_name=stream_name, 161 aws_access_key=aws_access_key, 162 aws_secret_key=aws_secret_key, 163 region=region, 164 partition_key=partition_key, 165 service_endpoint=service_endpoint, 166 verify_certificate=verify_certificate, 167 producer_properties=producer_properties, 168 )), 169 expansion_service or default_io_expansion_service(), 170 ) 171 172 173 ReadFromKinesisSchema = NamedTuple( 174 'ReadFromKinesisSchema', 175 [ 176 ('stream_name', str), 177 ('aws_access_key', str), 178 ('aws_secret_key', str), 179 ('region', str), 180 ('service_endpoint', Optional[str]), 181 ('verify_certificate', Optional[bool]), 182 ('max_num_records', Optional[int]), 183 ('max_read_time', Optional[int]), 184 ('initial_position_in_stream', Optional[str]), 185 ('initial_timestamp_in_stream', Optional[int]), 186 ('request_records_limit', Optional[int]), 187 ('up_to_date_threshold', Optional[int]), 188 ('max_capacity_per_shard', Optional[int]), 189 ('watermark_policy', Optional[str]), 190 ('watermark_idle_duration_threshold', Optional[int]), 191 ('rate_limit', Optional[int]), 192 ], 193 ) 194 195 196 class ReadDataFromKinesis(ExternalTransform): 197 """ 198 An external PTransform which reads byte array stream from Amazon Kinesis. 199 200 Experimental; no backwards compatibility guarantees. 201 """ 202 URN = 'beam:transform:org.apache.beam:kinesis_read_data:v1' 203 204 def __init__( 205 self, 206 stream_name, 207 aws_access_key, 208 aws_secret_key, 209 region, 210 service_endpoint=None, 211 verify_certificate=None, 212 max_num_records=None, 213 max_read_time=None, 214 initial_position_in_stream=None, 215 initial_timestamp_in_stream=None, 216 request_records_limit=None, 217 up_to_date_threshold=None, 218 max_capacity_per_shard=None, 219 watermark_policy=None, 220 watermark_idle_duration_threshold=None, 221 rate_limit=None, 222 expansion_service=None, 223 ): 224 """ 225 Initializes a read operation from Kinesis. 226 227 :param stream_name: Kinesis stream name. 228 :param aws_access_key: Kinesis access key. 229 :param aws_secret_key: Kinesis access key secret. 230 :param region: AWS region. Example: 'us-east-1'. 231 :param service_endpoint: Kinesis service endpoint 232 :param verify_certificate: Enable or disable certificate verification. 233 Never set to False on production. True by default. 234 :param max_num_records: Specifies to read at most a given number of records. 235 Must be greater than 0. 236 :param max_read_time: Specifies to read records during x milliseconds. 237 :param initial_timestamp_in_stream: Specify reading beginning at the given 238 timestamp in milliseconds. Must be in the past. 239 :param initial_position_in_stream: Specify reading from some initial 240 position in stream. Possible values: 241 LATEST - Start after the most recent data record (fetch new data). 242 TRIM_HORIZON - Start from the oldest available data record. 243 AT_TIMESTAMP - Start from the record at or after the specified 244 server-side timestamp. 245 :param request_records_limit: Specifies the maximum number of records in 246 GetRecordsResult returned by GetRecords call which is limited by 10K 247 records. If should be adjusted according to average size of data record 248 to prevent shard overloading. More at: 249 docs.aws.amazon.com/kinesis/latest/APIReference/API_GetRecords.html 250 :param up_to_date_threshold: Specifies how late in milliseconds records 251 consumed by this source can be to still be considered on time. Defaults 252 to zero. 253 :param max_capacity_per_shard: Specifies the maximum number of messages per 254 one shard. Defaults to 10'000. 255 :param watermark_policy: Specifies the watermark policy. Possible values: 256 PROCESSING_TYPE, ARRIVAL_TIME. Defaults to ARRIVAL_TIME. 257 :param watermark_idle_duration_threshold: Use only when watermark policy is 258 ARRIVAL_TIME. Denotes the duration for which the watermark can be idle. 259 Passed in milliseconds. 260 :param rate_limit: Sets fixed rate policy for given milliseconds value. By 261 default there is no rate limit. 262 :param expansion_service: The address (host:port) of the ExpansionService. 263 """ 264 WatermarkPolicy.validate_param(watermark_policy) 265 InitialPositionInStream.validate_param(initial_position_in_stream) 266 267 if watermark_idle_duration_threshold: 268 assert WatermarkPolicy.ARRIVAL_TIME == watermark_policy 269 270 if request_records_limit: 271 assert 0 < request_records_limit <= 10000 272 273 initial_timestamp_in_stream = int( 274 initial_timestamp_in_stream) if initial_timestamp_in_stream else None 275 276 if initial_timestamp_in_stream and initial_timestamp_in_stream < time.time( 277 ): 278 logging.warning('Provided timestamp emplaced not in the past.') 279 280 super().__init__( 281 self.URN, 282 NamedTupleBasedPayloadBuilder( 283 ReadFromKinesisSchema( 284 stream_name=stream_name, 285 aws_access_key=aws_access_key, 286 aws_secret_key=aws_secret_key, 287 region=region, 288 service_endpoint=service_endpoint, 289 verify_certificate=verify_certificate, 290 max_num_records=max_num_records, 291 max_read_time=max_read_time, 292 initial_position_in_stream=initial_position_in_stream, 293 initial_timestamp_in_stream=initial_timestamp_in_stream, 294 request_records_limit=request_records_limit, 295 up_to_date_threshold=up_to_date_threshold, 296 max_capacity_per_shard=max_capacity_per_shard, 297 watermark_policy=watermark_policy, 298 watermark_idle_duration_threshold= 299 watermark_idle_duration_threshold, 300 rate_limit=rate_limit, 301 )), 302 expansion_service or default_io_expansion_service(), 303 ) 304 305 306 class InitialPositionInStream: 307 LATEST = 'LATEST' 308 TRIM_HORIZON = 'TRIM_HORIZON' 309 AT_TIMESTAMP = 'AT_TIMESTAMP' 310 311 @staticmethod 312 def validate_param(param): 313 if param and not hasattr(InitialPositionInStream, param): 314 raise RuntimeError('Invalid initial position in stream: {}'.format(param)) 315 316 317 class WatermarkPolicy: 318 PROCESSING_TYPE = 'PROCESSING_TYPE' 319 ARRIVAL_TIME = 'ARRIVAL_TIME' 320 321 @staticmethod 322 def validate_param(param): 323 if param and not hasattr(WatermarkPolicy, param): 324 raise RuntimeError('Invalid watermark policy: {}'.format(param))