github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/io/kafka.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """Unbounded source and sink transforms for 19 `Kafka <href="http://kafka.apache.org/>`_. 20 21 These transforms are currently supported by Beam portable runners (for 22 example, portable Flink and Spark) as well as Dataflow runner. 23 24 **Setup** 25 26 Transforms provided in this module are cross-language transforms 27 implemented in the Beam Java SDK. During the pipeline construction, Python SDK 28 will connect to a Java expansion service to expand these transforms. 29 To facilitate this, a small amount of setup is needed before using these 30 transforms in a Beam Python pipeline. 31 32 There are several ways to setup cross-language Kafka transforms. 33 34 * Option 1: use the default expansion service 35 * Option 2: specify a custom expansion service 36 37 See below for details regarding each of these options. 38 39 *Option 1: Use the default expansion service* 40 41 This is the recommended and easiest setup option for using Python Kafka 42 transforms. This option is only available for Beam 2.22.0 and later. 43 44 This option requires following pre-requisites before running the Beam 45 pipeline. 46 47 * Install Java runtime in the computer from where the pipeline is constructed 48 and make sure that 'java' command is available. 49 50 In this option, Python SDK will either download (for released Beam version) or 51 build (when running from a Beam Git clone) an expansion service jar and use 52 that to expand transforms. Currently Kafka transforms use the 53 'beam-sdks-java-io-expansion-service' jar for this purpose. 54 55 *Option 2: specify a custom expansion service* 56 57 In this option, you startup your own expansion service and provide that as 58 a parameter when using the transforms provided in this module. 59 60 This option requires following pre-requisites before running the Beam 61 pipeline. 62 63 * Startup your own expansion service. 64 * Update your pipeline to provide the expansion service address when 65 initiating Kafka transforms provided in this module. 66 67 Flink Users can use the built-in Expansion Service of the Flink Runner's 68 Job Server. If you start Flink's Job Server, the expansion service will be 69 started on port 8097. For a different address, please set the 70 expansion_service parameter. 71 72 **More information** 73 74 For more information regarding cross-language transforms see: 75 - https://beam.apache.org/roadmap/portability/ 76 77 For more information specific to Flink runner see: 78 - https://beam.apache.org/documentation/runners/flink/ 79 """ 80 81 # pytype: skip-file 82 83 import typing 84 85 from apache_beam.transforms.external import BeamJarExpansionService 86 from apache_beam.transforms.external import ExternalTransform 87 from apache_beam.transforms.external import NamedTupleBasedPayloadBuilder 88 89 ReadFromKafkaSchema = typing.NamedTuple( 90 'ReadFromKafkaSchema', 91 [('consumer_config', typing.Mapping[str, str]), 92 ('topics', typing.List[str]), ('key_deserializer', str), 93 ('value_deserializer', str), ('start_read_time', typing.Optional[int]), 94 ('max_num_records', typing.Optional[int]), 95 ('max_read_time', typing.Optional[int]), 96 ('commit_offset_in_finalize', bool), ('timestamp_policy', str)]) 97 98 99 def default_io_expansion_service(append_args=None): 100 return BeamJarExpansionService( 101 'sdks:java:io:expansion-service:shadowJar', append_args=append_args) 102 103 104 class ReadFromKafka(ExternalTransform): 105 """ 106 An external PTransform which reads from Kafka and returns a KV pair for 107 each item in the specified Kafka topics. If no Kafka Deserializer for 108 key/value is provided, then the data will be returned as a raw byte array. 109 110 Experimental; no backwards compatibility guarantees. 111 """ 112 113 # Returns the key/value data as raw byte arrays 114 byte_array_deserializer = ( 115 'org.apache.kafka.common.serialization.ByteArrayDeserializer') 116 117 processing_time_policy = 'ProcessingTime' 118 create_time_policy = 'CreateTime' 119 log_append_time = 'LogAppendTime' 120 121 URN_WITH_METADATA = ( 122 'beam:transform:org.apache.beam:kafka_read_with_metadata:v1') 123 URN_WITHOUT_METADATA = ( 124 'beam:transform:org.apache.beam:kafka_read_without_metadata:v1') 125 126 def __init__( 127 self, 128 consumer_config, 129 topics, 130 key_deserializer=byte_array_deserializer, 131 value_deserializer=byte_array_deserializer, 132 start_read_time=None, 133 max_num_records=None, 134 max_read_time=None, 135 commit_offset_in_finalize=False, 136 timestamp_policy=processing_time_policy, 137 with_metadata=False, 138 expansion_service=None, 139 ): 140 """ 141 Initializes a read operation from Kafka. 142 143 :param consumer_config: A dictionary containing the consumer configuration. 144 :param topics: A list of topic strings. 145 :param key_deserializer: A fully-qualified Java class name of a Kafka 146 Deserializer for the topic's key, e.g. 147 'org.apache.kafka.common.serialization.LongDeserializer'. 148 Default: 'org.apache.kafka.common.serialization.ByteArrayDeserializer'. 149 :param value_deserializer: A fully-qualified Java class name of a Kafka 150 Deserializer for the topic's value, e.g. 151 'org.apache.kafka.common.serialization.LongDeserializer'. 152 Default: 'org.apache.kafka.common.serialization.ByteArrayDeserializer'. 153 :param start_read_time: Use timestamp to set up start offset in milliseconds 154 epoch. 155 :param max_num_records: Maximum amount of records to be read. Mainly used 156 for tests and demo applications. 157 :param max_read_time: Maximum amount of time in seconds the transform 158 executes. Mainly used for tests and demo applications. 159 :param commit_offset_in_finalize: Whether to commit offsets when finalizing. 160 :param timestamp_policy: The built-in timestamp policy which is used for 161 extracting timestamp from KafkaRecord. 162 :param with_metadata: whether the returned PCollection should contain 163 Kafka related metadata or not. If False (default), elements of the 164 returned PCollection will be of type 'bytes' if True, elements of the 165 returned PCollection will be of the type 'Row'. Note that, currently 166 this only works when using default key and value deserializers where 167 Java Kafka Reader reads keys and values as 'byte[]'. 168 :param expansion_service: The address (host:port) of the ExpansionService. 169 """ 170 if timestamp_policy not in [ReadFromKafka.processing_time_policy, 171 ReadFromKafka.create_time_policy, 172 ReadFromKafka.log_append_time]: 173 raise ValueError( 174 'timestamp_policy should be one of ' 175 '[ProcessingTime, CreateTime, LogAppendTime]') 176 177 super().__init__( 178 self.URN_WITH_METADATA if with_metadata else self.URN_WITHOUT_METADATA, 179 NamedTupleBasedPayloadBuilder( 180 ReadFromKafkaSchema( 181 consumer_config=consumer_config, 182 topics=topics, 183 key_deserializer=key_deserializer, 184 value_deserializer=value_deserializer, 185 max_num_records=max_num_records, 186 max_read_time=max_read_time, 187 start_read_time=start_read_time, 188 commit_offset_in_finalize=commit_offset_in_finalize, 189 timestamp_policy=timestamp_policy)), 190 expansion_service or default_io_expansion_service()) 191 192 193 WriteToKafkaSchema = typing.NamedTuple( 194 'WriteToKafkaSchema', 195 [ 196 ('producer_config', typing.Mapping[str, str]), 197 ('topic', str), 198 ('key_serializer', str), 199 ('value_serializer', str), 200 ]) 201 202 203 class WriteToKafka(ExternalTransform): 204 """ 205 An external PTransform which writes KV data to a specified Kafka topic. 206 If no Kafka Serializer for key/value is provided, then key/value are 207 assumed to be byte arrays. 208 209 Experimental; no backwards compatibility guarantees. 210 """ 211 212 # Default serializer which passes raw bytes to Kafka 213 byte_array_serializer = ( 214 'org.apache.kafka.common.serialization.ByteArraySerializer') 215 216 URN = 'beam:transform:org.apache.beam:kafka_write:v1' 217 218 def __init__( 219 self, 220 producer_config, 221 topic, 222 key_serializer=byte_array_serializer, 223 value_serializer=byte_array_serializer, 224 expansion_service=None): 225 """ 226 Initializes a write operation to Kafka. 227 228 :param producer_config: A dictionary containing the producer configuration. 229 :param topic: A Kafka topic name. 230 :param key_deserializer: A fully-qualified Java class name of a Kafka 231 Serializer for the topic's key, e.g. 232 'org.apache.kafka.common.serialization.LongSerializer'. 233 Default: 'org.apache.kafka.common.serialization.ByteArraySerializer'. 234 :param value_deserializer: A fully-qualified Java class name of a Kafka 235 Serializer for the topic's value, e.g. 236 'org.apache.kafka.common.serialization.LongSerializer'. 237 Default: 'org.apache.kafka.common.serialization.ByteArraySerializer'. 238 :param expansion_service: The address (host:port) of the ExpansionService. 239 """ 240 super().__init__( 241 self.URN, 242 NamedTupleBasedPayloadBuilder( 243 WriteToKafkaSchema( 244 producer_config=producer_config, 245 topic=topic, 246 key_serializer=key_serializer, 247 value_serializer=value_serializer, 248 )), 249 expansion_service or default_io_expansion_service())