github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/io/kafka.py

github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/io/kafka.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """Unbounded source and sink transforms for
    19     `Kafka <href="http://kafka.apache.org/>`_.
    20  
    21    These transforms are currently supported by Beam portable runners (for
    22    example, portable Flink and Spark) as well as Dataflow runner.
    23  
    24    **Setup**
    25  
    26    Transforms provided in this module are cross-language transforms
    27    implemented in the Beam Java SDK. During the pipeline construction, Python SDK
    28    will connect to a Java expansion service to expand these transforms.
    29    To facilitate this, a small amount of setup is needed before using these
    30    transforms in a Beam Python pipeline.
    31  
    32    There are several ways to setup cross-language Kafka transforms.
    33  
    34    * Option 1: use the default expansion service
    35    * Option 2: specify a custom expansion service
    36  
    37    See below for details regarding each of these options.
    38  
    39    *Option 1: Use the default expansion service*
    40  
    41    This is the recommended and easiest setup option for using Python Kafka
    42    transforms. This option is only available for Beam 2.22.0 and later.
    43  
    44    This option requires following pre-requisites before running the Beam
    45    pipeline.
    46  
    47    * Install Java runtime in the computer from where the pipeline is constructed
    48      and make sure that 'java' command is available.
    49  
    50    In this option, Python SDK will either download (for released Beam version) or
    51    build (when running from a Beam Git clone) an expansion service jar and use
    52    that to expand transforms. Currently Kafka transforms use the
    53    'beam-sdks-java-io-expansion-service' jar for this purpose.
    54  
    55    *Option 2: specify a custom expansion service*
    56  
    57    In this option, you startup your own expansion service and provide that as
    58    a parameter when using the transforms provided in this module.
    59  
    60    This option requires following pre-requisites before running the Beam
    61    pipeline.
    62  
    63    * Startup your own expansion service.
    64    * Update your pipeline to provide the expansion service address when
    65      initiating Kafka transforms provided in this module.
    66  
    67    Flink Users can use the built-in Expansion Service of the Flink Runner's
    68    Job Server. If you start Flink's Job Server, the expansion service will be
    69    started on port 8097. For a different address, please set the
    70    expansion_service parameter.
    71  
    72    **More information**
    73  
    74    For more information regarding cross-language transforms see:
    75    - https://beam.apache.org/roadmap/portability/
    76  
    77    For more information specific to Flink runner see:
    78    - https://beam.apache.org/documentation/runners/flink/
    79  """
    80  
    81  # pytype: skip-file
    82  
    83  import typing
    84  
    85  from apache_beam.transforms.external import BeamJarExpansionService
    86  from apache_beam.transforms.external import ExternalTransform
    87  from apache_beam.transforms.external import NamedTupleBasedPayloadBuilder
    88  
    89  ReadFromKafkaSchema = typing.NamedTuple(
    90      'ReadFromKafkaSchema',
    91      [('consumer_config', typing.Mapping[str, str]),
    92       ('topics', typing.List[str]), ('key_deserializer', str),
    93       ('value_deserializer', str), ('start_read_time', typing.Optional[int]),
    94       ('max_num_records', typing.Optional[int]),
    95       ('max_read_time', typing.Optional[int]),
    96       ('commit_offset_in_finalize', bool), ('timestamp_policy', str)])
    97  
    98  
    99  def default_io_expansion_service(append_args=None):
   100    return BeamJarExpansionService(
   101        'sdks:java:io:expansion-service:shadowJar', append_args=append_args)
   102  
   103  
   104  class ReadFromKafka(ExternalTransform):
   105    """
   106      An external PTransform which reads from Kafka and returns a KV pair for
   107      each item in the specified Kafka topics. If no Kafka Deserializer for
   108      key/value is provided, then the data will be returned as a raw byte array.
   109  
   110      Experimental; no backwards compatibility guarantees.
   111    """
   112  
   113    # Returns the key/value data as raw byte arrays
   114    byte_array_deserializer = (
   115        'org.apache.kafka.common.serialization.ByteArrayDeserializer')
   116  
   117    processing_time_policy = 'ProcessingTime'
   118    create_time_policy = 'CreateTime'
   119    log_append_time = 'LogAppendTime'
   120  
   121    URN_WITH_METADATA = (
   122        'beam:transform:org.apache.beam:kafka_read_with_metadata:v1')
   123    URN_WITHOUT_METADATA = (
   124        'beam:transform:org.apache.beam:kafka_read_without_metadata:v1')
   125  
   126    def __init__(
   127        self,
   128        consumer_config,
   129        topics,
   130        key_deserializer=byte_array_deserializer,
   131        value_deserializer=byte_array_deserializer,
   132        start_read_time=None,
   133        max_num_records=None,
   134        max_read_time=None,
   135        commit_offset_in_finalize=False,
   136        timestamp_policy=processing_time_policy,
   137        with_metadata=False,
   138        expansion_service=None,
   139    ):
   140      """
   141      Initializes a read operation from Kafka.
   142  
   143      :param consumer_config: A dictionary containing the consumer configuration.
   144      :param topics: A list of topic strings.
   145      :param key_deserializer: A fully-qualified Java class name of a Kafka
   146          Deserializer for the topic's key, e.g.
   147          'org.apache.kafka.common.serialization.LongDeserializer'.
   148          Default: 'org.apache.kafka.common.serialization.ByteArrayDeserializer'.
   149      :param value_deserializer: A fully-qualified Java class name of a Kafka
   150          Deserializer for the topic's value, e.g.
   151          'org.apache.kafka.common.serialization.LongDeserializer'.
   152          Default: 'org.apache.kafka.common.serialization.ByteArrayDeserializer'.
   153      :param start_read_time: Use timestamp to set up start offset in milliseconds
   154          epoch.
   155      :param max_num_records: Maximum amount of records to be read. Mainly used
   156          for tests and demo applications.
   157      :param max_read_time: Maximum amount of time in seconds the transform
   158          executes. Mainly used for tests and demo applications.
   159      :param commit_offset_in_finalize: Whether to commit offsets when finalizing.
   160      :param timestamp_policy: The built-in timestamp policy which is used for
   161          extracting timestamp from KafkaRecord.
   162      :param with_metadata: whether the returned PCollection should contain
   163          Kafka related metadata or not. If False (default), elements of the
   164          returned PCollection will be of type 'bytes' if True, elements of the
   165          returned PCollection will be of the type 'Row'. Note that, currently
   166          this only works when using default key and value deserializers where
   167          Java Kafka Reader reads keys and values as 'byte[]'.
   168      :param expansion_service: The address (host:port) of the ExpansionService.
   169      """
   170      if timestamp_policy not in [ReadFromKafka.processing_time_policy,
   171                                  ReadFromKafka.create_time_policy,
   172                                  ReadFromKafka.log_append_time]:
   173        raise ValueError(
   174            'timestamp_policy should be one of '
   175            '[ProcessingTime, CreateTime, LogAppendTime]')
   176  
   177      super().__init__(
   178          self.URN_WITH_METADATA if with_metadata else self.URN_WITHOUT_METADATA,
   179          NamedTupleBasedPayloadBuilder(
   180              ReadFromKafkaSchema(
   181                  consumer_config=consumer_config,
   182                  topics=topics,
   183                  key_deserializer=key_deserializer,
   184                  value_deserializer=value_deserializer,
   185                  max_num_records=max_num_records,
   186                  max_read_time=max_read_time,
   187                  start_read_time=start_read_time,
   188                  commit_offset_in_finalize=commit_offset_in_finalize,
   189                  timestamp_policy=timestamp_policy)),
   190          expansion_service or default_io_expansion_service())
   191  
   192  
   193  WriteToKafkaSchema = typing.NamedTuple(
   194      'WriteToKafkaSchema',
   195      [
   196          ('producer_config', typing.Mapping[str, str]),
   197          ('topic', str),
   198          ('key_serializer', str),
   199          ('value_serializer', str),
   200      ])
   201  
   202  
   203  class WriteToKafka(ExternalTransform):
   204    """
   205      An external PTransform which writes KV data to a specified Kafka topic.
   206      If no Kafka Serializer for key/value is provided, then key/value are
   207      assumed to be byte arrays.
   208  
   209      Experimental; no backwards compatibility guarantees.
   210    """
   211  
   212    # Default serializer which passes raw bytes to Kafka
   213    byte_array_serializer = (
   214        'org.apache.kafka.common.serialization.ByteArraySerializer')
   215  
   216    URN = 'beam:transform:org.apache.beam:kafka_write:v1'
   217  
   218    def __init__(
   219        self,
   220        producer_config,
   221        topic,
   222        key_serializer=byte_array_serializer,
   223        value_serializer=byte_array_serializer,
   224        expansion_service=None):
   225      """
   226      Initializes a write operation to Kafka.
   227  
   228      :param producer_config: A dictionary containing the producer configuration.
   229      :param topic: A Kafka topic name.
   230      :param key_deserializer: A fully-qualified Java class name of a Kafka
   231          Serializer for the topic's key, e.g.
   232          'org.apache.kafka.common.serialization.LongSerializer'.
   233          Default: 'org.apache.kafka.common.serialization.ByteArraySerializer'.
   234      :param value_deserializer: A fully-qualified Java class name of a Kafka
   235          Serializer for the topic's value, e.g.
   236          'org.apache.kafka.common.serialization.LongSerializer'.
   237          Default: 'org.apache.kafka.common.serialization.ByteArraySerializer'.
   238      :param expansion_service: The address (host:port) of the ExpansionService.
   239      """
   240      super().__init__(
   241          self.URN,
   242          NamedTupleBasedPayloadBuilder(
   243              WriteToKafkaSchema(
   244                  producer_config=producer_config,
   245                  topic=topic,
   246                  key_serializer=key_serializer,
   247                  value_serializer=value_serializer,
   248              )),
   249          expansion_service or default_io_expansion_service())