github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/io/external/gcp/pubsub.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  # pytype: skip-file
    19  
    20  import typing
    21  
    22  import apache_beam as beam
    23  from apache_beam.io.gcp import pubsub
    24  from apache_beam.transforms import Map
    25  from apache_beam.transforms.external import ExternalTransform
    26  from apache_beam.transforms.external import NamedTupleBasedPayloadBuilder
    27  
    28  ReadFromPubsubSchema = typing.NamedTuple(
    29      'ReadFromPubsubSchema',
    30      [
    31          ('topic', typing.Optional[str]),
    32          ('subscription', typing.Optional[str]),
    33          ('id_label', typing.Optional[str]),
    34          ('with_attributes', bool),
    35          ('timestamp_attribute', typing.Optional[str]),
    36      ])
    37  
    38  
    39  class ReadFromPubSub(beam.PTransform):
    40    """An external ``PTransform`` for reading from Cloud Pub/Sub.
    41  
    42    Experimental; no backwards compatibility guarantees.  It requires special
    43    preparation of the Java SDK.  See https://github.com/apache/beam/issues/19728.
    44    """
    45  
    46    URN = 'beam:transform:org.apache.beam:pubsub_read:v1'
    47  
    48    def __init__(
    49        self,
    50        topic=None,
    51        subscription=None,
    52        id_label=None,
    53        with_attributes=False,
    54        timestamp_attribute=None,
    55        expansion_service=None):
    56      """Initializes ``ReadFromPubSub``.
    57  
    58      Args:
    59        topic: Cloud Pub/Sub topic in the form
    60          "projects/<project>/topics/<topic>". If provided, subscription must be
    61          None.
    62        subscription: Existing Cloud Pub/Sub subscription to use in the
    63          form "projects/<project>/subscriptions/<subscription>". If not
    64          specified, a temporary subscription will be created from the specified
    65          topic. If provided, topic must be None.
    66        id_label: The attribute on incoming Pub/Sub messages to use as a unique
    67          record identifier. When specified, the value of this attribute (which
    68          can be any string that uniquely identifies the record) will be used for
    69          deduplication of messages. If not provided, we cannot guarantee
    70          that no duplicate data will be delivered on the Pub/Sub stream. In this
    71          case, deduplication of the stream will be strictly best effort.
    72        with_attributes:
    73          True - output elements will be
    74          :class:`~apache_beam.io.gcp.pubsub.PubsubMessage` objects.
    75          False - output elements will be of type ``bytes`` (message
    76          data only).
    77        timestamp_attribute: Message value to use as element timestamp. If None,
    78          uses message publishing time as the timestamp.
    79  
    80          Timestamp values should be in one of two formats:
    81  
    82          - A numerical value representing the number of milliseconds since the
    83            Unix epoch.
    84          - A string in RFC 3339 format, UTC timezone. Example:
    85            ``2015-10-29T23:41:41.123Z``. The sub-second component of the
    86            timestamp is optional, and digits beyond the first three (i.e., time
    87            units smaller than milliseconds) may be ignored.
    88      """
    89      self.params = ReadFromPubsubSchema(
    90          topic=topic,
    91          subscription=subscription,
    92          id_label=id_label,
    93          with_attributes=with_attributes,
    94          timestamp_attribute=timestamp_attribute)
    95      self.expansion_service = expansion_service
    96  
    97    def expand(self, pbegin):
    98      pcoll = pbegin.apply(
    99          ExternalTransform(
   100              self.URN,
   101              NamedTupleBasedPayloadBuilder(self.params),
   102              self.expansion_service))
   103  
   104      if self.params.with_attributes:
   105        pcoll = pcoll | 'FromProto' >> Map(pubsub.PubsubMessage._from_proto_str)
   106        pcoll.element_type = pubsub.PubsubMessage
   107      else:
   108        pcoll.element_type = bytes
   109      return pcoll
   110  
   111  
   112  WriteToPubsubSchema = typing.NamedTuple(
   113      'WriteToPubsubSchema',
   114      [
   115          ('topic', str),
   116          ('id_label', typing.Optional[str]),
   117          # this is not implemented yet on the Java side:
   118          # ('with_attributes', bool),
   119          ('timestamp_attribute', typing.Optional[str]),
   120      ])
   121  
   122  
   123  class WriteToPubSub(beam.PTransform):
   124    """An external ``PTransform`` for writing messages to Cloud Pub/Sub.
   125  
   126    Experimental; no backwards compatibility guarantees.  It requires special
   127    preparation of the Java SDK.  See https://github.com/apache/beam/issues/19728.
   128    """
   129  
   130    URN = 'beam:transform:org.apache.beam:pubsub_write:v1'
   131  
   132    def __init__(
   133        self,
   134        topic,
   135        with_attributes=False,
   136        id_label=None,
   137        timestamp_attribute=None,
   138        expansion_service=None):
   139      """Initializes ``WriteToPubSub``.
   140  
   141      Args:
   142        topic: Cloud Pub/Sub topic in the form "/topics/<project>/<topic>".
   143        with_attributes:
   144          True - input elements will be
   145          :class:`~apache_beam.io.gcp.pubsub.PubsubMessage` objects.
   146          False - input elements will be of type ``bytes`` (message
   147          data only).
   148        id_label: If set, will set an attribute for each Cloud Pub/Sub message
   149          with the given name and a unique value. This attribute can then be used
   150          in a ReadFromPubSub PTransform to deduplicate messages.
   151        timestamp_attribute: If set, will set an attribute for each Cloud Pub/Sub
   152          message with the given name and the message's publish time as the value.
   153      """
   154      self.params = WriteToPubsubSchema(
   155          topic=topic,
   156          id_label=id_label,
   157          # with_attributes=with_attributes,
   158          timestamp_attribute=timestamp_attribute)
   159      self.expansion_service = expansion_service
   160      self.with_attributes = with_attributes
   161  
   162    def expand(self, pvalue):
   163      if self.with_attributes:
   164        pcoll = pvalue | 'ToProto' >> Map(pubsub.WriteToPubSub.to_proto_str)
   165      else:
   166        pcoll = pvalue | 'ToProto' >> Map(
   167            lambda x: pubsub.PubsubMessage(x, {})._to_proto_str())
   168      pcoll.element_type = bytes
   169  
   170      return pcoll.apply(
   171          ExternalTransform(
   172              self.URN,
   173              NamedTupleBasedPayloadBuilder(self.params),
   174              self.expansion_service))