github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/io/gcp/pubsub.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """Google Cloud PubSub sources and sinks. 19 20 Cloud Pub/Sub sources and sinks are currently supported only in streaming 21 pipelines, during remote execution. 22 23 This API is currently under development and is subject to change. 24 25 **Updates to the I/O connector code** 26 27 For any significant updates to this I/O connector, please consider involving 28 corresponding code reviewers mentioned in 29 https://github.com/apache/beam/blob/master/sdks/python/OWNERS 30 """ 31 32 # pytype: skip-file 33 34 import re 35 from typing import Any 36 from typing import List 37 from typing import NamedTuple 38 from typing import Optional 39 from typing import Tuple 40 41 from apache_beam import coders 42 from apache_beam.io.iobase import Read 43 from apache_beam.io.iobase import Write 44 from apache_beam.runners.dataflow.native_io import iobase as dataflow_io 45 from apache_beam.transforms import Flatten 46 from apache_beam.transforms import Map 47 from apache_beam.transforms import PTransform 48 from apache_beam.transforms.display import DisplayDataItem 49 from apache_beam.utils.annotations import deprecated 50 51 try: 52 from google.cloud import pubsub 53 except ImportError: 54 pubsub = None 55 56 __all__ = [ 57 'MultipleReadFromPubSub', 58 'PubsubMessage', 59 'PubSubSourceDescriptor', 60 'ReadFromPubSub', 61 'ReadStringsFromPubSub', 62 'WriteStringsToPubSub', 63 'WriteToPubSub' 64 ] 65 66 67 class PubsubMessage(object): 68 """Represents a Cloud Pub/Sub message. 69 70 Message payload includes the data and attributes fields. For the payload to be 71 valid, at least one of its fields must be non-empty. 72 73 Attributes: 74 data: (bytes) Message data. May be None. 75 attributes: (dict) Key-value map of str to str, containing both user-defined 76 and service generated attributes (such as id_label and 77 timestamp_attribute). May be None. 78 message_id: (str) ID of the message, assigned by the pubsub service when the 79 message is published. Guaranteed to be unique within the topic. Will be 80 reset to None if the message is being written to pubsub. 81 publish_time: (datetime) Time at which the message was published. Will be 82 reset to None if the Message is being written to pubsub. 83 ordering_key: (str) If non-empty, identifies related messages for which 84 publish order is respected by the PubSub subscription. 85 """ 86 def __init__( 87 self, 88 data, 89 attributes, 90 message_id=None, 91 publish_time=None, 92 ordering_key=""): 93 if data is None and not attributes: 94 raise ValueError( 95 'Either data (%r) or attributes (%r) must be set.', data, attributes) 96 self.data = data 97 self.attributes = attributes 98 self.message_id = message_id 99 self.publish_time = publish_time 100 self.ordering_key = ordering_key 101 102 def __hash__(self): 103 return hash((self.data, frozenset(self.attributes.items()))) 104 105 def __eq__(self, other): 106 return isinstance(other, PubsubMessage) and ( 107 self.data == other.data and self.attributes == other.attributes) 108 109 def __repr__(self): 110 return 'PubsubMessage(%s, %s)' % (self.data, self.attributes) 111 112 @staticmethod 113 def _from_proto_str(proto_msg): 114 # type: (bytes) -> PubsubMessage 115 116 """Construct from serialized form of ``PubsubMessage``. 117 118 Args: 119 proto_msg: String containing a serialized protobuf of type 120 https://cloud.google.com/pubsub/docs/reference/rpc/google.pubsub.v1#google.pubsub.v1.PubsubMessage 121 122 Returns: 123 A new PubsubMessage object. 124 """ 125 msg = pubsub.types.PubsubMessage.deserialize(proto_msg) 126 # Convert ScalarMapContainer to dict. 127 attributes = dict((key, msg.attributes[key]) for key in msg.attributes) 128 return PubsubMessage( 129 msg.data, 130 attributes, 131 msg.message_id, 132 msg.publish_time, 133 msg.ordering_key) 134 135 def _to_proto_str(self, for_publish=False): 136 """Get serialized form of ``PubsubMessage``. 137 138 The serialized message is validated against pubsub message limits specified 139 at https://cloud.google.com/pubsub/quotas#resource_limits 140 141 Args: 142 proto_msg: str containing a serialized protobuf. 143 for_publish: bool, if True strip out message fields which cannot be 144 published (currently message_id and publish_time) per 145 https://cloud.google.com/pubsub/docs/reference/rpc/google.pubsub.v1#pubsubmessage 146 147 Returns: 148 A str containing a serialized protobuf of type 149 https://cloud.google.com/pubsub/docs/reference/rpc/google.pubsub.v1#google.pubsub.v1.PubsubMessage 150 containing the payload of this object. 151 """ 152 msg = pubsub.types.PubsubMessage() 153 if len(self.data) > (10 << 20): 154 raise ValueError('A pubsub message data field must not exceed 10MB') 155 msg.data = self.data 156 157 if self.attributes: 158 if len(self.attributes) > 100: 159 raise ValueError( 160 'A pubsub message must not have more than 100 attributes.') 161 for key, value in self.attributes.items(): 162 if len(key) > 256: 163 raise ValueError( 164 'A pubsub message attribute key must not exceed 256 bytes.') 165 if len(value) > 1024: 166 raise ValueError( 167 'A pubsub message attribute value must not exceed 1024 bytes') 168 msg.attributes[key] = value 169 170 if not for_publish: 171 if self.message_id: 172 msg.message_id = self.message_id 173 if self.publish_time: 174 msg.publish_time = self.publish_time 175 176 if len(self.ordering_key) > 1024: 177 raise ValueError( 178 'A pubsub message ordering key must not exceed 1024 bytes.') 179 msg.ordering_key = self.ordering_key 180 181 serialized = pubsub.types.PubsubMessage.serialize(msg) 182 if len(serialized) > (10 << 20): 183 raise ValueError( 184 'Serialized pubsub message exceeds the publish request limit of 10MB') 185 return serialized 186 187 @staticmethod 188 def _from_message(msg): 189 # type: (Any) -> PubsubMessage 190 191 """Construct from ``google.cloud.pubsub_v1.subscriber.message.Message``. 192 193 https://googleapis.github.io/google-cloud-python/latest/pubsub/subscriber/api/message.html 194 """ 195 # Convert ScalarMapContainer to dict. 196 attributes = dict((key, msg.attributes[key]) for key in msg.attributes) 197 pubsubmessage = PubsubMessage(msg.data, attributes) 198 if msg.message_id: 199 pubsubmessage.message_id = msg.message_id 200 if msg.publish_time: 201 pubsubmessage.publish_time = msg.publish_time 202 if msg.ordering_key: 203 pubsubmessage.ordering_key = msg.ordering_key 204 return pubsubmessage 205 206 207 class ReadFromPubSub(PTransform): 208 """A ``PTransform`` for reading from Cloud Pub/Sub.""" 209 210 # Implementation note: This ``PTransform`` is overridden by Directrunner. 211 212 def __init__( 213 self, 214 topic=None, # type: Optional[str] 215 subscription=None, # type: Optional[str] 216 id_label=None, # type: Optional[str] 217 with_attributes=False, # type: bool 218 timestamp_attribute=None # type: Optional[str] 219 ): 220 # type: (...) -> None 221 222 """Initializes ``ReadFromPubSub``. 223 224 Args: 225 topic: Cloud Pub/Sub topic in the form 226 "projects/<project>/topics/<topic>". If provided, subscription must be 227 None. 228 subscription: Existing Cloud Pub/Sub subscription to use in the 229 form "projects/<project>/subscriptions/<subscription>". If not 230 specified, a temporary subscription will be created from the specified 231 topic. If provided, topic must be None. 232 id_label: The attribute on incoming Pub/Sub messages to use as a unique 233 record identifier. When specified, the value of this attribute (which 234 can be any string that uniquely identifies the record) will be used for 235 deduplication of messages. If not provided, we cannot guarantee 236 that no duplicate data will be delivered on the Pub/Sub stream. In this 237 case, deduplication of the stream will be strictly best effort. 238 with_attributes: 239 True - output elements will be :class:`~PubsubMessage` objects. 240 False - output elements will be of type ``bytes`` (message 241 data only). 242 timestamp_attribute: Message value to use as element timestamp. If None, 243 uses message publishing time as the timestamp. 244 245 Timestamp values should be in one of two formats: 246 247 - A numerical value representing the number of milliseconds since the 248 Unix epoch. 249 - A string in RFC 3339 format, UTC timezone. Example: 250 ``2015-10-29T23:41:41.123Z``. The sub-second component of the 251 timestamp is optional, and digits beyond the first three (i.e., time 252 units smaller than milliseconds) may be ignored. 253 """ 254 super().__init__() 255 self.with_attributes = with_attributes 256 self._source = _PubSubSource( 257 topic=topic, 258 subscription=subscription, 259 id_label=id_label, 260 with_attributes=self.with_attributes, 261 timestamp_attribute=timestamp_attribute) 262 263 def expand(self, pvalue): 264 pcoll = pvalue.pipeline | Read(self._source) 265 pcoll.element_type = bytes 266 if self.with_attributes: 267 pcoll = pcoll | Map(PubsubMessage._from_proto_str) 268 pcoll.element_type = PubsubMessage 269 return pcoll 270 271 def to_runner_api_parameter(self, context): 272 # Required as this is identified by type in PTransformOverrides. 273 # TODO(https://github.com/apache/beam/issues/18713): Use an actual URN here. 274 return self.to_runner_api_pickled(context) 275 276 277 @deprecated(since='2.7.0', extra_message='Use ReadFromPubSub instead.') 278 def ReadStringsFromPubSub(topic=None, subscription=None, id_label=None): 279 return _ReadStringsFromPubSub(topic, subscription, id_label) 280 281 282 class _ReadStringsFromPubSub(PTransform): 283 """This class is deprecated. Use ``ReadFromPubSub`` instead.""" 284 def __init__(self, topic=None, subscription=None, id_label=None): 285 super().__init__() 286 self.topic = topic 287 self.subscription = subscription 288 self.id_label = id_label 289 290 def expand(self, pvalue): 291 p = ( 292 pvalue.pipeline 293 | ReadFromPubSub( 294 self.topic, self.subscription, self.id_label, with_attributes=False) 295 | 'DecodeString' >> Map(lambda b: b.decode('utf-8'))) 296 p.element_type = str 297 return p 298 299 300 @deprecated(since='2.7.0', extra_message='Use WriteToPubSub instead.') 301 def WriteStringsToPubSub(topic): 302 return _WriteStringsToPubSub(topic) 303 304 305 class _WriteStringsToPubSub(PTransform): 306 """This class is deprecated. Use ``WriteToPubSub`` instead.""" 307 def __init__(self, topic): 308 """Initializes ``_WriteStringsToPubSub``. 309 310 Attributes: 311 topic: Cloud Pub/Sub topic in the form "/topics/<project>/<topic>". 312 """ 313 super().__init__() 314 self.topic = topic 315 316 def expand(self, pcoll): 317 pcoll = pcoll | 'EncodeString' >> Map(lambda s: s.encode('utf-8')) 318 pcoll.element_type = bytes 319 return pcoll | WriteToPubSub(self.topic) 320 321 322 class WriteToPubSub(PTransform): 323 """A ``PTransform`` for writing messages to Cloud Pub/Sub.""" 324 325 # Implementation note: This ``PTransform`` is overridden by Directrunner. 326 327 def __init__( 328 self, 329 topic, # type: str 330 with_attributes=False, # type: bool 331 id_label=None, # type: Optional[str] 332 timestamp_attribute=None # type: Optional[str] 333 ): 334 # type: (...) -> None 335 336 """Initializes ``WriteToPubSub``. 337 338 Args: 339 topic: Cloud Pub/Sub topic in the form "/topics/<project>/<topic>". 340 with_attributes: 341 True - input elements will be :class:`~PubsubMessage` objects. 342 False - input elements will be of type ``bytes`` (message 343 data only). 344 id_label: If set, will set an attribute for each Cloud Pub/Sub message 345 with the given name and a unique value. This attribute can then be used 346 in a ReadFromPubSub PTransform to deduplicate messages. 347 timestamp_attribute: If set, will set an attribute for each Cloud Pub/Sub 348 message with the given name and the message's publish time as the value. 349 """ 350 super().__init__() 351 self.with_attributes = with_attributes 352 self.id_label = id_label 353 self.timestamp_attribute = timestamp_attribute 354 self.project, self.topic_name = parse_topic(topic) 355 self.full_topic = topic 356 self._sink = _PubSubSink(topic, id_label, timestamp_attribute) 357 358 @staticmethod 359 def message_to_proto_str(element): 360 # type: (PubsubMessage) -> bytes 361 if not isinstance(element, PubsubMessage): 362 raise TypeError( 363 'Unexpected element. Type: %s (expected: PubsubMessage), ' 364 'value: %r' % (type(element), element)) 365 return element._to_proto_str(for_publish=True) 366 367 @staticmethod 368 def bytes_to_proto_str(element): 369 # type: (bytes) -> bytes 370 msg = PubsubMessage(element, {}) 371 return msg._to_proto_str(for_publish=True) 372 373 def expand(self, pcoll): 374 if self.with_attributes: 375 pcoll = pcoll | 'ToProtobuf' >> Map(self.message_to_proto_str) 376 else: 377 pcoll = pcoll | 'ToProtobuf' >> Map(self.bytes_to_proto_str) 378 pcoll.element_type = bytes 379 return pcoll | Write(self._sink) 380 381 def to_runner_api_parameter(self, context): 382 # Required as this is identified by type in PTransformOverrides. 383 # TODO(https://github.com/apache/beam/issues/18713): Use an actual URN here. 384 return self.to_runner_api_pickled(context) 385 386 def display_data(self): 387 return { 388 'topic': DisplayDataItem(self.full_topic, label='Pubsub Topic'), 389 'id_label': DisplayDataItem(self.id_label, label='ID Label Attribute'), 390 'with_attributes': DisplayDataItem( 391 True, label='With Attributes').drop_if_none(), 392 'timestamp_attribute': DisplayDataItem( 393 self.timestamp_attribute, label='Timestamp Attribute'), 394 } 395 396 397 PROJECT_ID_REGEXP = '[a-z][-a-z0-9:.]{4,61}[a-z0-9]' 398 SUBSCRIPTION_REGEXP = 'projects/([^/]+)/subscriptions/(.+)' 399 TOPIC_REGEXP = 'projects/([^/]+)/topics/(.+)' 400 401 402 def parse_topic(full_topic: str) -> Tuple[str, str]: 403 match = re.match(TOPIC_REGEXP, full_topic) 404 if not match: 405 raise ValueError( 406 'PubSub topic must be in the form "projects/<project>/topics' 407 '/<topic>" (got %r).' % full_topic) 408 project, topic_name = match.group(1), match.group(2) 409 if not re.match(PROJECT_ID_REGEXP, project): 410 raise ValueError('Invalid PubSub project name: %r.' % project) 411 return project, topic_name 412 413 414 def parse_subscription(full_subscription): 415 match = re.match(SUBSCRIPTION_REGEXP, full_subscription) 416 if not match: 417 raise ValueError( 418 'PubSub subscription must be in the form "projects/<project>' 419 '/subscriptions/<subscription>" (got %r).' % full_subscription) 420 project, subscription_name = match.group(1), match.group(2) 421 if not re.match(PROJECT_ID_REGEXP, project): 422 raise ValueError('Invalid PubSub project name: %r.' % project) 423 return project, subscription_name 424 425 426 class _PubSubSource(dataflow_io.NativeSource): 427 """Source for a Cloud Pub/Sub topic or subscription. 428 429 This ``NativeSource`` is overridden by a native Pubsub implementation. 430 431 Attributes: 432 with_attributes: If False, will fetch just message data. Otherwise, 433 fetches ``PubsubMessage`` protobufs. 434 """ 435 def __init__( 436 self, 437 topic=None, # type: Optional[str] 438 subscription=None, # type: Optional[str] 439 id_label=None, # type: Optional[str] 440 with_attributes=False, # type: bool 441 timestamp_attribute=None # type: Optional[str] 442 ): 443 self.coder = coders.BytesCoder() 444 self.full_topic = topic 445 self.full_subscription = subscription 446 self.topic_name = None 447 self.subscription_name = None 448 self.id_label = id_label 449 self.with_attributes = with_attributes 450 self.timestamp_attribute = timestamp_attribute 451 452 # Perform some validation on the topic and subscription. 453 if not (topic or subscription): 454 raise ValueError('Either a topic or subscription must be provided.') 455 if topic and subscription: 456 raise ValueError('Only one of topic or subscription should be provided.') 457 458 if topic: 459 self.project, self.topic_name = parse_topic(topic) 460 if subscription: 461 self.project, self.subscription_name = parse_subscription(subscription) 462 463 @property 464 def format(self): 465 """Source format name required for remote execution.""" 466 return 'pubsub' 467 468 def display_data(self): 469 return { 470 'id_label': DisplayDataItem(self.id_label, 471 label='ID Label Attribute').drop_if_none(), 472 'topic': DisplayDataItem(self.full_topic, 473 label='Pubsub Topic').drop_if_none(), 474 'subscription': DisplayDataItem( 475 self.full_subscription, label='Pubsub Subscription').drop_if_none(), 476 'with_attributes': DisplayDataItem( 477 self.with_attributes, label='With Attributes').drop_if_none(), 478 'timestamp_attribute': DisplayDataItem( 479 self.timestamp_attribute, 480 label='Timestamp Attribute').drop_if_none(), 481 } 482 483 def reader(self): 484 raise NotImplementedError 485 486 def is_bounded(self): 487 return False 488 489 490 class _PubSubSink(dataflow_io.NativeSink): 491 """Sink for a Cloud Pub/Sub topic. 492 493 This ``NativeSource`` is overridden by a native Pubsub implementation. 494 """ 495 def __init__( 496 self, 497 topic: str, 498 id_label: Optional[str], 499 timestamp_attribute: Optional[str], 500 ): 501 self.coder = coders.BytesCoder() 502 self.full_topic = topic 503 self.id_label = id_label 504 self.timestamp_attribute = timestamp_attribute 505 506 self.project, self.topic_name = parse_topic(topic) 507 508 @property 509 def format(self): 510 """Sink format name required for remote execution.""" 511 return 'pubsub' 512 513 def writer(self): 514 raise NotImplementedError 515 516 517 class PubSubSourceDescriptor(NamedTuple): 518 """A PubSub source descriptor for ``MultipleReadFromPubSub``` 519 520 Attributes: 521 source: Existing Cloud Pub/Sub topic or subscription to use in the 522 form "projects/<project>/topics/<topic>" or 523 "projects/<project>/subscriptions/<subscription>" 524 id_label: The attribute on incoming Pub/Sub messages to use as a unique 525 record identifier. When specified, the value of this attribute (which 526 can be any string that uniquely identifies the record) will be used for 527 deduplication of messages. If not provided, we cannot guarantee 528 that no duplicate data will be delivered on the Pub/Sub stream. In this 529 case, deduplication of the stream will be strictly best effort. 530 timestamp_attribute: Message value to use as element timestamp. If None, 531 uses message publishing time as the timestamp. 532 533 Timestamp values should be in one of two formats: 534 535 - A numerical value representing the number of milliseconds since the 536 Unix epoch. 537 - A string in RFC 3339 format, UTC timezone. Example: 538 ``2015-10-29T23:41:41.123Z``. The sub-second component of the 539 timestamp is optional, and digits beyond the first three (i.e., time 540 units smaller than milliseconds) may be ignored. 541 """ 542 source: str 543 id_label: str = None 544 timestamp_attribute: str = None 545 546 547 PUBSUB_DESCRIPTOR_REGEXP = 'projects/([^/]+)/(topics|subscriptions)/(.+)' 548 549 550 class MultipleReadFromPubSub(PTransform): 551 """A ``PTransform`` that expands ``ReadFromPubSub`` to read from multiple 552 ``PubSubSourceDescriptor``. 553 554 The `MultipleReadFromPubSub` transform allows you to read multiple topics 555 and/or subscriptions using just one transform. It is the recommended transform 556 to read multiple Pub/Sub sources when the output `PCollection` are going to be 557 flattened. The transform takes a list of `PubSubSourceDescriptor` and organize 558 them by type (topic / subscription) and project::: 559 560 topic_1 = PubSubSourceDescriptor('projects/myproject/topics/a_topic') 561 topic_2 = PubSubSourceDescriptor( 562 'projects/myproject2/topics/b_topic', 563 'my_label', 564 'my_timestamp_attribute') 565 subscription_1 = PubSubSourceDescriptor( 566 'projects/myproject/subscriptions/a_subscription') 567 568 results = pipeline | MultipleReadFromPubSub( 569 [topic_1, topic_2, subscription_1]) 570 """ 571 def __init__( 572 self, 573 pubsub_source_descriptors, # type: List[PubSubSourceDescriptor] 574 with_attributes=False, # type: bool 575 ): 576 """Initializes ``PubSubMultipleReader``. 577 578 Args: 579 pubsub_source_descriptors: List of Cloud Pub/Sub topics or subscriptions 580 of type `~PubSubSourceDescriptor`. 581 with_attributes: 582 True - input elements will be :class:`~PubsubMessage` objects. 583 False - input elements will be of type ``bytes`` (message data only). 584 """ 585 self.pubsub_source_descriptors = pubsub_source_descriptors 586 self.with_attributes = with_attributes 587 588 for descriptor in self.pubsub_source_descriptors: 589 match_descriptor = re.match(PUBSUB_DESCRIPTOR_REGEXP, descriptor.source) 590 591 if not match_descriptor: 592 raise ValueError( 593 'PubSub source descriptor must be in the form "projects/<project>' 594 '/topics/<topic>" or "projects/<project>/subscription' 595 '/<subscription>" (got %r).' % descriptor.source) 596 597 def expand(self, pcol): 598 sources_pcol = [] 599 for descriptor in self.pubsub_source_descriptors: 600 source_match = re.match(PUBSUB_DESCRIPTOR_REGEXP, descriptor.source) 601 source_project = source_match.group(1) 602 source_type = source_match.group(2) 603 source_name = source_match.group(3) 604 605 read_step_name = 'PubSub %s/project:%s/Read %s' % ( 606 source_type, source_project, source_name) 607 608 if source_type == 'topics': 609 current_source = pcol | read_step_name >> ReadFromPubSub( 610 topic=descriptor.source, 611 id_label=descriptor.id_label, 612 with_attributes=self.with_attributes, 613 timestamp_attribute=descriptor.timestamp_attribute) 614 else: 615 current_source = pcol | read_step_name >> ReadFromPubSub( 616 subscription=descriptor.source, 617 id_label=descriptor.id_label, 618 with_attributes=self.with_attributes, 619 timestamp_attribute=descriptor.timestamp_attribute) 620 621 sources_pcol.append(current_source) 622 623 return tuple(sources_pcol) | Flatten()