github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/pipeline.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """Pipeline, the top-level Beam object. 19 20 A pipeline holds a DAG of data transforms. Conceptually the nodes of the DAG 21 are transforms (:class:`~apache_beam.transforms.ptransform.PTransform` objects) 22 and the edges are values (mostly :class:`~apache_beam.pvalue.PCollection` 23 objects). The transforms take as inputs one or more PValues and output one or 24 more :class:`~apache_beam.pvalue.PValue` s. 25 26 The pipeline offers functionality to traverse the graph. The actual operation 27 to be executed for each node visited is specified through a runner object. 28 29 Typical usage:: 30 31 # Create a pipeline object using a local runner for execution. 32 with beam.Pipeline('DirectRunner') as p: 33 34 # Add to the pipeline a "Create" transform. When executed this 35 # transform will produce a PCollection object with the specified values. 36 pcoll = p | 'Create' >> beam.Create([1, 2, 3]) 37 38 # Another transform could be applied to pcoll, e.g., writing to a text file. 39 # For other transforms, refer to transforms/ directory. 40 pcoll | 'Write' >> beam.io.WriteToText('./output') 41 42 # run() will execute the DAG stored in the pipeline. The execution of the 43 # nodes visited is done using the specified local runner. 44 45 """ 46 47 # pytype: skip-file 48 # mypy: disallow-untyped-defs 49 50 import abc 51 import logging 52 import os 53 import re 54 import shutil 55 import tempfile 56 import unicodedata 57 from collections import defaultdict 58 from typing import TYPE_CHECKING 59 from typing import Any 60 from typing import Dict 61 from typing import FrozenSet 62 from typing import Iterable 63 from typing import List 64 from typing import Mapping 65 from typing import Optional 66 from typing import Sequence 67 from typing import Set 68 from typing import Tuple 69 from typing import Type 70 from typing import Union 71 72 from google.protobuf import message 73 74 from apache_beam import pvalue 75 from apache_beam.internal import pickler 76 from apache_beam.io.filesystems import FileSystems 77 from apache_beam.options.pipeline_options import CrossLanguageOptions 78 from apache_beam.options.pipeline_options import DebugOptions 79 from apache_beam.options.pipeline_options import PipelineOptions 80 from apache_beam.options.pipeline_options import SetupOptions 81 from apache_beam.options.pipeline_options import StandardOptions 82 from apache_beam.options.pipeline_options import TypeOptions 83 from apache_beam.options.pipeline_options_validator import PipelineOptionsValidator 84 from apache_beam.portability import common_urns 85 from apache_beam.portability.api import beam_runner_api_pb2 86 from apache_beam.runners import PipelineRunner 87 from apache_beam.runners import create_runner 88 from apache_beam.transforms import ParDo 89 from apache_beam.transforms import ptransform 90 from apache_beam.transforms.display import DisplayData 91 from apache_beam.transforms.resources import merge_resource_hints 92 from apache_beam.transforms.resources import resource_hints_from_options 93 from apache_beam.transforms.sideinputs import get_sideinput_index 94 from apache_beam.typehints import TypeCheckError 95 from apache_beam.typehints import typehints 96 from apache_beam.utils import proto_utils 97 from apache_beam.utils import subprocess_server 98 from apache_beam.utils.annotations import deprecated 99 from apache_beam.utils.interactive_utils import alter_label_if_ipython 100 from apache_beam.utils.interactive_utils import is_in_ipython 101 102 if TYPE_CHECKING: 103 from types import TracebackType 104 from apache_beam.runners.pipeline_context import PipelineContext 105 from apache_beam.runners.runner import PipelineResult 106 from apache_beam.transforms import environments 107 108 __all__ = ['Pipeline', 'PTransformOverride'] 109 110 111 class Pipeline(object): 112 """A pipeline object that manages a DAG of 113 :class:`~apache_beam.pvalue.PValue` s and their 114 :class:`~apache_beam.transforms.ptransform.PTransform` s. 115 116 Conceptually the :class:`~apache_beam.pvalue.PValue` s are the DAG's nodes and 117 the :class:`~apache_beam.transforms.ptransform.PTransform` s computing 118 the :class:`~apache_beam.pvalue.PValue` s are the edges. 119 120 All the transforms applied to the pipeline must have distinct full labels. 121 If same transform instance needs to be applied then the right shift operator 122 should be used to designate new names 123 (e.g. ``input | "label" >> my_transform``). 124 """ 125 @classmethod 126 def runner_implemented_transforms(cls): 127 # type: () -> FrozenSet[str] 128 129 # This set should only contain transforms which are required to be 130 # implemented by a runner. 131 return frozenset([ 132 common_urns.primitives.GROUP_BY_KEY.urn, 133 common_urns.primitives.IMPULSE.urn, 134 ]) 135 136 def __init__(self, runner=None, options=None, argv=None): 137 # type: (Optional[Union[str, PipelineRunner]], Optional[PipelineOptions], Optional[List[str]]) -> None 138 139 """Initialize a pipeline object. 140 141 Args: 142 runner (~apache_beam.runners.runner.PipelineRunner): An object of 143 type :class:`~apache_beam.runners.runner.PipelineRunner` that will be 144 used to execute the pipeline. For registered runners, the runner name 145 can be specified, otherwise a runner object must be supplied. 146 options (~apache_beam.options.pipeline_options.PipelineOptions): 147 A configured 148 :class:`~apache_beam.options.pipeline_options.PipelineOptions` object 149 containing arguments that should be used for running the Beam job. 150 argv (List[str]): a list of arguments (such as :data:`sys.argv`) 151 to be used for building a 152 :class:`~apache_beam.options.pipeline_options.PipelineOptions` object. 153 This will only be used if argument **options** is :data:`None`. 154 155 Raises: 156 ValueError: if either the runner or options argument is not 157 of the expected type. 158 """ 159 # Initializing logging configuration in case the user did not set it up. 160 logging.basicConfig() 161 162 if options is not None: 163 if isinstance(options, PipelineOptions): 164 self._options = options 165 else: 166 raise ValueError( 167 'Parameter options, if specified, must be of type PipelineOptions. ' 168 'Received : %r' % options) 169 elif argv is not None: 170 if isinstance(argv, list): 171 self._options = PipelineOptions(argv) 172 else: 173 raise ValueError( 174 'Parameter argv, if specified, must be a list. Received : %r' % 175 argv) 176 else: 177 self._options = PipelineOptions([]) 178 179 FileSystems.set_options(self._options) 180 181 pickle_library = self._options.view_as(SetupOptions).pickle_library 182 pickler.set_library(pickle_library) 183 184 if runner is None: 185 runner = self._options.view_as(StandardOptions).runner 186 if runner is None: 187 runner = StandardOptions.DEFAULT_RUNNER 188 logging.info(( 189 'Missing pipeline option (runner). Executing pipeline ' 190 'using the default runner: %s.'), 191 runner) 192 193 if isinstance(runner, str): 194 runner = create_runner(runner) 195 elif not isinstance(runner, PipelineRunner): 196 raise TypeError( 197 'Runner %s is not a PipelineRunner object or the ' 198 'name of a registered runner.' % runner) 199 200 # Validate pipeline options 201 errors = PipelineOptionsValidator(self._options, runner).validate() 202 if errors: 203 raise ValueError( 204 'Pipeline has validations errors: \n' + '\n'.join(errors)) 205 206 # set default experiments for portable runners 207 # (needs to occur prior to pipeline construction) 208 if runner.is_fnapi_compatible(): 209 experiments = (self._options.view_as(DebugOptions).experiments or []) 210 if not 'beam_fn_api' in experiments: 211 experiments.append('beam_fn_api') 212 self._options.view_as(DebugOptions).experiments = experiments 213 214 self.local_tempdir = tempfile.mkdtemp(prefix='beam-pipeline-temp') 215 216 # Default runner to be used. 217 self.runner = runner 218 # Stack of transforms generated by nested apply() calls. The stack will 219 # contain a root node as an enclosing (parent) node for top transforms. 220 self.transforms_stack = [AppliedPTransform(None, None, '', None)] 221 # Set of transform labels (full labels) applied to the pipeline. 222 # If a transform is applied and the full label is already in the set 223 # then the transform will have to be cloned with a new label. 224 self.applied_labels = set() # type: Set[str] 225 # Hints supplied via pipeline options are considered the outermost hints. 226 self._root_transform().resource_hints = resource_hints_from_options(options) 227 # Create a ComponentIdMap for assigning IDs to components. Ensures that any 228 # components that receive an ID during pipeline construction (for example in 229 # ExternalTransform), will receive the same component ID when generating the 230 # full pipeline proto. 231 self.component_id_map = ComponentIdMap() 232 233 # Records whether this pipeline contains any external transforms. 234 self.contains_external_transforms = False 235 236 237 @property # type: ignore[misc] # decorated property not supported 238 @deprecated( 239 since='First stable release', 240 extra_message='References to <pipeline>.options' 241 ' will not be supported') 242 def options(self): 243 # type: () -> PipelineOptions 244 return self._options 245 246 @property 247 def allow_unsafe_triggers(self): 248 # type: () -> bool 249 return self._options.view_as(TypeOptions).allow_unsafe_triggers 250 251 def _current_transform(self): 252 # type: () -> AppliedPTransform 253 254 """Returns the transform currently on the top of the stack.""" 255 return self.transforms_stack[-1] 256 257 def _root_transform(self): 258 # type: () -> AppliedPTransform 259 260 """Returns the root transform of the transform stack.""" 261 return self.transforms_stack[0] 262 263 def _remove_labels_recursively(self, applied_transform): 264 # type: (AppliedPTransform) -> None 265 for part in applied_transform.parts: 266 if part.full_label in self.applied_labels: 267 self.applied_labels.remove(part.full_label) 268 self._remove_labels_recursively(part) 269 270 def _replace(self, override): 271 # type: (PTransformOverride) -> None 272 assert isinstance(override, PTransformOverride) 273 274 # From original transform output --> replacement transform output 275 output_map = {} # type: Dict[pvalue.PValue, pvalue.PValue] 276 output_replacements = { 277 } # type: Dict[AppliedPTransform, List[Tuple[pvalue.PValue, Optional[str]]]] 278 input_replacements = { 279 } # type: Dict[AppliedPTransform, Mapping[str, Union[pvalue.PBegin, pvalue.PCollection]]] 280 side_input_replacements = { 281 } # type: Dict[AppliedPTransform, List[pvalue.AsSideInput]] 282 283 class TransformUpdater(PipelineVisitor): # pylint: disable=used-before-assignment 284 """"A visitor that replaces the matching PTransforms.""" 285 def __init__(self, pipeline): 286 # type: (Pipeline) -> None 287 self.pipeline = pipeline 288 289 def _replace_if_needed(self, original_transform_node): 290 # type: (AppliedPTransform) -> None 291 if override.matches(original_transform_node): 292 assert isinstance(original_transform_node, AppliedPTransform) 293 replacement_transform = ( 294 override.get_replacement_transform_for_applied_ptransform( 295 original_transform_node)) 296 if replacement_transform is original_transform_node.transform: 297 return 298 replacement_transform.side_inputs = tuple( 299 original_transform_node.transform.side_inputs) 300 301 replacement_transform_node = AppliedPTransform( 302 original_transform_node.parent, 303 replacement_transform, 304 original_transform_node.full_label, 305 original_transform_node.main_inputs) 306 307 # TODO(https://github.com/apache/beam/issues/21178): Merge rather 308 # than override. 309 replacement_transform_node.resource_hints = ( 310 original_transform_node.resource_hints) 311 312 # Transform execution could depend on order in which nodes are 313 # considered. Hence we insert the replacement transform node to same 314 # index as the original transform node. Note that this operation 315 # removes the original transform node. 316 if original_transform_node.parent: 317 assert isinstance(original_transform_node.parent, AppliedPTransform) 318 parent_parts = original_transform_node.parent.parts 319 parent_parts[parent_parts.index(original_transform_node)] = ( 320 replacement_transform_node) 321 else: 322 # Original transform has to be a root. 323 roots = self.pipeline.transforms_stack[0].parts 324 assert original_transform_node in roots 325 roots[roots.index(original_transform_node)] = ( 326 replacement_transform_node) 327 328 inputs = override.get_replacement_inputs(original_transform_node) 329 if len(inputs) > 1: 330 transform_input = inputs 331 elif len(inputs) == 1: 332 transform_input = inputs[0] 333 elif len(inputs) == 0: 334 transform_input = pvalue.PBegin(self.pipeline) 335 try: 336 # We have to add the new AppliedTransform to the stack before 337 # expand() and pop it out later to make sure that parts get added 338 # correctly. 339 self.pipeline.transforms_stack.append(replacement_transform_node) 340 341 # Keeping the same label for the replaced node but recursively 342 # removing labels of child transforms of original transform since 343 # they will be replaced during the expand below. This is needed in 344 # case the replacement contains children that have labels that 345 # conflicts with labels of the children of the original. 346 self.pipeline._remove_labels_recursively(original_transform_node) 347 348 new_output = replacement_transform.expand(transform_input) 349 assert isinstance( 350 new_output, (dict, pvalue.PValue, pvalue.DoOutputsTuple)) 351 352 if isinstance(new_output, pvalue.PValue): 353 new_output.element_type = None 354 self.pipeline._infer_result_type( 355 replacement_transform, inputs, new_output) 356 357 if isinstance(new_output, dict): 358 for new_tag, new_pcoll in new_output.items(): 359 replacement_transform_node.add_output(new_pcoll, new_tag) 360 elif isinstance(new_output, pvalue.DoOutputsTuple): 361 replacement_transform_node.add_output( 362 new_output, new_output._main_tag) 363 else: 364 replacement_transform_node.add_output(new_output, new_output.tag) 365 366 # Recording updated outputs. This cannot be done in the same 367 # visitor since if we dynamically update output type here, we'll 368 # run into errors when visiting child nodes. 369 # 370 # NOTE: When replacing multiple outputs, the replacement 371 # PCollection tags must have a matching tag in the original 372 # transform. 373 if isinstance(new_output, pvalue.PValue): 374 if not new_output.producer: 375 new_output.producer = replacement_transform_node 376 output_map[original_transform_node.outputs[new_output.tag]] = \ 377 new_output 378 elif isinstance(new_output, (pvalue.DoOutputsTuple, tuple)): 379 for pcoll in new_output: 380 if not pcoll.producer: 381 pcoll.producer = replacement_transform_node 382 output_map[original_transform_node.outputs[pcoll.tag]] = pcoll 383 elif isinstance(new_output, dict): 384 for tag, pcoll in new_output.items(): 385 if not pcoll.producer: 386 pcoll.producer = replacement_transform_node 387 output_map[original_transform_node.outputs[tag]] = pcoll 388 finally: 389 self.pipeline.transforms_stack.pop() 390 391 def enter_composite_transform(self, transform_node): 392 # type: (AppliedPTransform) -> None 393 self._replace_if_needed(transform_node) 394 395 def visit_transform(self, transform_node): 396 # type: (AppliedPTransform) -> None 397 self._replace_if_needed(transform_node) 398 399 self.visit(TransformUpdater(self)) 400 401 # Ensure no type information is lost. 402 for old, new in output_map.items(): 403 if new.element_type == typehints.Any: 404 # TODO(robertwb): Perhaps take the intersection? 405 new.element_type = old.element_type 406 407 # Adjusting inputs and outputs 408 class InputOutputUpdater(PipelineVisitor): # pylint: disable=used-before-assignment 409 """"A visitor that records input and output values to be replaced. 410 411 Input and output values that should be updated are recorded in maps 412 input_replacements and output_replacements respectively. 413 414 We cannot update input and output values while visiting since that results 415 in validation errors. 416 """ 417 def __init__(self, pipeline): 418 # type: (Pipeline) -> None 419 self.pipeline = pipeline 420 421 def enter_composite_transform(self, transform_node): 422 # type: (AppliedPTransform) -> None 423 self.visit_transform(transform_node) 424 425 def visit_transform(self, transform_node): 426 # type: (AppliedPTransform) -> None 427 replace_output = False 428 for tag in transform_node.outputs: 429 if transform_node.outputs[tag] in output_map: 430 replace_output = True 431 break 432 433 replace_input = False 434 for input in transform_node.inputs: 435 if input in output_map: 436 replace_input = True 437 break 438 439 replace_side_inputs = False 440 for side_input in transform_node.side_inputs: 441 if side_input.pvalue in output_map: 442 replace_side_inputs = True 443 break 444 445 if replace_output: 446 output_replacements[transform_node] = [] 447 for original, replacement in output_map.items(): 448 for tag, output in transform_node.outputs.items(): 449 if output == original: 450 output_replacements[transform_node].append((tag, replacement)) 451 452 if replace_input: 453 new_inputs = { 454 tag: input if not input in output_map else output_map[input] 455 for (tag, input) in transform_node.main_inputs.items() 456 } 457 input_replacements[transform_node] = new_inputs 458 459 if replace_side_inputs: 460 new_side_inputs = [] 461 for side_input in transform_node.side_inputs: 462 if side_input.pvalue in output_map: 463 side_input.pvalue = output_map[side_input.pvalue] 464 new_side_inputs.append(side_input) 465 else: 466 new_side_inputs.append(side_input) 467 side_input_replacements[transform_node] = new_side_inputs 468 469 self.visit(InputOutputUpdater(self)) 470 471 for transform, output_replacement in output_replacements.items(): 472 for tag, output in output_replacement: 473 transform.replace_output(output, tag=tag) 474 475 for transform, input_replacement in input_replacements.items(): 476 transform.replace_inputs(input_replacement) 477 478 for transform, side_input_replacement in side_input_replacements.items(): 479 transform.replace_side_inputs(side_input_replacement) 480 481 def _check_replacement(self, override): 482 # type: (PTransformOverride) -> None 483 class ReplacementValidator(PipelineVisitor): 484 def visit_transform(self, transform_node): 485 # type: (AppliedPTransform) -> None 486 if override.matches(transform_node): 487 raise RuntimeError( 488 'Transform node %r was not replaced as expected.' % 489 transform_node) 490 491 self.visit(ReplacementValidator()) 492 493 def replace_all(self, replacements): 494 # type: (Iterable[PTransformOverride]) -> None 495 496 """ Dynamically replaces PTransforms in the currently populated hierarchy. 497 498 Currently this only works for replacements where input and output types 499 are exactly the same. 500 501 TODO: Update this to also work for transform overrides where input and 502 output types are different. 503 504 Args: 505 replacements (List[~apache_beam.pipeline.PTransformOverride]): a list of 506 :class:`~apache_beam.pipeline.PTransformOverride` objects. 507 """ 508 for override in replacements: 509 assert isinstance(override, PTransformOverride) 510 self._replace(override) 511 512 # Checking if the PTransforms have been successfully replaced. This will 513 # result in a failure if a PTransform that was replaced in a given override 514 # gets re-added in a subsequent override. This is not allowed and ordering 515 # of PTransformOverride objects in 'replacements' is important. 516 for override in replacements: 517 self._check_replacement(override) 518 519 def run(self, test_runner_api='AUTO'): 520 # type: (Union[bool, str]) -> PipelineResult 521 522 """Runs the pipeline. Returns whatever our runner returns after running.""" 523 524 # Records whether this pipeline contains any cross-language transforms. 525 self.contains_external_transforms = ( 526 ExternalTransformFinder.contains_external_transforms(self)) 527 528 try: 529 if test_runner_api == 'AUTO': 530 # Don't pay the cost of a round-trip if we're going to be going through 531 # the FnApi anyway... 532 is_fnapi_compatible = self.runner.is_fnapi_compatible() or ( 533 # DirectRunner uses the Fn API for batch only 534 self.runner.__class__.__name__ == 'SwitchingDirectRunner' and 535 not self._options.view_as(StandardOptions).streaming) 536 537 # Multi-language pipelines that contain external pipeline segments may 538 # not be able to create a Python pipeline object graph. Hence following 539 # runner API check should be skipped for such pipelines. 540 541 # The InteractiveRunner relies on a constant pipeline reference, skip 542 # it. 543 test_runner_api = ( 544 not is_fnapi_compatible and 545 not self.contains_external_transforms and 546 self.runner.__class__.__name__ != 'InteractiveRunner') 547 548 # When possible, invoke a round trip through the runner API. 549 if test_runner_api and self._verify_runner_api_compatible(): 550 return Pipeline.from_runner_api( 551 self.to_runner_api(use_fake_coders=True), 552 self.runner, 553 self._options).run(False) 554 555 if (self._options.view_as(TypeOptions).runtime_type_check and 556 self._options.view_as(TypeOptions).performance_runtime_type_check): 557 raise RuntimeError( 558 'You cannot turn on runtime_type_check ' 559 'and performance_runtime_type_check simultaneously. ' 560 'Pick one or the other.') 561 562 if self._options.view_as(TypeOptions).runtime_type_check: 563 from apache_beam.typehints import typecheck 564 self.visit(typecheck.TypeCheckVisitor()) 565 566 if self._options.view_as(TypeOptions).performance_runtime_type_check: 567 from apache_beam.typehints import typecheck 568 self.visit(typecheck.PerformanceTypeCheckVisitor()) 569 570 if self._options.view_as(SetupOptions).save_main_session: 571 # If this option is chosen, verify we can pickle the main session early. 572 tmpdir = tempfile.mkdtemp() 573 try: 574 pickler.dump_session(os.path.join(tmpdir, 'main_session.pickle')) 575 finally: 576 shutil.rmtree(tmpdir) 577 return self.runner.run_pipeline(self, self._options) 578 finally: 579 if not is_in_ipython(): 580 shutil.rmtree(self.local_tempdir, ignore_errors=True) 581 # else interactive beam handles the cleanup. 582 583 def __enter__(self): 584 # type: () -> Pipeline 585 self._extra_context = subprocess_server.JavaJarServer.beam_services( 586 self._options.view_as(CrossLanguageOptions).beam_services) 587 self._extra_context.__enter__() 588 return self 589 590 def __exit__( 591 self, 592 exc_type, # type: Optional[Type[BaseException]] 593 exc_val, # type: Optional[BaseException] 594 exc_tb # type: Optional[TracebackType] 595 ): 596 # type: (...) -> None 597 598 try: 599 if not exc_type: 600 self.result = self.run() 601 self.result.wait_until_finish() 602 finally: 603 self._extra_context.__exit__(exc_type, exc_val, exc_tb) 604 605 def visit(self, visitor): 606 # type: (PipelineVisitor) -> None 607 608 """Visits depth-first every node of a pipeline's DAG. 609 610 Runner-internal implementation detail; no backwards-compatibility guarantees 611 612 Args: 613 visitor (~apache_beam.pipeline.PipelineVisitor): 614 :class:`~apache_beam.pipeline.PipelineVisitor` object whose callbacks 615 will be called for each node visited. See 616 :class:`~apache_beam.pipeline.PipelineVisitor` comments. 617 618 Raises: 619 TypeError: if node is specified and is not a 620 :class:`~apache_beam.pvalue.PValue`. 621 ~apache_beam.error.PipelineError: if node is specified and does not 622 belong to this pipeline instance. 623 """ 624 625 visited = set() # type: Set[pvalue.PValue] 626 self._root_transform().visit(visitor, self, visited) 627 628 def apply( 629 self, 630 transform, # type: ptransform.PTransform 631 pvalueish=None, # type: Optional[pvalue.PValue] 632 label=None # type: Optional[str] 633 ): 634 # type: (...) -> pvalue.PValue 635 636 """Applies a custom transform using the pvalueish specified. 637 638 Args: 639 transform (~apache_beam.transforms.ptransform.PTransform): the 640 :class:`~apache_beam.transforms.ptransform.PTransform` to apply. 641 pvalueish (~apache_beam.pvalue.PCollection): the input for the 642 :class:`~apache_beam.transforms.ptransform.PTransform` (typically a 643 :class:`~apache_beam.pvalue.PCollection`). 644 label (str): label of the 645 :class:`~apache_beam.transforms.ptransform.PTransform`. 646 647 Raises: 648 TypeError: if the transform object extracted from the 649 argument list is not a 650 :class:`~apache_beam.transforms.ptransform.PTransform`. 651 RuntimeError: if the transform object was already applied to 652 this pipeline and needs to be cloned in order to apply again. 653 """ 654 if isinstance(transform, ptransform._NamedPTransform): 655 return self.apply( 656 transform.transform, pvalueish, label or transform.label) 657 658 if not isinstance(transform, ptransform.PTransform): 659 raise TypeError("Expected a PTransform object, got %s" % transform) 660 661 if label: 662 # Fix self.label as it is inspected by some PTransform operations 663 # (e.g. to produce error messages for type hint violations). 664 old_label, transform.label = transform.label, label 665 try: 666 return self.apply(transform, pvalueish) 667 finally: 668 transform.label = old_label 669 670 # Attempts to alter the label of the transform to be applied only when it's 671 # a top-level transform so that the cell number will not be prepended to 672 # every child transform in a composite. 673 if self._current_transform() is self._root_transform(): 674 alter_label_if_ipython(transform, pvalueish) 675 676 full_label = '/'.join( 677 [self._current_transform().full_label, label or 678 transform.label]).lstrip('/') 679 if full_label in self.applied_labels: 680 raise RuntimeError( 681 'A transform with label "%s" already exists in the pipeline. ' 682 'To apply a transform with a specified label write ' 683 'pvalue | "label" >> transform' % full_label) 684 self.applied_labels.add(full_label) 685 686 pvalueish, inputs = transform._extract_input_pvalues(pvalueish) 687 try: 688 if not isinstance(inputs, dict): 689 inputs = {str(ix): input for (ix, input) in enumerate(inputs)} 690 except TypeError: 691 raise NotImplementedError( 692 'Unable to extract PValue inputs from %s; either %s does not accept ' 693 'inputs of this format, or it does not properly override ' 694 '_extract_input_pvalues' % (pvalueish, transform)) 695 for t, leaf_input in inputs.items(): 696 if not isinstance(leaf_input, pvalue.PValue) or not isinstance(t, str): 697 raise NotImplementedError( 698 '%s does not properly override _extract_input_pvalues, ' 699 'returned %s from %s' % (transform, inputs, pvalueish)) 700 701 current = AppliedPTransform( 702 self._current_transform(), transform, full_label, inputs) 703 self._current_transform().add_part(current) 704 705 try: 706 self.transforms_stack.append(current) 707 708 type_options = self._options.view_as(TypeOptions) 709 if type_options.pipeline_type_check: 710 transform.type_check_inputs(pvalueish) 711 712 pvalueish_result = self.runner.apply(transform, pvalueish, self._options) 713 714 if type_options is not None and type_options.pipeline_type_check: 715 transform.type_check_outputs(pvalueish_result) 716 717 for tag, result in ptransform.get_named_nested_pvalues(pvalueish_result): 718 assert isinstance(result, (pvalue.PValue, pvalue.DoOutputsTuple)) 719 720 # Make sure we set the producer only for a leaf node in the transform 721 # DAG. This way we preserve the last transform of a composite transform 722 # as being the real producer of the result. 723 if result.producer is None: 724 result.producer = current 725 726 # TODO(BEAM-1833): Pass full tuples dict. 727 self._infer_result_type(transform, tuple(inputs.values()), result) 728 729 assert isinstance(result.producer.inputs, tuple) 730 # The DoOutputsTuple adds the PCollection to the outputs when accessed 731 # except for the main tag. Add the main tag here. 732 if isinstance(result, pvalue.DoOutputsTuple): 733 current.add_output(result, result._main_tag) 734 continue 735 736 # If there is already a tag with the same name, increase a counter for 737 # the name. This can happen, for example, when a composite outputs a 738 # list of PCollections where all the tags are None. 739 base = tag 740 counter = 0 741 while tag in current.outputs: 742 counter += 1 743 tag = '%s_%d' % (base, counter) 744 745 current.add_output(result, tag) 746 747 if (type_options is not None and 748 type_options.type_check_strictness == 'ALL_REQUIRED' and 749 transform.get_type_hints().output_types is None): 750 ptransform_name = '%s(%s)' % (transform.__class__.__name__, full_label) 751 raise TypeCheckError( 752 'Pipeline type checking is enabled, however no ' 753 'output type-hint was found for the ' 754 'PTransform %s' % ptransform_name) 755 finally: 756 self.transforms_stack.pop() 757 return pvalueish_result 758 759 def _infer_result_type( 760 self, 761 transform, # type: ptransform.PTransform 762 inputs, # type: Sequence[Union[pvalue.PBegin, pvalue.PCollection]] 763 result_pcollection # type: Union[pvalue.PValue, pvalue.DoOutputsTuple] 764 ): 765 # type: (...) -> None 766 # TODO(robertwb): Multi-input inference. 767 type_options = self._options.view_as(TypeOptions) 768 if type_options is None or not type_options.pipeline_type_check: 769 return 770 if (isinstance(result_pcollection, pvalue.PCollection) and 771 (not result_pcollection.element_type 772 # TODO(robertwb): Ideally we'd do intersection here. 773 or result_pcollection.element_type == typehints.Any)): 774 # {Single, multi}-input, single-output inference. 775 input_element_types_tuple = tuple(i.element_type for i in inputs) 776 input_element_type = ( 777 input_element_types_tuple[0] if len(input_element_types_tuple) == 1 778 else typehints.Union[input_element_types_tuple]) 779 type_hints = transform.get_type_hints() 780 declared_output_type = type_hints.simple_output_type(transform.label) 781 if declared_output_type: 782 input_types = type_hints.input_types 783 if input_types and input_types[0]: 784 declared_input_type = input_types[0][0] 785 result_element_type = typehints.bind_type_variables( 786 declared_output_type, 787 typehints.match_type_variables( 788 declared_input_type, input_element_type)) 789 else: 790 result_element_type = declared_output_type 791 else: 792 result_element_type = transform.infer_output_type(input_element_type) 793 # Any remaining type variables have no bindings higher than this scope. 794 result_pcollection.element_type = typehints.bind_type_variables( 795 result_element_type, {'*': typehints.Any}) 796 elif isinstance(result_pcollection, pvalue.DoOutputsTuple): 797 # {Single, multi}-input, multi-output inference. 798 # TODO(https://github.com/apache/beam/issues/18957): Add support for 799 # tagged type hints. 800 # https://github.com/apache/beam/pull/9810#discussion_r338765251 801 for pcoll in result_pcollection: 802 if pcoll.element_type is None: 803 pcoll.element_type = typehints.Any 804 805 def __reduce__(self): 806 # type: () -> Tuple[Type, Tuple[str, ...]] 807 # Some transforms contain a reference to their enclosing pipeline, 808 # which in turn reference all other transforms (resulting in quadratic 809 # time/space to pickle each transform individually). As we don't 810 # require pickled pipelines to be executable, break the chain here. 811 return str, ('Pickled pipeline stub.', ) 812 813 def _verify_runner_api_compatible(self): 814 # type: () -> bool 815 if self._options.view_as(TypeOptions).runtime_type_check: 816 # This option is incompatible with the runner API as it requires 817 # the runner to inspect non-serialized hints on the transform 818 # itself. 819 return False 820 821 class Visitor(PipelineVisitor): # pylint: disable=used-before-assignment 822 ok = True # Really a nonlocal. 823 824 def enter_composite_transform(self, transform_node): 825 # type: (AppliedPTransform) -> None 826 pass 827 828 def visit_transform(self, transform_node): 829 # type: (AppliedPTransform) -> None 830 try: 831 # Transforms must be picklable. 832 pickler.loads( 833 pickler.dumps(transform_node.transform, enable_trace=False), 834 enable_trace=False) 835 except Exception: 836 Visitor.ok = False 837 838 def visit_value(self, value, _): 839 # type: (pvalue.PValue, AppliedPTransform) -> None 840 if isinstance(value, pvalue.PDone): 841 Visitor.ok = False 842 843 self.visit(Visitor()) 844 return Visitor.ok 845 846 def to_runner_api( 847 self, 848 return_context=False, # type: bool 849 context=None, # type: Optional[PipelineContext] 850 use_fake_coders=False, # type: bool 851 default_environment=None # type: Optional[environments.Environment] 852 ): 853 # type: (...) -> beam_runner_api_pb2.Pipeline 854 855 """For internal use only; no backwards-compatibility guarantees.""" 856 from apache_beam.runners import pipeline_context 857 if context is None: 858 context = pipeline_context.PipelineContext( 859 use_fake_coders=use_fake_coders, 860 component_id_map=self.component_id_map, 861 default_environment=default_environment) 862 elif default_environment is not None: 863 raise ValueError( 864 'Only one of context or default_environment may be specified.') 865 866 # The RunnerAPI spec requires certain transforms and side-inputs to have KV 867 # inputs (and corresponding outputs). 868 # Currently we only upgrade to KV pairs. If there is a need for more 869 # general shapes, potential conflicts will have to be resolved. 870 # We also only handle single-input, and (for fixing the output) single 871 # output, which is sufficient. 872 # Also marks such values as requiring deterministic key coders. 873 deterministic_key_coders = not self._options.view_as( 874 TypeOptions).allow_non_deterministic_key_coders 875 876 class ForceKvInputTypes(PipelineVisitor): 877 def enter_composite_transform(self, transform_node): 878 # type: (AppliedPTransform) -> None 879 self.visit_transform(transform_node) 880 881 def visit_transform(self, transform_node): 882 # type: (AppliedPTransform) -> None 883 if not transform_node.transform: 884 return 885 if transform_node.transform.runner_api_requires_keyed_input(): 886 pcoll = transform_node.inputs[0] 887 pcoll.element_type = typehints.coerce_to_kv_type( 888 pcoll.element_type, transform_node.full_label) 889 pcoll.requires_deterministic_key_coder = ( 890 deterministic_key_coders and transform_node.full_label) 891 if len(transform_node.outputs) == 1: 892 # The runner often has expectations about the output types as well. 893 output, = transform_node.outputs.values() 894 if not output.element_type: 895 output.element_type = transform_node.transform.infer_output_type( 896 pcoll.element_type) 897 if (isinstance(output.element_type, 898 typehints.TupleHint.TupleConstraint) and 899 len(output.element_type.tuple_types) == 2 and 900 pcoll.element_type.tuple_types[0] == 901 output.element_type.tuple_types[0]): 902 output.requires_deterministic_key_coder = ( 903 deterministic_key_coders and transform_node.full_label) 904 for side_input in transform_node.transform.side_inputs: 905 if side_input.requires_keyed_input(): 906 side_input.pvalue.element_type = typehints.coerce_to_kv_type( 907 side_input.pvalue.element_type, 908 transform_node.full_label, 909 side_input_producer=side_input.pvalue.producer.full_label) 910 side_input.pvalue.requires_deterministic_key_coder = ( 911 deterministic_key_coders and transform_node.full_label) 912 913 self.visit(ForceKvInputTypes()) 914 915 # Mutates context; placing inline would force dependence on 916 # argument evaluation order. 917 root_transform_id = context.transforms.get_id(self._root_transform()) 918 proto = beam_runner_api_pb2.Pipeline( 919 root_transform_ids=[root_transform_id], 920 components=context.to_runner_api(), 921 requirements=context.requirements()) 922 proto.components.transforms[root_transform_id].unique_name = ( 923 root_transform_id) 924 self.merge_compatible_environments(proto) 925 if return_context: 926 return proto, context # type: ignore # too complicated for now 927 else: 928 return proto 929 930 @staticmethod 931 def merge_compatible_environments(proto): 932 """Tries to minimize the number of distinct environments by merging 933 those that are compatible (currently defined as identical). 934 935 Mutates proto as contexts may have references to proto.components. 936 """ 937 env_map = {} 938 canonical_env = {} 939 files_by_hash = {} 940 for env_id, env in proto.components.environments.items(): 941 # First deduplicate any file dependencies by their hash. 942 for dep in env.dependencies: 943 if dep.type_urn == common_urns.artifact_types.FILE.urn: 944 file_payload = beam_runner_api_pb2.ArtifactFilePayload.FromString( 945 dep.type_payload) 946 if file_payload.sha256: 947 if file_payload.sha256 in files_by_hash: 948 file_payload.path = files_by_hash[file_payload.sha256] 949 dep.type_payload = file_payload.SerializeToString() 950 else: 951 files_by_hash[file_payload.sha256] = file_payload.path 952 # Next check if we've ever seen this environment before. 953 normalized = env.SerializeToString(deterministic=True) 954 if normalized in canonical_env: 955 env_map[env_id] = canonical_env[normalized] 956 else: 957 canonical_env[normalized] = env_id 958 for old_env, new_env in env_map.items(): 959 for transform in proto.components.transforms.values(): 960 if transform.environment_id == old_env: 961 transform.environment_id = new_env 962 for windowing_strategy in proto.components.windowing_strategies.values(): 963 if windowing_strategy.environment_id == old_env: 964 windowing_strategy.environment_id = new_env 965 del proto.components.environments[old_env] 966 967 @staticmethod 968 def from_runner_api( 969 proto, # type: beam_runner_api_pb2.Pipeline 970 runner, # type: PipelineRunner 971 options, # type: PipelineOptions 972 return_context=False, # type: bool 973 ): 974 # type: (...) -> Pipeline 975 976 """For internal use only; no backwards-compatibility guarantees.""" 977 p = Pipeline(runner=runner, options=options) 978 from apache_beam.runners import pipeline_context 979 context = pipeline_context.PipelineContext( 980 proto.components, requirements=proto.requirements) 981 if proto.root_transform_ids: 982 root_transform_id, = proto.root_transform_ids 983 p.transforms_stack = [context.transforms.get_by_id(root_transform_id)] 984 else: 985 p.transforms_stack = [AppliedPTransform(None, None, '', None)] 986 # TODO(robertwb): These are only needed to continue construction. Omit? 987 p.applied_labels = { 988 t.unique_name 989 for t in proto.components.transforms.values() 990 } 991 for id in proto.components.pcollections: 992 pcollection = context.pcollections.get_by_id(id) 993 pcollection.pipeline = p 994 if not pcollection.producer: 995 raise ValueError('No producer for %s' % id) 996 997 # Inject PBegin input where necessary. 998 from apache_beam.io.iobase import Read 999 from apache_beam.transforms.core import Create 1000 has_pbegin = [Read, Create] 1001 for id in proto.components.transforms: 1002 transform = context.transforms.get_by_id(id) 1003 if not transform.inputs and transform.transform.__class__ in has_pbegin: 1004 transform.main_inputs = {'None': pvalue.PBegin(p)} 1005 1006 if return_context: 1007 return p, context # type: ignore # too complicated for now 1008 else: 1009 return p 1010 1011 1012 class PipelineVisitor(object): 1013 """For internal use only; no backwards-compatibility guarantees. 1014 1015 Visitor pattern class used to traverse a DAG of transforms 1016 (used internally by Pipeline for bookkeeping purposes). 1017 """ 1018 def visit_value(self, value, producer_node): 1019 # type: (pvalue.PValue, AppliedPTransform) -> None 1020 1021 """Callback for visiting a PValue in the pipeline DAG. 1022 1023 Args: 1024 value: PValue visited (typically a PCollection instance). 1025 producer_node: AppliedPTransform object whose transform produced the 1026 pvalue. 1027 """ 1028 pass 1029 1030 def visit_transform(self, transform_node): 1031 # type: (AppliedPTransform) -> None 1032 1033 """Callback for visiting a transform leaf node in the pipeline DAG.""" 1034 pass 1035 1036 def enter_composite_transform(self, transform_node): 1037 # type: (AppliedPTransform) -> None 1038 1039 """Callback for entering traversal of a composite transform node.""" 1040 pass 1041 1042 def leave_composite_transform(self, transform_node): 1043 # type: (AppliedPTransform) -> None 1044 1045 """Callback for leaving traversal of a composite transform node.""" 1046 pass 1047 1048 1049 class ExternalTransformFinder(PipelineVisitor): 1050 """Looks for any external transforms in the pipeline and if found records 1051 it. 1052 """ 1053 def __init__(self): 1054 self._contains_external_transforms = False 1055 1056 @staticmethod 1057 def contains_external_transforms(pipeline): 1058 visitor = ExternalTransformFinder() 1059 pipeline.visit(visitor) 1060 return visitor._contains_external_transforms 1061 1062 def _perform_exernal_transform_test(self, transform): 1063 if not transform: 1064 return 1065 from apache_beam.transforms import ExternalTransform 1066 if isinstance(transform, ExternalTransform): 1067 self._contains_external_transforms = True 1068 1069 def visit_transform(self, transform_node): 1070 # type: (AppliedPTransform) -> None 1071 self._perform_exernal_transform_test(transform_node.transform) 1072 1073 def enter_composite_transform(self, transform_node): 1074 # type: (AppliedPTransform) -> None 1075 # Python SDK object graph may represent an external transform that is a leaf 1076 # of the pipeline graph as a composite without sub-transforms. 1077 # Note that this visitor is just used to identify pipelines with external 1078 # transforms. A Runner API pipeline proto generated from the Pipeline object 1079 # will include external sub-transform. 1080 self._perform_exernal_transform_test(transform_node.transform) 1081 1082 1083 class AppliedPTransform(object): 1084 """For internal use only; no backwards-compatibility guarantees. 1085 1086 A transform node representing an instance of applying a PTransform 1087 (used internally by Pipeline for bookeeping purposes). 1088 """ 1089 def __init__( 1090 self, 1091 parent, # type: Optional[AppliedPTransform] 1092 transform, # type: Optional[ptransform.PTransform] 1093 full_label, # type: str 1094 main_inputs, # type: Optional[Mapping[str, Union[pvalue.PBegin, pvalue.PCollection]]] 1095 environment_id=None, # type: Optional[str] 1096 annotations=None, # type: Optional[Dict[str, bytes]] 1097 ): 1098 # type: (...) -> None 1099 self.parent = parent 1100 self.transform = transform 1101 # Note that we want the PipelineVisitor classes to use the full_label, 1102 # inputs, side_inputs, and outputs fields from this instance instead of the 1103 # ones of the PTransform instance associated with it. Doing this permits 1104 # reusing PTransform instances in different contexts (apply() calls) without 1105 # any interference. This is particularly useful for composite transforms. 1106 self.full_label = full_label 1107 self.main_inputs = dict(main_inputs or {}) 1108 1109 self.side_inputs = tuple() if transform is None else transform.side_inputs 1110 self.outputs = {} # type: Dict[Union[str, int, None], pvalue.PValue] 1111 self.parts = [] # type: List[AppliedPTransform] 1112 self.environment_id = environment_id if environment_id else None # type: Optional[str] 1113 # We may need to merge the hints with environment-provided hints here 1114 # once environment is a first-class citizen in Beam graph and we have 1115 # access to actual environment, not just an id. 1116 self.resource_hints = dict( 1117 transform.get_resource_hints()) if transform else { 1118 } # type: Dict[str, bytes] 1119 1120 if annotations is None and transform: 1121 1122 def annotation_to_bytes(key, a: Any) -> bytes: 1123 if isinstance(a, bytes): 1124 return a 1125 elif isinstance(a, str): 1126 return a.encode('ascii') 1127 elif isinstance(a, message.Message): 1128 return a.SerializeToString() 1129 else: 1130 raise TypeError( 1131 'Unknown annotation type %r (type %s) for %s' % (a, type(a), key)) 1132 1133 annotations = { 1134 key: annotation_to_bytes(key, a) 1135 for key, 1136 a in transform.annotations().items() 1137 } 1138 self.annotations = annotations 1139 1140 @property 1141 def inputs(self): 1142 return tuple(self.main_inputs.values()) 1143 1144 def __repr__(self): 1145 # type: () -> str 1146 return "%s(%s, %s)" % ( 1147 self.__class__.__name__, self.full_label, type(self.transform).__name__) 1148 1149 def replace_output( 1150 self, 1151 output, # type: Union[pvalue.PValue, pvalue.DoOutputsTuple] 1152 tag=None # type: Union[str, int, None] 1153 ): 1154 # type: (...) -> None 1155 1156 """Replaces the output defined by the given tag with the given output. 1157 1158 Args: 1159 output: replacement output 1160 tag: tag of the output to be replaced. 1161 """ 1162 if isinstance(output, pvalue.DoOutputsTuple): 1163 self.replace_output(output[output._main_tag]) 1164 elif isinstance(output, pvalue.PValue): 1165 self.outputs[tag] = output 1166 elif isinstance(output, dict): 1167 for output_tag, out in output.items(): 1168 self.outputs[output_tag] = out 1169 else: 1170 raise TypeError("Unexpected output type: %s" % output) 1171 1172 # Importing locally to prevent circular dependency issues. 1173 from apache_beam.transforms import external 1174 if isinstance(self.transform, external.ExternalTransform): 1175 self.transform.replace_named_outputs(self.named_outputs()) 1176 1177 def replace_inputs(self, main_inputs): 1178 self.main_inputs = main_inputs 1179 1180 # Importing locally to prevent circular dependency issues. 1181 from apache_beam.transforms import external 1182 if isinstance(self.transform, external.ExternalTransform): 1183 self.transform.replace_named_inputs(self.named_inputs()) 1184 1185 def replace_side_inputs(self, side_inputs): 1186 self.side_inputs = side_inputs 1187 1188 # Importing locally to prevent circular dependency issues. 1189 from apache_beam.transforms import external 1190 if isinstance(self.transform, external.ExternalTransform): 1191 self.transform.replace_named_inputs(self.named_inputs()) 1192 1193 def add_output( 1194 self, 1195 output, # type: Union[pvalue.DoOutputsTuple, pvalue.PValue] 1196 tag # type: Union[str, int, None] 1197 ): 1198 # type: (...) -> None 1199 if isinstance(output, pvalue.DoOutputsTuple): 1200 self.add_output(output[tag], tag) 1201 elif isinstance(output, pvalue.PValue): 1202 assert tag not in self.outputs 1203 self.outputs[tag] = output 1204 else: 1205 raise TypeError("Unexpected output type: %s" % output) 1206 1207 def add_part(self, part): 1208 # type: (AppliedPTransform) -> None 1209 assert isinstance(part, AppliedPTransform) 1210 part._merge_outer_resource_hints() 1211 self.parts.append(part) 1212 1213 def is_composite(self): 1214 # type: () -> bool 1215 1216 """Returns whether this is a composite transform. 1217 1218 A composite transform has parts (inner transforms) or isn't the 1219 producer for any of its outputs. (An example of a transform that 1220 is not a producer is one that returns its inputs instead.) 1221 """ 1222 return bool(self.parts) or all( 1223 pval.producer is not self for pval in self.outputs.values()) 1224 1225 def visit( 1226 self, 1227 visitor, # type: PipelineVisitor 1228 pipeline, # type: Pipeline 1229 visited # type: Set[pvalue.PValue] 1230 ): 1231 # type: (...) -> None 1232 1233 """Visits all nodes reachable from the current node.""" 1234 1235 for in_pval in self.inputs: 1236 if in_pval not in visited and not isinstance(in_pval, pvalue.PBegin): 1237 if in_pval.producer is not None: 1238 in_pval.producer.visit(visitor, pipeline, visited) 1239 # The value should be visited now since we visit outputs too. 1240 assert in_pval in visited, in_pval 1241 1242 # Visit side inputs. 1243 for side_input in self.side_inputs: 1244 if isinstance(side_input, pvalue.AsSideInput) \ 1245 and side_input.pvalue not in visited: 1246 pval = side_input.pvalue # Unpack marker-object-wrapped pvalue. 1247 if pval.producer is not None: 1248 pval.producer.visit(visitor, pipeline, visited) 1249 # The value should be visited now since we visit outputs too. 1250 assert pval in visited 1251 # TODO(silviuc): Is there a way to signal that we are visiting a side 1252 # value? The issue is that the same PValue can be reachable through 1253 # multiple paths and therefore it is not guaranteed that the value 1254 # will be visited as a side value. 1255 1256 # Visit a composite or primitive transform. 1257 if self.is_composite(): 1258 visitor.enter_composite_transform(self) 1259 for part in self.parts: 1260 part.visit(visitor, pipeline, visited) 1261 visitor.leave_composite_transform(self) 1262 else: 1263 visitor.visit_transform(self) 1264 1265 # Visit the outputs (one or more). It is essential to mark as visited the 1266 # tagged PCollections of the DoOutputsTuple object. A tagged PCollection is 1267 # connected directly with its producer (a multi-output ParDo), but the 1268 # output of such a transform is the containing DoOutputsTuple, not the 1269 # PCollection inside it. Without the code below a tagged PCollection will 1270 # not be marked as visited while visiting its producer. 1271 for out_pval in self.outputs.values(): 1272 if isinstance(out_pval, pvalue.DoOutputsTuple): 1273 pvals = (v for v in out_pval) 1274 else: 1275 pvals = (out_pval, ) 1276 for v in pvals: 1277 if v not in visited: 1278 visited.add(v) 1279 visitor.visit_value(v, self) 1280 1281 def named_inputs(self): 1282 # type: () -> Dict[str, pvalue.PValue] 1283 if self.transform is None: 1284 assert not self.main_inputs and not self.side_inputs 1285 return {} 1286 else: 1287 named_inputs = self.transform._named_inputs( 1288 self.main_inputs, self.side_inputs) 1289 if not self.parts: 1290 for name, pc_out in self.outputs.items(): 1291 if pc_out.producer is not self and pc_out not in named_inputs.values( 1292 ): 1293 named_inputs[f'__implicit_input_{name}'] = pc_out 1294 return named_inputs 1295 1296 def named_outputs(self): 1297 # type: () -> Dict[str, pvalue.PCollection] 1298 if self.transform is None: 1299 assert not self.outputs 1300 return {} 1301 else: 1302 return self.transform._named_outputs(self.outputs) 1303 1304 def to_runner_api(self, context): 1305 # type: (PipelineContext) -> beam_runner_api_pb2.PTransform 1306 # External transforms require more splicing than just setting the spec. 1307 from apache_beam.transforms import external 1308 if isinstance(self.transform, external.ExternalTransform): 1309 # TODO(https://github.com/apache/beam/issues/18371): Support resource 1310 # hints in XLang transforms. In particular, make sure hints on composites 1311 # are properly propagated. 1312 return self.transform.to_runner_api_transform(context, self.full_label) 1313 1314 def transform_to_runner_api( 1315 transform, # type: Optional[ptransform.PTransform] 1316 context # type: PipelineContext 1317 ): 1318 # type: (...) -> Optional[beam_runner_api_pb2.FunctionSpec] 1319 if transform is None: 1320 return None 1321 else: 1322 # We only populate inputs information to ParDo in order to expose 1323 # key_coder and window_coder to stateful DoFn. 1324 if isinstance(transform, ParDo): 1325 return transform.to_runner_api( 1326 context, 1327 has_parts=bool(self.parts), 1328 named_inputs=self.named_inputs()) 1329 return transform.to_runner_api(context, has_parts=bool(self.parts)) 1330 1331 # Iterate over inputs and outputs by sorted key order, so that ids are 1332 # consistently generated for multiple runs of the same pipeline. 1333 transform_spec = transform_to_runner_api(self.transform, context) 1334 environment_id = self.environment_id 1335 transform_urn = transform_spec.urn if transform_spec else None 1336 if (not environment_id and 1337 (transform_urn not in Pipeline.runner_implemented_transforms())): 1338 environment_id = context.get_environment_id_for_resource_hints( 1339 self.resource_hints) 1340 1341 return beam_runner_api_pb2.PTransform( 1342 unique_name=self.full_label, 1343 spec=transform_spec, 1344 subtransforms=[ 1345 context.transforms.get_id(part, label=part.full_label) 1346 for part in self.parts 1347 ], 1348 inputs={ 1349 tag: context.pcollections.get_id(pc) 1350 for tag, 1351 pc in sorted(self.named_inputs().items()) 1352 }, 1353 outputs={ 1354 tag: context.pcollections.get_id(out) 1355 for tag, 1356 out in sorted(self.named_outputs().items()) 1357 }, 1358 environment_id=environment_id, 1359 annotations=self.annotations, 1360 # TODO(https://github.com/apache/beam/issues/18012): Add display_data. 1361 display_data=DisplayData.create_from(self.transform).to_proto() 1362 if self.transform else None) 1363 1364 @staticmethod 1365 def from_runner_api( 1366 proto, # type: beam_runner_api_pb2.PTransform 1367 context # type: PipelineContext 1368 ): 1369 # type: (...) -> AppliedPTransform 1370 1371 if common_urns.primitives.PAR_DO.urn == proto.spec.urn: 1372 # Preserving side input tags. 1373 pardo_payload = ( 1374 proto_utils.parse_Bytes( 1375 proto.spec.payload, beam_runner_api_pb2.ParDoPayload)) 1376 side_input_tags = list(pardo_payload.side_inputs.keys()) 1377 else: 1378 pardo_payload = None 1379 side_input_tags = [] 1380 1381 main_inputs = { 1382 tag: context.pcollections.get_by_id(id) 1383 for (tag, id) in proto.inputs.items() if tag not in side_input_tags 1384 } 1385 1386 transform = ptransform.PTransform.from_runner_api(proto, context) 1387 if transform and proto.environment_id: 1388 resource_hints = context.environments.get_by_id( 1389 proto.environment_id).resource_hints() 1390 if resource_hints: 1391 transform._resource_hints = dict(resource_hints) 1392 1393 # Ordering is important here. 1394 # TODO(https://github.com/apache/beam/issues/20136): use key, value pairs 1395 # instead of depending on tags with index as a suffix. 1396 indexed_side_inputs = [ 1397 (get_sideinput_index(tag), context.pcollections.get_by_id(id)) for tag, 1398 id in proto.inputs.items() if tag in side_input_tags 1399 ] 1400 side_inputs = [si for _, si in sorted(indexed_side_inputs)] 1401 1402 result = AppliedPTransform( 1403 parent=None, 1404 transform=transform, 1405 full_label=proto.unique_name, 1406 main_inputs=main_inputs, 1407 environment_id=None, 1408 annotations=proto.annotations) 1409 1410 if result.transform and result.transform.side_inputs: 1411 for si, pcoll in zip(result.transform.side_inputs, side_inputs): 1412 si.pvalue = pcoll 1413 result.side_inputs = tuple(result.transform.side_inputs) 1414 result.parts = [] 1415 for transform_id in proto.subtransforms: 1416 part = context.transforms.get_by_id(transform_id) 1417 part.parent = result 1418 result.add_part(part) 1419 result.outputs = { 1420 None if tag == 'None' else tag: context.pcollections.get_by_id(id) 1421 for tag, 1422 id in proto.outputs.items() 1423 } 1424 # This annotation is expected by some runners. 1425 if proto.spec.urn == common_urns.primitives.PAR_DO.urn: 1426 result.transform.output_tags = set(proto.outputs.keys()).difference( 1427 {'None'}) 1428 if not result.parts: 1429 for tag, pcoll_id in proto.outputs.items(): 1430 if pcoll_id not in proto.inputs.values(): 1431 pc = context.pcollections.get_by_id(pcoll_id) 1432 pc.producer = result 1433 pc.tag = None if tag == 'None' else tag 1434 return result 1435 1436 def _merge_outer_resource_hints(self): 1437 if (self.parent is not None and self.parent.resource_hints): 1438 self.resource_hints = merge_resource_hints( 1439 outer_hints=self.parent.resource_hints, 1440 inner_hints=self.resource_hints) 1441 if self.resource_hints: 1442 for part in self.parts: 1443 part._merge_outer_resource_hints() 1444 1445 1446 class PTransformOverride(metaclass=abc.ABCMeta): 1447 """For internal use only; no backwards-compatibility guarantees. 1448 1449 Gives a matcher and replacements for matching PTransforms. 1450 1451 TODO: Update this to support cases where input and/our output types are 1452 different. 1453 """ 1454 @abc.abstractmethod 1455 def matches(self, applied_ptransform): 1456 # type: (AppliedPTransform) -> bool 1457 1458 """Determines whether the given AppliedPTransform matches. 1459 1460 Note that the matching will happen *after* Runner API proto translation. 1461 If matching is done via type checks, to/from_runner_api[_parameter] methods 1462 must be implemented to preserve the type (and other data) through proto 1463 serialization. 1464 1465 Consider URN-based translation instead. 1466 1467 Args: 1468 applied_ptransform: AppliedPTransform to be matched. 1469 1470 Returns: 1471 a bool indicating whether the given AppliedPTransform is a match. 1472 """ 1473 raise NotImplementedError 1474 1475 def get_replacement_transform_for_applied_ptransform( 1476 self, applied_ptransform): 1477 # type: (AppliedPTransform) -> ptransform.PTransform 1478 1479 """Provides a runner specific override for a given `AppliedPTransform`. 1480 1481 Args: 1482 applied_ptransform: `AppliedPTransform` containing the `PTransform` to be 1483 replaced. 1484 1485 Returns: 1486 A `PTransform` that will be the replacement for the `PTransform` inside 1487 the `AppliedPTransform` given as an argument. 1488 """ 1489 # Returns a PTransformReplacement 1490 return self.get_replacement_transform(applied_ptransform.transform) 1491 1492 @deprecated( 1493 since='2.24', current='get_replacement_transform_for_applied_ptransform') 1494 def get_replacement_transform(self, ptransform): 1495 # type: (Optional[ptransform.PTransform]) -> ptransform.PTransform 1496 1497 """Provides a runner specific override for a given PTransform. 1498 1499 Args: 1500 ptransform: PTransform to be replaced. 1501 1502 Returns: 1503 A PTransform that will be the replacement for the PTransform given as an 1504 argument. 1505 """ 1506 # Returns a PTransformReplacement 1507 raise NotImplementedError 1508 1509 def get_replacement_inputs(self, applied_ptransform): 1510 # type: (AppliedPTransform) -> Iterable[pvalue.PValue] 1511 1512 """Provides inputs that will be passed to the replacement PTransform. 1513 1514 Args: 1515 applied_ptransform: Original AppliedPTransform containing the PTransform 1516 to be replaced. 1517 1518 Returns: 1519 An iterable of PValues that will be passed to the expand() method of the 1520 replacement PTransform. 1521 """ 1522 return tuple(applied_ptransform.inputs) + tuple( 1523 side_input.pvalue for side_input in applied_ptransform.side_inputs) 1524 1525 1526 class ComponentIdMap(object): 1527 """A utility for assigning unique component ids to Beam components. 1528 1529 Component ID assignments are only guaranteed to be unique and consistent 1530 within the scope of a ComponentIdMap instance. 1531 """ 1532 def __init__(self, namespace="ref"): 1533 self.namespace = namespace 1534 self._counters = defaultdict(lambda: 0) # type: Dict[type, int] 1535 self._obj_to_id = {} # type: Dict[Any, str] 1536 1537 def get_or_assign(self, obj=None, obj_type=None, label=None): 1538 if obj not in self._obj_to_id: 1539 self._obj_to_id[obj] = self._unique_ref(obj, obj_type, label) 1540 1541 return self._obj_to_id[obj] 1542 1543 def _normalize(self, str_value): 1544 str_value = unicodedata.normalize('NFC', str_value) 1545 return re.sub(r'[^a-zA-Z0-9-_]+', '-', str_value) 1546 1547 def _unique_ref(self, obj=None, obj_type=None, label=None): 1548 # Normalize, trim, and uniqify. 1549 prefix = self._normalize( 1550 '%s_%s_%s' % 1551 (self.namespace, obj_type.__name__, label or type(obj).__name__))[0:100] 1552 self._counters[obj_type] += 1 1553 return '%s_%d' % (prefix, self._counters[obj_type])