github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/pipeline.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """Pipeline, the top-level Beam object.
    19  
    20  A pipeline holds a DAG of data transforms. Conceptually the nodes of the DAG
    21  are transforms (:class:`~apache_beam.transforms.ptransform.PTransform` objects)
    22  and the edges are values (mostly :class:`~apache_beam.pvalue.PCollection`
    23  objects). The transforms take as inputs one or more PValues and output one or
    24  more :class:`~apache_beam.pvalue.PValue` s.
    25  
    26  The pipeline offers functionality to traverse the graph.  The actual operation
    27  to be executed for each node visited is specified through a runner object.
    28  
    29  Typical usage::
    30  
    31    # Create a pipeline object using a local runner for execution.
    32    with beam.Pipeline('DirectRunner') as p:
    33  
    34      # Add to the pipeline a "Create" transform. When executed this
    35      # transform will produce a PCollection object with the specified values.
    36      pcoll = p | 'Create' >> beam.Create([1, 2, 3])
    37  
    38      # Another transform could be applied to pcoll, e.g., writing to a text file.
    39      # For other transforms, refer to transforms/ directory.
    40      pcoll | 'Write' >> beam.io.WriteToText('./output')
    41  
    42      # run() will execute the DAG stored in the pipeline.  The execution of the
    43      # nodes visited is done using the specified local runner.
    44  
    45  """
    46  
    47  # pytype: skip-file
    48  # mypy: disallow-untyped-defs
    49  
    50  import abc
    51  import logging
    52  import os
    53  import re
    54  import shutil
    55  import tempfile
    56  import unicodedata
    57  from collections import defaultdict
    58  from typing import TYPE_CHECKING
    59  from typing import Any
    60  from typing import Dict
    61  from typing import FrozenSet
    62  from typing import Iterable
    63  from typing import List
    64  from typing import Mapping
    65  from typing import Optional
    66  from typing import Sequence
    67  from typing import Set
    68  from typing import Tuple
    69  from typing import Type
    70  from typing import Union
    71  
    72  from google.protobuf import message
    73  
    74  from apache_beam import pvalue
    75  from apache_beam.internal import pickler
    76  from apache_beam.io.filesystems import FileSystems
    77  from apache_beam.options.pipeline_options import CrossLanguageOptions
    78  from apache_beam.options.pipeline_options import DebugOptions
    79  from apache_beam.options.pipeline_options import PipelineOptions
    80  from apache_beam.options.pipeline_options import SetupOptions
    81  from apache_beam.options.pipeline_options import StandardOptions
    82  from apache_beam.options.pipeline_options import TypeOptions
    83  from apache_beam.options.pipeline_options_validator import PipelineOptionsValidator
    84  from apache_beam.portability import common_urns
    85  from apache_beam.portability.api import beam_runner_api_pb2
    86  from apache_beam.runners import PipelineRunner
    87  from apache_beam.runners import create_runner
    88  from apache_beam.transforms import ParDo
    89  from apache_beam.transforms import ptransform
    90  from apache_beam.transforms.display import DisplayData
    91  from apache_beam.transforms.resources import merge_resource_hints
    92  from apache_beam.transforms.resources import resource_hints_from_options
    93  from apache_beam.transforms.sideinputs import get_sideinput_index
    94  from apache_beam.typehints import TypeCheckError
    95  from apache_beam.typehints import typehints
    96  from apache_beam.utils import proto_utils
    97  from apache_beam.utils import subprocess_server
    98  from apache_beam.utils.annotations import deprecated
    99  from apache_beam.utils.interactive_utils import alter_label_if_ipython
   100  from apache_beam.utils.interactive_utils import is_in_ipython
   101  
   102  if TYPE_CHECKING:
   103    from types import TracebackType
   104    from apache_beam.runners.pipeline_context import PipelineContext
   105    from apache_beam.runners.runner import PipelineResult
   106    from apache_beam.transforms import environments
   107  
   108  __all__ = ['Pipeline', 'PTransformOverride']
   109  
   110  
   111  class Pipeline(object):
   112    """A pipeline object that manages a DAG of
   113    :class:`~apache_beam.pvalue.PValue` s and their
   114    :class:`~apache_beam.transforms.ptransform.PTransform` s.
   115  
   116    Conceptually the :class:`~apache_beam.pvalue.PValue` s are the DAG's nodes and
   117    the :class:`~apache_beam.transforms.ptransform.PTransform` s computing
   118    the :class:`~apache_beam.pvalue.PValue` s are the edges.
   119  
   120    All the transforms applied to the pipeline must have distinct full labels.
   121    If same transform instance needs to be applied then the right shift operator
   122    should be used to designate new names
   123    (e.g. ``input | "label" >> my_transform``).
   124    """
   125    @classmethod
   126    def runner_implemented_transforms(cls):
   127      # type: () -> FrozenSet[str]
   128  
   129      # This set should only contain transforms which are required to be
   130      # implemented by a runner.
   131      return frozenset([
   132          common_urns.primitives.GROUP_BY_KEY.urn,
   133          common_urns.primitives.IMPULSE.urn,
   134      ])
   135  
   136    def __init__(self, runner=None, options=None, argv=None):
   137      # type: (Optional[Union[str, PipelineRunner]], Optional[PipelineOptions], Optional[List[str]]) -> None
   138  
   139      """Initialize a pipeline object.
   140  
   141      Args:
   142        runner (~apache_beam.runners.runner.PipelineRunner): An object of
   143          type :class:`~apache_beam.runners.runner.PipelineRunner` that will be
   144          used to execute the pipeline. For registered runners, the runner name
   145          can be specified, otherwise a runner object must be supplied.
   146        options (~apache_beam.options.pipeline_options.PipelineOptions):
   147          A configured
   148          :class:`~apache_beam.options.pipeline_options.PipelineOptions` object
   149          containing arguments that should be used for running the Beam job.
   150        argv (List[str]): a list of arguments (such as :data:`sys.argv`)
   151          to be used for building a
   152          :class:`~apache_beam.options.pipeline_options.PipelineOptions` object.
   153          This will only be used if argument **options** is :data:`None`.
   154  
   155      Raises:
   156        ValueError: if either the runner or options argument is not
   157          of the expected type.
   158      """
   159      # Initializing logging configuration in case the user did not set it up.
   160      logging.basicConfig()
   161  
   162      if options is not None:
   163        if isinstance(options, PipelineOptions):
   164          self._options = options
   165        else:
   166          raise ValueError(
   167              'Parameter options, if specified, must be of type PipelineOptions. '
   168              'Received : %r' % options)
   169      elif argv is not None:
   170        if isinstance(argv, list):
   171          self._options = PipelineOptions(argv)
   172        else:
   173          raise ValueError(
   174              'Parameter argv, if specified, must be a list. Received : %r' %
   175              argv)
   176      else:
   177        self._options = PipelineOptions([])
   178  
   179      FileSystems.set_options(self._options)
   180  
   181      pickle_library = self._options.view_as(SetupOptions).pickle_library
   182      pickler.set_library(pickle_library)
   183  
   184      if runner is None:
   185        runner = self._options.view_as(StandardOptions).runner
   186        if runner is None:
   187          runner = StandardOptions.DEFAULT_RUNNER
   188          logging.info((
   189              'Missing pipeline option (runner). Executing pipeline '
   190              'using the default runner: %s.'),
   191                       runner)
   192  
   193      if isinstance(runner, str):
   194        runner = create_runner(runner)
   195      elif not isinstance(runner, PipelineRunner):
   196        raise TypeError(
   197            'Runner %s is not a PipelineRunner object or the '
   198            'name of a registered runner.' % runner)
   199  
   200      # Validate pipeline options
   201      errors = PipelineOptionsValidator(self._options, runner).validate()
   202      if errors:
   203        raise ValueError(
   204            'Pipeline has validations errors: \n' + '\n'.join(errors))
   205  
   206      # set default experiments for portable runners
   207      # (needs to occur prior to pipeline construction)
   208      if runner.is_fnapi_compatible():
   209        experiments = (self._options.view_as(DebugOptions).experiments or [])
   210        if not 'beam_fn_api' in experiments:
   211          experiments.append('beam_fn_api')
   212          self._options.view_as(DebugOptions).experiments = experiments
   213  
   214      self.local_tempdir = tempfile.mkdtemp(prefix='beam-pipeline-temp')
   215  
   216      # Default runner to be used.
   217      self.runner = runner
   218      # Stack of transforms generated by nested apply() calls. The stack will
   219      # contain a root node as an enclosing (parent) node for top transforms.
   220      self.transforms_stack = [AppliedPTransform(None, None, '', None)]
   221      # Set of transform labels (full labels) applied to the pipeline.
   222      # If a transform is applied and the full label is already in the set
   223      # then the transform will have to be cloned with a new label.
   224      self.applied_labels = set()  # type: Set[str]
   225      # Hints supplied via pipeline options are considered the outermost hints.
   226      self._root_transform().resource_hints = resource_hints_from_options(options)
   227      # Create a ComponentIdMap for assigning IDs to components. Ensures that any
   228      # components that receive an ID during pipeline construction (for example in
   229      # ExternalTransform), will receive the same component ID when generating the
   230      # full pipeline proto.
   231      self.component_id_map = ComponentIdMap()
   232  
   233      # Records whether this pipeline contains any external transforms.
   234      self.contains_external_transforms = False
   235  
   236  
   237    @property  # type: ignore[misc]  # decorated property not supported
   238    @deprecated(
   239        since='First stable release',
   240        extra_message='References to <pipeline>.options'
   241        ' will not be supported')
   242    def options(self):
   243      # type: () -> PipelineOptions
   244      return self._options
   245  
   246    @property
   247    def allow_unsafe_triggers(self):
   248      # type: () -> bool
   249      return self._options.view_as(TypeOptions).allow_unsafe_triggers
   250  
   251    def _current_transform(self):
   252      # type: () -> AppliedPTransform
   253  
   254      """Returns the transform currently on the top of the stack."""
   255      return self.transforms_stack[-1]
   256  
   257    def _root_transform(self):
   258      # type: () -> AppliedPTransform
   259  
   260      """Returns the root transform of the transform stack."""
   261      return self.transforms_stack[0]
   262  
   263    def _remove_labels_recursively(self, applied_transform):
   264      # type: (AppliedPTransform) -> None
   265      for part in applied_transform.parts:
   266        if part.full_label in self.applied_labels:
   267          self.applied_labels.remove(part.full_label)
   268          self._remove_labels_recursively(part)
   269  
   270    def _replace(self, override):
   271      # type: (PTransformOverride) -> None
   272      assert isinstance(override, PTransformOverride)
   273  
   274      # From original transform output --> replacement transform output
   275      output_map = {}  # type: Dict[pvalue.PValue, pvalue.PValue]
   276      output_replacements = {
   277      }  # type: Dict[AppliedPTransform, List[Tuple[pvalue.PValue, Optional[str]]]]
   278      input_replacements = {
   279      }  # type: Dict[AppliedPTransform, Mapping[str, Union[pvalue.PBegin, pvalue.PCollection]]]
   280      side_input_replacements = {
   281      }  # type: Dict[AppliedPTransform, List[pvalue.AsSideInput]]
   282  
   283      class TransformUpdater(PipelineVisitor):  # pylint: disable=used-before-assignment
   284        """"A visitor that replaces the matching PTransforms."""
   285        def __init__(self, pipeline):
   286          # type: (Pipeline) -> None
   287          self.pipeline = pipeline
   288  
   289        def _replace_if_needed(self, original_transform_node):
   290          # type: (AppliedPTransform) -> None
   291          if override.matches(original_transform_node):
   292            assert isinstance(original_transform_node, AppliedPTransform)
   293            replacement_transform = (
   294                override.get_replacement_transform_for_applied_ptransform(
   295                    original_transform_node))
   296            if replacement_transform is original_transform_node.transform:
   297              return
   298            replacement_transform.side_inputs = tuple(
   299                original_transform_node.transform.side_inputs)
   300  
   301            replacement_transform_node = AppliedPTransform(
   302                original_transform_node.parent,
   303                replacement_transform,
   304                original_transform_node.full_label,
   305                original_transform_node.main_inputs)
   306  
   307            # TODO(https://github.com/apache/beam/issues/21178): Merge rather
   308            # than override.
   309            replacement_transform_node.resource_hints = (
   310                original_transform_node.resource_hints)
   311  
   312            # Transform execution could depend on order in which nodes are
   313            # considered. Hence we insert the replacement transform node to same
   314            # index as the original transform node. Note that this operation
   315            # removes the original transform node.
   316            if original_transform_node.parent:
   317              assert isinstance(original_transform_node.parent, AppliedPTransform)
   318              parent_parts = original_transform_node.parent.parts
   319              parent_parts[parent_parts.index(original_transform_node)] = (
   320                  replacement_transform_node)
   321            else:
   322              # Original transform has to be a root.
   323              roots = self.pipeline.transforms_stack[0].parts
   324              assert original_transform_node in roots
   325              roots[roots.index(original_transform_node)] = (
   326                  replacement_transform_node)
   327  
   328            inputs = override.get_replacement_inputs(original_transform_node)
   329            if len(inputs) > 1:
   330              transform_input = inputs
   331            elif len(inputs) == 1:
   332              transform_input = inputs[0]
   333            elif len(inputs) == 0:
   334              transform_input = pvalue.PBegin(self.pipeline)
   335            try:
   336              # We have to add the new AppliedTransform to the stack before
   337              # expand() and pop it out later to make sure that parts get added
   338              # correctly.
   339              self.pipeline.transforms_stack.append(replacement_transform_node)
   340  
   341              # Keeping the same label for the replaced node but recursively
   342              # removing labels of child transforms of original transform since
   343              # they will be replaced during the expand below. This is needed in
   344              # case the replacement contains children that have labels that
   345              # conflicts with labels of the children of the original.
   346              self.pipeline._remove_labels_recursively(original_transform_node)
   347  
   348              new_output = replacement_transform.expand(transform_input)
   349              assert isinstance(
   350                  new_output, (dict, pvalue.PValue, pvalue.DoOutputsTuple))
   351  
   352              if isinstance(new_output, pvalue.PValue):
   353                new_output.element_type = None
   354                self.pipeline._infer_result_type(
   355                    replacement_transform, inputs, new_output)
   356  
   357              if isinstance(new_output, dict):
   358                for new_tag, new_pcoll in new_output.items():
   359                  replacement_transform_node.add_output(new_pcoll, new_tag)
   360              elif isinstance(new_output, pvalue.DoOutputsTuple):
   361                replacement_transform_node.add_output(
   362                    new_output, new_output._main_tag)
   363              else:
   364                replacement_transform_node.add_output(new_output, new_output.tag)
   365  
   366              # Recording updated outputs. This cannot be done in the same
   367              # visitor since if we dynamically update output type here, we'll
   368              # run into errors when visiting child nodes.
   369              #
   370              # NOTE: When replacing multiple outputs, the replacement
   371              # PCollection tags must have a matching tag in the original
   372              # transform.
   373              if isinstance(new_output, pvalue.PValue):
   374                if not new_output.producer:
   375                  new_output.producer = replacement_transform_node
   376                output_map[original_transform_node.outputs[new_output.tag]] = \
   377                    new_output
   378              elif isinstance(new_output, (pvalue.DoOutputsTuple, tuple)):
   379                for pcoll in new_output:
   380                  if not pcoll.producer:
   381                    pcoll.producer = replacement_transform_node
   382                  output_map[original_transform_node.outputs[pcoll.tag]] = pcoll
   383              elif isinstance(new_output, dict):
   384                for tag, pcoll in new_output.items():
   385                  if not pcoll.producer:
   386                    pcoll.producer = replacement_transform_node
   387                  output_map[original_transform_node.outputs[tag]] = pcoll
   388            finally:
   389              self.pipeline.transforms_stack.pop()
   390  
   391        def enter_composite_transform(self, transform_node):
   392          # type: (AppliedPTransform) -> None
   393          self._replace_if_needed(transform_node)
   394  
   395        def visit_transform(self, transform_node):
   396          # type: (AppliedPTransform) -> None
   397          self._replace_if_needed(transform_node)
   398  
   399      self.visit(TransformUpdater(self))
   400  
   401      # Ensure no type information is lost.
   402      for old, new in output_map.items():
   403        if new.element_type == typehints.Any:
   404          # TODO(robertwb): Perhaps take the intersection?
   405          new.element_type = old.element_type
   406  
   407      # Adjusting inputs and outputs
   408      class InputOutputUpdater(PipelineVisitor):  # pylint: disable=used-before-assignment
   409        """"A visitor that records input and output values to be replaced.
   410  
   411        Input and output values that should be updated are recorded in maps
   412        input_replacements and output_replacements respectively.
   413  
   414        We cannot update input and output values while visiting since that results
   415        in validation errors.
   416        """
   417        def __init__(self, pipeline):
   418          # type: (Pipeline) -> None
   419          self.pipeline = pipeline
   420  
   421        def enter_composite_transform(self, transform_node):
   422          # type: (AppliedPTransform) -> None
   423          self.visit_transform(transform_node)
   424  
   425        def visit_transform(self, transform_node):
   426          # type: (AppliedPTransform) -> None
   427          replace_output = False
   428          for tag in transform_node.outputs:
   429            if transform_node.outputs[tag] in output_map:
   430              replace_output = True
   431              break
   432  
   433          replace_input = False
   434          for input in transform_node.inputs:
   435            if input in output_map:
   436              replace_input = True
   437              break
   438  
   439          replace_side_inputs = False
   440          for side_input in transform_node.side_inputs:
   441            if side_input.pvalue in output_map:
   442              replace_side_inputs = True
   443              break
   444  
   445          if replace_output:
   446            output_replacements[transform_node] = []
   447            for original, replacement in output_map.items():
   448              for tag, output in transform_node.outputs.items():
   449                if output == original:
   450                  output_replacements[transform_node].append((tag, replacement))
   451  
   452          if replace_input:
   453            new_inputs = {
   454                tag: input if not input in output_map else output_map[input]
   455                for (tag, input) in transform_node.main_inputs.items()
   456            }
   457            input_replacements[transform_node] = new_inputs
   458  
   459          if replace_side_inputs:
   460            new_side_inputs = []
   461            for side_input in transform_node.side_inputs:
   462              if side_input.pvalue in output_map:
   463                side_input.pvalue = output_map[side_input.pvalue]
   464                new_side_inputs.append(side_input)
   465              else:
   466                new_side_inputs.append(side_input)
   467            side_input_replacements[transform_node] = new_side_inputs
   468  
   469      self.visit(InputOutputUpdater(self))
   470  
   471      for transform, output_replacement in output_replacements.items():
   472        for tag, output in output_replacement:
   473          transform.replace_output(output, tag=tag)
   474  
   475      for transform, input_replacement in input_replacements.items():
   476        transform.replace_inputs(input_replacement)
   477  
   478      for transform, side_input_replacement in side_input_replacements.items():
   479        transform.replace_side_inputs(side_input_replacement)
   480  
   481    def _check_replacement(self, override):
   482      # type: (PTransformOverride) -> None
   483      class ReplacementValidator(PipelineVisitor):
   484        def visit_transform(self, transform_node):
   485          # type: (AppliedPTransform) -> None
   486          if override.matches(transform_node):
   487            raise RuntimeError(
   488                'Transform node %r was not replaced as expected.' %
   489                transform_node)
   490  
   491      self.visit(ReplacementValidator())
   492  
   493    def replace_all(self, replacements):
   494      # type: (Iterable[PTransformOverride]) -> None
   495  
   496      """ Dynamically replaces PTransforms in the currently populated hierarchy.
   497  
   498      Currently this only works for replacements where input and output types
   499      are exactly the same.
   500  
   501      TODO: Update this to also work for transform overrides where input and
   502      output types are different.
   503  
   504      Args:
   505        replacements (List[~apache_beam.pipeline.PTransformOverride]): a list of
   506          :class:`~apache_beam.pipeline.PTransformOverride` objects.
   507      """
   508      for override in replacements:
   509        assert isinstance(override, PTransformOverride)
   510        self._replace(override)
   511  
   512      # Checking if the PTransforms have been successfully replaced. This will
   513      # result in a failure if a PTransform that was replaced in a given override
   514      # gets re-added in a subsequent override. This is not allowed and ordering
   515      # of PTransformOverride objects in 'replacements' is important.
   516      for override in replacements:
   517        self._check_replacement(override)
   518  
   519    def run(self, test_runner_api='AUTO'):
   520      # type: (Union[bool, str]) -> PipelineResult
   521  
   522      """Runs the pipeline. Returns whatever our runner returns after running."""
   523  
   524      # Records whether this pipeline contains any cross-language transforms.
   525      self.contains_external_transforms = (
   526          ExternalTransformFinder.contains_external_transforms(self))
   527  
   528      try:
   529        if test_runner_api == 'AUTO':
   530          # Don't pay the cost of a round-trip if we're going to be going through
   531          # the FnApi anyway...
   532          is_fnapi_compatible = self.runner.is_fnapi_compatible() or (
   533              # DirectRunner uses the Fn API for batch only
   534              self.runner.__class__.__name__ == 'SwitchingDirectRunner' and
   535              not self._options.view_as(StandardOptions).streaming)
   536  
   537          # Multi-language pipelines that contain external pipeline segments may
   538          # not be able to create a Python pipeline object graph. Hence following
   539          # runner API check should be skipped for such pipelines.
   540  
   541          # The InteractiveRunner relies on a constant pipeline reference, skip
   542          # it.
   543          test_runner_api = (
   544              not is_fnapi_compatible and
   545              not self.contains_external_transforms and
   546              self.runner.__class__.__name__ != 'InteractiveRunner')
   547  
   548        # When possible, invoke a round trip through the runner API.
   549        if test_runner_api and self._verify_runner_api_compatible():
   550          return Pipeline.from_runner_api(
   551              self.to_runner_api(use_fake_coders=True),
   552              self.runner,
   553              self._options).run(False)
   554  
   555        if (self._options.view_as(TypeOptions).runtime_type_check and
   556            self._options.view_as(TypeOptions).performance_runtime_type_check):
   557          raise RuntimeError(
   558              'You cannot turn on runtime_type_check '
   559              'and performance_runtime_type_check simultaneously. '
   560              'Pick one or the other.')
   561  
   562        if self._options.view_as(TypeOptions).runtime_type_check:
   563          from apache_beam.typehints import typecheck
   564          self.visit(typecheck.TypeCheckVisitor())
   565  
   566        if self._options.view_as(TypeOptions).performance_runtime_type_check:
   567          from apache_beam.typehints import typecheck
   568          self.visit(typecheck.PerformanceTypeCheckVisitor())
   569  
   570        if self._options.view_as(SetupOptions).save_main_session:
   571          # If this option is chosen, verify we can pickle the main session early.
   572          tmpdir = tempfile.mkdtemp()
   573          try:
   574            pickler.dump_session(os.path.join(tmpdir, 'main_session.pickle'))
   575          finally:
   576            shutil.rmtree(tmpdir)
   577        return self.runner.run_pipeline(self, self._options)
   578      finally:
   579        if not is_in_ipython():
   580          shutil.rmtree(self.local_tempdir, ignore_errors=True)
   581        # else interactive beam handles the cleanup.
   582  
   583    def __enter__(self):
   584      # type: () -> Pipeline
   585      self._extra_context = subprocess_server.JavaJarServer.beam_services(
   586          self._options.view_as(CrossLanguageOptions).beam_services)
   587      self._extra_context.__enter__()
   588      return self
   589  
   590    def __exit__(
   591        self,
   592        exc_type,  # type: Optional[Type[BaseException]]
   593        exc_val,  # type: Optional[BaseException]
   594        exc_tb  # type: Optional[TracebackType]
   595    ):
   596      # type: (...) -> None
   597  
   598      try:
   599        if not exc_type:
   600          self.result = self.run()
   601          self.result.wait_until_finish()
   602      finally:
   603        self._extra_context.__exit__(exc_type, exc_val, exc_tb)
   604  
   605    def visit(self, visitor):
   606      # type: (PipelineVisitor) -> None
   607  
   608      """Visits depth-first every node of a pipeline's DAG.
   609  
   610      Runner-internal implementation detail; no backwards-compatibility guarantees
   611  
   612      Args:
   613        visitor (~apache_beam.pipeline.PipelineVisitor):
   614          :class:`~apache_beam.pipeline.PipelineVisitor` object whose callbacks
   615          will be called for each node visited. See
   616          :class:`~apache_beam.pipeline.PipelineVisitor` comments.
   617  
   618      Raises:
   619        TypeError: if node is specified and is not a
   620          :class:`~apache_beam.pvalue.PValue`.
   621        ~apache_beam.error.PipelineError: if node is specified and does not
   622          belong to this pipeline instance.
   623      """
   624  
   625      visited = set()  # type: Set[pvalue.PValue]
   626      self._root_transform().visit(visitor, self, visited)
   627  
   628    def apply(
   629        self,
   630        transform,  # type: ptransform.PTransform
   631        pvalueish=None,  # type: Optional[pvalue.PValue]
   632        label=None  # type: Optional[str]
   633    ):
   634      # type: (...) -> pvalue.PValue
   635  
   636      """Applies a custom transform using the pvalueish specified.
   637  
   638      Args:
   639        transform (~apache_beam.transforms.ptransform.PTransform): the
   640          :class:`~apache_beam.transforms.ptransform.PTransform` to apply.
   641        pvalueish (~apache_beam.pvalue.PCollection): the input for the
   642          :class:`~apache_beam.transforms.ptransform.PTransform` (typically a
   643          :class:`~apache_beam.pvalue.PCollection`).
   644        label (str): label of the
   645          :class:`~apache_beam.transforms.ptransform.PTransform`.
   646  
   647      Raises:
   648        TypeError: if the transform object extracted from the
   649          argument list is not a
   650          :class:`~apache_beam.transforms.ptransform.PTransform`.
   651        RuntimeError: if the transform object was already applied to
   652          this pipeline and needs to be cloned in order to apply again.
   653      """
   654      if isinstance(transform, ptransform._NamedPTransform):
   655        return self.apply(
   656            transform.transform, pvalueish, label or transform.label)
   657  
   658      if not isinstance(transform, ptransform.PTransform):
   659        raise TypeError("Expected a PTransform object, got %s" % transform)
   660  
   661      if label:
   662        # Fix self.label as it is inspected by some PTransform operations
   663        # (e.g. to produce error messages for type hint violations).
   664        old_label, transform.label = transform.label, label
   665        try:
   666          return self.apply(transform, pvalueish)
   667        finally:
   668          transform.label = old_label
   669  
   670      # Attempts to alter the label of the transform to be applied only when it's
   671      # a top-level transform so that the cell number will not be prepended to
   672      # every child transform in a composite.
   673      if self._current_transform() is self._root_transform():
   674        alter_label_if_ipython(transform, pvalueish)
   675  
   676      full_label = '/'.join(
   677          [self._current_transform().full_label, label or
   678           transform.label]).lstrip('/')
   679      if full_label in self.applied_labels:
   680        raise RuntimeError(
   681            'A transform with label "%s" already exists in the pipeline. '
   682            'To apply a transform with a specified label write '
   683            'pvalue | "label" >> transform' % full_label)
   684      self.applied_labels.add(full_label)
   685  
   686      pvalueish, inputs = transform._extract_input_pvalues(pvalueish)
   687      try:
   688        if not isinstance(inputs, dict):
   689          inputs = {str(ix): input for (ix, input) in enumerate(inputs)}
   690      except TypeError:
   691        raise NotImplementedError(
   692            'Unable to extract PValue inputs from %s; either %s does not accept '
   693            'inputs of this format, or it does not properly override '
   694            '_extract_input_pvalues' % (pvalueish, transform))
   695      for t, leaf_input in inputs.items():
   696        if not isinstance(leaf_input, pvalue.PValue) or not isinstance(t, str):
   697          raise NotImplementedError(
   698              '%s does not properly override _extract_input_pvalues, '
   699              'returned %s from %s' % (transform, inputs, pvalueish))
   700  
   701      current = AppliedPTransform(
   702          self._current_transform(), transform, full_label, inputs)
   703      self._current_transform().add_part(current)
   704  
   705      try:
   706        self.transforms_stack.append(current)
   707  
   708        type_options = self._options.view_as(TypeOptions)
   709        if type_options.pipeline_type_check:
   710          transform.type_check_inputs(pvalueish)
   711  
   712        pvalueish_result = self.runner.apply(transform, pvalueish, self._options)
   713  
   714        if type_options is not None and type_options.pipeline_type_check:
   715          transform.type_check_outputs(pvalueish_result)
   716  
   717        for tag, result in ptransform.get_named_nested_pvalues(pvalueish_result):
   718          assert isinstance(result, (pvalue.PValue, pvalue.DoOutputsTuple))
   719  
   720          # Make sure we set the producer only for a leaf node in the transform
   721          # DAG. This way we preserve the last transform of a composite transform
   722          # as being the real producer of the result.
   723          if result.producer is None:
   724            result.producer = current
   725  
   726          # TODO(BEAM-1833): Pass full tuples dict.
   727          self._infer_result_type(transform, tuple(inputs.values()), result)
   728  
   729          assert isinstance(result.producer.inputs, tuple)
   730          # The DoOutputsTuple adds the PCollection to the outputs when accessed
   731          # except for the main tag. Add the main tag here.
   732          if isinstance(result, pvalue.DoOutputsTuple):
   733            current.add_output(result, result._main_tag)
   734            continue
   735  
   736          # If there is already a tag with the same name, increase a counter for
   737          # the name. This can happen, for example, when a composite outputs a
   738          # list of PCollections where all the tags are None.
   739          base = tag
   740          counter = 0
   741          while tag in current.outputs:
   742            counter += 1
   743            tag = '%s_%d' % (base, counter)
   744  
   745          current.add_output(result, tag)
   746  
   747        if (type_options is not None and
   748            type_options.type_check_strictness == 'ALL_REQUIRED' and
   749            transform.get_type_hints().output_types is None):
   750          ptransform_name = '%s(%s)' % (transform.__class__.__name__, full_label)
   751          raise TypeCheckError(
   752              'Pipeline type checking is enabled, however no '
   753              'output type-hint was found for the '
   754              'PTransform %s' % ptransform_name)
   755      finally:
   756        self.transforms_stack.pop()
   757      return pvalueish_result
   758  
   759    def _infer_result_type(
   760        self,
   761        transform,  # type: ptransform.PTransform
   762        inputs,  # type: Sequence[Union[pvalue.PBegin, pvalue.PCollection]]
   763        result_pcollection  # type: Union[pvalue.PValue, pvalue.DoOutputsTuple]
   764    ):
   765      # type: (...) -> None
   766      # TODO(robertwb): Multi-input inference.
   767      type_options = self._options.view_as(TypeOptions)
   768      if type_options is None or not type_options.pipeline_type_check:
   769        return
   770      if (isinstance(result_pcollection, pvalue.PCollection) and
   771          (not result_pcollection.element_type
   772           # TODO(robertwb): Ideally we'd do intersection here.
   773           or result_pcollection.element_type == typehints.Any)):
   774        # {Single, multi}-input, single-output inference.
   775        input_element_types_tuple = tuple(i.element_type for i in inputs)
   776        input_element_type = (
   777            input_element_types_tuple[0] if len(input_element_types_tuple) == 1
   778            else typehints.Union[input_element_types_tuple])
   779        type_hints = transform.get_type_hints()
   780        declared_output_type = type_hints.simple_output_type(transform.label)
   781        if declared_output_type:
   782          input_types = type_hints.input_types
   783          if input_types and input_types[0]:
   784            declared_input_type = input_types[0][0]
   785            result_element_type = typehints.bind_type_variables(
   786                declared_output_type,
   787                typehints.match_type_variables(
   788                    declared_input_type, input_element_type))
   789          else:
   790            result_element_type = declared_output_type
   791        else:
   792          result_element_type = transform.infer_output_type(input_element_type)
   793        # Any remaining type variables have no bindings higher than this scope.
   794        result_pcollection.element_type = typehints.bind_type_variables(
   795            result_element_type, {'*': typehints.Any})
   796      elif isinstance(result_pcollection, pvalue.DoOutputsTuple):
   797        # {Single, multi}-input, multi-output inference.
   798        # TODO(https://github.com/apache/beam/issues/18957): Add support for
   799        #   tagged type hints.
   800        #   https://github.com/apache/beam/pull/9810#discussion_r338765251
   801        for pcoll in result_pcollection:
   802          if pcoll.element_type is None:
   803            pcoll.element_type = typehints.Any
   804  
   805    def __reduce__(self):
   806      # type: () -> Tuple[Type, Tuple[str, ...]]
   807      # Some transforms contain a reference to their enclosing pipeline,
   808      # which in turn reference all other transforms (resulting in quadratic
   809      # time/space to pickle each transform individually).  As we don't
   810      # require pickled pipelines to be executable, break the chain here.
   811      return str, ('Pickled pipeline stub.', )
   812  
   813    def _verify_runner_api_compatible(self):
   814      # type: () -> bool
   815      if self._options.view_as(TypeOptions).runtime_type_check:
   816        # This option is incompatible with the runner API as it requires
   817        # the runner to inspect non-serialized hints on the transform
   818        # itself.
   819        return False
   820  
   821      class Visitor(PipelineVisitor):  # pylint: disable=used-before-assignment
   822        ok = True  # Really a nonlocal.
   823  
   824        def enter_composite_transform(self, transform_node):
   825          # type: (AppliedPTransform) -> None
   826          pass
   827  
   828        def visit_transform(self, transform_node):
   829          # type: (AppliedPTransform) -> None
   830          try:
   831            # Transforms must be picklable.
   832            pickler.loads(
   833                pickler.dumps(transform_node.transform, enable_trace=False),
   834                enable_trace=False)
   835          except Exception:
   836            Visitor.ok = False
   837  
   838        def visit_value(self, value, _):
   839          # type: (pvalue.PValue, AppliedPTransform) -> None
   840          if isinstance(value, pvalue.PDone):
   841            Visitor.ok = False
   842  
   843      self.visit(Visitor())
   844      return Visitor.ok
   845  
   846    def to_runner_api(
   847        self,
   848        return_context=False,  # type: bool
   849        context=None,  # type: Optional[PipelineContext]
   850        use_fake_coders=False,  # type: bool
   851        default_environment=None  # type: Optional[environments.Environment]
   852    ):
   853      # type: (...) -> beam_runner_api_pb2.Pipeline
   854  
   855      """For internal use only; no backwards-compatibility guarantees."""
   856      from apache_beam.runners import pipeline_context
   857      if context is None:
   858        context = pipeline_context.PipelineContext(
   859            use_fake_coders=use_fake_coders,
   860            component_id_map=self.component_id_map,
   861            default_environment=default_environment)
   862      elif default_environment is not None:
   863        raise ValueError(
   864            'Only one of context or default_environment may be specified.')
   865  
   866      # The RunnerAPI spec requires certain transforms and side-inputs to have KV
   867      # inputs (and corresponding outputs).
   868      # Currently we only upgrade to KV pairs.  If there is a need for more
   869      # general shapes, potential conflicts will have to be resolved.
   870      # We also only handle single-input, and (for fixing the output) single
   871      # output, which is sufficient.
   872      # Also marks such values as requiring deterministic key coders.
   873      deterministic_key_coders = not self._options.view_as(
   874          TypeOptions).allow_non_deterministic_key_coders
   875  
   876      class ForceKvInputTypes(PipelineVisitor):
   877        def enter_composite_transform(self, transform_node):
   878          # type: (AppliedPTransform) -> None
   879          self.visit_transform(transform_node)
   880  
   881        def visit_transform(self, transform_node):
   882          # type: (AppliedPTransform) -> None
   883          if not transform_node.transform:
   884            return
   885          if transform_node.transform.runner_api_requires_keyed_input():
   886            pcoll = transform_node.inputs[0]
   887            pcoll.element_type = typehints.coerce_to_kv_type(
   888                pcoll.element_type, transform_node.full_label)
   889            pcoll.requires_deterministic_key_coder = (
   890                deterministic_key_coders and transform_node.full_label)
   891            if len(transform_node.outputs) == 1:
   892              # The runner often has expectations about the output types as well.
   893              output, = transform_node.outputs.values()
   894              if not output.element_type:
   895                output.element_type = transform_node.transform.infer_output_type(
   896                    pcoll.element_type)
   897              if (isinstance(output.element_type,
   898                             typehints.TupleHint.TupleConstraint) and
   899                  len(output.element_type.tuple_types) == 2 and
   900                  pcoll.element_type.tuple_types[0] ==
   901                  output.element_type.tuple_types[0]):
   902                output.requires_deterministic_key_coder = (
   903                    deterministic_key_coders and transform_node.full_label)
   904          for side_input in transform_node.transform.side_inputs:
   905            if side_input.requires_keyed_input():
   906              side_input.pvalue.element_type = typehints.coerce_to_kv_type(
   907                  side_input.pvalue.element_type,
   908                  transform_node.full_label,
   909                  side_input_producer=side_input.pvalue.producer.full_label)
   910              side_input.pvalue.requires_deterministic_key_coder = (
   911                  deterministic_key_coders and transform_node.full_label)
   912  
   913      self.visit(ForceKvInputTypes())
   914  
   915      # Mutates context; placing inline would force dependence on
   916      # argument evaluation order.
   917      root_transform_id = context.transforms.get_id(self._root_transform())
   918      proto = beam_runner_api_pb2.Pipeline(
   919          root_transform_ids=[root_transform_id],
   920          components=context.to_runner_api(),
   921          requirements=context.requirements())
   922      proto.components.transforms[root_transform_id].unique_name = (
   923          root_transform_id)
   924      self.merge_compatible_environments(proto)
   925      if return_context:
   926        return proto, context  # type: ignore  # too complicated for now
   927      else:
   928        return proto
   929  
   930    @staticmethod
   931    def merge_compatible_environments(proto):
   932      """Tries to minimize the number of distinct environments by merging
   933      those that are compatible (currently defined as identical).
   934  
   935      Mutates proto as contexts may have references to proto.components.
   936      """
   937      env_map = {}
   938      canonical_env = {}
   939      files_by_hash = {}
   940      for env_id, env in proto.components.environments.items():
   941        # First deduplicate any file dependencies by their hash.
   942        for dep in env.dependencies:
   943          if dep.type_urn == common_urns.artifact_types.FILE.urn:
   944            file_payload = beam_runner_api_pb2.ArtifactFilePayload.FromString(
   945                dep.type_payload)
   946            if file_payload.sha256:
   947              if file_payload.sha256 in files_by_hash:
   948                file_payload.path = files_by_hash[file_payload.sha256]
   949                dep.type_payload = file_payload.SerializeToString()
   950              else:
   951                files_by_hash[file_payload.sha256] = file_payload.path
   952        # Next check if we've ever seen this environment before.
   953        normalized = env.SerializeToString(deterministic=True)
   954        if normalized in canonical_env:
   955          env_map[env_id] = canonical_env[normalized]
   956        else:
   957          canonical_env[normalized] = env_id
   958      for old_env, new_env in env_map.items():
   959        for transform in proto.components.transforms.values():
   960          if transform.environment_id == old_env:
   961            transform.environment_id = new_env
   962        for windowing_strategy in proto.components.windowing_strategies.values():
   963          if windowing_strategy.environment_id == old_env:
   964            windowing_strategy.environment_id = new_env
   965        del proto.components.environments[old_env]
   966  
   967    @staticmethod
   968    def from_runner_api(
   969        proto,  # type: beam_runner_api_pb2.Pipeline
   970        runner,  # type: PipelineRunner
   971        options,  # type: PipelineOptions
   972        return_context=False,  # type: bool
   973    ):
   974      # type: (...) -> Pipeline
   975  
   976      """For internal use only; no backwards-compatibility guarantees."""
   977      p = Pipeline(runner=runner, options=options)
   978      from apache_beam.runners import pipeline_context
   979      context = pipeline_context.PipelineContext(
   980          proto.components, requirements=proto.requirements)
   981      if proto.root_transform_ids:
   982        root_transform_id, = proto.root_transform_ids
   983        p.transforms_stack = [context.transforms.get_by_id(root_transform_id)]
   984      else:
   985        p.transforms_stack = [AppliedPTransform(None, None, '', None)]
   986      # TODO(robertwb): These are only needed to continue construction. Omit?
   987      p.applied_labels = {
   988          t.unique_name
   989          for t in proto.components.transforms.values()
   990      }
   991      for id in proto.components.pcollections:
   992        pcollection = context.pcollections.get_by_id(id)
   993        pcollection.pipeline = p
   994        if not pcollection.producer:
   995          raise ValueError('No producer for %s' % id)
   996  
   997      # Inject PBegin input where necessary.
   998      from apache_beam.io.iobase import Read
   999      from apache_beam.transforms.core import Create
  1000      has_pbegin = [Read, Create]
  1001      for id in proto.components.transforms:
  1002        transform = context.transforms.get_by_id(id)
  1003        if not transform.inputs and transform.transform.__class__ in has_pbegin:
  1004          transform.main_inputs = {'None': pvalue.PBegin(p)}
  1005  
  1006      if return_context:
  1007        return p, context  # type: ignore  # too complicated for now
  1008      else:
  1009        return p
  1010  
  1011  
  1012  class PipelineVisitor(object):
  1013    """For internal use only; no backwards-compatibility guarantees.
  1014  
  1015    Visitor pattern class used to traverse a DAG of transforms
  1016    (used internally by Pipeline for bookkeeping purposes).
  1017    """
  1018    def visit_value(self, value, producer_node):
  1019      # type: (pvalue.PValue, AppliedPTransform) -> None
  1020  
  1021      """Callback for visiting a PValue in the pipeline DAG.
  1022  
  1023      Args:
  1024        value: PValue visited (typically a PCollection instance).
  1025        producer_node: AppliedPTransform object whose transform produced the
  1026          pvalue.
  1027      """
  1028      pass
  1029  
  1030    def visit_transform(self, transform_node):
  1031      # type: (AppliedPTransform) -> None
  1032  
  1033      """Callback for visiting a transform leaf node in the pipeline DAG."""
  1034      pass
  1035  
  1036    def enter_composite_transform(self, transform_node):
  1037      # type: (AppliedPTransform) -> None
  1038  
  1039      """Callback for entering traversal of a composite transform node."""
  1040      pass
  1041  
  1042    def leave_composite_transform(self, transform_node):
  1043      # type: (AppliedPTransform) -> None
  1044  
  1045      """Callback for leaving traversal of a composite transform node."""
  1046      pass
  1047  
  1048  
  1049  class ExternalTransformFinder(PipelineVisitor):
  1050    """Looks for any external transforms in the pipeline and if found records
  1051    it.
  1052    """
  1053    def __init__(self):
  1054      self._contains_external_transforms = False
  1055  
  1056    @staticmethod
  1057    def contains_external_transforms(pipeline):
  1058      visitor = ExternalTransformFinder()
  1059      pipeline.visit(visitor)
  1060      return visitor._contains_external_transforms
  1061  
  1062    def _perform_exernal_transform_test(self, transform):
  1063      if not transform:
  1064        return
  1065      from apache_beam.transforms import ExternalTransform
  1066      if isinstance(transform, ExternalTransform):
  1067        self._contains_external_transforms = True
  1068  
  1069    def visit_transform(self, transform_node):
  1070      # type: (AppliedPTransform) -> None
  1071      self._perform_exernal_transform_test(transform_node.transform)
  1072  
  1073    def enter_composite_transform(self, transform_node):
  1074      # type: (AppliedPTransform) -> None
  1075      # Python SDK object graph may represent an external transform that is a leaf
  1076      # of the pipeline graph as a composite without sub-transforms.
  1077      # Note that this visitor is just used to identify pipelines with external
  1078      # transforms. A Runner API pipeline proto generated from the Pipeline object
  1079      # will include external sub-transform.
  1080      self._perform_exernal_transform_test(transform_node.transform)
  1081  
  1082  
  1083  class AppliedPTransform(object):
  1084    """For internal use only; no backwards-compatibility guarantees.
  1085  
  1086    A transform node representing an instance of applying a PTransform
  1087    (used internally by Pipeline for bookeeping purposes).
  1088    """
  1089    def __init__(
  1090        self,
  1091        parent,  # type:  Optional[AppliedPTransform]
  1092        transform,  # type: Optional[ptransform.PTransform]
  1093        full_label,  # type: str
  1094        main_inputs,  # type: Optional[Mapping[str, Union[pvalue.PBegin, pvalue.PCollection]]]
  1095        environment_id=None,  # type: Optional[str]
  1096        annotations=None, # type: Optional[Dict[str, bytes]]
  1097    ):
  1098      # type: (...) -> None
  1099      self.parent = parent
  1100      self.transform = transform
  1101      # Note that we want the PipelineVisitor classes to use the full_label,
  1102      # inputs, side_inputs, and outputs fields from this instance instead of the
  1103      # ones of the PTransform instance associated with it. Doing this permits
  1104      # reusing PTransform instances in different contexts (apply() calls) without
  1105      # any interference. This is particularly useful for composite transforms.
  1106      self.full_label = full_label
  1107      self.main_inputs = dict(main_inputs or {})
  1108  
  1109      self.side_inputs = tuple() if transform is None else transform.side_inputs
  1110      self.outputs = {}  # type: Dict[Union[str, int, None], pvalue.PValue]
  1111      self.parts = []  # type: List[AppliedPTransform]
  1112      self.environment_id = environment_id if environment_id else None  # type: Optional[str]
  1113      # We may need to merge the hints with environment-provided hints here
  1114      # once environment is a first-class citizen in Beam graph and we have
  1115      # access to actual environment, not just an id.
  1116      self.resource_hints = dict(
  1117          transform.get_resource_hints()) if transform else {
  1118          }  # type: Dict[str, bytes]
  1119  
  1120      if annotations is None and transform:
  1121  
  1122        def annotation_to_bytes(key, a: Any) -> bytes:
  1123          if isinstance(a, bytes):
  1124            return a
  1125          elif isinstance(a, str):
  1126            return a.encode('ascii')
  1127          elif isinstance(a, message.Message):
  1128            return a.SerializeToString()
  1129          else:
  1130            raise TypeError(
  1131                'Unknown annotation type %r (type %s) for %s' % (a, type(a), key))
  1132  
  1133        annotations = {
  1134            key: annotation_to_bytes(key, a)
  1135            for key,
  1136            a in transform.annotations().items()
  1137        }
  1138      self.annotations = annotations
  1139  
  1140    @property
  1141    def inputs(self):
  1142      return tuple(self.main_inputs.values())
  1143  
  1144    def __repr__(self):
  1145      # type: () -> str
  1146      return "%s(%s, %s)" % (
  1147          self.__class__.__name__, self.full_label, type(self.transform).__name__)
  1148  
  1149    def replace_output(
  1150        self,
  1151        output,  # type: Union[pvalue.PValue, pvalue.DoOutputsTuple]
  1152        tag=None  # type: Union[str, int, None]
  1153    ):
  1154      # type: (...) -> None
  1155  
  1156      """Replaces the output defined by the given tag with the given output.
  1157  
  1158      Args:
  1159        output: replacement output
  1160        tag: tag of the output to be replaced.
  1161      """
  1162      if isinstance(output, pvalue.DoOutputsTuple):
  1163        self.replace_output(output[output._main_tag])
  1164      elif isinstance(output, pvalue.PValue):
  1165        self.outputs[tag] = output
  1166      elif isinstance(output, dict):
  1167        for output_tag, out in output.items():
  1168          self.outputs[output_tag] = out
  1169      else:
  1170        raise TypeError("Unexpected output type: %s" % output)
  1171  
  1172      # Importing locally to prevent circular dependency issues.
  1173      from apache_beam.transforms import external
  1174      if isinstance(self.transform, external.ExternalTransform):
  1175        self.transform.replace_named_outputs(self.named_outputs())
  1176  
  1177    def replace_inputs(self, main_inputs):
  1178      self.main_inputs = main_inputs
  1179  
  1180      # Importing locally to prevent circular dependency issues.
  1181      from apache_beam.transforms import external
  1182      if isinstance(self.transform, external.ExternalTransform):
  1183        self.transform.replace_named_inputs(self.named_inputs())
  1184  
  1185    def replace_side_inputs(self, side_inputs):
  1186      self.side_inputs = side_inputs
  1187  
  1188      # Importing locally to prevent circular dependency issues.
  1189      from apache_beam.transforms import external
  1190      if isinstance(self.transform, external.ExternalTransform):
  1191        self.transform.replace_named_inputs(self.named_inputs())
  1192  
  1193    def add_output(
  1194        self,
  1195        output,  # type: Union[pvalue.DoOutputsTuple, pvalue.PValue]
  1196        tag  # type: Union[str, int, None]
  1197    ):
  1198      # type: (...) -> None
  1199      if isinstance(output, pvalue.DoOutputsTuple):
  1200        self.add_output(output[tag], tag)
  1201      elif isinstance(output, pvalue.PValue):
  1202        assert tag not in self.outputs
  1203        self.outputs[tag] = output
  1204      else:
  1205        raise TypeError("Unexpected output type: %s" % output)
  1206  
  1207    def add_part(self, part):
  1208      # type: (AppliedPTransform) -> None
  1209      assert isinstance(part, AppliedPTransform)
  1210      part._merge_outer_resource_hints()
  1211      self.parts.append(part)
  1212  
  1213    def is_composite(self):
  1214      # type: () -> bool
  1215  
  1216      """Returns whether this is a composite transform.
  1217  
  1218      A composite transform has parts (inner transforms) or isn't the
  1219      producer for any of its outputs. (An example of a transform that
  1220      is not a producer is one that returns its inputs instead.)
  1221      """
  1222      return bool(self.parts) or all(
  1223          pval.producer is not self for pval in self.outputs.values())
  1224  
  1225    def visit(
  1226        self,
  1227        visitor,  # type: PipelineVisitor
  1228        pipeline,  # type: Pipeline
  1229        visited  # type: Set[pvalue.PValue]
  1230    ):
  1231      # type: (...) -> None
  1232  
  1233      """Visits all nodes reachable from the current node."""
  1234  
  1235      for in_pval in self.inputs:
  1236        if in_pval not in visited and not isinstance(in_pval, pvalue.PBegin):
  1237          if in_pval.producer is not None:
  1238            in_pval.producer.visit(visitor, pipeline, visited)
  1239            # The value should be visited now since we visit outputs too.
  1240            assert in_pval in visited, in_pval
  1241  
  1242      # Visit side inputs.
  1243      for side_input in self.side_inputs:
  1244        if isinstance(side_input, pvalue.AsSideInput) \
  1245            and side_input.pvalue not in visited:
  1246          pval = side_input.pvalue  # Unpack marker-object-wrapped pvalue.
  1247          if pval.producer is not None:
  1248            pval.producer.visit(visitor, pipeline, visited)
  1249            # The value should be visited now since we visit outputs too.
  1250            assert pval in visited
  1251            # TODO(silviuc): Is there a way to signal that we are visiting a side
  1252            # value? The issue is that the same PValue can be reachable through
  1253            # multiple paths and therefore it is not guaranteed that the value
  1254            # will be visited as a side value.
  1255  
  1256      # Visit a composite or primitive transform.
  1257      if self.is_composite():
  1258        visitor.enter_composite_transform(self)
  1259        for part in self.parts:
  1260          part.visit(visitor, pipeline, visited)
  1261        visitor.leave_composite_transform(self)
  1262      else:
  1263        visitor.visit_transform(self)
  1264  
  1265      # Visit the outputs (one or more). It is essential to mark as visited the
  1266      # tagged PCollections of the DoOutputsTuple object. A tagged PCollection is
  1267      # connected directly with its producer (a multi-output ParDo), but the
  1268      # output of such a transform is the containing DoOutputsTuple, not the
  1269      # PCollection inside it. Without the code below a tagged PCollection will
  1270      # not be marked as visited while visiting its producer.
  1271      for out_pval in self.outputs.values():
  1272        if isinstance(out_pval, pvalue.DoOutputsTuple):
  1273          pvals = (v for v in out_pval)
  1274        else:
  1275          pvals = (out_pval, )
  1276        for v in pvals:
  1277          if v not in visited:
  1278            visited.add(v)
  1279            visitor.visit_value(v, self)
  1280  
  1281    def named_inputs(self):
  1282      # type: () -> Dict[str, pvalue.PValue]
  1283      if self.transform is None:
  1284        assert not self.main_inputs and not self.side_inputs
  1285        return {}
  1286      else:
  1287        named_inputs = self.transform._named_inputs(
  1288            self.main_inputs, self.side_inputs)
  1289        if not self.parts:
  1290          for name, pc_out in self.outputs.items():
  1291            if pc_out.producer is not self and pc_out not in named_inputs.values(
  1292            ):
  1293              named_inputs[f'__implicit_input_{name}'] = pc_out
  1294        return named_inputs
  1295  
  1296    def named_outputs(self):
  1297      # type: () -> Dict[str, pvalue.PCollection]
  1298      if self.transform is None:
  1299        assert not self.outputs
  1300        return {}
  1301      else:
  1302        return self.transform._named_outputs(self.outputs)
  1303  
  1304    def to_runner_api(self, context):
  1305      # type: (PipelineContext) -> beam_runner_api_pb2.PTransform
  1306      # External transforms require more splicing than just setting the spec.
  1307      from apache_beam.transforms import external
  1308      if isinstance(self.transform, external.ExternalTransform):
  1309        # TODO(https://github.com/apache/beam/issues/18371): Support resource
  1310        # hints in XLang transforms. In particular, make sure hints on composites
  1311        # are properly propagated.
  1312        return self.transform.to_runner_api_transform(context, self.full_label)
  1313  
  1314      def transform_to_runner_api(
  1315          transform,  # type: Optional[ptransform.PTransform]
  1316          context  # type: PipelineContext
  1317      ):
  1318        # type: (...) -> Optional[beam_runner_api_pb2.FunctionSpec]
  1319        if transform is None:
  1320          return None
  1321        else:
  1322          # We only populate inputs information to ParDo in order to expose
  1323          # key_coder and window_coder to stateful DoFn.
  1324          if isinstance(transform, ParDo):
  1325            return transform.to_runner_api(
  1326                context,
  1327                has_parts=bool(self.parts),
  1328                named_inputs=self.named_inputs())
  1329          return transform.to_runner_api(context, has_parts=bool(self.parts))
  1330  
  1331      # Iterate over inputs and outputs by sorted key order, so that ids are
  1332      # consistently generated for multiple runs of the same pipeline.
  1333      transform_spec = transform_to_runner_api(self.transform, context)
  1334      environment_id = self.environment_id
  1335      transform_urn = transform_spec.urn if transform_spec else None
  1336      if (not environment_id and
  1337          (transform_urn not in Pipeline.runner_implemented_transforms())):
  1338        environment_id = context.get_environment_id_for_resource_hints(
  1339            self.resource_hints)
  1340  
  1341      return beam_runner_api_pb2.PTransform(
  1342          unique_name=self.full_label,
  1343          spec=transform_spec,
  1344          subtransforms=[
  1345              context.transforms.get_id(part, label=part.full_label)
  1346              for part in self.parts
  1347          ],
  1348          inputs={
  1349              tag: context.pcollections.get_id(pc)
  1350              for tag,
  1351              pc in sorted(self.named_inputs().items())
  1352          },
  1353          outputs={
  1354              tag: context.pcollections.get_id(out)
  1355              for tag,
  1356              out in sorted(self.named_outputs().items())
  1357          },
  1358          environment_id=environment_id,
  1359          annotations=self.annotations,
  1360          # TODO(https://github.com/apache/beam/issues/18012): Add display_data.
  1361          display_data=DisplayData.create_from(self.transform).to_proto()
  1362          if self.transform else None)
  1363  
  1364    @staticmethod
  1365    def from_runner_api(
  1366        proto,  # type: beam_runner_api_pb2.PTransform
  1367        context  # type: PipelineContext
  1368    ):
  1369      # type: (...) -> AppliedPTransform
  1370  
  1371      if common_urns.primitives.PAR_DO.urn == proto.spec.urn:
  1372        # Preserving side input tags.
  1373        pardo_payload = (
  1374            proto_utils.parse_Bytes(
  1375                proto.spec.payload, beam_runner_api_pb2.ParDoPayload))
  1376        side_input_tags = list(pardo_payload.side_inputs.keys())
  1377      else:
  1378        pardo_payload = None
  1379        side_input_tags = []
  1380  
  1381      main_inputs = {
  1382          tag: context.pcollections.get_by_id(id)
  1383          for (tag, id) in proto.inputs.items() if tag not in side_input_tags
  1384      }
  1385  
  1386      transform = ptransform.PTransform.from_runner_api(proto, context)
  1387      if transform and proto.environment_id:
  1388        resource_hints = context.environments.get_by_id(
  1389            proto.environment_id).resource_hints()
  1390        if resource_hints:
  1391          transform._resource_hints = dict(resource_hints)
  1392  
  1393      # Ordering is important here.
  1394      # TODO(https://github.com/apache/beam/issues/20136): use key, value pairs
  1395      # instead of depending on tags with index as a suffix.
  1396      indexed_side_inputs = [
  1397          (get_sideinput_index(tag), context.pcollections.get_by_id(id)) for tag,
  1398          id in proto.inputs.items() if tag in side_input_tags
  1399      ]
  1400      side_inputs = [si for _, si in sorted(indexed_side_inputs)]
  1401  
  1402      result = AppliedPTransform(
  1403          parent=None,
  1404          transform=transform,
  1405          full_label=proto.unique_name,
  1406          main_inputs=main_inputs,
  1407          environment_id=None,
  1408          annotations=proto.annotations)
  1409  
  1410      if result.transform and result.transform.side_inputs:
  1411        for si, pcoll in zip(result.transform.side_inputs, side_inputs):
  1412          si.pvalue = pcoll
  1413        result.side_inputs = tuple(result.transform.side_inputs)
  1414      result.parts = []
  1415      for transform_id in proto.subtransforms:
  1416        part = context.transforms.get_by_id(transform_id)
  1417        part.parent = result
  1418        result.add_part(part)
  1419      result.outputs = {
  1420          None if tag == 'None' else tag: context.pcollections.get_by_id(id)
  1421          for tag,
  1422          id in proto.outputs.items()
  1423      }
  1424      # This annotation is expected by some runners.
  1425      if proto.spec.urn == common_urns.primitives.PAR_DO.urn:
  1426        result.transform.output_tags = set(proto.outputs.keys()).difference(
  1427            {'None'})
  1428      if not result.parts:
  1429        for tag, pcoll_id in proto.outputs.items():
  1430          if pcoll_id not in proto.inputs.values():
  1431            pc = context.pcollections.get_by_id(pcoll_id)
  1432            pc.producer = result
  1433            pc.tag = None if tag == 'None' else tag
  1434      return result
  1435  
  1436    def _merge_outer_resource_hints(self):
  1437      if (self.parent is not None and self.parent.resource_hints):
  1438        self.resource_hints = merge_resource_hints(
  1439            outer_hints=self.parent.resource_hints,
  1440            inner_hints=self.resource_hints)
  1441      if self.resource_hints:
  1442        for part in self.parts:
  1443          part._merge_outer_resource_hints()
  1444  
  1445  
  1446  class PTransformOverride(metaclass=abc.ABCMeta):
  1447    """For internal use only; no backwards-compatibility guarantees.
  1448  
  1449    Gives a matcher and replacements for matching PTransforms.
  1450  
  1451    TODO: Update this to support cases where input and/our output types are
  1452    different.
  1453    """
  1454    @abc.abstractmethod
  1455    def matches(self, applied_ptransform):
  1456      # type: (AppliedPTransform) -> bool
  1457  
  1458      """Determines whether the given AppliedPTransform matches.
  1459  
  1460      Note that the matching will happen *after* Runner API proto translation.
  1461      If matching is done via type checks, to/from_runner_api[_parameter] methods
  1462      must be implemented to preserve the type (and other data) through proto
  1463      serialization.
  1464  
  1465      Consider URN-based translation instead.
  1466  
  1467      Args:
  1468        applied_ptransform: AppliedPTransform to be matched.
  1469  
  1470      Returns:
  1471        a bool indicating whether the given AppliedPTransform is a match.
  1472      """
  1473      raise NotImplementedError
  1474  
  1475    def get_replacement_transform_for_applied_ptransform(
  1476        self, applied_ptransform):
  1477      # type: (AppliedPTransform) -> ptransform.PTransform
  1478  
  1479      """Provides a runner specific override for a given `AppliedPTransform`.
  1480  
  1481      Args:
  1482        applied_ptransform: `AppliedPTransform` containing the `PTransform` to be
  1483          replaced.
  1484  
  1485      Returns:
  1486        A `PTransform` that will be the replacement for the `PTransform` inside
  1487        the `AppliedPTransform` given as an argument.
  1488      """
  1489      # Returns a PTransformReplacement
  1490      return self.get_replacement_transform(applied_ptransform.transform)
  1491  
  1492    @deprecated(
  1493        since='2.24', current='get_replacement_transform_for_applied_ptransform')
  1494    def get_replacement_transform(self, ptransform):
  1495      # type: (Optional[ptransform.PTransform]) -> ptransform.PTransform
  1496  
  1497      """Provides a runner specific override for a given PTransform.
  1498  
  1499      Args:
  1500        ptransform: PTransform to be replaced.
  1501  
  1502      Returns:
  1503        A PTransform that will be the replacement for the PTransform given as an
  1504        argument.
  1505      """
  1506      # Returns a PTransformReplacement
  1507      raise NotImplementedError
  1508  
  1509    def get_replacement_inputs(self, applied_ptransform):
  1510      # type: (AppliedPTransform) -> Iterable[pvalue.PValue]
  1511  
  1512      """Provides inputs that will be passed to the replacement PTransform.
  1513  
  1514      Args:
  1515        applied_ptransform: Original AppliedPTransform containing the PTransform
  1516          to be replaced.
  1517  
  1518      Returns:
  1519        An iterable of PValues that will be passed to the expand() method of the
  1520        replacement PTransform.
  1521      """
  1522      return tuple(applied_ptransform.inputs) + tuple(
  1523          side_input.pvalue for side_input in applied_ptransform.side_inputs)
  1524  
  1525  
  1526  class ComponentIdMap(object):
  1527    """A utility for assigning unique component ids to Beam components.
  1528  
  1529    Component ID assignments are only guaranteed to be unique and consistent
  1530    within the scope of a ComponentIdMap instance.
  1531    """
  1532    def __init__(self, namespace="ref"):
  1533      self.namespace = namespace
  1534      self._counters = defaultdict(lambda: 0)  # type: Dict[type, int]
  1535      self._obj_to_id = {}  # type: Dict[Any, str]
  1536  
  1537    def get_or_assign(self, obj=None, obj_type=None, label=None):
  1538      if obj not in self._obj_to_id:
  1539        self._obj_to_id[obj] = self._unique_ref(obj, obj_type, label)
  1540  
  1541      return self._obj_to_id[obj]
  1542  
  1543    def _normalize(self, str_value):
  1544      str_value = unicodedata.normalize('NFC', str_value)
  1545      return re.sub(r'[^a-zA-Z0-9-_]+', '-', str_value)
  1546  
  1547    def _unique_ref(self, obj=None, obj_type=None, label=None):
  1548      # Normalize, trim, and uniqify.
  1549      prefix = self._normalize(
  1550          '%s_%s_%s' %
  1551          (self.namespace, obj_type.__name__, label or type(obj).__name__))[0:100]
  1552      self._counters[obj_type] += 1
  1553      return '%s_%d' % (prefix, self._counters[obj_type])