github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/runners/runner.py

github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/runners/runner.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """PipelineRunner, an abstract base runner object."""
    19  
    20  # pytype: skip-file
    21  
    22  import importlib
    23  import logging
    24  import os
    25  import shelve
    26  import shutil
    27  import tempfile
    28  from typing import TYPE_CHECKING
    29  from typing import Optional
    30  
    31  from apache_beam.options.pipeline_options import StandardOptions
    32  
    33  if TYPE_CHECKING:
    34    from apache_beam import pvalue
    35    from apache_beam import PTransform
    36    from apache_beam.options.pipeline_options import PipelineOptions
    37    from apache_beam.pipeline import AppliedPTransform
    38    from apache_beam.pipeline import Pipeline
    39    from apache_beam.pipeline import PipelineVisitor
    40  
    41  __all__ = ['PipelineRunner', 'PipelineState', 'PipelineResult']
    42  
    43  _RUNNER_MAP = {
    44      path.rsplit('.', maxsplit=1)[-1].lower(): path
    45      for path in StandardOptions.ALL_KNOWN_RUNNERS
    46  }
    47  
    48  # Allow this alias, but don't make public.
    49  _RUNNER_MAP['pythonrpcdirectrunner'] = (
    50      'apache_beam.runners.experimental'
    51      '.python_rpc_direct.python_rpc_direct_runner.PythonRPCDirectRunner')
    52  
    53  _LOGGER = logging.getLogger(__name__)
    54  
    55  
    56  def create_runner(runner_name):
    57    # type: (str) -> PipelineRunner
    58  
    59    """For internal use only; no backwards-compatibility guarantees.
    60  
    61    Creates a runner instance from a runner class name.
    62  
    63    Args:
    64      runner_name: Name of the pipeline runner. Possible values are listed in
    65        _RUNNER_MAP above.
    66  
    67    Returns:
    68      A runner object.
    69  
    70    Raises:
    71      RuntimeError: if an invalid runner name is used.
    72    """
    73  
    74    # Get the qualified runner name by using the lower case runner name. If that
    75    # fails try appending the name with 'runner' and check if it matches.
    76    # If that also fails, use the given runner name as is.
    77    runner_name = _RUNNER_MAP.get(
    78        runner_name.lower(),
    79        _RUNNER_MAP.get(runner_name.lower() + 'runner', runner_name))
    80  
    81    if '.' in runner_name:
    82      module, runner = runner_name.rsplit('.', 1)
    83      try:
    84        return getattr(importlib.import_module(module), runner)()
    85      except ImportError:
    86        if 'dataflow' in runner_name.lower():
    87          raise ImportError(
    88              'Google Cloud Dataflow runner not available, '
    89              'please install apache_beam[gcp]')
    90        elif 'interactive' in runner_name.lower():
    91          raise ImportError(
    92              'Interactive runner not available, '
    93              'please install apache_beam[interactive]')
    94        else:
    95          raise
    96    else:
    97      raise ValueError(
    98          'Unexpected pipeline runner: %s. Valid values are %s '
    99          'or the fully qualified name of a PipelineRunner subclass.' %
   100          (runner_name, ', '.join(StandardOptions.KNOWN_RUNNER_NAMES)))
   101  
   102  
   103  class PipelineRunner(object):
   104    """A runner of a pipeline object.
   105  
   106    The base runner provides a run() method for visiting every node in the
   107    pipeline's DAG and executing the transforms computing the PValue in the node.
   108  
   109    A custom runner will typically provide implementations for some of the
   110    transform methods (ParDo, GroupByKey, Create, etc.). It may also
   111    provide a new implementation for clear_pvalue(), which is used to wipe out
   112    materialized values in order to reduce footprint.
   113    """
   114  
   115    def run(self,
   116            transform,  # type: PTransform
   117            options=None  # type: Optional[PipelineOptions]
   118           ):
   119      # type: (...) -> PipelineResult
   120  
   121      """Run the given transform or callable with this runner.
   122  
   123      Blocks until the pipeline is complete.  See also `PipelineRunner.run_async`.
   124      """
   125      result = self.run_async(transform, options)
   126      result.wait_until_finish()
   127      return result
   128  
   129    def run_async(self,
   130                  transform,  # type: PTransform
   131                  options=None  # type: Optional[PipelineOptions]
   132                 ):
   133      # type: (...) -> PipelineResult
   134  
   135      """Run the given transform or callable with this runner.
   136  
   137      May return immediately, executing the pipeline in the background.
   138      The returned result object can be queried for progress, and
   139      `wait_until_finish` may be called to block until completion.
   140      """
   141      # Imported here to avoid circular dependencies.
   142      # pylint: disable=wrong-import-order, wrong-import-position
   143      from apache_beam import PTransform
   144      from apache_beam.pvalue import PBegin
   145      from apache_beam.pipeline import Pipeline
   146      p = Pipeline(runner=self, options=options)
   147      if isinstance(transform, PTransform):
   148        p | transform
   149      else:
   150        transform(PBegin(p))
   151      return p.run()
   152  
   153    def run_pipeline(
   154        self,
   155        pipeline,  # type: Pipeline
   156        options  # type: PipelineOptions
   157    ):
   158      # type: (...) -> PipelineResult
   159  
   160      """Execute the entire pipeline or the sub-DAG reachable from a node.
   161  
   162      Runners should override this method.
   163      """
   164      raise NotImplementedError
   165  
   166    def apply(self,
   167              transform,  # type: PTransform
   168              input,  # type: Optional[pvalue.PValue]
   169              options  # type: PipelineOptions
   170             ):
   171      """Runner callback for a pipeline.apply call.
   172  
   173      Args:
   174        transform: the transform to apply.
   175        input: transform's input (typically a PCollection).
   176  
   177      A concrete implementation of the Runner class may want to do custom
   178      pipeline construction for a given transform.  To override the behavior
   179      for a transform class Xyz, implement an apply_Xyz method with this same
   180      signature.
   181      """
   182      for cls in transform.__class__.mro():
   183        m = getattr(self, 'apply_%s' % cls.__name__, None)
   184        if m:
   185          return m(transform, input, options)
   186      raise NotImplementedError(
   187          'Execution of [%s] not implemented in runner %s.' % (transform, self))
   188  
   189    def visit_transforms(
   190        self,
   191        pipeline,  # type: Pipeline
   192        options  # type: PipelineOptions
   193    ):
   194      # type: (...) -> None
   195      # Imported here to avoid circular dependencies.
   196      # pylint: disable=wrong-import-order, wrong-import-position
   197      from apache_beam.pipeline import PipelineVisitor
   198  
   199      class RunVisitor(PipelineVisitor):
   200        def __init__(self, runner):
   201          # type: (PipelineRunner) -> None
   202          self.runner = runner
   203  
   204        def visit_transform(self, transform_node):
   205          try:
   206            self.runner.run_transform(transform_node, options)
   207          except:
   208            _LOGGER.error('Error while visiting %s', transform_node.full_label)
   209            raise
   210  
   211      pipeline.visit(RunVisitor(self))
   212  
   213    def apply_PTransform(self, transform, input, options):
   214      # The base case of apply is to call the transform's expand.
   215      return transform.expand(input)
   216  
   217    def run_transform(self,
   218                      transform_node,  # type: AppliedPTransform
   219                      options  # type: PipelineOptions
   220                     ):
   221      """Runner callback for a pipeline.run call.
   222  
   223      Args:
   224        transform_node: transform node for the transform to run.
   225  
   226      A concrete implementation of the Runner class must implement run_Abc for
   227      some class Abc in the method resolution order for every non-composite
   228      transform Xyz in the pipeline.
   229      """
   230      for cls in transform_node.transform.__class__.mro():
   231        m = getattr(self, 'run_%s' % cls.__name__, None)
   232        if m:
   233          return m(transform_node, options)
   234      raise NotImplementedError(
   235          'Execution of [%s] not implemented in runner %s.' %
   236          (transform_node.transform, self))
   237  
   238    def is_fnapi_compatible(self):
   239      """Whether to enable the beam_fn_api experiment by default."""
   240      return True
   241  
   242  
   243  class PValueCache(object):
   244    """For internal use only; no backwards-compatibility guarantees.
   245  
   246    Local cache for arbitrary information computed for PValue objects."""
   247    def __init__(self, use_disk_backed_cache=False):
   248      # Cache of values computed while a runner executes a pipeline. This is a
   249      # dictionary of PValues and their computed values. Note that in principle
   250      # the runner could contain PValues from several pipelines without clashes
   251      # since a PValue is associated with one and only one pipeline. The keys of
   252      # the dictionary are tuple of PValue instance addresses obtained using id()
   253      # and tag names converted to strings.
   254  
   255      self._use_disk_backed_cache = use_disk_backed_cache
   256      if use_disk_backed_cache:
   257        self._tempdir = tempfile.mkdtemp()
   258        self._cache = shelve.open(os.path.join(self._tempdir, 'shelve'))
   259      else:
   260        self._cache = {}
   261  
   262    def __del__(self):
   263      if self._use_disk_backed_cache:
   264        self._cache.close()
   265        shutil.rmtree(self._tempdir)
   266  
   267    def __len__(self):
   268      return len(self._cache)
   269  
   270    def to_cache_key(self, transform, tag):
   271      return transform.full_label, tag
   272  
   273    def _ensure_pvalue_has_real_producer(self, pvalue):
   274      """Ensure the passed-in PValue has the real_producer attribute.
   275  
   276      Args:
   277        pvalue: A PValue instance whose cached value is requested.
   278  
   279      During the runner's execution only the results of the primitive transforms
   280      are cached. Whenever we are looking for a PValue that is the output of a
   281      composite transform we need to find the output of its rightmost transform
   282      part.
   283      """
   284      if not hasattr(pvalue, 'real_producer'):
   285        real_producer = pvalue.producer
   286        while real_producer.parts:
   287          real_producer = real_producer.parts[-1]
   288        pvalue.real_producer = real_producer
   289  
   290    def is_cached(self, pobj):
   291      from apache_beam.pipeline import AppliedPTransform
   292      if isinstance(pobj, AppliedPTransform):
   293        transform = pobj
   294        tag = None
   295      else:
   296        self._ensure_pvalue_has_real_producer(pobj)
   297        transform = pobj.real_producer
   298        tag = pobj.tag
   299      return self.to_cache_key(transform, tag) in self._cache
   300  
   301    def cache_output(self, transform, tag_or_value, value=None):
   302      if value is None:
   303        value = tag_or_value
   304        tag = None
   305      else:
   306        tag = tag_or_value
   307      self._cache[self.to_cache_key(transform, tag)] = value
   308  
   309    def get_pvalue(self, pvalue):
   310      """Gets the value associated with a PValue from the cache."""
   311      self._ensure_pvalue_has_real_producer(pvalue)
   312      try:
   313        return self._cache[self.key(pvalue)]
   314      except KeyError:
   315        if (pvalue.tag is not None and
   316            self.to_cache_key(pvalue.real_producer, None) in self._cache):
   317          # This is an undeclared, empty output of a DoFn executed
   318          # in the local runner before this output was referenced.
   319          return []
   320        else:
   321          raise
   322  
   323    def get_unwindowed_pvalue(self, pvalue):
   324      return [v.value for v in self.get_pvalue(pvalue)]
   325  
   326    def clear_pvalue(self, pvalue):
   327      """Removes a PValue from the cache."""
   328      if self.is_cached(pvalue):
   329        del self._cache[self.key(pvalue)]
   330  
   331    def key(self, pobj):
   332      self._ensure_pvalue_has_real_producer(pobj)
   333      return self.to_cache_key(pobj.real_producer, pobj.tag)
   334  
   335  
   336  # FIXME: replace with PipelineState(str, enum.Enum)
   337  class PipelineState(object):
   338    """State of the Pipeline, as returned by :attr:`PipelineResult.state`.
   339  
   340    This is meant to be the union of all the states any runner can put a
   341    pipeline in. Currently, it represents the values of the dataflow
   342    API JobState enum.
   343    """
   344    UNKNOWN = 'UNKNOWN'  # not specified by a runner, or unknown to a runner.
   345    STARTING = 'STARTING'  # not yet started
   346    STOPPED = 'STOPPED'  # paused or not yet started
   347    RUNNING = 'RUNNING'  # currently running
   348    DONE = 'DONE'  # successfully completed (terminal state)
   349    FAILED = 'FAILED'  # failed (terminal state)
   350    CANCELLED = 'CANCELLED'  # explicitly cancelled (terminal state)
   351    UPDATED = 'UPDATED'  # replaced by another job (terminal state)
   352    DRAINING = 'DRAINING'  # still processing, no longer reading data
   353    DRAINED = 'DRAINED'  # draining completed (terminal state)
   354    PENDING = 'PENDING'  # the job has been created but is not yet running.
   355    CANCELLING = 'CANCELLING'  # job has been explicitly cancelled and is
   356    # in the process of stopping
   357    RESOURCE_CLEANING_UP = 'RESOURCE_CLEANING_UP'  # job's resources are being
   358    # cleaned up
   359    UNRECOGNIZED = 'UNRECOGNIZED'  # the job state reported by a runner cannot be
   360    # interpreted by the SDK.
   361  
   362    @classmethod
   363    def is_terminal(cls, state):
   364      return state in [
   365          cls.DONE, cls.FAILED, cls.CANCELLED, cls.UPDATED, cls.DRAINED
   366      ]
   367  
   368  
   369  class PipelineResult(object):
   370    """A :class:`PipelineResult` provides access to info about a pipeline."""
   371    def __init__(self, state):
   372      self._state = state
   373  
   374    @property
   375    def state(self):
   376      """Return the current state of the pipeline execution."""
   377      return self._state
   378  
   379    def wait_until_finish(self, duration=None):
   380      """Waits until the pipeline finishes and returns the final status.
   381  
   382      Args:
   383        duration (int): The time to wait (in milliseconds) for job to finish.
   384          If it is set to :data:`None`, it will wait indefinitely until the job
   385          is finished.
   386  
   387      Raises:
   388        IOError: If there is a persistent problem getting job
   389          information.
   390        NotImplementedError: If the runner does not support this
   391          operation.
   392  
   393      Returns:
   394        The final state of the pipeline, or :data:`None` on timeout.
   395      """
   396      raise NotImplementedError
   397  
   398    def cancel(self):
   399      """Cancels the pipeline execution.
   400  
   401      Raises:
   402        IOError: If there is a persistent problem getting job
   403          information.
   404        NotImplementedError: If the runner does not support this
   405          operation.
   406  
   407      Returns:
   408        The final state of the pipeline.
   409      """
   410      raise NotImplementedError
   411  
   412    def metrics(self):
   413      """Returns :class:`~apache_beam.metrics.metric.MetricResults` object to
   414      query metrics from the runner.
   415  
   416      Raises:
   417        NotImplementedError: If the runner does not support this
   418          operation.
   419      """
   420      raise NotImplementedError
   421  
   422    # pylint: disable=unused-argument
   423    def aggregated_values(self, aggregator_or_name):
   424      """Return a dict of step names to values of the Aggregator."""
   425      _LOGGER.warning(
   426          '%s does not implement aggregated_values', self.__class__.__name__)
   427      return {}