github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/runners/interactive/user_pipeline_tracker.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """Class that tracks derived/pipeline fragments from user pipelines.
    19  
    20  For internal use only; no backwards-compatibility guarantees.
    21  In the InteractiveRunner the design is to keep the user pipeline unchanged,
    22  create a copy of the user pipeline, and modify the copy. When the derived
    23  pipeline runs, there should only be per-user pipeline state. This makes sure
    24  that derived pipelines can link back to the parent user pipeline.
    25  """
    26  
    27  import shutil
    28  from typing import Iterator
    29  from typing import Optional
    30  
    31  import apache_beam as beam  # type: ignore
    32  
    33  
    34  class UserPipelineTracker:
    35    """Tracks user pipelines from derived pipelines.
    36  
    37    This data structure is similar to a disjoint set data structure. A derived
    38    pipeline can only have one parent user pipeline. A user pipeline can have many
    39    derived pipelines.
    40    """
    41    def __init__(self):
    42      self._user_pipelines: dict[beam.Pipeline, list[beam.Pipeline]] = {}
    43      self._derived_pipelines: dict[beam.Pipeline] = {}
    44      self._pid_to_pipelines: dict[beam.Pipeline] = {}
    45  
    46    def __iter__(self) -> Iterator[beam.Pipeline]:
    47      """Iterates through all the user pipelines."""
    48      for p in self._user_pipelines:
    49        yield p
    50  
    51    def _key(self, pipeline: beam.Pipeline) -> str:
    52      return str(id(pipeline))
    53  
    54    def evict(self, pipeline: beam.Pipeline) -> None:
    55      """Evicts the pipeline.
    56  
    57      Removes the given pipeline and derived pipelines if a user pipeline.
    58      Otherwise, removes the given derived pipeline.
    59      """
    60      user_pipeline = self.get_user_pipeline(pipeline)
    61      if user_pipeline:
    62        for d in self._user_pipelines[user_pipeline]:
    63          del self._derived_pipelines[d]
    64        del self._user_pipelines[user_pipeline]
    65      elif pipeline in self._derived_pipelines:
    66        del self._derived_pipelines[pipeline]
    67  
    68    def clear(self) -> None:
    69      """Clears the tracker of all user and derived pipelines."""
    70      # Remove all local_tempdir of created pipelines.
    71      for p in self._pid_to_pipelines.values():
    72        shutil.rmtree(p.local_tempdir, ignore_errors=True)
    73  
    74      self._user_pipelines.clear()
    75      self._derived_pipelines.clear()
    76      self._pid_to_pipelines.clear()
    77  
    78    def get_pipeline(self, pid: str) -> Optional[beam.Pipeline]:
    79      """Returns the pipeline corresponding to the given pipeline id."""
    80      return self._pid_to_pipelines.get(pid, None)
    81  
    82    def add_user_pipeline(self, p: beam.Pipeline) -> beam.Pipeline:
    83      """Adds a user pipeline with an empty set of derived pipelines."""
    84      self._memoize_pipieline(p)
    85  
    86      # Create a new node for the user pipeline if it doesn't exist already.
    87      user_pipeline = self.get_user_pipeline(p)
    88      if not user_pipeline:
    89        user_pipeline = p
    90        self._user_pipelines[p] = []
    91  
    92      return user_pipeline
    93  
    94    def _memoize_pipieline(self, p: beam.Pipeline) -> None:
    95      """Memoizes the pid of the pipeline to the pipeline object."""
    96      pid = self._key(p)
    97      if pid not in self._pid_to_pipelines:
    98        self._pid_to_pipelines[pid] = p
    99  
   100    def add_derived_pipeline(
   101        self, maybe_user_pipeline: beam.Pipeline,
   102        derived_pipeline: beam.Pipeline) -> None:
   103      """Adds a derived pipeline with the user pipeline.
   104  
   105      If the `maybe_user_pipeline` is a user pipeline, then the derived pipeline
   106      will be added to its set. Otherwise, the derived pipeline will be added to
   107      the user pipeline of the `maybe_user_pipeline`.
   108  
   109      By doing the above one can do:
   110      p = beam.Pipeline()
   111  
   112      derived1 = beam.Pipeline()
   113      derived2 = beam.Pipeline()
   114  
   115      ut = UserPipelineTracker()
   116      ut.add_derived_pipeline(p, derived1)
   117      ut.add_derived_pipeline(derived1, derived2)
   118  
   119      # Returns p.
   120      ut.get_user_pipeline(derived2)
   121      """
   122      self._memoize_pipieline(maybe_user_pipeline)
   123      self._memoize_pipieline(derived_pipeline)
   124  
   125      # Cannot add a derived pipeline twice.
   126      assert derived_pipeline not in self._derived_pipelines
   127  
   128      # Get the "true" user pipeline. This allows for the user to derive a
   129      # pipeline from another derived pipeline, use both as arguments, and this
   130      # method will still get the correct user pipeline.
   131      user = self.add_user_pipeline(maybe_user_pipeline)
   132  
   133      # Map the derived pipeline to the user pipeline.
   134      self._derived_pipelines[derived_pipeline] = user
   135      self._user_pipelines[user].append(derived_pipeline)
   136  
   137    def get_user_pipeline(self, p: beam.Pipeline) -> Optional[beam.Pipeline]:
   138      """Returns the user pipeline of the given pipeline.
   139  
   140      If the given pipeline has no user pipeline, i.e. not added to this tracker,
   141      then this returns None. If the given pipeline is a user pipeline then this
   142      returns the same pipeline. If the given pipeline is a derived pipeline then
   143      this returns the user pipeline.
   144      """
   145  
   146      # If `p` is a user pipeline then return it.
   147      if p in self._user_pipelines:
   148        return p
   149  
   150      # If `p` exists then return its user pipeline.
   151      if p in self._derived_pipelines:
   152        return self._derived_pipelines[p]
   153  
   154      # Otherwise, `p` is not in this tracker.
   155      return None