github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/runners/interactive/user_pipeline_tracker.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """Class that tracks derived/pipeline fragments from user pipelines. 19 20 For internal use only; no backwards-compatibility guarantees. 21 In the InteractiveRunner the design is to keep the user pipeline unchanged, 22 create a copy of the user pipeline, and modify the copy. When the derived 23 pipeline runs, there should only be per-user pipeline state. This makes sure 24 that derived pipelines can link back to the parent user pipeline. 25 """ 26 27 import shutil 28 from typing import Iterator 29 from typing import Optional 30 31 import apache_beam as beam # type: ignore 32 33 34 class UserPipelineTracker: 35 """Tracks user pipelines from derived pipelines. 36 37 This data structure is similar to a disjoint set data structure. A derived 38 pipeline can only have one parent user pipeline. A user pipeline can have many 39 derived pipelines. 40 """ 41 def __init__(self): 42 self._user_pipelines: dict[beam.Pipeline, list[beam.Pipeline]] = {} 43 self._derived_pipelines: dict[beam.Pipeline] = {} 44 self._pid_to_pipelines: dict[beam.Pipeline] = {} 45 46 def __iter__(self) -> Iterator[beam.Pipeline]: 47 """Iterates through all the user pipelines.""" 48 for p in self._user_pipelines: 49 yield p 50 51 def _key(self, pipeline: beam.Pipeline) -> str: 52 return str(id(pipeline)) 53 54 def evict(self, pipeline: beam.Pipeline) -> None: 55 """Evicts the pipeline. 56 57 Removes the given pipeline and derived pipelines if a user pipeline. 58 Otherwise, removes the given derived pipeline. 59 """ 60 user_pipeline = self.get_user_pipeline(pipeline) 61 if user_pipeline: 62 for d in self._user_pipelines[user_pipeline]: 63 del self._derived_pipelines[d] 64 del self._user_pipelines[user_pipeline] 65 elif pipeline in self._derived_pipelines: 66 del self._derived_pipelines[pipeline] 67 68 def clear(self) -> None: 69 """Clears the tracker of all user and derived pipelines.""" 70 # Remove all local_tempdir of created pipelines. 71 for p in self._pid_to_pipelines.values(): 72 shutil.rmtree(p.local_tempdir, ignore_errors=True) 73 74 self._user_pipelines.clear() 75 self._derived_pipelines.clear() 76 self._pid_to_pipelines.clear() 77 78 def get_pipeline(self, pid: str) -> Optional[beam.Pipeline]: 79 """Returns the pipeline corresponding to the given pipeline id.""" 80 return self._pid_to_pipelines.get(pid, None) 81 82 def add_user_pipeline(self, p: beam.Pipeline) -> beam.Pipeline: 83 """Adds a user pipeline with an empty set of derived pipelines.""" 84 self._memoize_pipieline(p) 85 86 # Create a new node for the user pipeline if it doesn't exist already. 87 user_pipeline = self.get_user_pipeline(p) 88 if not user_pipeline: 89 user_pipeline = p 90 self._user_pipelines[p] = [] 91 92 return user_pipeline 93 94 def _memoize_pipieline(self, p: beam.Pipeline) -> None: 95 """Memoizes the pid of the pipeline to the pipeline object.""" 96 pid = self._key(p) 97 if pid not in self._pid_to_pipelines: 98 self._pid_to_pipelines[pid] = p 99 100 def add_derived_pipeline( 101 self, maybe_user_pipeline: beam.Pipeline, 102 derived_pipeline: beam.Pipeline) -> None: 103 """Adds a derived pipeline with the user pipeline. 104 105 If the `maybe_user_pipeline` is a user pipeline, then the derived pipeline 106 will be added to its set. Otherwise, the derived pipeline will be added to 107 the user pipeline of the `maybe_user_pipeline`. 108 109 By doing the above one can do: 110 p = beam.Pipeline() 111 112 derived1 = beam.Pipeline() 113 derived2 = beam.Pipeline() 114 115 ut = UserPipelineTracker() 116 ut.add_derived_pipeline(p, derived1) 117 ut.add_derived_pipeline(derived1, derived2) 118 119 # Returns p. 120 ut.get_user_pipeline(derived2) 121 """ 122 self._memoize_pipieline(maybe_user_pipeline) 123 self._memoize_pipieline(derived_pipeline) 124 125 # Cannot add a derived pipeline twice. 126 assert derived_pipeline not in self._derived_pipelines 127 128 # Get the "true" user pipeline. This allows for the user to derive a 129 # pipeline from another derived pipeline, use both as arguments, and this 130 # method will still get the correct user pipeline. 131 user = self.add_user_pipeline(maybe_user_pipeline) 132 133 # Map the derived pipeline to the user pipeline. 134 self._derived_pipelines[derived_pipeline] = user 135 self._user_pipelines[user].append(derived_pipeline) 136 137 def get_user_pipeline(self, p: beam.Pipeline) -> Optional[beam.Pipeline]: 138 """Returns the user pipeline of the given pipeline. 139 140 If the given pipeline has no user pipeline, i.e. not added to this tracker, 141 then this returns None. If the given pipeline is a user pipeline then this 142 returns the same pipeline. If the given pipeline is a derived pipeline then 143 this returns the user pipeline. 144 """ 145 146 # If `p` is a user pipeline then return it. 147 if p in self._user_pipelines: 148 return p 149 150 # If `p` exists then return its user pipeline. 151 if p in self._derived_pipelines: 152 return self._derived_pipelines[p] 153 154 # Otherwise, `p` is not in this tracker. 155 return None