github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/runners/interactive/caching/expression_cache.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 17 from typing import * 18 19 import apache_beam as beam 20 from apache_beam.dataframe import convert 21 from apache_beam.dataframe import expressions 22 23 24 class ExpressionCache(object): 25 """Utility class for caching deferred DataFrames expressions. 26 27 This is cache is currently a light-weight wrapper around the 28 TO_PCOLLECTION_CACHE in the beam.dataframes.convert module and the 29 computed_pcollections in the interactive module. 30 31 Example:: 32 33 df : beam.dataframe.DeferredDataFrame = ... 34 ... 35 cache = ExpressionCache() 36 cache.replace_with_cached(df._expr) 37 38 This will automatically link the instance to the existing caches. After it is 39 created, the cache can then be used to modify an existing deferred dataframe 40 expression tree to replace nodes with computed PCollections. 41 42 This object can be created and destroyed whenever. This class holds no state 43 and the only side-effect is modifying the given expression. 44 """ 45 def __init__(self, pcollection_cache=None, computed_cache=None): 46 from apache_beam.runners.interactive import interactive_environment as ie 47 48 self._pcollection_cache = ( 49 convert.TO_PCOLLECTION_CACHE 50 if pcollection_cache is None else pcollection_cache) 51 self._computed_cache = ( 52 ie.current_env().computed_pcollections 53 if computed_cache is None else computed_cache) 54 55 def replace_with_cached( 56 self, expr: expressions.Expression) -> Dict[str, expressions.Expression]: 57 """Replaces any previously computed expressions with PlaceholderExpressions. 58 59 This is used to correctly read any expressions that were cached in previous 60 runs. This enables the InteractiveRunner to prune off old calculations from 61 the expression tree. 62 """ 63 64 replaced_inputs: Dict[str, expressions.Expression] = {} 65 self._replace_with_cached_recur(expr, replaced_inputs) 66 return replaced_inputs 67 68 def _replace_with_cached_recur( 69 self, 70 expr: expressions.Expression, 71 replaced_inputs: Dict[str, expressions.Expression]) -> None: 72 """Recursive call for `replace_with_cached`. 73 74 Recurses through the expression tree and replaces any cached inputs with 75 `PlaceholderExpression`s. 76 """ 77 78 final_inputs = [] 79 80 for input in expr.args(): 81 pc = self._get_cached(input) 82 83 # Only read from cache when there is the PCollection has been fully 84 # computed. This is so that no partial results are used. 85 if self._is_computed(pc): 86 87 # Reuse previously seen cached expressions. This is so that the same 88 # value isn't cached multiple times. 89 if input._id in replaced_inputs: 90 cached = replaced_inputs[input._id] 91 else: 92 cached = expressions.PlaceholderExpression( 93 input.proxy(), self._pcollection_cache[input._id]) 94 95 replaced_inputs[input._id] = cached 96 final_inputs.append(cached) 97 else: 98 final_inputs.append(input) 99 self._replace_with_cached_recur(input, replaced_inputs) 100 expr._args = tuple(final_inputs) 101 102 def _get_cached(self, 103 expr: expressions.Expression) -> Optional[beam.PCollection]: 104 """Returns the PCollection associated with the expression.""" 105 return self._pcollection_cache.get(expr._id, None) 106 107 def _is_computed(self, pc: beam.PCollection) -> bool: 108 """Returns True if the PCollection has been run and computed.""" 109 return pc is not None and pc in self._computed_cache