github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/runners/interactive/caching/expression_cache.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  
    17  from typing import *
    18  
    19  import apache_beam as beam
    20  from apache_beam.dataframe import convert
    21  from apache_beam.dataframe import expressions
    22  
    23  
    24  class ExpressionCache(object):
    25    """Utility class for caching deferred DataFrames expressions.
    26  
    27    This is cache is currently a light-weight wrapper around the
    28    TO_PCOLLECTION_CACHE in the beam.dataframes.convert module and the
    29    computed_pcollections in the interactive module.
    30  
    31    Example::
    32  
    33      df : beam.dataframe.DeferredDataFrame = ...
    34      ...
    35      cache = ExpressionCache()
    36      cache.replace_with_cached(df._expr)
    37  
    38    This will automatically link the instance to the existing caches. After it is
    39    created, the cache can then be used to modify an existing deferred dataframe
    40    expression tree to replace nodes with computed PCollections.
    41  
    42    This object can be created and destroyed whenever. This class holds no state
    43    and the only side-effect is modifying the given expression.
    44    """
    45    def __init__(self, pcollection_cache=None, computed_cache=None):
    46      from apache_beam.runners.interactive import interactive_environment as ie
    47  
    48      self._pcollection_cache = (
    49          convert.TO_PCOLLECTION_CACHE
    50          if pcollection_cache is None else pcollection_cache)
    51      self._computed_cache = (
    52          ie.current_env().computed_pcollections
    53          if computed_cache is None else computed_cache)
    54  
    55    def replace_with_cached(
    56        self, expr: expressions.Expression) -> Dict[str, expressions.Expression]:
    57      """Replaces any previously computed expressions with PlaceholderExpressions.
    58  
    59      This is used to correctly read any expressions that were cached in previous
    60      runs. This enables the InteractiveRunner to prune off old calculations from
    61      the expression tree.
    62      """
    63  
    64      replaced_inputs: Dict[str, expressions.Expression] = {}
    65      self._replace_with_cached_recur(expr, replaced_inputs)
    66      return replaced_inputs
    67  
    68    def _replace_with_cached_recur(
    69        self,
    70        expr: expressions.Expression,
    71        replaced_inputs: Dict[str, expressions.Expression]) -> None:
    72      """Recursive call for `replace_with_cached`.
    73  
    74      Recurses through the expression tree and replaces any cached inputs with
    75      `PlaceholderExpression`s.
    76      """
    77  
    78      final_inputs = []
    79  
    80      for input in expr.args():
    81        pc = self._get_cached(input)
    82  
    83        # Only read from cache when there is the PCollection has been fully
    84        # computed. This is so that no partial results are used.
    85        if self._is_computed(pc):
    86  
    87          # Reuse previously seen cached expressions. This is so that the same
    88          # value isn't cached multiple times.
    89          if input._id in replaced_inputs:
    90            cached = replaced_inputs[input._id]
    91          else:
    92            cached = expressions.PlaceholderExpression(
    93                input.proxy(), self._pcollection_cache[input._id])
    94  
    95            replaced_inputs[input._id] = cached
    96          final_inputs.append(cached)
    97        else:
    98          final_inputs.append(input)
    99          self._replace_with_cached_recur(input, replaced_inputs)
   100      expr._args = tuple(final_inputs)
   101  
   102    def _get_cached(self,
   103                    expr: expressions.Expression) -> Optional[beam.PCollection]:
   104      """Returns the PCollection associated with the expression."""
   105      return self._pcollection_cache.get(expr._id, None)
   106  
   107    def _is_computed(self, pc: beam.PCollection) -> bool:
   108      """Returns True if the PCollection has been run and computed."""
   109      return pc is not None and pc in self._computed_cache