github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/runners/interactive/caching/cacheable.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """Module for dataclasses to hold metadata of cacheable PCollections in the user
    19  code scope.
    20  
    21  For internal use only; no backwards-compatibility guarantees.
    22  """
    23  
    24  # pytype: skip-file
    25  
    26  from dataclasses import dataclass
    27  
    28  import apache_beam as beam
    29  
    30  
    31  @dataclass
    32  class Cacheable:
    33    var: str
    34    version: str
    35    producer_version: str
    36    pcoll: beam.pvalue.PCollection
    37  
    38    def __hash__(self):
    39      return hash((self.var, self.version, self.producer_version, self.pcoll))
    40  
    41    @staticmethod
    42    def from_pcoll(
    43        pcoll_name: str, pcoll: beam.pvalue.PCollection) -> 'Cacheable':
    44      return Cacheable(pcoll_name, str(id(pcoll)), str(id(pcoll.producer)), pcoll)
    45  
    46    def to_key(self):
    47      return CacheKey(
    48          self.var,
    49          self.version,
    50          self.producer_version,
    51          str(id(self.pcoll.pipeline)))
    52  
    53  
    54  @dataclass
    55  class CacheKey:
    56    """The identifier of a cacheable PCollection in cache.
    57  
    58    It contains 4 stringified components:
    59    var: The obfuscated variable name of the PCollection.
    60    version: The id() of the PCollection.
    61    producer_version: The id() of the producer of the PCollection.
    62    pipeline_id: The id() of the pipeline the PCollection belongs to.
    63    """
    64    var: str
    65    version: str
    66    producer_version: str
    67    pipeline_id: str
    68  
    69    def __post_init__(self):
    70      from apache_beam.runners.interactive.utils import obfuscate
    71      # Normalize arbitrary variable name to a fixed length hex str.
    72      self.var = obfuscate(self.var)[:10]
    73  
    74    def __hash__(self):
    75      return hash(
    76          (self.var, self.version, self.producer_version, self.pipeline_id))
    77  
    78    @staticmethod
    79    def from_str(r: str) -> 'CacheKey':
    80      r_split = r.split('-')
    81      ck = CacheKey(*r_split)
    82      # Avoid double obfuscation.
    83      ck.var = r_split[0]
    84      return ck
    85  
    86    @staticmethod
    87    def from_pcoll(pcoll_name: str, pcoll: beam.pvalue.PCollection) -> 'CacheKey':
    88      return CacheKey(
    89          pcoll_name,
    90          str(id(pcoll)),
    91          str(id(pcoll.producer)),
    92          str(id(pcoll.pipeline)))
    93  
    94    def to_str(self):
    95      return '-'.join(
    96          [self.var, self.version, self.producer_version, self.pipeline_id])
    97  
    98    def __repr__(self):
    99      return self.to_str()
   100  
   101    def __str__(self):
   102      return self.to_str()