github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/runners/interactive/caching/cacheable.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """Module for dataclasses to hold metadata of cacheable PCollections in the user 19 code scope. 20 21 For internal use only; no backwards-compatibility guarantees. 22 """ 23 24 # pytype: skip-file 25 26 from dataclasses import dataclass 27 28 import apache_beam as beam 29 30 31 @dataclass 32 class Cacheable: 33 var: str 34 version: str 35 producer_version: str 36 pcoll: beam.pvalue.PCollection 37 38 def __hash__(self): 39 return hash((self.var, self.version, self.producer_version, self.pcoll)) 40 41 @staticmethod 42 def from_pcoll( 43 pcoll_name: str, pcoll: beam.pvalue.PCollection) -> 'Cacheable': 44 return Cacheable(pcoll_name, str(id(pcoll)), str(id(pcoll.producer)), pcoll) 45 46 def to_key(self): 47 return CacheKey( 48 self.var, 49 self.version, 50 self.producer_version, 51 str(id(self.pcoll.pipeline))) 52 53 54 @dataclass 55 class CacheKey: 56 """The identifier of a cacheable PCollection in cache. 57 58 It contains 4 stringified components: 59 var: The obfuscated variable name of the PCollection. 60 version: The id() of the PCollection. 61 producer_version: The id() of the producer of the PCollection. 62 pipeline_id: The id() of the pipeline the PCollection belongs to. 63 """ 64 var: str 65 version: str 66 producer_version: str 67 pipeline_id: str 68 69 def __post_init__(self): 70 from apache_beam.runners.interactive.utils import obfuscate 71 # Normalize arbitrary variable name to a fixed length hex str. 72 self.var = obfuscate(self.var)[:10] 73 74 def __hash__(self): 75 return hash( 76 (self.var, self.version, self.producer_version, self.pipeline_id)) 77 78 @staticmethod 79 def from_str(r: str) -> 'CacheKey': 80 r_split = r.split('-') 81 ck = CacheKey(*r_split) 82 # Avoid double obfuscation. 83 ck.var = r_split[0] 84 return ck 85 86 @staticmethod 87 def from_pcoll(pcoll_name: str, pcoll: beam.pvalue.PCollection) -> 'CacheKey': 88 return CacheKey( 89 pcoll_name, 90 str(id(pcoll)), 91 str(id(pcoll.producer)), 92 str(id(pcoll.pipeline))) 93 94 def to_str(self): 95 return '-'.join( 96 [self.var, self.version, self.producer_version, self.pipeline_id]) 97 98 def __repr__(self): 99 return self.to_str() 100 101 def __str__(self): 102 return self.to_str()