github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/runners/sdf_utils.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 # pytype: skip-file 19 20 """Common utility class to help SDK harness to execute an SDF. """ 21 22 import logging 23 import threading 24 from typing import TYPE_CHECKING 25 from typing import Any 26 from typing import NamedTuple 27 from typing import Optional 28 from typing import Tuple 29 from typing import Union 30 31 from apache_beam.transforms.core import WatermarkEstimatorProvider 32 from apache_beam.utils.timestamp import Duration 33 from apache_beam.utils.timestamp import Timestamp 34 from apache_beam.utils.windowed_value import WindowedValue 35 36 if TYPE_CHECKING: 37 from apache_beam.io.iobase import RestrictionProgress 38 from apache_beam.io.iobase import RestrictionTracker 39 from apache_beam.io.iobase import WatermarkEstimator 40 41 _LOGGER = logging.getLogger(__name__) 42 43 SplitResultPrimary = NamedTuple( 44 'SplitResultPrimary', [('primary_value', WindowedValue)]) 45 46 SplitResultResidual = NamedTuple( 47 'SplitResultResidual', 48 [('residual_value', WindowedValue), ('current_watermark', Timestamp), 49 ('deferred_timestamp', Optional[Duration])]) 50 51 52 class ThreadsafeRestrictionTracker(object): 53 """A thread-safe wrapper which wraps a `RestrictionTracker`. 54 55 This wrapper guarantees synchronization of modifying restrictions across 56 multi-thread. 57 """ 58 def __init__(self, restriction_tracker): 59 # type: (RestrictionTracker) -> None 60 from apache_beam.io.iobase import RestrictionTracker 61 if not isinstance(restriction_tracker, RestrictionTracker): 62 raise ValueError( 63 'Initialize ThreadsafeRestrictionTracker requires' 64 'RestrictionTracker.') 65 self._restriction_tracker = restriction_tracker 66 # Records an absolute timestamp when defer_remainder is called. 67 self._timestamp = None 68 self._lock = threading.RLock() 69 self._deferred_residual = None 70 self._deferred_timestamp = None # type: Optional[Union[Timestamp, Duration]] 71 72 def current_restriction(self): 73 with self._lock: 74 return self._restriction_tracker.current_restriction() 75 76 def try_claim(self, position): 77 with self._lock: 78 return self._restriction_tracker.try_claim(position) 79 80 def defer_remainder(self, deferred_time=None): 81 """Performs self-checkpoint on current processing restriction with an 82 expected resuming time. 83 84 Self-checkpoint could happen during processing elements. When executing an 85 DoFn.process(), you may want to stop processing an element and resuming 86 later if current element has been processed quit a long time or you also 87 want to have some outputs from other elements. ``defer_remainder()`` can be 88 called on per element if needed. 89 90 Args: 91 deferred_time: A relative ``Duration`` that indicates the ideal time gap 92 between now and resuming, or an absolute ``Timestamp`` for resuming 93 execution time. If the time_delay is None, the deferred work will be 94 executed as soon as possible. 95 """ 96 97 # Record current time for calculating deferred_time later. 98 with self._lock: 99 self._timestamp = Timestamp.now() 100 if deferred_time and not isinstance(deferred_time, (Duration, Timestamp)): 101 raise ValueError( 102 'The timestamp of deter_remainder() should be a ' 103 'Duration or a Timestamp, or None.') 104 self._deferred_timestamp = deferred_time 105 checkpoint = self.try_split(0) 106 if checkpoint: 107 _, self._deferred_residual = checkpoint 108 109 def check_done(self): 110 with self._lock: 111 return self._restriction_tracker.check_done() 112 113 def current_progress(self): 114 # type: () -> RestrictionProgress 115 with self._lock: 116 return self._restriction_tracker.current_progress() 117 118 def try_split(self, fraction_of_remainder): 119 with self._lock: 120 return self._restriction_tracker.try_split(fraction_of_remainder) 121 122 def deferred_status(self): 123 # type: () -> Optional[Tuple[Any, Duration]] 124 125 """Returns deferred work which is produced by ``defer_remainder()``. 126 127 When there is a self-checkpoint performed, the system needs to fulfill the 128 DelayedBundleApplication with deferred_work for a ProcessBundleResponse. 129 The system calls this API to get deferred_residual with watermark together 130 to help the runner to schedule a future work. 131 132 Returns: (deferred_residual, time_delay) if having any residual, else None. 133 """ 134 if self._deferred_residual: 135 # If _deferred_timestamp is None, create Duration(0). 136 if not self._deferred_timestamp: 137 self._deferred_timestamp = Duration() 138 # If an absolute timestamp is provided, calculate the delta between 139 # the absoluted time and the time deferred_status() is called. 140 elif isinstance(self._deferred_timestamp, Timestamp): 141 self._deferred_timestamp = (self._deferred_timestamp - Timestamp.now()) 142 # If a Duration is provided, the deferred time should be: 143 # provided duration - the spent time since the defer_remainder() is 144 # called. 145 elif isinstance(self._deferred_timestamp, Duration): 146 self._deferred_timestamp -= (Timestamp.now() - self._timestamp) 147 return self._deferred_residual, self._deferred_timestamp 148 return None 149 150 def is_bounded(self): 151 return self._restriction_tracker.is_bounded() 152 153 154 class RestrictionTrackerView(object): 155 """A DoFn view of thread-safe RestrictionTracker. 156 157 The RestrictionTrackerView wraps a ThreadsafeRestrictionTracker and only 158 exposes APIs that will be called by a ``DoFn.process()``. During execution 159 time, the RestrictionTrackerView will be fed into the ``DoFn.process`` as a 160 restriction_tracker. 161 """ 162 def __init__(self, threadsafe_restriction_tracker): 163 # type: (ThreadsafeRestrictionTracker) -> None 164 if not isinstance(threadsafe_restriction_tracker, 165 ThreadsafeRestrictionTracker): 166 raise ValueError( 167 'Initialize RestrictionTrackerView requires ' 168 'ThreadsafeRestrictionTracker.') 169 self._threadsafe_restriction_tracker = threadsafe_restriction_tracker 170 171 def current_restriction(self): 172 return self._threadsafe_restriction_tracker.current_restriction() 173 174 def try_claim(self, position): 175 return self._threadsafe_restriction_tracker.try_claim(position) 176 177 def defer_remainder(self, deferred_time=None): 178 self._threadsafe_restriction_tracker.defer_remainder(deferred_time) 179 180 def is_bounded(self): 181 self._threadsafe_restriction_tracker.is_bounded() 182 183 184 class ThreadsafeWatermarkEstimator(object): 185 """A threadsafe wrapper which wraps a WatermarkEstimator with locking 186 mechanism to guarantee multi-thread safety. 187 """ 188 def __init__(self, watermark_estimator): 189 # type: (WatermarkEstimator) -> None 190 from apache_beam.io.iobase import WatermarkEstimator 191 if not isinstance(watermark_estimator, WatermarkEstimator): 192 raise ValueError('Initializing Threadsafe requires a WatermarkEstimator') 193 self._watermark_estimator = watermark_estimator 194 self._lock = threading.Lock() 195 196 def __getattr__(self, attr): 197 if hasattr(self._watermark_estimator, attr): 198 199 def method_wrapper(*args, **kw): 200 with self._lock: 201 return getattr(self._watermark_estimator, attr)(*args, **kw) 202 203 return method_wrapper 204 raise AttributeError(attr) 205 206 def get_estimator_state(self): 207 with self._lock: 208 return self._watermark_estimator.get_estimator_state() 209 210 def current_watermark(self): 211 # type: () -> Timestamp 212 with self._lock: 213 return self._watermark_estimator.current_watermark() 214 215 def observe_timestamp(self, timestamp): 216 # type: (Timestamp) -> None 217 if not isinstance(timestamp, Timestamp): 218 raise ValueError( 219 'Input of observe_timestamp should be a Timestamp ' 220 'object') 221 with self._lock: 222 self._watermark_estimator.observe_timestamp(timestamp) 223 224 225 class NoOpWatermarkEstimatorProvider(WatermarkEstimatorProvider): 226 """A WatermarkEstimatorProvider which creates NoOpWatermarkEstimator for the 227 framework. 228 """ 229 def initial_estimator_state(self, element, restriction): 230 return None 231 232 def create_watermark_estimator(self, estimator_state): 233 from apache_beam.io.iobase import WatermarkEstimator 234 235 class _NoOpWatermarkEstimator(WatermarkEstimator): 236 """A No-op WatermarkEstimator which is provided for the framework if there 237 is no custom one. 238 """ 239 def observe_timestamp(self, timestamp): 240 pass 241 242 def current_watermark(self): 243 return None 244 245 def get_estimator_state(self): 246 return None 247 248 return _NoOpWatermarkEstimator()