github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/runners/direct/watermark_manager.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """Manages watermarks of PCollections and AppliedPTransforms.""" 19 20 # pytype: skip-file 21 22 import threading 23 from typing import TYPE_CHECKING 24 from typing import Dict 25 from typing import Iterable 26 from typing import List 27 from typing import Set 28 from typing import Tuple 29 30 from apache_beam import pipeline 31 from apache_beam import pvalue 32 from apache_beam.runners.direct.util import TimerFiring 33 from apache_beam.utils.timestamp import MAX_TIMESTAMP 34 from apache_beam.utils.timestamp import MIN_TIMESTAMP 35 from apache_beam.utils.timestamp import TIME_GRANULARITY 36 37 if TYPE_CHECKING: 38 from apache_beam.pipeline import AppliedPTransform 39 from apache_beam.runners.direct.bundle_factory import _Bundle 40 from apache_beam.utils.timestamp import Timestamp 41 42 43 class WatermarkManager(object): 44 """For internal use only; no backwards-compatibility guarantees. 45 46 Tracks and updates watermarks for all AppliedPTransforms.""" 47 48 WATERMARK_POS_INF = MAX_TIMESTAMP 49 WATERMARK_NEG_INF = MIN_TIMESTAMP 50 51 def __init__( 52 self, clock, root_transforms, value_to_consumers, transform_keyed_states): 53 self._clock = clock 54 self._root_transforms = root_transforms 55 self._value_to_consumers = value_to_consumers 56 self._transform_keyed_states = transform_keyed_states 57 # AppliedPTransform -> TransformWatermarks 58 self._transform_to_watermarks = { 59 } # type: Dict[AppliedPTransform, _TransformWatermarks] 60 61 for root_transform in root_transforms: 62 self._transform_to_watermarks[root_transform] = _TransformWatermarks( 63 self._clock, transform_keyed_states[root_transform], root_transform) 64 65 for consumers in value_to_consumers.values(): 66 for consumer in consumers: 67 self._transform_to_watermarks[consumer] = _TransformWatermarks( 68 self._clock, transform_keyed_states[consumer], consumer) 69 70 for consumers in value_to_consumers.values(): 71 for consumer in consumers: 72 self._update_input_transform_watermarks(consumer) 73 74 def _update_input_transform_watermarks(self, applied_ptransform): 75 # type: (AppliedPTransform) -> None 76 assert isinstance(applied_ptransform, pipeline.AppliedPTransform) 77 input_transform_watermarks = [] 78 for input_pvalue in applied_ptransform.inputs: 79 assert input_pvalue.producer or isinstance(input_pvalue, pvalue.PBegin) 80 if input_pvalue.producer: 81 input_transform_watermarks.append( 82 self.get_watermarks(input_pvalue.producer)) 83 self._transform_to_watermarks[ 84 applied_ptransform].update_input_transform_watermarks( 85 input_transform_watermarks) 86 87 def get_watermarks(self, applied_ptransform): 88 # type: (AppliedPTransform) -> _TransformWatermarks 89 90 """Gets the input and output watermarks for an AppliedPTransform. 91 92 If the applied_ptransform has not processed any elements, return a 93 watermark with minimum value. 94 95 Args: 96 applied_ptransform: AppliedPTransform to get the watermarks for. 97 98 Returns: 99 A snapshot (TransformWatermarks) of the input watermark and output 100 watermark for the provided transform. 101 """ 102 103 # TODO(altay): Composite transforms should have a composite watermark. Until 104 # then they are represented by their last transform. 105 while applied_ptransform.parts: 106 applied_ptransform = applied_ptransform.parts[-1] 107 108 return self._transform_to_watermarks[applied_ptransform] 109 110 def update_watermarks(self, 111 completed_committed_bundle, # type: _Bundle 112 applied_ptransform, # type: AppliedPTransform 113 completed_timers, 114 outputs, 115 unprocessed_bundles, 116 keyed_earliest_holds, 117 side_inputs_container 118 ): 119 assert isinstance(applied_ptransform, pipeline.AppliedPTransform) 120 self._update_pending( 121 completed_committed_bundle, 122 applied_ptransform, 123 completed_timers, 124 outputs, 125 unprocessed_bundles) 126 tw = self.get_watermarks(applied_ptransform) 127 tw.hold(keyed_earliest_holds) 128 return self._refresh_watermarks(applied_ptransform, side_inputs_container) 129 130 def _update_pending(self, 131 input_committed_bundle, 132 applied_ptransform, # type: AppliedPTransform 133 completed_timers, 134 output_committed_bundles, # type: Iterable[_Bundle] 135 unprocessed_bundles # type: Iterable[_Bundle] 136 ): 137 """Updated list of pending bundles for the given AppliedPTransform.""" 138 139 # Update pending elements. Filter out empty bundles. They do not impact 140 # watermarks and should not trigger downstream execution. 141 for output in output_committed_bundles: 142 if output.has_elements(): 143 if output.pcollection in self._value_to_consumers: 144 consumers = self._value_to_consumers[output.pcollection] 145 for consumer in consumers: 146 consumer_tw = self._transform_to_watermarks[consumer] 147 consumer_tw.add_pending(output) 148 149 completed_tw = self._transform_to_watermarks[applied_ptransform] 150 completed_tw.update_timers(completed_timers) 151 152 for unprocessed_bundle in unprocessed_bundles: 153 completed_tw.add_pending(unprocessed_bundle) 154 155 assert input_committed_bundle or applied_ptransform in self._root_transforms 156 if input_committed_bundle and input_committed_bundle.has_elements(): 157 completed_tw.remove_pending(input_committed_bundle) 158 159 def _refresh_watermarks(self, applied_ptransform, side_inputs_container): 160 assert isinstance(applied_ptransform, pipeline.AppliedPTransform) 161 unblocked_tasks = [] 162 tw = self.get_watermarks(applied_ptransform) 163 if tw.refresh(): 164 for pval in applied_ptransform.outputs.values(): 165 if isinstance(pval, pvalue.DoOutputsTuple): 166 pvals = (v for v in pval) 167 else: 168 pvals = (pval, ) 169 for v in pvals: 170 if v in self._value_to_consumers: # If there are downstream consumers 171 consumers = self._value_to_consumers[v] 172 for consumer in consumers: 173 unblocked_tasks.extend( 174 self._refresh_watermarks(consumer, side_inputs_container)) 175 # Notify the side_inputs_container. 176 unblocked_tasks.extend( 177 side_inputs_container. 178 update_watermarks_for_transform_and_unblock_tasks( 179 applied_ptransform, tw)) 180 return unblocked_tasks 181 182 def extract_all_timers(self): 183 # type: () -> Tuple[List[Tuple[AppliedPTransform, List[TimerFiring]]], bool] 184 185 """Extracts fired timers for all transforms 186 and reports if there are any timers set.""" 187 all_timers = [] # type: List[Tuple[AppliedPTransform, List[TimerFiring]]] 188 has_realtime_timer = False 189 for applied_ptransform, tw in self._transform_to_watermarks.items(): 190 fired_timers, had_realtime_timer = tw.extract_transform_timers() 191 if fired_timers: 192 # We should sort the timer firings, so they are fired in order. 193 fired_timers.sort(key=lambda ft: ft.timestamp) 194 all_timers.append((applied_ptransform, fired_timers)) 195 if (had_realtime_timer and 196 tw.output_watermark < WatermarkManager.WATERMARK_POS_INF): 197 has_realtime_timer = True 198 return all_timers, has_realtime_timer 199 200 201 class _TransformWatermarks(object): 202 """Tracks input and output watermarks for an AppliedPTransform.""" 203 def __init__(self, clock, keyed_states, transform): 204 self._clock = clock 205 self._keyed_states = keyed_states 206 self._input_transform_watermarks = [] # type: List[_TransformWatermarks] 207 self._input_watermark = WatermarkManager.WATERMARK_NEG_INF 208 self._output_watermark = WatermarkManager.WATERMARK_NEG_INF 209 self._keyed_earliest_holds = {} 210 # Scheduled bundles targeted for this transform. 211 self._pending = set() # type: Set[_Bundle] 212 self._fired_timers = set() 213 self._lock = threading.Lock() 214 215 self._label = str(transform) 216 217 def update_input_transform_watermarks(self, input_transform_watermarks): 218 # type: (List[_TransformWatermarks]) -> None 219 with self._lock: 220 self._input_transform_watermarks = input_transform_watermarks 221 222 def update_timers(self, completed_timers): 223 with self._lock: 224 for timer_firing in completed_timers: 225 self._fired_timers.remove(timer_firing) 226 227 @property 228 def input_watermark(self): 229 # type: () -> Timestamp 230 with self._lock: 231 return self._input_watermark 232 233 @property 234 def output_watermark(self): 235 # type: () -> Timestamp 236 with self._lock: 237 return self._output_watermark 238 239 def hold(self, keyed_earliest_holds): 240 with self._lock: 241 for key, hold_value in keyed_earliest_holds.items(): 242 self._keyed_earliest_holds[key] = hold_value 243 if (hold_value is None or 244 hold_value == WatermarkManager.WATERMARK_POS_INF): 245 del self._keyed_earliest_holds[key] 246 247 def add_pending(self, pending): 248 # type: (_Bundle) -> None 249 with self._lock: 250 self._pending.add(pending) 251 252 def remove_pending(self, completed): 253 # type: (_Bundle) -> None 254 with self._lock: 255 # Ignore repeated removes. This will happen if a transform has a repeated 256 # input. 257 if completed in self._pending: 258 self._pending.remove(completed) 259 260 def refresh(self): 261 # type: () -> bool 262 263 """Refresh the watermark for a given transform. 264 265 This method looks at the watermark coming from all input PTransforms, and 266 the timestamp of the minimum element, as well as any watermark holds. 267 268 Returns: 269 True if the watermark has advanced, and False if it has not. 270 """ 271 with self._lock: 272 min_pending_timestamp = WatermarkManager.WATERMARK_POS_INF 273 has_pending_elements = False 274 for input_bundle in self._pending: 275 # TODO(ccy): we can have the Bundle class keep track of the minimum 276 # timestamp so we don't have to do an iteration here. 277 for wv in input_bundle.get_elements_iterable(): 278 has_pending_elements = True 279 if wv.timestamp < min_pending_timestamp: 280 min_pending_timestamp = wv.timestamp 281 282 # If there is a pending element with a certain timestamp, we can at most 283 # advance our watermark to the maximum timestamp less than that 284 # timestamp. 285 pending_holder = WatermarkManager.WATERMARK_POS_INF 286 if has_pending_elements: 287 pending_holder = min_pending_timestamp - TIME_GRANULARITY 288 289 input_watermarks = [ 290 tw.output_watermark for tw in self._input_transform_watermarks 291 ] 292 input_watermarks.append(WatermarkManager.WATERMARK_POS_INF) 293 producer_watermark = min(input_watermarks) 294 295 self._input_watermark = max( 296 self._input_watermark, min(pending_holder, producer_watermark)) 297 earliest_hold = WatermarkManager.WATERMARK_POS_INF 298 for hold in self._keyed_earliest_holds.values(): 299 if hold < earliest_hold: 300 earliest_hold = hold 301 new_output_watermark = min(self._input_watermark, earliest_hold) 302 303 advanced = new_output_watermark > self._output_watermark 304 self._output_watermark = new_output_watermark 305 return advanced 306 307 @property 308 def synchronized_processing_output_time(self): 309 return self._clock.time() 310 311 def extract_transform_timers(self): 312 # type: () -> Tuple[List[TimerFiring], bool] 313 314 """Extracts fired timers and reports of any timers set per transform.""" 315 with self._lock: 316 fired_timers = [] 317 has_realtime_timer = False 318 for encoded_key, state in self._keyed_states.items(): 319 timers, had_realtime_timer = state.get_timers( 320 watermark=self._input_watermark, 321 processing_time=self._clock.time()) 322 if had_realtime_timer: 323 has_realtime_timer = True 324 for expired in timers: 325 window, (name, time_domain, timestamp, dynamic_timer_tag) = expired 326 fired_timers.append( 327 TimerFiring( 328 encoded_key, 329 window, 330 name, 331 time_domain, 332 timestamp, 333 dynamic_timer_tag=dynamic_timer_tag)) 334 self._fired_timers.update(fired_timers) 335 return fired_timers, has_realtime_timer