github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/runners/direct/watermark_manager.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """Manages watermarks of PCollections and AppliedPTransforms."""
    19  
    20  # pytype: skip-file
    21  
    22  import threading
    23  from typing import TYPE_CHECKING
    24  from typing import Dict
    25  from typing import Iterable
    26  from typing import List
    27  from typing import Set
    28  from typing import Tuple
    29  
    30  from apache_beam import pipeline
    31  from apache_beam import pvalue
    32  from apache_beam.runners.direct.util import TimerFiring
    33  from apache_beam.utils.timestamp import MAX_TIMESTAMP
    34  from apache_beam.utils.timestamp import MIN_TIMESTAMP
    35  from apache_beam.utils.timestamp import TIME_GRANULARITY
    36  
    37  if TYPE_CHECKING:
    38    from apache_beam.pipeline import AppliedPTransform
    39    from apache_beam.runners.direct.bundle_factory import _Bundle
    40    from apache_beam.utils.timestamp import Timestamp
    41  
    42  
    43  class WatermarkManager(object):
    44    """For internal use only; no backwards-compatibility guarantees.
    45  
    46    Tracks and updates watermarks for all AppliedPTransforms."""
    47  
    48    WATERMARK_POS_INF = MAX_TIMESTAMP
    49    WATERMARK_NEG_INF = MIN_TIMESTAMP
    50  
    51    def __init__(
    52        self, clock, root_transforms, value_to_consumers, transform_keyed_states):
    53      self._clock = clock
    54      self._root_transforms = root_transforms
    55      self._value_to_consumers = value_to_consumers
    56      self._transform_keyed_states = transform_keyed_states
    57      # AppliedPTransform -> TransformWatermarks
    58      self._transform_to_watermarks = {
    59      }  # type: Dict[AppliedPTransform, _TransformWatermarks]
    60  
    61      for root_transform in root_transforms:
    62        self._transform_to_watermarks[root_transform] = _TransformWatermarks(
    63            self._clock, transform_keyed_states[root_transform], root_transform)
    64  
    65      for consumers in value_to_consumers.values():
    66        for consumer in consumers:
    67          self._transform_to_watermarks[consumer] = _TransformWatermarks(
    68              self._clock, transform_keyed_states[consumer], consumer)
    69  
    70      for consumers in value_to_consumers.values():
    71        for consumer in consumers:
    72          self._update_input_transform_watermarks(consumer)
    73  
    74    def _update_input_transform_watermarks(self, applied_ptransform):
    75      # type: (AppliedPTransform) -> None
    76      assert isinstance(applied_ptransform, pipeline.AppliedPTransform)
    77      input_transform_watermarks = []
    78      for input_pvalue in applied_ptransform.inputs:
    79        assert input_pvalue.producer or isinstance(input_pvalue, pvalue.PBegin)
    80        if input_pvalue.producer:
    81          input_transform_watermarks.append(
    82              self.get_watermarks(input_pvalue.producer))
    83      self._transform_to_watermarks[
    84          applied_ptransform].update_input_transform_watermarks(
    85              input_transform_watermarks)
    86  
    87    def get_watermarks(self, applied_ptransform):
    88      # type: (AppliedPTransform) -> _TransformWatermarks
    89  
    90      """Gets the input and output watermarks for an AppliedPTransform.
    91  
    92      If the applied_ptransform has not processed any elements, return a
    93      watermark with minimum value.
    94  
    95      Args:
    96        applied_ptransform: AppliedPTransform to get the watermarks for.
    97  
    98      Returns:
    99        A snapshot (TransformWatermarks) of the input watermark and output
   100        watermark for the provided transform.
   101      """
   102  
   103      # TODO(altay): Composite transforms should have a composite watermark. Until
   104      # then they are represented by their last transform.
   105      while applied_ptransform.parts:
   106        applied_ptransform = applied_ptransform.parts[-1]
   107  
   108      return self._transform_to_watermarks[applied_ptransform]
   109  
   110    def update_watermarks(self,
   111                          completed_committed_bundle,  # type: _Bundle
   112                          applied_ptransform,  # type: AppliedPTransform
   113                          completed_timers,
   114                          outputs,
   115                          unprocessed_bundles,
   116                          keyed_earliest_holds,
   117                          side_inputs_container
   118                         ):
   119      assert isinstance(applied_ptransform, pipeline.AppliedPTransform)
   120      self._update_pending(
   121          completed_committed_bundle,
   122          applied_ptransform,
   123          completed_timers,
   124          outputs,
   125          unprocessed_bundles)
   126      tw = self.get_watermarks(applied_ptransform)
   127      tw.hold(keyed_earliest_holds)
   128      return self._refresh_watermarks(applied_ptransform, side_inputs_container)
   129  
   130    def _update_pending(self,
   131                        input_committed_bundle,
   132                        applied_ptransform,  # type: AppliedPTransform
   133                        completed_timers,
   134                        output_committed_bundles,  # type: Iterable[_Bundle]
   135                        unprocessed_bundles  # type: Iterable[_Bundle]
   136                       ):
   137      """Updated list of pending bundles for the given AppliedPTransform."""
   138  
   139      # Update pending elements. Filter out empty bundles. They do not impact
   140      # watermarks and should not trigger downstream execution.
   141      for output in output_committed_bundles:
   142        if output.has_elements():
   143          if output.pcollection in self._value_to_consumers:
   144            consumers = self._value_to_consumers[output.pcollection]
   145            for consumer in consumers:
   146              consumer_tw = self._transform_to_watermarks[consumer]
   147              consumer_tw.add_pending(output)
   148  
   149      completed_tw = self._transform_to_watermarks[applied_ptransform]
   150      completed_tw.update_timers(completed_timers)
   151  
   152      for unprocessed_bundle in unprocessed_bundles:
   153        completed_tw.add_pending(unprocessed_bundle)
   154  
   155      assert input_committed_bundle or applied_ptransform in self._root_transforms
   156      if input_committed_bundle and input_committed_bundle.has_elements():
   157        completed_tw.remove_pending(input_committed_bundle)
   158  
   159    def _refresh_watermarks(self, applied_ptransform, side_inputs_container):
   160      assert isinstance(applied_ptransform, pipeline.AppliedPTransform)
   161      unblocked_tasks = []
   162      tw = self.get_watermarks(applied_ptransform)
   163      if tw.refresh():
   164        for pval in applied_ptransform.outputs.values():
   165          if isinstance(pval, pvalue.DoOutputsTuple):
   166            pvals = (v for v in pval)
   167          else:
   168            pvals = (pval, )
   169          for v in pvals:
   170            if v in self._value_to_consumers:  # If there are downstream consumers
   171              consumers = self._value_to_consumers[v]
   172              for consumer in consumers:
   173                unblocked_tasks.extend(
   174                    self._refresh_watermarks(consumer, side_inputs_container))
   175        # Notify the side_inputs_container.
   176        unblocked_tasks.extend(
   177            side_inputs_container.
   178            update_watermarks_for_transform_and_unblock_tasks(
   179                applied_ptransform, tw))
   180      return unblocked_tasks
   181  
   182    def extract_all_timers(self):
   183      # type: () -> Tuple[List[Tuple[AppliedPTransform, List[TimerFiring]]], bool]
   184  
   185      """Extracts fired timers for all transforms
   186      and reports if there are any timers set."""
   187      all_timers = []  # type: List[Tuple[AppliedPTransform, List[TimerFiring]]]
   188      has_realtime_timer = False
   189      for applied_ptransform, tw in self._transform_to_watermarks.items():
   190        fired_timers, had_realtime_timer = tw.extract_transform_timers()
   191        if fired_timers:
   192          # We should sort the timer firings, so they are fired in order.
   193          fired_timers.sort(key=lambda ft: ft.timestamp)
   194          all_timers.append((applied_ptransform, fired_timers))
   195        if (had_realtime_timer and
   196            tw.output_watermark < WatermarkManager.WATERMARK_POS_INF):
   197          has_realtime_timer = True
   198      return all_timers, has_realtime_timer
   199  
   200  
   201  class _TransformWatermarks(object):
   202    """Tracks input and output watermarks for an AppliedPTransform."""
   203    def __init__(self, clock, keyed_states, transform):
   204      self._clock = clock
   205      self._keyed_states = keyed_states
   206      self._input_transform_watermarks = []  # type: List[_TransformWatermarks]
   207      self._input_watermark = WatermarkManager.WATERMARK_NEG_INF
   208      self._output_watermark = WatermarkManager.WATERMARK_NEG_INF
   209      self._keyed_earliest_holds = {}
   210      # Scheduled bundles targeted for this transform.
   211      self._pending = set()  # type: Set[_Bundle]
   212      self._fired_timers = set()
   213      self._lock = threading.Lock()
   214  
   215      self._label = str(transform)
   216  
   217    def update_input_transform_watermarks(self, input_transform_watermarks):
   218      # type: (List[_TransformWatermarks]) -> None
   219      with self._lock:
   220        self._input_transform_watermarks = input_transform_watermarks
   221  
   222    def update_timers(self, completed_timers):
   223      with self._lock:
   224        for timer_firing in completed_timers:
   225          self._fired_timers.remove(timer_firing)
   226  
   227    @property
   228    def input_watermark(self):
   229      # type: () -> Timestamp
   230      with self._lock:
   231        return self._input_watermark
   232  
   233    @property
   234    def output_watermark(self):
   235      # type: () -> Timestamp
   236      with self._lock:
   237        return self._output_watermark
   238  
   239    def hold(self, keyed_earliest_holds):
   240      with self._lock:
   241        for key, hold_value in keyed_earliest_holds.items():
   242          self._keyed_earliest_holds[key] = hold_value
   243          if (hold_value is None or
   244              hold_value == WatermarkManager.WATERMARK_POS_INF):
   245            del self._keyed_earliest_holds[key]
   246  
   247    def add_pending(self, pending):
   248      # type: (_Bundle) -> None
   249      with self._lock:
   250        self._pending.add(pending)
   251  
   252    def remove_pending(self, completed):
   253      # type: (_Bundle) -> None
   254      with self._lock:
   255        # Ignore repeated removes. This will happen if a transform has a repeated
   256        # input.
   257        if completed in self._pending:
   258          self._pending.remove(completed)
   259  
   260    def refresh(self):
   261      # type: () -> bool
   262  
   263      """Refresh the watermark for a given transform.
   264  
   265      This method looks at the watermark coming from all input PTransforms, and
   266      the timestamp of the minimum element, as well as any watermark holds.
   267  
   268      Returns:
   269        True if the watermark has advanced, and False if it has not.
   270      """
   271      with self._lock:
   272        min_pending_timestamp = WatermarkManager.WATERMARK_POS_INF
   273        has_pending_elements = False
   274        for input_bundle in self._pending:
   275          # TODO(ccy): we can have the Bundle class keep track of the minimum
   276          # timestamp so we don't have to do an iteration here.
   277          for wv in input_bundle.get_elements_iterable():
   278            has_pending_elements = True
   279            if wv.timestamp < min_pending_timestamp:
   280              min_pending_timestamp = wv.timestamp
   281  
   282        # If there is a pending element with a certain timestamp, we can at most
   283        # advance our watermark to the maximum timestamp less than that
   284        # timestamp.
   285        pending_holder = WatermarkManager.WATERMARK_POS_INF
   286        if has_pending_elements:
   287          pending_holder = min_pending_timestamp - TIME_GRANULARITY
   288  
   289        input_watermarks = [
   290            tw.output_watermark for tw in self._input_transform_watermarks
   291        ]
   292        input_watermarks.append(WatermarkManager.WATERMARK_POS_INF)
   293        producer_watermark = min(input_watermarks)
   294  
   295        self._input_watermark = max(
   296            self._input_watermark, min(pending_holder, producer_watermark))
   297        earliest_hold = WatermarkManager.WATERMARK_POS_INF
   298        for hold in self._keyed_earliest_holds.values():
   299          if hold < earliest_hold:
   300            earliest_hold = hold
   301        new_output_watermark = min(self._input_watermark, earliest_hold)
   302  
   303        advanced = new_output_watermark > self._output_watermark
   304        self._output_watermark = new_output_watermark
   305        return advanced
   306  
   307    @property
   308    def synchronized_processing_output_time(self):
   309      return self._clock.time()
   310  
   311    def extract_transform_timers(self):
   312      # type: () -> Tuple[List[TimerFiring], bool]
   313  
   314      """Extracts fired timers and reports of any timers set per transform."""
   315      with self._lock:
   316        fired_timers = []
   317        has_realtime_timer = False
   318        for encoded_key, state in self._keyed_states.items():
   319          timers, had_realtime_timer = state.get_timers(
   320              watermark=self._input_watermark,
   321              processing_time=self._clock.time())
   322          if had_realtime_timer:
   323            has_realtime_timer = True
   324          for expired in timers:
   325            window, (name, time_domain, timestamp, dynamic_timer_tag) = expired
   326            fired_timers.append(
   327                TimerFiring(
   328                    encoded_key,
   329                    window,
   330                    name,
   331                    time_domain,
   332                    timestamp,
   333                    dynamic_timer_tag=dynamic_timer_tag))
   334        self._fired_timers.update(fired_timers)
   335        return fired_timers, has_realtime_timer