github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/io/watermark_estimators.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """A collection of WatermarkEstimator implementations that SplittableDoFns
    19  can use."""
    20  
    21  # pytype: skip-file
    22  
    23  from apache_beam.io.iobase import WatermarkEstimator
    24  from apache_beam.transforms.core import WatermarkEstimatorProvider
    25  from apache_beam.utils.timestamp import Timestamp
    26  
    27  
    28  class MonotonicWatermarkEstimator(WatermarkEstimator):
    29    """A WatermarkEstimator which assumes that timestamps of all ouput records
    30    are increasing monotonically.
    31    """
    32    def __init__(self, timestamp):
    33      """For a new <element, restriction> pair, the initial value is None. When
    34      resuming processing, the initial timestamp will be the last reported
    35      watermark.
    36      """
    37      self._watermark = timestamp
    38      self._last_observed_timestamp = timestamp
    39  
    40    def observe_timestamp(self, timestamp):
    41      self._last_observed_timestamp = timestamp
    42  
    43    def current_watermark(self):
    44      if self._last_observed_timestamp is not None \
    45          and self._last_observed_timestamp >= self._watermark:
    46        self._watermark = self._last_observed_timestamp
    47      return self._watermark
    48  
    49    def get_estimator_state(self):
    50      return self._watermark
    51  
    52    @staticmethod
    53    def default_provider():
    54      """Provide a default WatermarkEstimatorProvider for
    55      MonotonicWatermarkEstimator.
    56      """
    57      class DefaultMonotonicWatermarkEstimator(WatermarkEstimatorProvider):
    58        def initial_estimator_state(self, element, restriction):
    59          return None
    60  
    61        def create_watermark_estimator(self, estimator_state):
    62          return MonotonicWatermarkEstimator(estimator_state)
    63  
    64      return DefaultMonotonicWatermarkEstimator()
    65  
    66  
    67  class WalltimeWatermarkEstimator(WatermarkEstimator):
    68    """A WatermarkEstimator which uses processing time as the estimated watermark.
    69    """
    70    def __init__(self, timestamp=None):
    71      self._timestamp = timestamp or Timestamp.now()
    72  
    73    def observe_timestamp(self, timestamp):
    74      pass
    75  
    76    def current_watermark(self):
    77      self._timestamp = max(self._timestamp, Timestamp.now())
    78      return self._timestamp
    79  
    80    def get_estimator_state(self):
    81      return self._timestamp
    82  
    83    @staticmethod
    84    def default_provider():
    85      """Provide a default WatermarkEstimatorProvider for
    86      WalltimeWatermarkEstimator.
    87      """
    88      class DefaultWalltimeWatermarkEstimator(WatermarkEstimatorProvider):
    89        def initial_estimator_state(self, element, restriction):
    90          return None
    91  
    92        def create_watermark_estimator(self, estimator_state):
    93          return WalltimeWatermarkEstimator(estimator_state)
    94  
    95      return DefaultWalltimeWatermarkEstimator()
    96  
    97  
    98  class ManualWatermarkEstimator(WatermarkEstimator):
    99    """A WatermarkEstimator which is controlled manually from within a DoFn.
   100  
   101    The DoFn must invoke set_watermark to advance the watermark.
   102    """
   103    def __init__(self, watermark):
   104      self._watermark = watermark
   105  
   106    def observe_timestamp(self, timestamp):
   107      pass
   108  
   109    def current_watermark(self):
   110      return self._watermark
   111  
   112    def get_estimator_state(self):
   113      return self._watermark
   114  
   115    def set_watermark(self, timestamp):
   116      # pylint: disable=line-too-long
   117  
   118      """Sets a timestamp before or at the timestamps of all future elements
   119      produced by the associated DoFn.
   120  
   121      This can be approximate. If records are output that violate this guarantee,
   122      they will be considered late, which will affect how they will be processed.
   123      See https://beam.apache.org/documentation/programming-guide/#watermarks-and-late-data
   124      for more information on late data and how to handle it.
   125  
   126      However, this value should be as late as possible. Downstream windows may
   127      not be able to close until this watermark passes their end.
   128      """
   129      if not isinstance(timestamp, Timestamp):
   130        raise ValueError('set_watermark expects a Timestamp as input')
   131      if self._watermark and self._watermark > timestamp:
   132        raise ValueError(
   133            'Watermark must be monotonically increasing.'
   134            'Provided watermark %s is less than '
   135            'current watermark %s',
   136            timestamp,
   137            self._watermark)
   138      self._watermark = timestamp
   139  
   140    @staticmethod
   141    def default_provider():
   142      """Provide a default WatermarkEstimatorProvider for
   143      WalltimeWatermarkEstimator.
   144      """
   145      class DefaultManualWatermarkEstimatorProvider(WatermarkEstimatorProvider):
   146        def initial_estimator_state(self, element, restriction):
   147          return None
   148  
   149        def create_watermark_estimator(self, estimator_state):
   150          return ManualWatermarkEstimator(estimator_state)
   151  
   152      return DefaultManualWatermarkEstimatorProvider()