github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/io/watermark_estimators.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """A collection of WatermarkEstimator implementations that SplittableDoFns 19 can use.""" 20 21 # pytype: skip-file 22 23 from apache_beam.io.iobase import WatermarkEstimator 24 from apache_beam.transforms.core import WatermarkEstimatorProvider 25 from apache_beam.utils.timestamp import Timestamp 26 27 28 class MonotonicWatermarkEstimator(WatermarkEstimator): 29 """A WatermarkEstimator which assumes that timestamps of all ouput records 30 are increasing monotonically. 31 """ 32 def __init__(self, timestamp): 33 """For a new <element, restriction> pair, the initial value is None. When 34 resuming processing, the initial timestamp will be the last reported 35 watermark. 36 """ 37 self._watermark = timestamp 38 self._last_observed_timestamp = timestamp 39 40 def observe_timestamp(self, timestamp): 41 self._last_observed_timestamp = timestamp 42 43 def current_watermark(self): 44 if self._last_observed_timestamp is not None \ 45 and self._last_observed_timestamp >= self._watermark: 46 self._watermark = self._last_observed_timestamp 47 return self._watermark 48 49 def get_estimator_state(self): 50 return self._watermark 51 52 @staticmethod 53 def default_provider(): 54 """Provide a default WatermarkEstimatorProvider for 55 MonotonicWatermarkEstimator. 56 """ 57 class DefaultMonotonicWatermarkEstimator(WatermarkEstimatorProvider): 58 def initial_estimator_state(self, element, restriction): 59 return None 60 61 def create_watermark_estimator(self, estimator_state): 62 return MonotonicWatermarkEstimator(estimator_state) 63 64 return DefaultMonotonicWatermarkEstimator() 65 66 67 class WalltimeWatermarkEstimator(WatermarkEstimator): 68 """A WatermarkEstimator which uses processing time as the estimated watermark. 69 """ 70 def __init__(self, timestamp=None): 71 self._timestamp = timestamp or Timestamp.now() 72 73 def observe_timestamp(self, timestamp): 74 pass 75 76 def current_watermark(self): 77 self._timestamp = max(self._timestamp, Timestamp.now()) 78 return self._timestamp 79 80 def get_estimator_state(self): 81 return self._timestamp 82 83 @staticmethod 84 def default_provider(): 85 """Provide a default WatermarkEstimatorProvider for 86 WalltimeWatermarkEstimator. 87 """ 88 class DefaultWalltimeWatermarkEstimator(WatermarkEstimatorProvider): 89 def initial_estimator_state(self, element, restriction): 90 return None 91 92 def create_watermark_estimator(self, estimator_state): 93 return WalltimeWatermarkEstimator(estimator_state) 94 95 return DefaultWalltimeWatermarkEstimator() 96 97 98 class ManualWatermarkEstimator(WatermarkEstimator): 99 """A WatermarkEstimator which is controlled manually from within a DoFn. 100 101 The DoFn must invoke set_watermark to advance the watermark. 102 """ 103 def __init__(self, watermark): 104 self._watermark = watermark 105 106 def observe_timestamp(self, timestamp): 107 pass 108 109 def current_watermark(self): 110 return self._watermark 111 112 def get_estimator_state(self): 113 return self._watermark 114 115 def set_watermark(self, timestamp): 116 # pylint: disable=line-too-long 117 118 """Sets a timestamp before or at the timestamps of all future elements 119 produced by the associated DoFn. 120 121 This can be approximate. If records are output that violate this guarantee, 122 they will be considered late, which will affect how they will be processed. 123 See https://beam.apache.org/documentation/programming-guide/#watermarks-and-late-data 124 for more information on late data and how to handle it. 125 126 However, this value should be as late as possible. Downstream windows may 127 not be able to close until this watermark passes their end. 128 """ 129 if not isinstance(timestamp, Timestamp): 130 raise ValueError('set_watermark expects a Timestamp as input') 131 if self._watermark and self._watermark > timestamp: 132 raise ValueError( 133 'Watermark must be monotonically increasing.' 134 'Provided watermark %s is less than ' 135 'current watermark %s', 136 timestamp, 137 self._watermark) 138 self._watermark = timestamp 139 140 @staticmethod 141 def default_provider(): 142 """Provide a default WatermarkEstimatorProvider for 143 WalltimeWatermarkEstimator. 144 """ 145 class DefaultManualWatermarkEstimatorProvider(WatermarkEstimatorProvider): 146 def initial_estimator_state(self, element, restriction): 147 return None 148 149 def create_watermark_estimator(self, estimator_state): 150 return ManualWatermarkEstimator(estimator_state) 151 152 return DefaultManualWatermarkEstimatorProvider()