github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/transforms/periodicsequence.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 import math 19 import time 20 21 import apache_beam as beam 22 from apache_beam.io.restriction_trackers import OffsetRange 23 from apache_beam.io.restriction_trackers import OffsetRestrictionTracker 24 from apache_beam.io.watermark_estimators import ManualWatermarkEstimator 25 from apache_beam.runners import sdf_utils 26 from apache_beam.transforms import core 27 from apache_beam.transforms import window 28 from apache_beam.transforms.ptransform import PTransform 29 from apache_beam.transforms.window import TimestampedValue 30 from apache_beam.utils import timestamp 31 from apache_beam.utils.timestamp import MAX_TIMESTAMP 32 from apache_beam.utils.timestamp import Timestamp 33 34 35 class ImpulseSeqGenRestrictionProvider(core.RestrictionProvider): 36 def initial_restriction(self, element): 37 start, end, interval = element 38 if isinstance(start, Timestamp): 39 start = start.micros / 1000000 40 if isinstance(end, Timestamp): 41 end = end.micros / 1000000 42 43 assert start <= end 44 assert interval > 0 45 total_outputs = math.ceil((end - start) / interval) 46 return OffsetRange(0, total_outputs) 47 48 def create_tracker(self, restriction): 49 return OffsetRestrictionTracker(restriction) 50 51 def restriction_size(self, unused_element, restriction): 52 return restriction.size() 53 54 # On drain, immediately stop emitting new elements 55 def truncate(self, unused_element, unused_restriction): 56 return None 57 58 59 class ImpulseSeqGenDoFn(beam.DoFn): 60 ''' 61 ImpulseSeqGenDoFn fn receives tuple elements with three parts: 62 63 * first_timestamp = first timestamp to output element for. 64 * last_timestamp = last timestamp/time to output element for. 65 * fire_interval = how often to fire an element. 66 67 For each input element received, ImpulseSeqGenDoFn fn will start 68 generating output elements in following pattern: 69 70 * if element timestamp is less than current runtime then output element. 71 * if element timestamp is greater than current runtime, wait until next 72 element timestamp. 73 74 ImpulseSeqGenDoFn can't guarantee that each element is output at exact time. 75 ImpulseSeqGenDoFn guarantees that elements would not be output prior to 76 given runtime timestamp. 77 ''' 78 @beam.DoFn.unbounded_per_element() 79 def process( 80 self, 81 element, 82 restriction_tracker=beam.DoFn.RestrictionParam( 83 ImpulseSeqGenRestrictionProvider()), 84 watermark_estimator=beam.DoFn.WatermarkEstimatorParam( 85 ManualWatermarkEstimator.default_provider())): 86 ''' 87 :param element: (start_timestamp, end_timestamp, interval) 88 :param restriction_tracker: 89 :return: yields elements at processing real-time intervals with value of 90 target output timestamp for the element. 91 ''' 92 start, _, interval = element 93 94 if isinstance(start, Timestamp): 95 start = start.micros / 1000000 96 97 assert isinstance(restriction_tracker, sdf_utils.RestrictionTrackerView) 98 99 current_output_index = restriction_tracker.current_restriction().start 100 current_output_timestamp = start + interval * current_output_index 101 current_time = time.time() 102 watermark_estimator.set_watermark( 103 timestamp.Timestamp(current_output_timestamp)) 104 105 while current_output_timestamp <= current_time: 106 if restriction_tracker.try_claim(current_output_index): 107 yield current_output_timestamp 108 current_output_index += 1 109 current_output_timestamp = start + interval * current_output_index 110 current_time = time.time() 111 watermark_estimator.set_watermark( 112 timestamp.Timestamp(current_output_timestamp)) 113 else: 114 return 115 116 restriction_tracker.defer_remainder( 117 timestamp.Timestamp(current_output_timestamp)) 118 119 120 class PeriodicSequence(PTransform): 121 ''' 122 PeriodicSequence transform receives tuple elements with three parts: 123 124 * first_timestamp = first timestamp to output element for. 125 * last_timestamp = last timestamp/time to output element for. 126 * fire_interval = how often to fire an element. 127 128 For each input element received, PeriodicSequence transform will start 129 generating output elements in following pattern: 130 131 * if element timestamp is less than current runtime then output element. 132 * if element timestamp is greater than current runtime, wait until next 133 element timestamp. 134 135 PeriodicSequence can't guarantee that each element is output at exact time. 136 PeriodicSequence guarantees that elements would not be output prior to given 137 runtime timestamp. 138 The PCollection generated by PeriodicSequence is unbounded. 139 ''' 140 def __init__(self): 141 pass 142 143 def expand(self, pcoll): 144 return ( 145 pcoll 146 | 'GenSequence' >> beam.ParDo(ImpulseSeqGenDoFn()) 147 | 'MapToTimestamped' >> beam.Map(lambda tt: TimestampedValue(tt, tt))) 148 149 150 class PeriodicImpulse(PTransform): 151 ''' 152 PeriodicImpulse transform generates an infinite sequence of elements with 153 given runtime interval. 154 155 PeriodicImpulse transform behaves same as {@link PeriodicSequence} transform, 156 but can be used as first transform in pipeline. 157 The PCollection generated by PeriodicImpulse is unbounded. 158 ''' 159 def __init__( 160 self, 161 start_timestamp=Timestamp.now(), 162 stop_timestamp=MAX_TIMESTAMP, 163 fire_interval=360.0, 164 apply_windowing=False): 165 ''' 166 :param start_timestamp: Timestamp for first element. 167 :param stop_timestamp: Timestamp after which no elements will be output. 168 :param fire_interval: Interval at which to output elements. 169 :param apply_windowing: Whether each element should be assigned to 170 individual window. If false, all elements will reside in global window. 171 ''' 172 self.start_ts = start_timestamp 173 self.stop_ts = stop_timestamp 174 self.interval = fire_interval 175 self.apply_windowing = apply_windowing 176 177 def expand(self, pbegin): 178 result = ( 179 pbegin 180 | 'ImpulseElement' >> beam.Create( 181 [(self.start_ts, self.stop_ts, self.interval)]) 182 | 'GenSequence' >> beam.ParDo(ImpulseSeqGenDoFn()) 183 | 'MapToTimestamped' >> beam.Map(lambda tt: TimestampedValue(tt, tt))) 184 if self.apply_windowing: 185 result = result | 'ApplyWindowing' >> beam.WindowInto( 186 window.FixedWindows(self.interval)) 187 return result