github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/transforms/periodicsequence.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  import math
    19  import time
    20  
    21  import apache_beam as beam
    22  from apache_beam.io.restriction_trackers import OffsetRange
    23  from apache_beam.io.restriction_trackers import OffsetRestrictionTracker
    24  from apache_beam.io.watermark_estimators import ManualWatermarkEstimator
    25  from apache_beam.runners import sdf_utils
    26  from apache_beam.transforms import core
    27  from apache_beam.transforms import window
    28  from apache_beam.transforms.ptransform import PTransform
    29  from apache_beam.transforms.window import TimestampedValue
    30  from apache_beam.utils import timestamp
    31  from apache_beam.utils.timestamp import MAX_TIMESTAMP
    32  from apache_beam.utils.timestamp import Timestamp
    33  
    34  
    35  class ImpulseSeqGenRestrictionProvider(core.RestrictionProvider):
    36    def initial_restriction(self, element):
    37      start, end, interval = element
    38      if isinstance(start, Timestamp):
    39        start = start.micros / 1000000
    40      if isinstance(end, Timestamp):
    41        end = end.micros / 1000000
    42  
    43      assert start <= end
    44      assert interval > 0
    45      total_outputs = math.ceil((end - start) / interval)
    46      return OffsetRange(0, total_outputs)
    47  
    48    def create_tracker(self, restriction):
    49      return OffsetRestrictionTracker(restriction)
    50  
    51    def restriction_size(self, unused_element, restriction):
    52      return restriction.size()
    53  
    54    # On drain, immediately stop emitting new elements
    55    def truncate(self, unused_element, unused_restriction):
    56      return None
    57  
    58  
    59  class ImpulseSeqGenDoFn(beam.DoFn):
    60    '''
    61    ImpulseSeqGenDoFn fn receives tuple elements with three parts:
    62  
    63    * first_timestamp = first timestamp to output element for.
    64    * last_timestamp = last timestamp/time to output element for.
    65    * fire_interval = how often to fire an element.
    66  
    67    For each input element received, ImpulseSeqGenDoFn fn will start
    68    generating output elements in following pattern:
    69  
    70    * if element timestamp is less than current runtime then output element.
    71    * if element timestamp is greater than current runtime, wait until next
    72      element timestamp.
    73  
    74    ImpulseSeqGenDoFn can't guarantee that each element is output at exact time.
    75    ImpulseSeqGenDoFn guarantees that elements would not be output prior to
    76    given runtime timestamp.
    77    '''
    78    @beam.DoFn.unbounded_per_element()
    79    def process(
    80        self,
    81        element,
    82        restriction_tracker=beam.DoFn.RestrictionParam(
    83            ImpulseSeqGenRestrictionProvider()),
    84        watermark_estimator=beam.DoFn.WatermarkEstimatorParam(
    85            ManualWatermarkEstimator.default_provider())):
    86      '''
    87      :param element: (start_timestamp, end_timestamp, interval)
    88      :param restriction_tracker:
    89      :return: yields elements at processing real-time intervals with value of
    90        target output timestamp for the element.
    91      '''
    92      start, _, interval = element
    93  
    94      if isinstance(start, Timestamp):
    95        start = start.micros / 1000000
    96  
    97      assert isinstance(restriction_tracker, sdf_utils.RestrictionTrackerView)
    98  
    99      current_output_index = restriction_tracker.current_restriction().start
   100      current_output_timestamp = start + interval * current_output_index
   101      current_time = time.time()
   102      watermark_estimator.set_watermark(
   103          timestamp.Timestamp(current_output_timestamp))
   104  
   105      while current_output_timestamp <= current_time:
   106        if restriction_tracker.try_claim(current_output_index):
   107          yield current_output_timestamp
   108          current_output_index += 1
   109          current_output_timestamp = start + interval * current_output_index
   110          current_time = time.time()
   111          watermark_estimator.set_watermark(
   112              timestamp.Timestamp(current_output_timestamp))
   113        else:
   114          return
   115  
   116      restriction_tracker.defer_remainder(
   117          timestamp.Timestamp(current_output_timestamp))
   118  
   119  
   120  class PeriodicSequence(PTransform):
   121    '''
   122    PeriodicSequence transform receives tuple elements with three parts:
   123  
   124    * first_timestamp = first timestamp to output element for.
   125    * last_timestamp = last timestamp/time to output element for.
   126    * fire_interval = how often to fire an element.
   127  
   128    For each input element received, PeriodicSequence transform will start
   129    generating output elements in following pattern:
   130  
   131    * if element timestamp is less than current runtime then output element.
   132    * if element timestamp is greater than current runtime, wait until next
   133      element timestamp.
   134  
   135    PeriodicSequence can't guarantee that each element is output at exact time.
   136    PeriodicSequence guarantees that elements would not be output prior to given
   137    runtime timestamp.
   138    The PCollection generated by PeriodicSequence is unbounded.
   139    '''
   140    def __init__(self):
   141      pass
   142  
   143    def expand(self, pcoll):
   144      return (
   145          pcoll
   146          | 'GenSequence' >> beam.ParDo(ImpulseSeqGenDoFn())
   147          | 'MapToTimestamped' >> beam.Map(lambda tt: TimestampedValue(tt, tt)))
   148  
   149  
   150  class PeriodicImpulse(PTransform):
   151    '''
   152    PeriodicImpulse transform generates an infinite sequence of elements with
   153    given runtime interval.
   154  
   155    PeriodicImpulse transform behaves same as {@link PeriodicSequence} transform,
   156    but can be used as first transform in pipeline.
   157    The PCollection generated by PeriodicImpulse is unbounded.
   158    '''
   159    def __init__(
   160        self,
   161        start_timestamp=Timestamp.now(),
   162        stop_timestamp=MAX_TIMESTAMP,
   163        fire_interval=360.0,
   164        apply_windowing=False):
   165      '''
   166      :param start_timestamp: Timestamp for first element.
   167      :param stop_timestamp: Timestamp after which no elements will be output.
   168      :param fire_interval: Interval at which to output elements.
   169      :param apply_windowing: Whether each element should be assigned to
   170        individual window. If false, all elements will reside in global window.
   171      '''
   172      self.start_ts = start_timestamp
   173      self.stop_ts = stop_timestamp
   174      self.interval = fire_interval
   175      self.apply_windowing = apply_windowing
   176  
   177    def expand(self, pbegin):
   178      result = (
   179          pbegin
   180          | 'ImpulseElement' >> beam.Create(
   181              [(self.start_ts, self.stop_ts, self.interval)])
   182          | 'GenSequence' >> beam.ParDo(ImpulseSeqGenDoFn())
   183          | 'MapToTimestamped' >> beam.Map(lambda tt: TimestampedValue(tt, tt)))
   184      if self.apply_windowing:
   185        result = result | 'ApplyWindowing' >> beam.WindowInto(
   186            window.FixedWindows(self.interval))
   187      return result