github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/transforms/window.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """Windowing concepts.
    19  
    20  A WindowInto transform logically divides up or groups the elements of a
    21  PCollection into finite windows according to a windowing function (derived from
    22  WindowFn).
    23  
    24  The output of WindowInto contains the same elements as input, but they have been
    25  logically assigned to windows. The next GroupByKey(s) transforms, including one
    26  within a composite transform, will group by the combination of keys and windows.
    27  
    28  Windowing a PCollection allows chunks of it to be processed individually, before
    29  the entire PCollection is available.  This is especially important for
    30  PCollection(s) with unbounded size, since the full PCollection is never
    31  available at once, since more data is continually arriving. For PCollection(s)
    32  with a bounded size (aka. conventional batch mode), by default, all data is
    33  implicitly in a single window (see GlobalWindows), unless WindowInto is
    34  applied.
    35  
    36  For example, a simple form of windowing divides up the data into fixed-width
    37  time intervals, using FixedWindows.
    38  
    39  Seconds are used as the time unit for the built-in windowing primitives here.
    40  Integer or floating point seconds can be passed to these primitives.
    41  
    42  Internally, seconds, with microsecond granularity, are stored as
    43  timeutil.Timestamp and timeutil.Duration objects. This is done to avoid
    44  precision errors that would occur with floating point representations.
    45  
    46  Custom windowing function classes can be created, by subclassing from
    47  WindowFn.
    48  """
    49  
    50  # pytype: skip-file
    51  
    52  import abc
    53  from functools import total_ordering
    54  from typing import Any
    55  from typing import Iterable
    56  from typing import List
    57  from typing import Optional
    58  
    59  from google.protobuf import duration_pb2
    60  from google.protobuf import timestamp_pb2
    61  
    62  from apache_beam.coders import coders
    63  from apache_beam.portability import common_urns
    64  from apache_beam.portability import python_urns
    65  from apache_beam.portability.api import beam_runner_api_pb2
    66  from apache_beam.portability.api import standard_window_fns_pb2
    67  from apache_beam.transforms import timeutil
    68  from apache_beam.utils import proto_utils
    69  from apache_beam.utils import urns
    70  from apache_beam.utils import windowed_value
    71  from apache_beam.utils.timestamp import MIN_TIMESTAMP
    72  from apache_beam.utils.timestamp import Duration
    73  from apache_beam.utils.timestamp import DurationTypes  # pylint: disable=unused-import
    74  from apache_beam.utils.timestamp import Timestamp
    75  from apache_beam.utils.timestamp import TimestampTypes  # pylint: disable=unused-import
    76  from apache_beam.utils.windowed_value import WindowedValue
    77  
    78  __all__ = [
    79      'TimestampCombiner',
    80      'WindowFn',
    81      'BoundedWindow',
    82      'IntervalWindow',
    83      'TimestampedValue',
    84      'GlobalWindow',
    85      'NonMergingWindowFn',
    86      'GlobalWindows',
    87      'FixedWindows',
    88      'SlidingWindows',
    89      'Sessions',
    90  ]
    91  
    92  
    93  # TODO(ccy): revisit naming and semantics once Java Apache Beam finalizes their
    94  # behavior.
    95  class TimestampCombiner(object):
    96    """Determines how output timestamps of grouping operations are assigned."""
    97  
    98    OUTPUT_AT_EOW = beam_runner_api_pb2.OutputTime.END_OF_WINDOW
    99    OUTPUT_AT_EARLIEST = beam_runner_api_pb2.OutputTime.EARLIEST_IN_PANE
   100    OUTPUT_AT_LATEST = beam_runner_api_pb2.OutputTime.LATEST_IN_PANE
   101    # TODO(robertwb): Add this to the runner API or remove it.
   102    OUTPUT_AT_EARLIEST_TRANSFORMED = 'OUTPUT_AT_EARLIEST_TRANSFORMED'
   103  
   104    @staticmethod
   105    def get_impl(timestamp_combiner, window_fn):
   106      # type: (beam_runner_api_pb2.OutputTime.Enum, WindowFn) -> timeutil.TimestampCombinerImpl
   107      if timestamp_combiner == TimestampCombiner.OUTPUT_AT_EOW:
   108        return timeutil.OutputAtEndOfWindowImpl()
   109      elif timestamp_combiner == TimestampCombiner.OUTPUT_AT_EARLIEST:
   110        return timeutil.OutputAtEarliestInputTimestampImpl()
   111      elif timestamp_combiner == TimestampCombiner.OUTPUT_AT_LATEST:
   112        return timeutil.OutputAtLatestInputTimestampImpl()
   113      elif timestamp_combiner == TimestampCombiner.OUTPUT_AT_EARLIEST_TRANSFORMED:
   114        return timeutil.OutputAtEarliestTransformedInputTimestampImpl(window_fn)
   115      else:
   116        raise ValueError('Invalid TimestampCombiner: %s.' % timestamp_combiner)
   117  
   118  
   119  class WindowFn(urns.RunnerApiFn, metaclass=abc.ABCMeta):
   120    """An abstract windowing function defining a basic assign and merge."""
   121    class AssignContext(object):
   122      """Context passed to WindowFn.assign()."""
   123      def __init__(
   124          self,
   125          timestamp,  # type: TimestampTypes
   126          element=None,  # type: Optional[Any]
   127          window=None  # type: Optional[BoundedWindow]
   128      ):
   129        # type: (...) -> None
   130        self.timestamp = Timestamp.of(timestamp)
   131        self.element = element
   132        self.window = window
   133  
   134    @abc.abstractmethod
   135    def assign(self, assign_context):
   136      # type: (AssignContext) -> Iterable[BoundedWindow] # noqa: F821
   137  
   138      """Associates windows to an element.
   139  
   140      Arguments:
   141        assign_context: Instance of AssignContext.
   142  
   143      Returns:
   144        An iterable of BoundedWindow.
   145      """
   146      raise NotImplementedError
   147  
   148    class MergeContext(object):
   149      """Context passed to WindowFn.merge() to perform merging, if any."""
   150      def __init__(self, windows):
   151        # type: (Iterable[BoundedWindow]) -> None
   152        self.windows = list(windows)
   153  
   154      def merge(self, to_be_merged, merge_result):
   155        # type: (Iterable[BoundedWindow], BoundedWindow) -> None
   156        raise NotImplementedError
   157  
   158    @abc.abstractmethod
   159    def merge(self, merge_context):
   160      # type: (WindowFn.MergeContext) -> None
   161  
   162      """Returns a window that is the result of merging a set of windows."""
   163      raise NotImplementedError
   164  
   165    def is_merging(self):
   166      # type: () -> bool
   167  
   168      """Returns whether this WindowFn merges windows."""
   169      return True
   170  
   171    @abc.abstractmethod
   172    def get_window_coder(self):
   173      # type: () -> coders.Coder
   174      raise NotImplementedError
   175  
   176    def get_transformed_output_time(self, window, input_timestamp):  # pylint: disable=unused-argument
   177      # type: (BoundedWindow, Timestamp) -> Timestamp
   178  
   179      """Given input time and output window, returns output time for window.
   180  
   181      If TimestampCombiner.OUTPUT_AT_EARLIEST_TRANSFORMED is used in the
   182      Windowing, the output timestamp for the given window will be the earliest
   183      of the timestamps returned by get_transformed_output_time() for elements
   184      of the window.
   185  
   186      Arguments:
   187        window: Output window of element.
   188        input_timestamp: Input timestamp of element as a timeutil.Timestamp
   189          object.
   190  
   191      Returns:
   192        Transformed timestamp.
   193      """
   194      # By default, just return the input timestamp.
   195      return input_timestamp
   196  
   197    urns.RunnerApiFn.register_pickle_urn(python_urns.PICKLED_WINDOWFN)
   198  
   199  
   200  class BoundedWindow(object):
   201    """A window for timestamps in range (-infinity, end).
   202  
   203    Attributes:
   204      end: End of window.
   205    """
   206    def __init__(self, end):
   207      # type: (TimestampTypes) -> None
   208      self._end = Timestamp.of(end)
   209  
   210    @property
   211    def start(self):
   212      # type: () -> Timestamp
   213      raise NotImplementedError
   214  
   215    @property
   216    def end(self):
   217      # type: () -> Timestamp
   218      return self._end
   219  
   220    def max_timestamp(self):
   221      # type: () -> Timestamp
   222      return self.end.predecessor()
   223  
   224    def __eq__(self, other):
   225      raise NotImplementedError
   226  
   227    def __ne__(self, other):
   228      #  Order first by endpoint, then arbitrarily
   229      return self.end != other.end or hash(self) != hash(other)
   230  
   231    def __lt__(self, other):
   232      if self.end != other.end:
   233        return self.end < other.end
   234      return hash(self) < hash(other)
   235  
   236    def __le__(self, other):
   237      if self.end != other.end:
   238        return self.end <= other.end
   239      return hash(self) <= hash(other)
   240  
   241    def __gt__(self, other):
   242      if self.end != other.end:
   243        return self.end > other.end
   244      return hash(self) > hash(other)
   245  
   246    def __ge__(self, other):
   247      if self.end != other.end:
   248        return self.end >= other.end
   249      return hash(self) >= hash(other)
   250  
   251    def __hash__(self):
   252      raise NotImplementedError
   253  
   254    def __repr__(self):
   255      return '[?, %s)' % float(self.end)
   256  
   257  
   258  @total_ordering
   259  class IntervalWindow(windowed_value._IntervalWindowBase, BoundedWindow):
   260    """A window for timestamps in range [start, end).
   261  
   262    Attributes:
   263      start: Start of window as seconds since Unix epoch.
   264      end: End of window as seconds since Unix epoch.
   265    """
   266    def __lt__(self, other):
   267      if self.end != other.end:
   268        return self.end < other.end
   269      return hash(self) < hash(other)
   270  
   271    def intersects(self, other):
   272      # type: (IntervalWindow) -> bool
   273      return other.start < self.end or self.start < other.end
   274  
   275    def union(self, other):
   276      # type: (IntervalWindow) -> IntervalWindow
   277      return IntervalWindow(
   278          min(self.start, other.start), max(self.end, other.end))
   279  
   280  
   281  @total_ordering
   282  class TimestampedValue(object):
   283    """A timestamped value having a value and a timestamp.
   284  
   285    Attributes:
   286      value: The underlying value.
   287      timestamp: Timestamp associated with the value as seconds since Unix epoch.
   288    """
   289    def __init__(self, value, timestamp):
   290      # type: (Any, TimestampTypes) -> None
   291      self.value = value
   292      self.timestamp = Timestamp.of(timestamp)
   293  
   294    def __eq__(self, other):
   295      return (
   296          type(self) == type(other) and self.value == other.value and
   297          self.timestamp == other.timestamp)
   298  
   299    def __hash__(self):
   300      return hash((self.value, self.timestamp))
   301  
   302    def __lt__(self, other):
   303      if type(self) != type(other):
   304        return type(self).__name__ < type(other).__name__
   305      if self.value != other.value:
   306        return self.value < other.value
   307      return self.timestamp < other.timestamp
   308  
   309  
   310  class GlobalWindow(BoundedWindow):
   311    """The default window into which all data is placed (via GlobalWindows)."""
   312    _instance = None  # type: GlobalWindow
   313  
   314    def __new__(cls):
   315      if cls._instance is None:
   316        cls._instance = super(GlobalWindow, cls).__new__(cls)
   317      return cls._instance
   318  
   319    def __init__(self):
   320      # type: () -> None
   321      super().__init__(GlobalWindow._getTimestampFromProto())
   322  
   323    def __repr__(self):
   324      return 'GlobalWindow'
   325  
   326    def __hash__(self):
   327      return hash(type(self))
   328  
   329    def __eq__(self, other):
   330      # Global windows are always and only equal to each other.
   331      return self is other or type(self) is type(other)
   332  
   333    @property
   334    def start(self):
   335      # type: () -> Timestamp
   336      return MIN_TIMESTAMP
   337  
   338    @staticmethod
   339    def _getTimestampFromProto():
   340      # type: () -> Timestamp
   341      ts_millis = int(
   342          common_urns.constants.GLOBAL_WINDOW_MAX_TIMESTAMP_MILLIS.constant)
   343      return Timestamp(micros=ts_millis * 1000)
   344  
   345  
   346  class NonMergingWindowFn(WindowFn):
   347    def is_merging(self):
   348      # type: () -> bool
   349      return False
   350  
   351    def merge(self, merge_context):
   352      # type: (WindowFn.MergeContext) -> None
   353      pass  # No merging.
   354  
   355  
   356  class GlobalWindows(NonMergingWindowFn):
   357    """A windowing function that assigns everything to one global window."""
   358    @classmethod
   359    def windowed_batch(
   360        cls,
   361        batch,  # type: Any
   362        timestamp=MIN_TIMESTAMP,  # type: Timestamp
   363        pane_info=windowed_value.PANE_INFO_UNKNOWN  # type: windowed_value.PaneInfo
   364    ):
   365      # type: (...) -> windowed_value.WindowedBatch
   366      return windowed_value.HomogeneousWindowedBatch.of(
   367          batch, timestamp, (GlobalWindow(), ), pane_info)
   368  
   369    @classmethod
   370    def windowed_value(
   371        cls,
   372        value,  # type: Any
   373        timestamp=MIN_TIMESTAMP,  # type: Timestamp
   374        pane_info=windowed_value.PANE_INFO_UNKNOWN  # type: windowed_value.PaneInfo
   375    ):
   376      # type: (...) -> WindowedValue
   377      return WindowedValue(value, timestamp, (GlobalWindow(), ), pane_info)
   378  
   379    @classmethod
   380    def windowed_value_at_end_of_window(cls, value):
   381      return cls.windowed_value(value, GlobalWindow().max_timestamp())
   382  
   383    def assign(self, assign_context):
   384      # type: (WindowFn.AssignContext) -> List[GlobalWindow]
   385      return [GlobalWindow()]
   386  
   387    def get_window_coder(self):
   388      # type: () -> coders.GlobalWindowCoder
   389      return coders.GlobalWindowCoder()
   390  
   391    def __hash__(self):
   392      return hash(type(self))
   393  
   394    def __eq__(self, other):
   395      # Global windowfn is always and only equal to each other.
   396      return self is other or type(self) is type(other)
   397  
   398    def to_runner_api_parameter(self, context):
   399      return common_urns.global_windows.urn, None
   400  
   401    @staticmethod
   402    @urns.RunnerApiFn.register_urn(common_urns.global_windows.urn, None)
   403    def from_runner_api_parameter(unused_fn_parameter, unused_context):
   404      # type: (...) -> GlobalWindows
   405      return GlobalWindows()
   406  
   407  
   408  class FixedWindows(NonMergingWindowFn):
   409    """A windowing function that assigns each element to one time interval.
   410  
   411    The attributes size and offset determine in what time interval a timestamp
   412    will be slotted. The time intervals have the following formula:
   413    [N * size + offset, (N + 1) * size + offset)
   414  
   415    Attributes:
   416      size: Size of the window as seconds.
   417      offset: Offset of this window as seconds. Windows start at
   418        t=N * size + offset where t=0 is the UNIX epoch. The offset must be a
   419        value in range [0, size). If it is not it will be normalized to this
   420        range.
   421    """
   422    def __init__(
   423        self,
   424        size,  # type: DurationTypes
   425        offset=0  # type: TimestampTypes
   426    ):
   427      """Initialize a ``FixedWindows`` function for a given size and offset.
   428  
   429      Args:
   430        size (int): Size of the window in seconds.
   431        offset(int): Offset of this window as seconds. Windows start at
   432          t=N * size + offset where t=0 is the UNIX epoch. The offset must be a
   433          value in range [0, size). If it is not it will be normalized to this
   434          range.
   435      """
   436      if size <= 0:
   437        raise ValueError('The size parameter must be strictly positive.')
   438      self.size = Duration.of(size)
   439      self.offset = Timestamp.of(offset) % self.size
   440  
   441    def assign(self, context):
   442      # type: (WindowFn.AssignContext) -> List[IntervalWindow]
   443      timestamp = context.timestamp
   444      start = timestamp - (timestamp - self.offset) % self.size
   445      return [IntervalWindow(start, start + self.size)]
   446  
   447    def get_window_coder(self):
   448      # type: () -> coders.IntervalWindowCoder
   449      return coders.IntervalWindowCoder()
   450  
   451    def __eq__(self, other):
   452      if type(self) == type(other) == FixedWindows:
   453        return self.size == other.size and self.offset == other.offset
   454  
   455    def __hash__(self):
   456      return hash((self.size, self.offset))
   457  
   458    def to_runner_api_parameter(self, context):
   459      return (
   460          common_urns.fixed_windows.urn,
   461          standard_window_fns_pb2.FixedWindowsPayload(
   462              size=proto_utils.from_micros(
   463                  duration_pb2.Duration, self.size.micros),
   464              offset=proto_utils.from_micros(
   465                  timestamp_pb2.Timestamp, self.offset.micros)))
   466  
   467    @staticmethod
   468    @urns.RunnerApiFn.register_urn(
   469        common_urns.fixed_windows.urn,
   470        standard_window_fns_pb2.FixedWindowsPayload)
   471    def from_runner_api_parameter(fn_parameter, unused_context):
   472      # type: (...) -> FixedWindows
   473      return FixedWindows(
   474          size=Duration(micros=fn_parameter.size.ToMicroseconds()),
   475          offset=Timestamp(micros=fn_parameter.offset.ToMicroseconds()))
   476  
   477  
   478  class SlidingWindows(NonMergingWindowFn):
   479    """A windowing function that assigns each element to a set of sliding windows.
   480  
   481    The attributes size and offset determine in what time interval a timestamp
   482    will be slotted. The time intervals have the following formula:
   483    [N * period + offset, N * period + offset + size)
   484  
   485    Attributes:
   486      size: Size of the window as seconds.
   487      period: Period of the windows as seconds.
   488      offset: Offset of this window as seconds since Unix epoch. Windows start at
   489        t=N * period + offset where t=0 is the epoch. The offset must be a value
   490        in range [0, period). If it is not it will be normalized to this range.
   491    """
   492  
   493    def __init__(self,
   494                 size,  # type: DurationTypes
   495                 period,  # type: DurationTypes
   496                 offset=0,  # type: TimestampTypes
   497                ):
   498      if size <= 0:
   499        raise ValueError('The size parameter must be strictly positive.')
   500      self.size = Duration.of(size)
   501      self.period = Duration.of(period)
   502      self.offset = Timestamp.of(offset) % period
   503  
   504    def assign(self, context):
   505      # type: (WindowFn.AssignContext) -> List[IntervalWindow]
   506      timestamp = context.timestamp
   507      start = timestamp - ((timestamp - self.offset) % self.period)
   508      return [
   509          IntervalWindow(Timestamp(micros=s), Timestamp(micros=s) + self.size)
   510          for s in range(
   511              start.micros,
   512              timestamp.micros - self.size.micros,
   513              -self.period.micros)
   514      ]
   515  
   516    def get_window_coder(self):
   517      # type: () -> coders.IntervalWindowCoder
   518      return coders.IntervalWindowCoder()
   519  
   520    def __eq__(self, other):
   521      if type(self) == type(other) == SlidingWindows:
   522        return (
   523            self.size == other.size and self.offset == other.offset and
   524            self.period == other.period)
   525  
   526    def __hash__(self):
   527      return hash((self.offset, self.period))
   528  
   529    def to_runner_api_parameter(self, context):
   530      return (
   531          common_urns.sliding_windows.urn,
   532          standard_window_fns_pb2.SlidingWindowsPayload(
   533              size=proto_utils.from_micros(
   534                  duration_pb2.Duration, self.size.micros),
   535              offset=proto_utils.from_micros(
   536                  timestamp_pb2.Timestamp, self.offset.micros),
   537              period=proto_utils.from_micros(
   538                  duration_pb2.Duration, self.period.micros)))
   539  
   540    @staticmethod
   541    @urns.RunnerApiFn.register_urn(
   542        common_urns.sliding_windows.urn,
   543        standard_window_fns_pb2.SlidingWindowsPayload)
   544    def from_runner_api_parameter(fn_parameter, unused_context):
   545      # type: (...) -> SlidingWindows
   546      return SlidingWindows(
   547          size=Duration(micros=fn_parameter.size.ToMicroseconds()),
   548          offset=Timestamp(micros=fn_parameter.offset.ToMicroseconds()),
   549          period=Duration(micros=fn_parameter.period.ToMicroseconds()))
   550  
   551  
   552  class Sessions(WindowFn):
   553    """A windowing function that groups elements into sessions.
   554  
   555    A session is defined as a series of consecutive events
   556    separated by a specified gap size.
   557  
   558    Attributes:
   559      gap_size: Size of the gap between windows as floating-point seconds.
   560    """
   561    def __init__(self, gap_size):
   562      # type: (DurationTypes) -> None
   563      if gap_size <= 0:
   564        raise ValueError('The size parameter must be strictly positive.')
   565      self.gap_size = Duration.of(gap_size)
   566  
   567    def assign(self, context):
   568      # type: (WindowFn.AssignContext) -> List[IntervalWindow]
   569      timestamp = context.timestamp
   570      return [IntervalWindow(timestamp, timestamp + self.gap_size)]
   571  
   572    def get_window_coder(self):
   573      # type: () -> coders.IntervalWindowCoder
   574      return coders.IntervalWindowCoder()
   575  
   576    def merge(self, merge_context):
   577      # type: (WindowFn.MergeContext) -> None
   578      to_merge = []  # type: List[BoundedWindow]
   579      end = MIN_TIMESTAMP
   580      for w in sorted(merge_context.windows, key=lambda w: w.start):
   581        if to_merge:
   582          if end > w.start:
   583            to_merge.append(w)
   584            if w.end > end:
   585              end = w.end
   586          else:
   587            if len(to_merge) > 1:
   588              merge_context.merge(
   589                  to_merge, IntervalWindow(to_merge[0].start, end))
   590            to_merge = [w]
   591            end = w.end
   592        else:
   593          to_merge = [w]
   594          end = w.end
   595      if len(to_merge) > 1:
   596        merge_context.merge(to_merge, IntervalWindow(to_merge[0].start, end))
   597  
   598    def __eq__(self, other):
   599      if type(self) == type(other) == Sessions:
   600        return self.gap_size == other.gap_size
   601  
   602    def __hash__(self):
   603      return hash(self.gap_size)
   604  
   605    def to_runner_api_parameter(self, context):
   606      return (
   607          common_urns.session_windows.urn,
   608          standard_window_fns_pb2.SessionWindowsPayload(
   609              gap_size=proto_utils.from_micros(
   610                  duration_pb2.Duration, self.gap_size.micros)))
   611  
   612    @staticmethod
   613    @urns.RunnerApiFn.register_urn(
   614        common_urns.session_windows.urn,
   615        standard_window_fns_pb2.SessionWindowsPayload)
   616    def from_runner_api_parameter(fn_parameter, unused_context):
   617      # type: (...) -> Sessions
   618      return Sessions(
   619          gap_size=Duration(micros=fn_parameter.gap_size.ToMicroseconds()))