github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/transforms/window.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """Windowing concepts. 19 20 A WindowInto transform logically divides up or groups the elements of a 21 PCollection into finite windows according to a windowing function (derived from 22 WindowFn). 23 24 The output of WindowInto contains the same elements as input, but they have been 25 logically assigned to windows. The next GroupByKey(s) transforms, including one 26 within a composite transform, will group by the combination of keys and windows. 27 28 Windowing a PCollection allows chunks of it to be processed individually, before 29 the entire PCollection is available. This is especially important for 30 PCollection(s) with unbounded size, since the full PCollection is never 31 available at once, since more data is continually arriving. For PCollection(s) 32 with a bounded size (aka. conventional batch mode), by default, all data is 33 implicitly in a single window (see GlobalWindows), unless WindowInto is 34 applied. 35 36 For example, a simple form of windowing divides up the data into fixed-width 37 time intervals, using FixedWindows. 38 39 Seconds are used as the time unit for the built-in windowing primitives here. 40 Integer or floating point seconds can be passed to these primitives. 41 42 Internally, seconds, with microsecond granularity, are stored as 43 timeutil.Timestamp and timeutil.Duration objects. This is done to avoid 44 precision errors that would occur with floating point representations. 45 46 Custom windowing function classes can be created, by subclassing from 47 WindowFn. 48 """ 49 50 # pytype: skip-file 51 52 import abc 53 from functools import total_ordering 54 from typing import Any 55 from typing import Iterable 56 from typing import List 57 from typing import Optional 58 59 from google.protobuf import duration_pb2 60 from google.protobuf import timestamp_pb2 61 62 from apache_beam.coders import coders 63 from apache_beam.portability import common_urns 64 from apache_beam.portability import python_urns 65 from apache_beam.portability.api import beam_runner_api_pb2 66 from apache_beam.portability.api import standard_window_fns_pb2 67 from apache_beam.transforms import timeutil 68 from apache_beam.utils import proto_utils 69 from apache_beam.utils import urns 70 from apache_beam.utils import windowed_value 71 from apache_beam.utils.timestamp import MIN_TIMESTAMP 72 from apache_beam.utils.timestamp import Duration 73 from apache_beam.utils.timestamp import DurationTypes # pylint: disable=unused-import 74 from apache_beam.utils.timestamp import Timestamp 75 from apache_beam.utils.timestamp import TimestampTypes # pylint: disable=unused-import 76 from apache_beam.utils.windowed_value import WindowedValue 77 78 __all__ = [ 79 'TimestampCombiner', 80 'WindowFn', 81 'BoundedWindow', 82 'IntervalWindow', 83 'TimestampedValue', 84 'GlobalWindow', 85 'NonMergingWindowFn', 86 'GlobalWindows', 87 'FixedWindows', 88 'SlidingWindows', 89 'Sessions', 90 ] 91 92 93 # TODO(ccy): revisit naming and semantics once Java Apache Beam finalizes their 94 # behavior. 95 class TimestampCombiner(object): 96 """Determines how output timestamps of grouping operations are assigned.""" 97 98 OUTPUT_AT_EOW = beam_runner_api_pb2.OutputTime.END_OF_WINDOW 99 OUTPUT_AT_EARLIEST = beam_runner_api_pb2.OutputTime.EARLIEST_IN_PANE 100 OUTPUT_AT_LATEST = beam_runner_api_pb2.OutputTime.LATEST_IN_PANE 101 # TODO(robertwb): Add this to the runner API or remove it. 102 OUTPUT_AT_EARLIEST_TRANSFORMED = 'OUTPUT_AT_EARLIEST_TRANSFORMED' 103 104 @staticmethod 105 def get_impl(timestamp_combiner, window_fn): 106 # type: (beam_runner_api_pb2.OutputTime.Enum, WindowFn) -> timeutil.TimestampCombinerImpl 107 if timestamp_combiner == TimestampCombiner.OUTPUT_AT_EOW: 108 return timeutil.OutputAtEndOfWindowImpl() 109 elif timestamp_combiner == TimestampCombiner.OUTPUT_AT_EARLIEST: 110 return timeutil.OutputAtEarliestInputTimestampImpl() 111 elif timestamp_combiner == TimestampCombiner.OUTPUT_AT_LATEST: 112 return timeutil.OutputAtLatestInputTimestampImpl() 113 elif timestamp_combiner == TimestampCombiner.OUTPUT_AT_EARLIEST_TRANSFORMED: 114 return timeutil.OutputAtEarliestTransformedInputTimestampImpl(window_fn) 115 else: 116 raise ValueError('Invalid TimestampCombiner: %s.' % timestamp_combiner) 117 118 119 class WindowFn(urns.RunnerApiFn, metaclass=abc.ABCMeta): 120 """An abstract windowing function defining a basic assign and merge.""" 121 class AssignContext(object): 122 """Context passed to WindowFn.assign().""" 123 def __init__( 124 self, 125 timestamp, # type: TimestampTypes 126 element=None, # type: Optional[Any] 127 window=None # type: Optional[BoundedWindow] 128 ): 129 # type: (...) -> None 130 self.timestamp = Timestamp.of(timestamp) 131 self.element = element 132 self.window = window 133 134 @abc.abstractmethod 135 def assign(self, assign_context): 136 # type: (AssignContext) -> Iterable[BoundedWindow] # noqa: F821 137 138 """Associates windows to an element. 139 140 Arguments: 141 assign_context: Instance of AssignContext. 142 143 Returns: 144 An iterable of BoundedWindow. 145 """ 146 raise NotImplementedError 147 148 class MergeContext(object): 149 """Context passed to WindowFn.merge() to perform merging, if any.""" 150 def __init__(self, windows): 151 # type: (Iterable[BoundedWindow]) -> None 152 self.windows = list(windows) 153 154 def merge(self, to_be_merged, merge_result): 155 # type: (Iterable[BoundedWindow], BoundedWindow) -> None 156 raise NotImplementedError 157 158 @abc.abstractmethod 159 def merge(self, merge_context): 160 # type: (WindowFn.MergeContext) -> None 161 162 """Returns a window that is the result of merging a set of windows.""" 163 raise NotImplementedError 164 165 def is_merging(self): 166 # type: () -> bool 167 168 """Returns whether this WindowFn merges windows.""" 169 return True 170 171 @abc.abstractmethod 172 def get_window_coder(self): 173 # type: () -> coders.Coder 174 raise NotImplementedError 175 176 def get_transformed_output_time(self, window, input_timestamp): # pylint: disable=unused-argument 177 # type: (BoundedWindow, Timestamp) -> Timestamp 178 179 """Given input time and output window, returns output time for window. 180 181 If TimestampCombiner.OUTPUT_AT_EARLIEST_TRANSFORMED is used in the 182 Windowing, the output timestamp for the given window will be the earliest 183 of the timestamps returned by get_transformed_output_time() for elements 184 of the window. 185 186 Arguments: 187 window: Output window of element. 188 input_timestamp: Input timestamp of element as a timeutil.Timestamp 189 object. 190 191 Returns: 192 Transformed timestamp. 193 """ 194 # By default, just return the input timestamp. 195 return input_timestamp 196 197 urns.RunnerApiFn.register_pickle_urn(python_urns.PICKLED_WINDOWFN) 198 199 200 class BoundedWindow(object): 201 """A window for timestamps in range (-infinity, end). 202 203 Attributes: 204 end: End of window. 205 """ 206 def __init__(self, end): 207 # type: (TimestampTypes) -> None 208 self._end = Timestamp.of(end) 209 210 @property 211 def start(self): 212 # type: () -> Timestamp 213 raise NotImplementedError 214 215 @property 216 def end(self): 217 # type: () -> Timestamp 218 return self._end 219 220 def max_timestamp(self): 221 # type: () -> Timestamp 222 return self.end.predecessor() 223 224 def __eq__(self, other): 225 raise NotImplementedError 226 227 def __ne__(self, other): 228 # Order first by endpoint, then arbitrarily 229 return self.end != other.end or hash(self) != hash(other) 230 231 def __lt__(self, other): 232 if self.end != other.end: 233 return self.end < other.end 234 return hash(self) < hash(other) 235 236 def __le__(self, other): 237 if self.end != other.end: 238 return self.end <= other.end 239 return hash(self) <= hash(other) 240 241 def __gt__(self, other): 242 if self.end != other.end: 243 return self.end > other.end 244 return hash(self) > hash(other) 245 246 def __ge__(self, other): 247 if self.end != other.end: 248 return self.end >= other.end 249 return hash(self) >= hash(other) 250 251 def __hash__(self): 252 raise NotImplementedError 253 254 def __repr__(self): 255 return '[?, %s)' % float(self.end) 256 257 258 @total_ordering 259 class IntervalWindow(windowed_value._IntervalWindowBase, BoundedWindow): 260 """A window for timestamps in range [start, end). 261 262 Attributes: 263 start: Start of window as seconds since Unix epoch. 264 end: End of window as seconds since Unix epoch. 265 """ 266 def __lt__(self, other): 267 if self.end != other.end: 268 return self.end < other.end 269 return hash(self) < hash(other) 270 271 def intersects(self, other): 272 # type: (IntervalWindow) -> bool 273 return other.start < self.end or self.start < other.end 274 275 def union(self, other): 276 # type: (IntervalWindow) -> IntervalWindow 277 return IntervalWindow( 278 min(self.start, other.start), max(self.end, other.end)) 279 280 281 @total_ordering 282 class TimestampedValue(object): 283 """A timestamped value having a value and a timestamp. 284 285 Attributes: 286 value: The underlying value. 287 timestamp: Timestamp associated with the value as seconds since Unix epoch. 288 """ 289 def __init__(self, value, timestamp): 290 # type: (Any, TimestampTypes) -> None 291 self.value = value 292 self.timestamp = Timestamp.of(timestamp) 293 294 def __eq__(self, other): 295 return ( 296 type(self) == type(other) and self.value == other.value and 297 self.timestamp == other.timestamp) 298 299 def __hash__(self): 300 return hash((self.value, self.timestamp)) 301 302 def __lt__(self, other): 303 if type(self) != type(other): 304 return type(self).__name__ < type(other).__name__ 305 if self.value != other.value: 306 return self.value < other.value 307 return self.timestamp < other.timestamp 308 309 310 class GlobalWindow(BoundedWindow): 311 """The default window into which all data is placed (via GlobalWindows).""" 312 _instance = None # type: GlobalWindow 313 314 def __new__(cls): 315 if cls._instance is None: 316 cls._instance = super(GlobalWindow, cls).__new__(cls) 317 return cls._instance 318 319 def __init__(self): 320 # type: () -> None 321 super().__init__(GlobalWindow._getTimestampFromProto()) 322 323 def __repr__(self): 324 return 'GlobalWindow' 325 326 def __hash__(self): 327 return hash(type(self)) 328 329 def __eq__(self, other): 330 # Global windows are always and only equal to each other. 331 return self is other or type(self) is type(other) 332 333 @property 334 def start(self): 335 # type: () -> Timestamp 336 return MIN_TIMESTAMP 337 338 @staticmethod 339 def _getTimestampFromProto(): 340 # type: () -> Timestamp 341 ts_millis = int( 342 common_urns.constants.GLOBAL_WINDOW_MAX_TIMESTAMP_MILLIS.constant) 343 return Timestamp(micros=ts_millis * 1000) 344 345 346 class NonMergingWindowFn(WindowFn): 347 def is_merging(self): 348 # type: () -> bool 349 return False 350 351 def merge(self, merge_context): 352 # type: (WindowFn.MergeContext) -> None 353 pass # No merging. 354 355 356 class GlobalWindows(NonMergingWindowFn): 357 """A windowing function that assigns everything to one global window.""" 358 @classmethod 359 def windowed_batch( 360 cls, 361 batch, # type: Any 362 timestamp=MIN_TIMESTAMP, # type: Timestamp 363 pane_info=windowed_value.PANE_INFO_UNKNOWN # type: windowed_value.PaneInfo 364 ): 365 # type: (...) -> windowed_value.WindowedBatch 366 return windowed_value.HomogeneousWindowedBatch.of( 367 batch, timestamp, (GlobalWindow(), ), pane_info) 368 369 @classmethod 370 def windowed_value( 371 cls, 372 value, # type: Any 373 timestamp=MIN_TIMESTAMP, # type: Timestamp 374 pane_info=windowed_value.PANE_INFO_UNKNOWN # type: windowed_value.PaneInfo 375 ): 376 # type: (...) -> WindowedValue 377 return WindowedValue(value, timestamp, (GlobalWindow(), ), pane_info) 378 379 @classmethod 380 def windowed_value_at_end_of_window(cls, value): 381 return cls.windowed_value(value, GlobalWindow().max_timestamp()) 382 383 def assign(self, assign_context): 384 # type: (WindowFn.AssignContext) -> List[GlobalWindow] 385 return [GlobalWindow()] 386 387 def get_window_coder(self): 388 # type: () -> coders.GlobalWindowCoder 389 return coders.GlobalWindowCoder() 390 391 def __hash__(self): 392 return hash(type(self)) 393 394 def __eq__(self, other): 395 # Global windowfn is always and only equal to each other. 396 return self is other or type(self) is type(other) 397 398 def to_runner_api_parameter(self, context): 399 return common_urns.global_windows.urn, None 400 401 @staticmethod 402 @urns.RunnerApiFn.register_urn(common_urns.global_windows.urn, None) 403 def from_runner_api_parameter(unused_fn_parameter, unused_context): 404 # type: (...) -> GlobalWindows 405 return GlobalWindows() 406 407 408 class FixedWindows(NonMergingWindowFn): 409 """A windowing function that assigns each element to one time interval. 410 411 The attributes size and offset determine in what time interval a timestamp 412 will be slotted. The time intervals have the following formula: 413 [N * size + offset, (N + 1) * size + offset) 414 415 Attributes: 416 size: Size of the window as seconds. 417 offset: Offset of this window as seconds. Windows start at 418 t=N * size + offset where t=0 is the UNIX epoch. The offset must be a 419 value in range [0, size). If it is not it will be normalized to this 420 range. 421 """ 422 def __init__( 423 self, 424 size, # type: DurationTypes 425 offset=0 # type: TimestampTypes 426 ): 427 """Initialize a ``FixedWindows`` function for a given size and offset. 428 429 Args: 430 size (int): Size of the window in seconds. 431 offset(int): Offset of this window as seconds. Windows start at 432 t=N * size + offset where t=0 is the UNIX epoch. The offset must be a 433 value in range [0, size). If it is not it will be normalized to this 434 range. 435 """ 436 if size <= 0: 437 raise ValueError('The size parameter must be strictly positive.') 438 self.size = Duration.of(size) 439 self.offset = Timestamp.of(offset) % self.size 440 441 def assign(self, context): 442 # type: (WindowFn.AssignContext) -> List[IntervalWindow] 443 timestamp = context.timestamp 444 start = timestamp - (timestamp - self.offset) % self.size 445 return [IntervalWindow(start, start + self.size)] 446 447 def get_window_coder(self): 448 # type: () -> coders.IntervalWindowCoder 449 return coders.IntervalWindowCoder() 450 451 def __eq__(self, other): 452 if type(self) == type(other) == FixedWindows: 453 return self.size == other.size and self.offset == other.offset 454 455 def __hash__(self): 456 return hash((self.size, self.offset)) 457 458 def to_runner_api_parameter(self, context): 459 return ( 460 common_urns.fixed_windows.urn, 461 standard_window_fns_pb2.FixedWindowsPayload( 462 size=proto_utils.from_micros( 463 duration_pb2.Duration, self.size.micros), 464 offset=proto_utils.from_micros( 465 timestamp_pb2.Timestamp, self.offset.micros))) 466 467 @staticmethod 468 @urns.RunnerApiFn.register_urn( 469 common_urns.fixed_windows.urn, 470 standard_window_fns_pb2.FixedWindowsPayload) 471 def from_runner_api_parameter(fn_parameter, unused_context): 472 # type: (...) -> FixedWindows 473 return FixedWindows( 474 size=Duration(micros=fn_parameter.size.ToMicroseconds()), 475 offset=Timestamp(micros=fn_parameter.offset.ToMicroseconds())) 476 477 478 class SlidingWindows(NonMergingWindowFn): 479 """A windowing function that assigns each element to a set of sliding windows. 480 481 The attributes size and offset determine in what time interval a timestamp 482 will be slotted. The time intervals have the following formula: 483 [N * period + offset, N * period + offset + size) 484 485 Attributes: 486 size: Size of the window as seconds. 487 period: Period of the windows as seconds. 488 offset: Offset of this window as seconds since Unix epoch. Windows start at 489 t=N * period + offset where t=0 is the epoch. The offset must be a value 490 in range [0, period). If it is not it will be normalized to this range. 491 """ 492 493 def __init__(self, 494 size, # type: DurationTypes 495 period, # type: DurationTypes 496 offset=0, # type: TimestampTypes 497 ): 498 if size <= 0: 499 raise ValueError('The size parameter must be strictly positive.') 500 self.size = Duration.of(size) 501 self.period = Duration.of(period) 502 self.offset = Timestamp.of(offset) % period 503 504 def assign(self, context): 505 # type: (WindowFn.AssignContext) -> List[IntervalWindow] 506 timestamp = context.timestamp 507 start = timestamp - ((timestamp - self.offset) % self.period) 508 return [ 509 IntervalWindow(Timestamp(micros=s), Timestamp(micros=s) + self.size) 510 for s in range( 511 start.micros, 512 timestamp.micros - self.size.micros, 513 -self.period.micros) 514 ] 515 516 def get_window_coder(self): 517 # type: () -> coders.IntervalWindowCoder 518 return coders.IntervalWindowCoder() 519 520 def __eq__(self, other): 521 if type(self) == type(other) == SlidingWindows: 522 return ( 523 self.size == other.size and self.offset == other.offset and 524 self.period == other.period) 525 526 def __hash__(self): 527 return hash((self.offset, self.period)) 528 529 def to_runner_api_parameter(self, context): 530 return ( 531 common_urns.sliding_windows.urn, 532 standard_window_fns_pb2.SlidingWindowsPayload( 533 size=proto_utils.from_micros( 534 duration_pb2.Duration, self.size.micros), 535 offset=proto_utils.from_micros( 536 timestamp_pb2.Timestamp, self.offset.micros), 537 period=proto_utils.from_micros( 538 duration_pb2.Duration, self.period.micros))) 539 540 @staticmethod 541 @urns.RunnerApiFn.register_urn( 542 common_urns.sliding_windows.urn, 543 standard_window_fns_pb2.SlidingWindowsPayload) 544 def from_runner_api_parameter(fn_parameter, unused_context): 545 # type: (...) -> SlidingWindows 546 return SlidingWindows( 547 size=Duration(micros=fn_parameter.size.ToMicroseconds()), 548 offset=Timestamp(micros=fn_parameter.offset.ToMicroseconds()), 549 period=Duration(micros=fn_parameter.period.ToMicroseconds())) 550 551 552 class Sessions(WindowFn): 553 """A windowing function that groups elements into sessions. 554 555 A session is defined as a series of consecutive events 556 separated by a specified gap size. 557 558 Attributes: 559 gap_size: Size of the gap between windows as floating-point seconds. 560 """ 561 def __init__(self, gap_size): 562 # type: (DurationTypes) -> None 563 if gap_size <= 0: 564 raise ValueError('The size parameter must be strictly positive.') 565 self.gap_size = Duration.of(gap_size) 566 567 def assign(self, context): 568 # type: (WindowFn.AssignContext) -> List[IntervalWindow] 569 timestamp = context.timestamp 570 return [IntervalWindow(timestamp, timestamp + self.gap_size)] 571 572 def get_window_coder(self): 573 # type: () -> coders.IntervalWindowCoder 574 return coders.IntervalWindowCoder() 575 576 def merge(self, merge_context): 577 # type: (WindowFn.MergeContext) -> None 578 to_merge = [] # type: List[BoundedWindow] 579 end = MIN_TIMESTAMP 580 for w in sorted(merge_context.windows, key=lambda w: w.start): 581 if to_merge: 582 if end > w.start: 583 to_merge.append(w) 584 if w.end > end: 585 end = w.end 586 else: 587 if len(to_merge) > 1: 588 merge_context.merge( 589 to_merge, IntervalWindow(to_merge[0].start, end)) 590 to_merge = [w] 591 end = w.end 592 else: 593 to_merge = [w] 594 end = w.end 595 if len(to_merge) > 1: 596 merge_context.merge(to_merge, IntervalWindow(to_merge[0].start, end)) 597 598 def __eq__(self, other): 599 if type(self) == type(other) == Sessions: 600 return self.gap_size == other.gap_size 601 602 def __hash__(self): 603 return hash(self.gap_size) 604 605 def to_runner_api_parameter(self, context): 606 return ( 607 common_urns.session_windows.urn, 608 standard_window_fns_pb2.SessionWindowsPayload( 609 gap_size=proto_utils.from_micros( 610 duration_pb2.Duration, self.gap_size.micros))) 611 612 @staticmethod 613 @urns.RunnerApiFn.register_urn( 614 common_urns.session_windows.urn, 615 standard_window_fns_pb2.SessionWindowsPayload) 616 def from_runner_api_parameter(fn_parameter, unused_context): 617 # type: (...) -> Sessions 618 return Sessions( 619 gap_size=Duration(micros=fn_parameter.gap_size.ToMicroseconds()))