github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/utils/windowed_value.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """Core windowing data structures.""" 19 20 # This module is carefully crafted to have optimal performance when 21 # compiled while still being valid Python. Care needs to be taken when 22 # editing this file as WindowedValues are created for every element for 23 # every step in a Beam pipeline. 24 25 # cython: profile=True 26 # cython: language_level=3 27 28 # pytype: skip-file 29 30 import collections 31 from typing import TYPE_CHECKING 32 from typing import Any 33 from typing import Callable 34 from typing import Iterable 35 from typing import List 36 from typing import Optional 37 from typing import Sequence 38 from typing import Tuple 39 40 from apache_beam.utils.timestamp import MAX_TIMESTAMP 41 from apache_beam.utils.timestamp import MIN_TIMESTAMP 42 from apache_beam.utils.timestamp import Timestamp 43 from apache_beam.utils.timestamp import TimestampTypes # pylint: disable=unused-import 44 45 if TYPE_CHECKING: 46 from apache_beam.transforms.window import BoundedWindow 47 48 49 class PaneInfoTiming(object): 50 """The timing of a PaneInfo.""" 51 52 EARLY = 0 53 ON_TIME = 1 54 LATE = 2 55 UNKNOWN = 3 56 57 @classmethod 58 def to_string(cls, value): 59 return { 60 cls.EARLY: 'EARLY', 61 cls.ON_TIME: 'ON_TIME', 62 cls.LATE: 'LATE', 63 cls.UNKNOWN: 'UNKNOWN', 64 }[value] 65 66 @classmethod 67 def from_string(cls, value): 68 return { 69 'EARLY': cls.EARLY, 70 'ON_TIME': cls.ON_TIME, 71 'LATE': cls.LATE, 72 'UNKNOWN': cls.UNKNOWN 73 }[value] 74 75 76 class PaneInfo(object): 77 """Describes the trigger firing information for a given WindowedValue. 78 79 "Panes" represent individual firings on a single window. ``PaneInfo``s are 80 passed downstream after trigger firings. They contain information about 81 whether it's an early/on time/late firing, if it's the last or first firing 82 from a window, and the index of the firing. 83 """ 84 def __init__(self, is_first, is_last, timing, index, nonspeculative_index): 85 self._is_first = is_first 86 self._is_last = is_last 87 self._timing = timing 88 self._index = index 89 self._nonspeculative_index = nonspeculative_index 90 self._encoded_byte = self._get_encoded_byte() 91 92 def _get_encoded_byte(self): 93 byte = 0 94 if self._is_first: 95 byte |= 1 96 if self._is_last: 97 byte |= 2 98 byte |= self._timing << 2 99 return byte 100 101 @staticmethod 102 def from_encoded_byte(encoded_byte): 103 assert encoded_byte in _BYTE_TO_PANE_INFO 104 return _BYTE_TO_PANE_INFO[encoded_byte] 105 106 # Because common PaneInfo objects are cached, it is important that the value 107 # is immutable. We therefore explicitly enforce this here with read-only 108 # properties. 109 110 @property 111 def is_first(self): 112 return self._is_first 113 114 @property 115 def is_last(self): 116 return self._is_last 117 118 @property 119 def timing(self): 120 return self._timing 121 122 @property 123 def index(self): 124 # type: () -> int 125 return self._index 126 127 @property 128 def nonspeculative_index(self): 129 # type: () -> int 130 return self._nonspeculative_index 131 132 @property 133 def encoded_byte(self): 134 # type: () -> int 135 return self._encoded_byte 136 137 def __repr__(self): 138 return ( 139 'PaneInfo(first: %r, last: %r, timing: %s, index: %d, ' 140 'nonspeculative_index: %d)') % ( 141 self.is_first, 142 self.is_last, 143 PaneInfoTiming.to_string(self.timing), 144 self.index, 145 self.nonspeculative_index) 146 147 def __eq__(self, other): 148 if self is other: 149 return True 150 151 if isinstance(other, PaneInfo): 152 return ( 153 self.is_first == other.is_first and self.is_last == other.is_last and 154 self.timing == other.timing and self.index == other.index and 155 self.nonspeculative_index == other.nonspeculative_index) 156 157 return NotImplemented 158 159 def __hash__(self): 160 return hash(( 161 self.is_first, 162 self.is_last, 163 self.timing, 164 self.index, 165 self.nonspeculative_index)) 166 167 def __reduce__(self): 168 return PaneInfo, (self._is_first, self._is_last, self._timing, self._index, 169 self._nonspeculative_index) 170 171 172 def _construct_well_known_pane_infos(): 173 # type: () -> List[PaneInfo] 174 pane_infos = [] 175 for timing in (PaneInfoTiming.EARLY, 176 PaneInfoTiming.ON_TIME, 177 PaneInfoTiming.LATE, 178 PaneInfoTiming.UNKNOWN): 179 nonspeculative_index = -1 if timing == PaneInfoTiming.EARLY else 0 180 pane_infos.append(PaneInfo(True, True, timing, 0, nonspeculative_index)) 181 pane_infos.append(PaneInfo(True, False, timing, 0, nonspeculative_index)) 182 pane_infos.append(PaneInfo(False, True, timing, -1, nonspeculative_index)) 183 pane_infos.append(PaneInfo(False, False, timing, -1, nonspeculative_index)) 184 result = [None] * ( 185 max(p.encoded_byte for p in pane_infos) + 1 186 ) # type: List[PaneInfo] # type: ignore[list-item] 187 for pane_info in pane_infos: 188 result[pane_info.encoded_byte] = pane_info 189 return result 190 191 192 # Cache of well-known PaneInfo objects. 193 _BYTE_TO_PANE_INFO = _construct_well_known_pane_infos() 194 195 # Default PaneInfo descriptor for when a value is not the output of triggering. 196 PANE_INFO_UNKNOWN = _BYTE_TO_PANE_INFO[0xF] 197 198 199 class WindowedValue(object): 200 """A windowed value having a value, a timestamp and set of windows. 201 202 Attributes: 203 value: The underlying value of a windowed value. 204 timestamp: Timestamp associated with the value as seconds since Unix epoch. 205 windows: A set (iterable) of window objects for the value. The window 206 object are descendants of the BoundedWindow class. 207 pane_info: A PaneInfo descriptor describing the triggering information for 208 the pane that contained this value. If None, will be set to 209 PANE_INFO_UNKNOWN. 210 """ 211 def __init__( 212 self, 213 value, 214 timestamp, # type: TimestampTypes 215 windows, # type: Tuple[BoundedWindow, ...] 216 pane_info=PANE_INFO_UNKNOWN # type: PaneInfo 217 ): 218 # type: (...) -> None 219 # For performance reasons, only timestamp_micros is stored by default 220 # (as a C int). The Timestamp object is created on demand below. 221 self.value = value 222 if isinstance(timestamp, int): 223 self.timestamp_micros = timestamp * 1000000 224 if TYPE_CHECKING: 225 self.timestamp_object = None # type: Optional[Timestamp] 226 else: 227 self.timestamp_object = ( 228 timestamp 229 if isinstance(timestamp, Timestamp) else Timestamp.of(timestamp)) 230 self.timestamp_micros = self.timestamp_object.micros 231 self.windows = windows 232 self.pane_info = pane_info 233 234 @property 235 def timestamp(self): 236 # type: () -> Timestamp 237 if self.timestamp_object is None: 238 self.timestamp_object = Timestamp(0, self.timestamp_micros) 239 return self.timestamp_object 240 241 def __repr__(self): 242 return '(%s, %s, %s, %s)' % ( 243 repr(self.value), 244 'MIN_TIMESTAMP' if self.timestamp == MIN_TIMESTAMP else 'MAX_TIMESTAMP' 245 if self.timestamp == MAX_TIMESTAMP else float(self.timestamp), 246 self.windows, 247 self.pane_info) 248 249 def __eq__(self, other): 250 if isinstance(other, WindowedValue): 251 return ( 252 type(self) == type(other) and 253 self.timestamp_micros == other.timestamp_micros and 254 self.value == other.value and self.windows == other.windows and 255 self.pane_info == other.pane_info) 256 return NotImplemented 257 258 def __hash__(self): 259 return ((hash(self.value) & 0xFFFFFFFFFFFFFFF) + 3 * 260 (self.timestamp_micros & 0xFFFFFFFFFFFFFF) + 7 * 261 (hash(tuple(self.windows)) & 0xFFFFFFFFFFFFF) + 11 * 262 (hash(self.pane_info) & 0xFFFFFFFFFFFFF)) 263 264 def with_value(self, new_value): 265 # type: (Any) -> WindowedValue 266 267 """Creates a new WindowedValue with the same timestamps and windows as this. 268 269 This is the fasted way to create a new WindowedValue. 270 """ 271 return create( 272 new_value, self.timestamp_micros, self.windows, self.pane_info) 273 274 def __reduce__(self): 275 return WindowedValue, ( 276 self.value, self.timestamp, self.windows, self.pane_info) 277 278 279 # TODO(robertwb): Move this to a static method. 280 281 282 def create(value, timestamp_micros, windows, pane_info=PANE_INFO_UNKNOWN): 283 wv = WindowedValue.__new__(WindowedValue) 284 wv.value = value 285 wv.timestamp_micros = timestamp_micros 286 wv.windows = windows 287 wv.pane_info = pane_info 288 return wv 289 290 291 class WindowedBatch(object): 292 """A batch of N windowed values, each having a value, a timestamp and set of 293 windows.""" 294 def with_values(self, new_values): 295 # type: (Any) -> WindowedBatch 296 297 """Creates a new WindowedBatch with the same timestamps and windows as this. 298 299 This is the fasted way to create a new WindowedValue. 300 """ 301 raise NotImplementedError 302 303 def as_windowed_values(self, explode_fn: Callable) -> Iterable[WindowedValue]: 304 raise NotImplementedError 305 306 @staticmethod 307 def from_windowed_values( 308 windowed_values: Sequence[WindowedValue], *, 309 produce_fn: Callable) -> Iterable['WindowedBatch']: 310 return HomogeneousWindowedBatch.from_windowed_values( 311 windowed_values, produce_fn=produce_fn) 312 313 314 class HomogeneousWindowedBatch(WindowedBatch): 315 """A WindowedBatch with Homogeneous event-time information, represented 316 internally as a WindowedValue. 317 """ 318 def __init__(self, wv): 319 self._wv = wv 320 321 @staticmethod 322 def of(values, timestamp, windows, pane_info): 323 return HomogeneousWindowedBatch( 324 WindowedValue(values, timestamp, windows, pane_info)) 325 326 @property 327 def values(self): 328 return self._wv.value 329 330 @property 331 def timestamp(self): 332 return self._wv.timestamp 333 334 @property 335 def pane_info(self): 336 return self._wv.pane_info 337 338 @property 339 def windows(self): 340 return self._wv.windows 341 342 @windows.setter 343 def windows(self, value): 344 self._wv.windows = value 345 346 def with_values(self, new_values): 347 # type: (Any) -> WindowedBatch 348 return HomogeneousWindowedBatch(self._wv.with_value(new_values)) 349 350 def as_windowed_values(self, explode_fn: Callable) -> Iterable[WindowedValue]: 351 for value in explode_fn(self._wv.value): 352 yield self._wv.with_value(value) 353 354 def as_empty_windowed_value(self): 355 """Get a single WindowedValue with identical windowing information to this 356 HomogeneousWindowedBatch, but with value=None. Useful for re-using APIs that 357 pull windowing information from a WindowedValue.""" 358 return self._wv.with_value(None) 359 360 def __eq__(self, other): 361 if isinstance(other, HomogeneousWindowedBatch): 362 return self._wv == other._wv 363 return NotImplemented 364 365 def __hash__(self): 366 return hash(self._wv) 367 368 @staticmethod 369 def from_batch_and_windowed_value( 370 *, batch, windowed_value: WindowedValue) -> 'WindowedBatch': 371 return HomogeneousWindowedBatch(windowed_value.with_value(batch)) 372 373 @staticmethod 374 def from_windowed_values( 375 windowed_values: Sequence[WindowedValue], *, 376 produce_fn: Callable) -> Iterable['WindowedBatch']: 377 grouped = collections.defaultdict(lambda: []) 378 for wv in windowed_values: 379 grouped[wv.with_value(None)].append(wv.value) 380 381 for key, values in grouped.items(): 382 yield HomogeneousWindowedBatch(key.with_value(produce_fn(values))) 383 384 385 try: 386 WindowedValue.timestamp_object = None 387 except TypeError: 388 # When we're compiled, we can't dynamically add attributes to 389 # the cdef class, but in this case it's OK as it's already present 390 # on each instance. 391 pass 392 393 394 class _IntervalWindowBase(object): 395 """Optimized form of IntervalWindow storing only microseconds for endpoints. 396 """ 397 def __init__(self, start, end): 398 # type: (TimestampTypes, TimestampTypes) -> None 399 if start is not None: 400 self._start_object = Timestamp.of(start) # type: Optional[Timestamp] 401 try: 402 self._start_micros = self._start_object.micros 403 except OverflowError: 404 self._start_micros = ( 405 MIN_TIMESTAMP.micros 406 if self._start_object.micros < 0 else MAX_TIMESTAMP.micros) 407 else: 408 # Micros must be populated elsewhere. 409 self._start_object = None 410 411 if end is not None: 412 self._end_object = Timestamp.of(end) # type: Optional[Timestamp] 413 try: 414 self._end_micros = self._end_object.micros 415 except OverflowError: 416 self._end_micros = ( 417 MIN_TIMESTAMP.micros 418 if self._end_object.micros < 0 else MAX_TIMESTAMP.micros) 419 else: 420 # Micros must be populated elsewhere. 421 self._end_object = None 422 423 @property 424 def start(self): 425 # type: () -> Timestamp 426 if self._start_object is None: 427 self._start_object = Timestamp(0, self._start_micros) 428 return self._start_object 429 430 @property 431 def end(self): 432 # type: () -> Timestamp 433 if self._end_object is None: 434 self._end_object = Timestamp(0, self._end_micros) 435 return self._end_object 436 437 def __hash__(self): 438 return hash((self._start_micros, self._end_micros)) 439 440 def __eq__(self, other): 441 return ( 442 type(self) == type(other) and 443 self._start_micros == other._start_micros and 444 self._end_micros == other._end_micros) 445 446 def __repr__(self): 447 return '[%s, %s)' % (float(self.start), float(self.end))