github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/io/range_trackers.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """iobase.RangeTracker implementations provided with Apache Beam. 19 """ 20 # pytype: skip-file 21 22 import codecs 23 import logging 24 import math 25 import threading 26 from typing import Union 27 28 from apache_beam.io import iobase 29 30 __all__ = [ 31 'OffsetRangeTracker', 32 'LexicographicKeyRangeTracker', 33 'OrderedPositionRangeTracker', 34 'UnsplittableRangeTracker' 35 ] 36 37 _LOGGER = logging.getLogger(__name__) 38 39 40 class OffsetRangeTracker(iobase.RangeTracker): 41 """A 'RangeTracker' for non-negative positions of type 'long'.""" 42 43 # Offset corresponding to infinity. This can only be used as the upper-bound 44 # of a range, and indicates reading all of the records until the end without 45 # specifying exactly what the end is. 46 # Infinite ranges cannot be split because it is impossible to estimate 47 # progress within them. 48 OFFSET_INFINITY = float('inf') 49 50 def __init__(self, start, end): 51 super().__init__() 52 53 if start is None: 54 raise ValueError('Start offset must not be \'None\'') 55 if end is None: 56 raise ValueError('End offset must not be \'None\'') 57 assert isinstance(start, int) 58 if end != self.OFFSET_INFINITY: 59 assert isinstance(end, int) 60 61 assert start <= end 62 63 self._start_offset = start 64 self._stop_offset = end 65 66 self._last_record_start = -1 67 self._last_attempted_record_start = -1 68 self._offset_of_last_split_point = -1 69 self._lock = threading.Lock() 70 71 self._split_points_seen = 0 72 self._split_points_unclaimed_callback = None 73 74 def start_position(self): 75 return self._start_offset 76 77 def stop_position(self): 78 return self._stop_offset 79 80 @property 81 def last_record_start(self): 82 return self._last_record_start 83 84 @property 85 def last_attempted_record_start(self): 86 """Return current value of last_attempted_record_start. 87 88 last_attempted_record_start records a valid position that tried to be 89 claimed by calling try_claim(). This value is only updated by `try_claim()` 90 no matter `try_claim()` returns `True` or `False`. 91 """ 92 return self._last_attempted_record_start 93 94 def _validate_record_start(self, record_start, split_point): 95 # This function must only be called under the lock self.lock. 96 if not self._lock.locked(): 97 raise ValueError( 98 'This function must only be called under the lock self.lock.') 99 100 if record_start < self._last_record_start: 101 raise ValueError( 102 'Trying to return a record [starting at %d] which is before the ' 103 'last-returned record [starting at %d]' % 104 (record_start, self._last_record_start)) 105 106 if (split_point and self._offset_of_last_split_point != -1 and 107 record_start == self._offset_of_last_split_point): 108 raise ValueError( 109 'Record at a split point has same offset as the previous split ' 110 'point: %d' % record_start) 111 112 if not split_point and self._last_record_start == -1: 113 raise ValueError( 114 'The first record [starting at %d] must be at a split point' % 115 record_start) 116 117 def try_claim(self, record_start): 118 with self._lock: 119 # Attempted claim should be monotonous. 120 if record_start <= self._last_attempted_record_start: 121 raise ValueError( 122 'Trying to return a record [starting at %d] which is not greater' 123 'than the last-attempted record [starting at %d]' % 124 (record_start, self._last_attempted_record_start)) 125 self._validate_record_start(record_start, True) 126 self._last_attempted_record_start = record_start 127 if record_start >= self.stop_position(): 128 return False 129 self._offset_of_last_split_point = record_start 130 self._last_record_start = record_start 131 self._split_points_seen += 1 132 return True 133 134 def set_current_position(self, record_start): 135 with self._lock: 136 self._validate_record_start(record_start, False) 137 self._last_record_start = record_start 138 139 def try_split(self, split_offset): 140 assert isinstance(split_offset, int) 141 with self._lock: 142 if self._stop_offset == OffsetRangeTracker.OFFSET_INFINITY: 143 _LOGGER.debug( 144 'refusing to split %r at %d: stop position unspecified', 145 self, 146 split_offset) 147 return 148 if self._last_record_start == -1: 149 _LOGGER.debug( 150 'Refusing to split %r at %d: unstarted', self, split_offset) 151 return 152 153 if split_offset <= self._last_record_start: 154 _LOGGER.debug( 155 'Refusing to split %r at %d: already past proposed stop offset', 156 self, 157 split_offset) 158 return 159 if (split_offset < self.start_position() or 160 split_offset >= self.stop_position()): 161 _LOGGER.debug( 162 'Refusing to split %r at %d: proposed split position out of range', 163 self, 164 split_offset) 165 return 166 167 _LOGGER.debug('Agreeing to split %r at %d', self, split_offset) 168 169 split_fraction = ( 170 float(split_offset - self._start_offset) / 171 (self._stop_offset - self._start_offset)) 172 self._stop_offset = split_offset 173 174 return self._stop_offset, split_fraction 175 176 def fraction_consumed(self): 177 with self._lock: 178 # self.last_record_start may become larger than self.end_offset when 179 # reading the records since any record that starts before the first 'split 180 # point' at or after the defined 'stop offset' is considered to be within 181 # the range of the OffsetRangeTracker. Hence fraction could be > 1. 182 # self.last_record_start is initialized to -1, hence fraction may be < 0. 183 # Bounding the to range [0, 1]. 184 return self.position_to_fraction( 185 self._last_record_start, self.start_position(), self.stop_position()) 186 187 def position_to_fraction(self, pos, start, stop): 188 fraction = 1.0 * (pos - start) / (stop - start) if start != stop else 0.0 189 return max(0.0, min(1.0, fraction)) 190 191 def position_at_fraction(self, fraction): 192 if self.stop_position() == OffsetRangeTracker.OFFSET_INFINITY: 193 raise Exception( 194 'get_position_for_fraction_consumed is not applicable for an ' 195 'unbounded range') 196 return int( 197 math.ceil( 198 self.start_position() + fraction * 199 (self.stop_position() - self.start_position()))) 200 201 def split_points(self): 202 with self._lock: 203 split_points_consumed = ( 204 0 if self._split_points_seen == 0 else self._split_points_seen - 1) 205 split_points_unclaimed = ( 206 self._split_points_unclaimed_callback(self.stop_position()) 207 if self._split_points_unclaimed_callback else 208 iobase.RangeTracker.SPLIT_POINTS_UNKNOWN) 209 split_points_remaining = ( 210 iobase.RangeTracker.SPLIT_POINTS_UNKNOWN 211 if split_points_unclaimed == iobase.RangeTracker.SPLIT_POINTS_UNKNOWN 212 else (split_points_unclaimed + 1)) 213 214 return (split_points_consumed, split_points_remaining) 215 216 def set_split_points_unclaimed_callback(self, callback): 217 self._split_points_unclaimed_callback = callback 218 219 220 class OrderedPositionRangeTracker(iobase.RangeTracker): 221 """ 222 An abstract base class for range trackers whose positions are comparable. 223 224 Subclasses only need to implement the mapping from position ranges 225 to and from the closed interval [0, 1]. 226 """ 227 228 UNSTARTED = object() 229 230 def __init__(self, start_position=None, stop_position=None): 231 self._start_position = start_position 232 self._stop_position = stop_position 233 self._lock = threading.Lock() 234 self._last_claim = self.UNSTARTED 235 236 def start_position(self): 237 return self._start_position 238 239 def stop_position(self): 240 with self._lock: 241 return self._stop_position 242 243 def try_claim(self, position): 244 with self._lock: 245 if self._last_claim is not self.UNSTARTED and position < self._last_claim: 246 raise ValueError( 247 "Positions must be claimed in order: " 248 "claim '%s' attempted after claim '%s'" % 249 (position, self._last_claim)) 250 elif self._start_position is not None and position < self._start_position: 251 raise ValueError( 252 "Claim '%s' is before start '%s'" % 253 (position, self._start_position)) 254 if self._stop_position is None or position < self._stop_position: 255 self._last_claim = position 256 return True 257 else: 258 return False 259 260 def position_at_fraction(self, fraction): 261 return self.fraction_to_position( 262 fraction, self._start_position, self._stop_position) 263 264 def try_split(self, position): 265 with self._lock: 266 if ((self._stop_position is not None and position >= self._stop_position) 267 or (self._start_position is not None and 268 position <= self._start_position)): 269 _LOGGER.debug( 270 'Refusing to split %r at %d: proposed split position out of range', 271 self, 272 position) 273 return 274 275 if self._last_claim is self.UNSTARTED or self._last_claim < position: 276 fraction = self.position_to_fraction( 277 position, start=self._start_position, end=self._stop_position) 278 self._stop_position = position 279 return position, fraction 280 281 def fraction_consumed(self): 282 if self._last_claim is self.UNSTARTED: 283 return 0 284 else: 285 return self.position_to_fraction( 286 self._last_claim, self._start_position, self._stop_position) 287 288 def fraction_to_position(self, fraction, start, end): 289 """ 290 Converts a fraction between 0 and 1 to a position between start and end. 291 """ 292 raise NotImplementedError 293 294 def position_to_fraction(self, position, start, end): 295 """Returns the fraction of keys in the range [start, end) that 296 are less than the given key. 297 """ 298 raise NotImplementedError 299 300 301 class UnsplittableRangeTracker(iobase.RangeTracker): 302 """A RangeTracker that always ignores split requests. 303 304 This can be used to make a given 305 :class:`~apache_beam.io.iobase.RangeTracker` object unsplittable by 306 ignoring all calls to :meth:`.try_split()`. All other calls will be delegated 307 to the given :class:`~apache_beam.io.iobase.RangeTracker`. 308 """ 309 def __init__(self, range_tracker): 310 """Initializes UnsplittableRangeTracker. 311 312 Args: 313 range_tracker (~apache_beam.io.iobase.RangeTracker): a 314 :class:`~apache_beam.io.iobase.RangeTracker` to which all method 315 calls expect calls to :meth:`.try_split()` will be delegated. 316 """ 317 assert isinstance(range_tracker, iobase.RangeTracker) 318 self._range_tracker = range_tracker 319 320 def start_position(self): 321 return self._range_tracker.start_position() 322 323 def stop_position(self): 324 return self._range_tracker.stop_position() 325 326 def position_at_fraction(self, fraction): 327 return self._range_tracker.position_at_fraction(fraction) 328 329 def try_claim(self, position): 330 return self._range_tracker.try_claim(position) 331 332 def try_split(self, position): 333 return None 334 335 def set_current_position(self, position): 336 self._range_tracker.set_current_position(position) 337 338 def fraction_consumed(self): 339 return self._range_tracker.fraction_consumed() 340 341 def split_points(self): 342 # An unsplittable range only contains a single split point. 343 return (0, 1) 344 345 def set_split_points_unclaimed_callback(self, callback): 346 self._range_tracker.set_split_points_unclaimed_callback(callback) 347 348 349 class LexicographicKeyRangeTracker(OrderedPositionRangeTracker): 350 """A range tracker that tracks progress through a lexicographically 351 ordered keyspace of strings. 352 """ 353 @classmethod 354 def fraction_to_position( 355 cls, 356 fraction: float, 357 start: Union[bytes, str] = None, 358 end: Union[bytes, str] = None, 359 ) -> Union[bytes, str]: 360 """Linearly interpolates a key that is lexicographically 361 fraction of the way between start and end. 362 """ 363 assert 0 <= fraction <= 1, fraction 364 365 if start is None: 366 start = b'' 367 368 if fraction == 0: 369 return start 370 371 if fraction == 1: 372 return end 373 374 if not end: 375 common_prefix_len = len(start) - len(start.lstrip(b'\xFF')) 376 else: 377 for ix, (s, e) in enumerate(zip(start, end)): 378 if s != e: 379 common_prefix_len = ix 380 break 381 else: 382 common_prefix_len = min(len(start), len(end)) 383 384 # Convert the relative precision of fraction (~53 bits) to an absolute 385 # precision needed to represent values between start and end distinctly. 386 prec = common_prefix_len + int(-math.log(fraction, 256)) + 7 387 istart = cls._bytestring_to_int(start, prec) 388 iend = cls._bytestring_to_int(end, prec) if end else 1 << (prec * 8) 389 ikey = istart + int((iend - istart) * fraction) 390 391 # Could be equal due to rounding. 392 # Adjust to ensure we never return the actual start and end 393 # unless fraction is exatly 0 or 1. 394 if ikey == istart: 395 ikey += 1 396 elif ikey == iend: 397 ikey -= 1 398 399 position: bytes = cls._bytestring_from_int(ikey, prec).rstrip(b'\0') 400 401 if isinstance(start, bytes): 402 return position 403 404 return position.decode(encoding='unicode_escape', errors='replace') 405 406 @classmethod 407 def position_to_fraction( 408 cls, 409 key: Union[bytes, str] = None, 410 start: Union[bytes, str] = None, 411 end: Union[bytes, str] = None, 412 ) -> float: 413 """Returns the fraction of keys in the range [start, end) that 414 are less than the given key. 415 """ 416 if not key: 417 return 0 418 419 if start is None: 420 start = '' if isinstance(key, str) else b'' 421 422 prec = len(start) + 7 423 if key.startswith(start): 424 # Higher absolute precision needed for very small values of fixed 425 # relative position. 426 trailing_symbol = '\0' if isinstance(key, str) else b'\0' 427 prec = max( 428 prec, len(key) - len(key[len(start):].strip(trailing_symbol)) + 7) 429 istart = cls._bytestring_to_int(start, prec) 430 ikey = cls._bytestring_to_int(key, prec) 431 iend = cls._bytestring_to_int(end, prec) if end else 1 << (prec * 8) 432 return float(ikey - istart) / (iend - istart) 433 434 @staticmethod 435 def _bytestring_to_int(s: Union[bytes, str], prec: int) -> int: 436 """Returns int(256**prec * f) where f is the fraction 437 represented by interpreting '.' + s as a base-256 438 floating point number. 439 """ 440 if not s: 441 return 0 442 443 if isinstance(s, str): 444 s = s.encode() # str -> bytes 445 446 if len(s) < prec: 447 s += b'\0' * (prec - len(s)) 448 else: 449 s = s[:prec] 450 451 h = codecs.encode(s, encoding='hex') 452 return int(h, base=16) 453 454 @staticmethod 455 def _bytestring_from_int(i: int, prec: int) -> bytes: 456 """Inverse of _bytestring_to_int.""" 457 h = '%x' % i 458 return codecs.decode('0' * (2 * prec - len(h)) + h, encoding='hex')