github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/io/range_trackers.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  """iobase.RangeTracker implementations provided with Apache Beam.
    19  """
    20  # pytype: skip-file
    21  
    22  import codecs
    23  import logging
    24  import math
    25  import threading
    26  from typing import Union
    27  
    28  from apache_beam.io import iobase
    29  
    30  __all__ = [
    31      'OffsetRangeTracker',
    32      'LexicographicKeyRangeTracker',
    33      'OrderedPositionRangeTracker',
    34      'UnsplittableRangeTracker'
    35  ]
    36  
    37  _LOGGER = logging.getLogger(__name__)
    38  
    39  
    40  class OffsetRangeTracker(iobase.RangeTracker):
    41    """A 'RangeTracker' for non-negative positions of type 'long'."""
    42  
    43    # Offset corresponding to infinity. This can only be used as the upper-bound
    44    # of a range, and indicates reading all of the records until the end without
    45    # specifying exactly what the end is.
    46    # Infinite ranges cannot be split because it is impossible to estimate
    47    # progress within them.
    48    OFFSET_INFINITY = float('inf')
    49  
    50    def __init__(self, start, end):
    51      super().__init__()
    52  
    53      if start is None:
    54        raise ValueError('Start offset must not be \'None\'')
    55      if end is None:
    56        raise ValueError('End offset must not be \'None\'')
    57      assert isinstance(start, int)
    58      if end != self.OFFSET_INFINITY:
    59        assert isinstance(end, int)
    60  
    61      assert start <= end
    62  
    63      self._start_offset = start
    64      self._stop_offset = end
    65  
    66      self._last_record_start = -1
    67      self._last_attempted_record_start = -1
    68      self._offset_of_last_split_point = -1
    69      self._lock = threading.Lock()
    70  
    71      self._split_points_seen = 0
    72      self._split_points_unclaimed_callback = None
    73  
    74    def start_position(self):
    75      return self._start_offset
    76  
    77    def stop_position(self):
    78      return self._stop_offset
    79  
    80    @property
    81    def last_record_start(self):
    82      return self._last_record_start
    83  
    84    @property
    85    def last_attempted_record_start(self):
    86      """Return current value of last_attempted_record_start.
    87  
    88      last_attempted_record_start records a valid position that tried to be
    89      claimed by calling try_claim(). This value is only updated by `try_claim()`
    90      no matter `try_claim()` returns `True` or `False`.
    91      """
    92      return self._last_attempted_record_start
    93  
    94    def _validate_record_start(self, record_start, split_point):
    95      # This function must only be called under the lock self.lock.
    96      if not self._lock.locked():
    97        raise ValueError(
    98            'This function must only be called under the lock self.lock.')
    99  
   100      if record_start < self._last_record_start:
   101        raise ValueError(
   102            'Trying to return a record [starting at %d] which is before the '
   103            'last-returned record [starting at %d]' %
   104            (record_start, self._last_record_start))
   105  
   106      if (split_point and self._offset_of_last_split_point != -1 and
   107          record_start == self._offset_of_last_split_point):
   108        raise ValueError(
   109            'Record at a split point has same offset as the previous split '
   110            'point: %d' % record_start)
   111  
   112      if not split_point and self._last_record_start == -1:
   113        raise ValueError(
   114            'The first record [starting at %d] must be at a split point' %
   115            record_start)
   116  
   117    def try_claim(self, record_start):
   118      with self._lock:
   119        # Attempted claim should be monotonous.
   120        if record_start <= self._last_attempted_record_start:
   121          raise ValueError(
   122              'Trying to return a record [starting at %d] which is not greater'
   123              'than the last-attempted record [starting at %d]' %
   124              (record_start, self._last_attempted_record_start))
   125        self._validate_record_start(record_start, True)
   126        self._last_attempted_record_start = record_start
   127        if record_start >= self.stop_position():
   128          return False
   129        self._offset_of_last_split_point = record_start
   130        self._last_record_start = record_start
   131        self._split_points_seen += 1
   132        return True
   133  
   134    def set_current_position(self, record_start):
   135      with self._lock:
   136        self._validate_record_start(record_start, False)
   137        self._last_record_start = record_start
   138  
   139    def try_split(self, split_offset):
   140      assert isinstance(split_offset, int)
   141      with self._lock:
   142        if self._stop_offset == OffsetRangeTracker.OFFSET_INFINITY:
   143          _LOGGER.debug(
   144              'refusing to split %r at %d: stop position unspecified',
   145              self,
   146              split_offset)
   147          return
   148        if self._last_record_start == -1:
   149          _LOGGER.debug(
   150              'Refusing to split %r at %d: unstarted', self, split_offset)
   151          return
   152  
   153        if split_offset <= self._last_record_start:
   154          _LOGGER.debug(
   155              'Refusing to split %r at %d: already past proposed stop offset',
   156              self,
   157              split_offset)
   158          return
   159        if (split_offset < self.start_position() or
   160            split_offset >= self.stop_position()):
   161          _LOGGER.debug(
   162              'Refusing to split %r at %d: proposed split position out of range',
   163              self,
   164              split_offset)
   165          return
   166  
   167        _LOGGER.debug('Agreeing to split %r at %d', self, split_offset)
   168  
   169        split_fraction = (
   170            float(split_offset - self._start_offset) /
   171            (self._stop_offset - self._start_offset))
   172        self._stop_offset = split_offset
   173  
   174        return self._stop_offset, split_fraction
   175  
   176    def fraction_consumed(self):
   177      with self._lock:
   178        # self.last_record_start may become larger than self.end_offset when
   179        # reading the records since any record that starts before the first 'split
   180        # point' at or after the defined 'stop offset' is considered to be within
   181        # the range of the OffsetRangeTracker. Hence fraction could be > 1.
   182        # self.last_record_start is initialized to -1, hence fraction may be < 0.
   183        # Bounding the to range [0, 1].
   184        return self.position_to_fraction(
   185            self._last_record_start, self.start_position(), self.stop_position())
   186  
   187    def position_to_fraction(self, pos, start, stop):
   188      fraction = 1.0 * (pos - start) / (stop - start) if start != stop else 0.0
   189      return max(0.0, min(1.0, fraction))
   190  
   191    def position_at_fraction(self, fraction):
   192      if self.stop_position() == OffsetRangeTracker.OFFSET_INFINITY:
   193        raise Exception(
   194            'get_position_for_fraction_consumed is not applicable for an '
   195            'unbounded range')
   196      return int(
   197          math.ceil(
   198              self.start_position() + fraction *
   199              (self.stop_position() - self.start_position())))
   200  
   201    def split_points(self):
   202      with self._lock:
   203        split_points_consumed = (
   204            0 if self._split_points_seen == 0 else self._split_points_seen - 1)
   205        split_points_unclaimed = (
   206            self._split_points_unclaimed_callback(self.stop_position())
   207            if self._split_points_unclaimed_callback else
   208            iobase.RangeTracker.SPLIT_POINTS_UNKNOWN)
   209        split_points_remaining = (
   210            iobase.RangeTracker.SPLIT_POINTS_UNKNOWN
   211            if split_points_unclaimed == iobase.RangeTracker.SPLIT_POINTS_UNKNOWN
   212            else (split_points_unclaimed + 1))
   213  
   214        return (split_points_consumed, split_points_remaining)
   215  
   216    def set_split_points_unclaimed_callback(self, callback):
   217      self._split_points_unclaimed_callback = callback
   218  
   219  
   220  class OrderedPositionRangeTracker(iobase.RangeTracker):
   221    """
   222    An abstract base class for range trackers whose positions are comparable.
   223  
   224    Subclasses only need to implement the mapping from position ranges
   225    to and from the closed interval [0, 1].
   226    """
   227  
   228    UNSTARTED = object()
   229  
   230    def __init__(self, start_position=None, stop_position=None):
   231      self._start_position = start_position
   232      self._stop_position = stop_position
   233      self._lock = threading.Lock()
   234      self._last_claim = self.UNSTARTED
   235  
   236    def start_position(self):
   237      return self._start_position
   238  
   239    def stop_position(self):
   240      with self._lock:
   241        return self._stop_position
   242  
   243    def try_claim(self, position):
   244      with self._lock:
   245        if self._last_claim is not self.UNSTARTED and position < self._last_claim:
   246          raise ValueError(
   247              "Positions must be claimed in order: "
   248              "claim '%s' attempted after claim '%s'" %
   249              (position, self._last_claim))
   250        elif self._start_position is not None and position < self._start_position:
   251          raise ValueError(
   252              "Claim '%s' is before start '%s'" %
   253              (position, self._start_position))
   254        if self._stop_position is None or position < self._stop_position:
   255          self._last_claim = position
   256          return True
   257        else:
   258          return False
   259  
   260    def position_at_fraction(self, fraction):
   261      return self.fraction_to_position(
   262          fraction, self._start_position, self._stop_position)
   263  
   264    def try_split(self, position):
   265      with self._lock:
   266        if ((self._stop_position is not None and position >= self._stop_position)
   267            or (self._start_position is not None and
   268                position <= self._start_position)):
   269          _LOGGER.debug(
   270              'Refusing to split %r at %d: proposed split position out of range',
   271              self,
   272              position)
   273          return
   274  
   275        if self._last_claim is self.UNSTARTED or self._last_claim < position:
   276          fraction = self.position_to_fraction(
   277              position, start=self._start_position, end=self._stop_position)
   278          self._stop_position = position
   279          return position, fraction
   280  
   281    def fraction_consumed(self):
   282      if self._last_claim is self.UNSTARTED:
   283        return 0
   284      else:
   285        return self.position_to_fraction(
   286            self._last_claim, self._start_position, self._stop_position)
   287  
   288    def fraction_to_position(self, fraction, start, end):
   289      """
   290      Converts a fraction between 0 and 1 to a position between start and end.
   291      """
   292      raise NotImplementedError
   293  
   294    def position_to_fraction(self, position, start, end):
   295      """Returns the fraction of keys in the range [start, end) that
   296      are less than the given key.
   297      """
   298      raise NotImplementedError
   299  
   300  
   301  class UnsplittableRangeTracker(iobase.RangeTracker):
   302    """A RangeTracker that always ignores split requests.
   303  
   304    This can be used to make a given
   305    :class:`~apache_beam.io.iobase.RangeTracker` object unsplittable by
   306    ignoring all calls to :meth:`.try_split()`. All other calls will be delegated
   307    to the given :class:`~apache_beam.io.iobase.RangeTracker`.
   308    """
   309    def __init__(self, range_tracker):
   310      """Initializes UnsplittableRangeTracker.
   311  
   312      Args:
   313        range_tracker (~apache_beam.io.iobase.RangeTracker): a
   314          :class:`~apache_beam.io.iobase.RangeTracker` to which all method
   315          calls expect calls to :meth:`.try_split()` will be delegated.
   316      """
   317      assert isinstance(range_tracker, iobase.RangeTracker)
   318      self._range_tracker = range_tracker
   319  
   320    def start_position(self):
   321      return self._range_tracker.start_position()
   322  
   323    def stop_position(self):
   324      return self._range_tracker.stop_position()
   325  
   326    def position_at_fraction(self, fraction):
   327      return self._range_tracker.position_at_fraction(fraction)
   328  
   329    def try_claim(self, position):
   330      return self._range_tracker.try_claim(position)
   331  
   332    def try_split(self, position):
   333      return None
   334  
   335    def set_current_position(self, position):
   336      self._range_tracker.set_current_position(position)
   337  
   338    def fraction_consumed(self):
   339      return self._range_tracker.fraction_consumed()
   340  
   341    def split_points(self):
   342      # An unsplittable range only contains a single split point.
   343      return (0, 1)
   344  
   345    def set_split_points_unclaimed_callback(self, callback):
   346      self._range_tracker.set_split_points_unclaimed_callback(callback)
   347  
   348  
   349  class LexicographicKeyRangeTracker(OrderedPositionRangeTracker):
   350    """A range tracker that tracks progress through a lexicographically
   351    ordered keyspace of strings.
   352    """
   353    @classmethod
   354    def fraction_to_position(
   355        cls,
   356        fraction: float,
   357        start: Union[bytes, str] = None,
   358        end: Union[bytes, str] = None,
   359    ) -> Union[bytes, str]:
   360      """Linearly interpolates a key that is lexicographically
   361      fraction of the way between start and end.
   362      """
   363      assert 0 <= fraction <= 1, fraction
   364  
   365      if start is None:
   366        start = b''
   367  
   368      if fraction == 0:
   369        return start
   370  
   371      if fraction == 1:
   372        return end
   373  
   374      if not end:
   375        common_prefix_len = len(start) - len(start.lstrip(b'\xFF'))
   376      else:
   377        for ix, (s, e) in enumerate(zip(start, end)):
   378          if s != e:
   379            common_prefix_len = ix
   380            break
   381        else:
   382          common_prefix_len = min(len(start), len(end))
   383  
   384      # Convert the relative precision of fraction (~53 bits) to an absolute
   385      # precision needed to represent values between start and end distinctly.
   386      prec = common_prefix_len + int(-math.log(fraction, 256)) + 7
   387      istart = cls._bytestring_to_int(start, prec)
   388      iend = cls._bytestring_to_int(end, prec) if end else 1 << (prec * 8)
   389      ikey = istart + int((iend - istart) * fraction)
   390  
   391      # Could be equal due to rounding.
   392      # Adjust to ensure we never return the actual start and end
   393      # unless fraction is exatly 0 or 1.
   394      if ikey == istart:
   395        ikey += 1
   396      elif ikey == iend:
   397        ikey -= 1
   398  
   399      position: bytes = cls._bytestring_from_int(ikey, prec).rstrip(b'\0')
   400  
   401      if isinstance(start, bytes):
   402        return position
   403  
   404      return position.decode(encoding='unicode_escape', errors='replace')
   405  
   406    @classmethod
   407    def position_to_fraction(
   408        cls,
   409        key: Union[bytes, str] = None,
   410        start: Union[bytes, str] = None,
   411        end: Union[bytes, str] = None,
   412    ) -> float:
   413      """Returns the fraction of keys in the range [start, end) that
   414      are less than the given key.
   415      """
   416      if not key:
   417        return 0
   418  
   419      if start is None:
   420        start = '' if isinstance(key, str) else b''
   421  
   422      prec = len(start) + 7
   423      if key.startswith(start):
   424        # Higher absolute precision needed for very small values of fixed
   425        # relative position.
   426        trailing_symbol = '\0' if isinstance(key, str) else b'\0'
   427        prec = max(
   428            prec, len(key) - len(key[len(start):].strip(trailing_symbol)) + 7)
   429      istart = cls._bytestring_to_int(start, prec)
   430      ikey = cls._bytestring_to_int(key, prec)
   431      iend = cls._bytestring_to_int(end, prec) if end else 1 << (prec * 8)
   432      return float(ikey - istart) / (iend - istart)
   433  
   434    @staticmethod
   435    def _bytestring_to_int(s: Union[bytes, str], prec: int) -> int:
   436      """Returns int(256**prec * f) where f is the fraction
   437      represented by interpreting '.' + s as a base-256
   438      floating point number.
   439      """
   440      if not s:
   441        return 0
   442  
   443      if isinstance(s, str):
   444        s = s.encode()  # str -> bytes
   445  
   446      if len(s) < prec:
   447        s += b'\0' * (prec - len(s))
   448      else:
   449        s = s[:prec]
   450  
   451      h = codecs.encode(s, encoding='hex')
   452      return int(h, base=16)
   453  
   454    @staticmethod
   455    def _bytestring_from_int(i: int, prec: int) -> bytes:
   456      """Inverse of _bytestring_to_int."""
   457      h = '%x' % i
   458      return codecs.decode('0' * (2 * prec - len(h)) + h, encoding='hex')