github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/io/restriction_trackers.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 """`iobase.RestrictionTracker` implementations provided with Apache Beam.""" 19 # pytype: skip-file 20 21 from typing import Tuple 22 23 from apache_beam.io.iobase import RestrictionProgress 24 from apache_beam.io.iobase import RestrictionTracker 25 from apache_beam.io.range_trackers import OffsetRangeTracker 26 27 28 class OffsetRange(object): 29 def __init__(self, start, stop): 30 if start > stop: 31 raise ValueError( 32 'Start offset must be not be larger than the stop offset. ' 33 'Received %d and %d respectively.' % (start, stop)) 34 self.start = start 35 self.stop = stop 36 37 def __eq__(self, other): 38 if not isinstance(other, OffsetRange): 39 return False 40 41 return self.start == other.start and self.stop == other.stop 42 43 def __hash__(self): 44 return hash((type(self), self.start, self.stop)) 45 46 def __repr__(self): 47 return 'OffsetRange(start=%s, stop=%s)' % (self.start, self.stop) 48 49 def split(self, desired_num_offsets_per_split, min_num_offsets_per_split=1): 50 current_split_start = self.start 51 max_split_size = max( 52 desired_num_offsets_per_split, min_num_offsets_per_split) 53 while current_split_start < self.stop: 54 current_split_stop = min(current_split_start + max_split_size, self.stop) 55 remaining = self.stop - current_split_stop 56 57 # Avoiding a small split at the end. 58 if (remaining < desired_num_offsets_per_split // 4 or 59 remaining < min_num_offsets_per_split): 60 current_split_stop = self.stop 61 62 yield OffsetRange(current_split_start, current_split_stop) 63 current_split_start = current_split_stop 64 65 def split_at(self, split_pos): 66 # type: (...) -> Tuple[OffsetRange, OffsetRange] 67 return OffsetRange(self.start, split_pos), OffsetRange(split_pos, self.stop) 68 69 def new_tracker(self): 70 return OffsetRangeTracker(self.start, self.stop) 71 72 def size(self): 73 return self.stop - self.start 74 75 76 class OffsetRestrictionTracker(RestrictionTracker): 77 """An `iobase.RestrictionTracker` implementations for an offset range. 78 79 Offset range is represented as OffsetRange. 80 """ 81 def __init__(self, offset_range): 82 # type: (OffsetRange) -> None 83 assert isinstance(offset_range, OffsetRange), offset_range 84 self._range = offset_range 85 self._current_position = None 86 self._last_claim_attempt = None 87 self._checkpointed = False 88 89 def check_done(self): 90 if (self._range.start != self._range.stop and 91 (self._last_claim_attempt is None or 92 self._last_claim_attempt < self._range.stop - 1)): 93 raise ValueError( 94 'OffsetRestrictionTracker is not done since work in range [%s, %s) ' 95 'has not been claimed.' % ( 96 self._last_claim_attempt 97 if self._last_claim_attempt is not None else self._range.start, 98 self._range.stop)) 99 100 def current_restriction(self): 101 return self._range 102 103 def current_progress(self): 104 # type: () -> RestrictionProgress 105 if self._current_position is None: 106 fraction = 0.0 107 elif self._range.stop == self._range.start: 108 # If self._current_position is not None, we must be done. 109 fraction = 1.0 110 else: 111 fraction = ( 112 float(self._current_position - self._range.start) / 113 (self._range.stop - self._range.start)) 114 return RestrictionProgress(fraction=fraction) 115 116 def start_position(self): 117 return self._range.start 118 119 def stop_position(self): 120 return self._range.stop 121 122 def try_claim(self, position): 123 if (self._last_claim_attempt is not None and 124 position <= self._last_claim_attempt): 125 raise ValueError( 126 'Positions claimed should strictly increase. Trying to claim ' 127 'position %d while last claim attempt was %d.' % 128 (position, self._last_claim_attempt)) 129 130 self._last_claim_attempt = position 131 if position < self._range.start: 132 raise ValueError( 133 'Position to be claimed cannot be smaller than the start position ' 134 'of the range. Tried to claim position %r for the range [%r, %r)' % 135 (position, self._range.start, self._range.stop)) 136 137 if self._range.start <= position < self._range.stop: 138 self._current_position = position 139 return True 140 141 return False 142 143 def try_split(self, fraction_of_remainder): 144 if not self._checkpointed: 145 if self._last_claim_attempt is None: 146 cur = self._range.start - 1 147 else: 148 cur = self._last_claim_attempt 149 split_point = ( 150 cur + int(max(1, (self._range.stop - cur) * fraction_of_remainder))) 151 if split_point < self._range.stop: 152 if fraction_of_remainder == 0: 153 self._checkpointed = True 154 self._range, residual_range = self._range.split_at(split_point) 155 return self._range, residual_range 156 157 def is_bounded(self): 158 return True 159 160 161 class UnsplittableRestrictionTracker(RestrictionTracker): 162 """An `iobase.RestrictionTracker` that wraps another but does not split.""" 163 def __init__(self, underling_tracker): 164 self._underling_tracker = underling_tracker 165 166 def try_split(self, fraction_of_remainder): 167 return False 168 169 # __getattribute__ is used rather than __getattr__ to override the 170 # stubs in the baseclass. 171 def __getattribute__(self, name): 172 if name.startswith('_') or name in ('try_split', ): 173 return super().__getattribute__(name) 174 else: 175 return getattr(self._underling_tracker, name)