github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/runners/worker/opcounters.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 # cython: language_level=3 19 # cython: profile=True 20 21 """Counters collect the progress of the Worker for reporting to the service.""" 22 23 # pytype: skip-file 24 25 import math 26 import random 27 import sys 28 from typing import TYPE_CHECKING 29 from typing import Any 30 from typing import Optional 31 32 from apache_beam.typehints import TypeCheckError 33 from apache_beam.typehints.decorators import _check_instance_type 34 from apache_beam.utils import counters 35 from apache_beam.utils import windowed_value 36 from apache_beam.utils.counters import Counter 37 from apache_beam.utils.counters import CounterName 38 39 if TYPE_CHECKING: 40 from apache_beam.runners.worker.statesampler import StateSampler 41 from apache_beam.typehints.batch import BatchConverter 42 43 44 class TransformIOCounter(object): 45 """Class to track time and bytes consumed while reading from IO. 46 47 Subclasses should be able to track consumption of IO across steps 48 in the same stage - for instance, if a Shuffle or Side Input iterable 49 is passed down to a next step. 50 51 Some examples of IO can be side inputs, shuffle, or streaming state. 52 """ 53 def __init__(self, counter_factory, state_sampler): 54 """Create a new IO read counter. 55 56 Args: 57 counter_factory: A counters.CounterFactory to create byte counters. 58 state_sampler: A statesampler.StateSampler to transition into read states. 59 """ 60 self._counter_factory = counter_factory 61 self._state_sampler = state_sampler 62 self._latest_step = None 63 self.bytes_read_counter = None 64 self.scoped_state = None 65 66 def update_current_step(self): 67 """Update the current running step. 68 69 Due to the fusion optimization, user code may choose to emit the data 70 structure that holds side inputs (Iterable, Dict, or others). This call 71 updates the current step, to attribute the data consumption to the step 72 that is responsible for actual consumption. 73 74 CounterName uses the io_target field for information pertinent to the 75 consumption of IO. 76 """ 77 current_state = self._state_sampler.current_state() 78 current_step_name = current_state.name.step_name 79 if current_step_name != self._latest_step: 80 self._latest_step = current_step_name 81 self._update_counters_for_requesting_step(current_step_name) 82 83 def _update_counters_for_requesting_step(self, step_name): 84 pass 85 86 def add_bytes_read(self, count): 87 if count > 0 and self.bytes_read_counter: 88 self.bytes_read_counter.update(count) 89 90 def __enter__(self): 91 self.scoped_state.__enter__() 92 93 def __exit__(self, exception_type, exception_value, traceback): 94 self.scoped_state.__exit__(exception_type, exception_value, traceback) 95 96 97 class NoOpTransformIOCounter(TransformIOCounter): 98 """All operations for IO tracking are no-ops.""" 99 def __init__(self): 100 super().__init__(None, None) 101 102 def update_current_step(self): 103 pass 104 105 def __enter__(self): 106 pass 107 108 def __exit__(self, exception_type, exception_value, traceback): 109 pass 110 111 def add_bytes_read(self, count): 112 pass 113 114 115 class SideInputReadCounter(TransformIOCounter): 116 """Tracks time and bytes consumed while reading from side inputs. 117 118 This class is designed to track consumption of side inputs across fused steps. 119 We represent a side input as a declaring step, and an input index. 120 121 The declaring step is the step that originally receives the side input for 122 consumption, and the input index in which the declaring step receives the side 123 input that we want to identify. 124 125 Note that the declaring step originally receives the side input, but it may 126 not be the only step that spends time reading from this side input. 127 """ 128 129 def __init__(self, 130 counter_factory, 131 state_sampler, # type: StateSampler 132 declaring_step, 133 input_index 134 ): 135 """Create a side input read counter. 136 137 Args: 138 counter_factory: A counters.CounterFactory to create byte counters. 139 state_sampler: A statesampler.StateSampler to transition into read states. 140 declaring_step: A string with the step name of the step that directly 141 receives the side input initially. 142 input_index: The index of the side input in the list of inputs of the 143 declaring step. 144 145 The side input is uniquely identified by (declaring_step, input_index); 146 where declaring_step is the step that receives the PCollectionView as a 147 side input, and input_index is the index of the PCollectionView within 148 the list of inputs. 149 """ 150 super().__init__(counter_factory, state_sampler) 151 self.declaring_step = declaring_step 152 self.input_index = input_index 153 154 # Side inputs are set up within the start state of the first receiving 155 # step. We check the current state to create the internal counters. 156 self.update_current_step() 157 158 def _update_counters_for_requesting_step(self, step_name): 159 side_input_id = counters.side_input_id(step_name, self.input_index) 160 self.scoped_state = self._state_sampler.scoped_state( 161 self.declaring_step, 'read-sideinput', io_target=side_input_id) 162 self.bytes_read_counter = self._counter_factory.get_counter( 163 CounterName( 164 'read-sideinput-byte-count', 165 step_name=self.declaring_step, 166 io_target=side_input_id), 167 Counter.SUM) 168 169 170 class SumAccumulator(object): 171 """Accumulator for collecting byte counts.""" 172 def __init__(self): 173 self._value = 0 174 175 def update(self, value): 176 self._value += value 177 178 def value(self): 179 return self._value 180 181 182 class OperationCounters(object): 183 """The set of basic counters to attach to an Operation.""" 184 def __init__( 185 self, 186 counter_factory, 187 step_name, # type: str 188 coder, 189 index, 190 suffix='out', 191 producer_type_hints=None, 192 producer_batch_converter=None, # type: Optional[BatchConverter] 193 ): 194 self._counter_factory = counter_factory 195 self.element_counter = counter_factory.get_counter( 196 '%s-%s%s-ElementCount' % (step_name, suffix, index), Counter.SUM) 197 self.mean_byte_counter = counter_factory.get_counter( 198 '%s-%s%s-MeanByteCount' % (step_name, suffix, index), 199 Counter.BEAM_DISTRIBUTION) 200 self.coder_impl = coder.get_impl() if coder else None 201 self.active_accumulator = None # type: Optional[SumAccumulator] 202 self.current_size = None # type: Optional[int] 203 self._sample_counter = 0 204 self._next_sample = 0 205 self.output_type_constraints = producer_type_hints or {} 206 self.producer_batch_converter = producer_batch_converter 207 208 def update_from(self, windowed_value): 209 # type: (windowed_value.WindowedValue) -> None 210 211 """Add one value to this counter.""" 212 if self._should_sample(): 213 self.do_sample(windowed_value) 214 215 def update_from_batch(self, windowed_batch): 216 # type: (windowed_value.WindowedBatch) -> None 217 assert self.producer_batch_converter is not None 218 assert isinstance(windowed_batch, windowed_value.HomogeneousWindowedBatch) 219 220 batch_length = self.producer_batch_converter.get_length( 221 windowed_batch.values) 222 self.element_counter.update(batch_length) 223 224 mean_element_size = self.producer_batch_converter.estimate_byte_size( 225 windowed_batch.values) / batch_length 226 self.mean_byte_counter.update_n(mean_element_size, batch_length) 227 228 def _observable_callback(self, inner_coder_impl, accumulator): 229 def _observable_callback_inner(value, is_encoded=False): 230 # TODO(ccy): If this stream is large, sample it as well. 231 # To do this, we'll need to compute the average size of elements 232 # in this stream to add the *total* size of this stream to accumulator. 233 # We'll also want make sure we sample at least some of this stream 234 # (as self.should_sample() may be sampling very sparsely by now). 235 if is_encoded: 236 size = len(value) 237 accumulator.update(size) 238 else: 239 accumulator.update(inner_coder_impl.estimate_size(value)) 240 241 return _observable_callback_inner 242 243 def type_check(self, value): 244 # type: (Any, bool) -> None 245 for transform_label, type_constraint_tuple in ( 246 self.output_type_constraints.items()): 247 parameter_name, constraint = type_constraint_tuple 248 try: 249 _check_instance_type(constraint, value, parameter_name, verbose=True) 250 except TypeCheckError as e: 251 # TODO: Remove the 'ParDo' prefix for the label name (BEAM-10710) 252 if not transform_label.startswith('ParDo'): 253 transform_label = 'ParDo(%s)' % transform_label 254 error_msg = ( 255 'Runtime type violation detected within %s: ' 256 '%s' % (transform_label, e)) 257 _, _, traceback = sys.exc_info() 258 raise TypeCheckError(error_msg).with_traceback(traceback) 259 260 def do_sample(self, windowed_value): 261 # type: (windowed_value.WindowedValue) -> None 262 self.type_check(windowed_value.value) 263 264 size, observables = ( 265 self.coder_impl.get_estimated_size_and_observables(windowed_value)) 266 if not observables: 267 self.current_size = size 268 else: 269 self.active_accumulator = SumAccumulator() 270 self.active_accumulator.update(size) 271 for observable, inner_coder_impl in observables: 272 observable.register_observer( 273 self._observable_callback( 274 inner_coder_impl, self.active_accumulator)) 275 276 def update_collect(self): 277 """Collects the accumulated size estimates. 278 279 Now that the element has been processed, we ask our accumulator 280 for the total and store the result in a counter. 281 """ 282 self.element_counter.update(1) 283 if self.current_size is not None: 284 self.mean_byte_counter.update(self.current_size) 285 self.current_size = None 286 elif self.active_accumulator is not None: 287 self.mean_byte_counter.update(self.active_accumulator.value()) 288 self.active_accumulator = None 289 290 def _compute_next_sample(self, i): 291 # https://en.wikipedia.org/wiki/Reservoir_sampling#Fast_Approximation 292 gap = math.log(1.0 - random.random()) / math.log(1.0 - (10.0 / i)) 293 return i + math.floor(gap) 294 295 def _should_sample(self): 296 """Determines whether to sample the next element. 297 298 Size calculation can be expensive, so we don't do it for each element. 299 Because we need only an estimate of average size, we sample. 300 301 We always sample the first 10 elements, then the sampling rate 302 is approximately 10/N. After reading N elements, of the next N, 303 we will sample approximately 10*ln(2) (about 7) elements. 304 305 This algorithm samples at the same rate as Reservoir Sampling, but 306 it never throws away early results. (Because we keep only a 307 running accumulation, storage is not a problem, so there is no 308 need to discard earlier calculations.) 309 310 Because we accumulate and do not replace, our statistics are 311 biased toward early data. If the data are distributed uniformly, 312 this is not a problem. If the data change over time (i.e., the 313 element size tends to grow or shrink over time), our estimate will 314 show the bias. We could correct this by giving weight N to each 315 sample, since each sample is a stand-in for the N/(10*ln(2)) 316 samples around it, which is proportional to N. Since we do not 317 expect biased data, for efficiency we omit the extra multiplication. 318 We could reduce the early-data bias by putting a lower bound on 319 the sampling rate. 320 321 Computing random.randint(1, self._sample_counter) for each element 322 is too slow, so when the sample size is big enough (we estimate 30 323 is big enough), we estimate the size of the gap after each sample. 324 This estimation allows us to call random much less often. 325 326 Returns: 327 True if it is time to compute another element's size. 328 """ 329 if self.coder_impl is None: 330 return False 331 self._sample_counter += 1 332 if self._next_sample == 0: 333 if random.randint(1, self._sample_counter) <= 10: 334 if self._sample_counter > 30: 335 self._next_sample = self._compute_next_sample(self._sample_counter) 336 return True 337 return False 338 elif self._sample_counter >= self._next_sample: 339 self._next_sample = self._compute_next_sample(self._sample_counter) 340 return True 341 return False 342 343 def should_sample(self): 344 # We create this separate method because the above "_should_sample()" method 345 # is marked as inline in Cython and thus can't be exposed to Python code. 346 return self._should_sample() 347 348 def restart_sampling(self): 349 self._sample_counter = 0 350 351 def __str__(self): 352 return '<%s [%s]>' % ( 353 self.__class__.__name__, ', '.join([str(x) for x in self.__iter__()])) 354 355 def __repr__(self): 356 return '<%s %s at %s>' % ( 357 self.__class__.__name__, [x for x in self.__iter__()], hex(id(self)))