github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/runners/worker/opcounters.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  # cython: language_level=3
    19  # cython: profile=True
    20  
    21  """Counters collect the progress of the Worker for reporting to the service."""
    22  
    23  # pytype: skip-file
    24  
    25  import math
    26  import random
    27  import sys
    28  from typing import TYPE_CHECKING
    29  from typing import Any
    30  from typing import Optional
    31  
    32  from apache_beam.typehints import TypeCheckError
    33  from apache_beam.typehints.decorators import _check_instance_type
    34  from apache_beam.utils import counters
    35  from apache_beam.utils import windowed_value
    36  from apache_beam.utils.counters import Counter
    37  from apache_beam.utils.counters import CounterName
    38  
    39  if TYPE_CHECKING:
    40    from apache_beam.runners.worker.statesampler import StateSampler
    41    from apache_beam.typehints.batch import BatchConverter
    42  
    43  
    44  class TransformIOCounter(object):
    45    """Class to track time and bytes consumed while reading from IO.
    46  
    47    Subclasses should be able to track consumption of IO across steps
    48    in the same stage - for instance, if a Shuffle or Side Input iterable
    49    is passed down to a next step.
    50  
    51    Some examples of IO can be side inputs, shuffle, or streaming state.
    52    """
    53    def __init__(self, counter_factory, state_sampler):
    54      """Create a new IO read counter.
    55  
    56      Args:
    57        counter_factory: A counters.CounterFactory to create byte counters.
    58        state_sampler: A statesampler.StateSampler to transition into read states.
    59      """
    60      self._counter_factory = counter_factory
    61      self._state_sampler = state_sampler
    62      self._latest_step = None
    63      self.bytes_read_counter = None
    64      self.scoped_state = None
    65  
    66    def update_current_step(self):
    67      """Update the current running step.
    68  
    69      Due to the fusion optimization, user code may choose to emit the data
    70      structure that holds side inputs (Iterable, Dict, or others). This call
    71      updates the current step, to attribute the data consumption to the step
    72      that is responsible for actual consumption.
    73  
    74      CounterName uses the io_target field for information pertinent to the
    75      consumption of IO.
    76      """
    77      current_state = self._state_sampler.current_state()
    78      current_step_name = current_state.name.step_name
    79      if current_step_name != self._latest_step:
    80        self._latest_step = current_step_name
    81        self._update_counters_for_requesting_step(current_step_name)
    82  
    83    def _update_counters_for_requesting_step(self, step_name):
    84      pass
    85  
    86    def add_bytes_read(self, count):
    87      if count > 0 and self.bytes_read_counter:
    88        self.bytes_read_counter.update(count)
    89  
    90    def __enter__(self):
    91      self.scoped_state.__enter__()
    92  
    93    def __exit__(self, exception_type, exception_value, traceback):
    94      self.scoped_state.__exit__(exception_type, exception_value, traceback)
    95  
    96  
    97  class NoOpTransformIOCounter(TransformIOCounter):
    98    """All operations for IO tracking are no-ops."""
    99    def __init__(self):
   100      super().__init__(None, None)
   101  
   102    def update_current_step(self):
   103      pass
   104  
   105    def __enter__(self):
   106      pass
   107  
   108    def __exit__(self, exception_type, exception_value, traceback):
   109      pass
   110  
   111    def add_bytes_read(self, count):
   112      pass
   113  
   114  
   115  class SideInputReadCounter(TransformIOCounter):
   116    """Tracks time and bytes consumed while reading from side inputs.
   117  
   118    This class is designed to track consumption of side inputs across fused steps.
   119    We represent a side input as a declaring step, and an input index.
   120  
   121    The declaring step is the step that originally receives the side input for
   122    consumption, and the input index in which the declaring step receives the side
   123    input that we want to identify.
   124  
   125    Note that the declaring step originally receives the side input, but it may
   126    not be the only step that spends time reading from this side input.
   127    """
   128  
   129    def __init__(self,
   130                 counter_factory,
   131                 state_sampler,  # type: StateSampler
   132                 declaring_step,
   133                 input_index
   134                ):
   135      """Create a side input read counter.
   136  
   137      Args:
   138        counter_factory: A counters.CounterFactory to create byte counters.
   139        state_sampler: A statesampler.StateSampler to transition into read states.
   140        declaring_step: A string with the step name of the step that directly
   141          receives the side input initially.
   142        input_index: The index of the side input in the list of inputs of the
   143          declaring step.
   144  
   145      The side input is uniquely identified by (declaring_step, input_index);
   146      where declaring_step is the step that receives the PCollectionView as a
   147      side input, and input_index is the index of the PCollectionView within
   148      the list of inputs.
   149      """
   150      super().__init__(counter_factory, state_sampler)
   151      self.declaring_step = declaring_step
   152      self.input_index = input_index
   153  
   154      # Side inputs are set up within the start state of the first receiving
   155      # step. We check the current state to create the internal counters.
   156      self.update_current_step()
   157  
   158    def _update_counters_for_requesting_step(self, step_name):
   159      side_input_id = counters.side_input_id(step_name, self.input_index)
   160      self.scoped_state = self._state_sampler.scoped_state(
   161          self.declaring_step, 'read-sideinput', io_target=side_input_id)
   162      self.bytes_read_counter = self._counter_factory.get_counter(
   163          CounterName(
   164              'read-sideinput-byte-count',
   165              step_name=self.declaring_step,
   166              io_target=side_input_id),
   167          Counter.SUM)
   168  
   169  
   170  class SumAccumulator(object):
   171    """Accumulator for collecting byte counts."""
   172    def __init__(self):
   173      self._value = 0
   174  
   175    def update(self, value):
   176      self._value += value
   177  
   178    def value(self):
   179      return self._value
   180  
   181  
   182  class OperationCounters(object):
   183    """The set of basic counters to attach to an Operation."""
   184    def __init__(
   185        self,
   186        counter_factory,
   187        step_name,  # type: str
   188        coder,
   189        index,
   190        suffix='out',
   191        producer_type_hints=None,
   192        producer_batch_converter=None, # type: Optional[BatchConverter]
   193    ):
   194      self._counter_factory = counter_factory
   195      self.element_counter = counter_factory.get_counter(
   196          '%s-%s%s-ElementCount' % (step_name, suffix, index), Counter.SUM)
   197      self.mean_byte_counter = counter_factory.get_counter(
   198          '%s-%s%s-MeanByteCount' % (step_name, suffix, index),
   199          Counter.BEAM_DISTRIBUTION)
   200      self.coder_impl = coder.get_impl() if coder else None
   201      self.active_accumulator = None  # type: Optional[SumAccumulator]
   202      self.current_size = None  # type: Optional[int]
   203      self._sample_counter = 0
   204      self._next_sample = 0
   205      self.output_type_constraints = producer_type_hints or {}
   206      self.producer_batch_converter = producer_batch_converter
   207  
   208    def update_from(self, windowed_value):
   209      # type: (windowed_value.WindowedValue) -> None
   210  
   211      """Add one value to this counter."""
   212      if self._should_sample():
   213        self.do_sample(windowed_value)
   214  
   215    def update_from_batch(self, windowed_batch):
   216      # type: (windowed_value.WindowedBatch) -> None
   217      assert self.producer_batch_converter is not None
   218      assert isinstance(windowed_batch, windowed_value.HomogeneousWindowedBatch)
   219  
   220      batch_length = self.producer_batch_converter.get_length(
   221          windowed_batch.values)
   222      self.element_counter.update(batch_length)
   223  
   224      mean_element_size = self.producer_batch_converter.estimate_byte_size(
   225          windowed_batch.values) / batch_length
   226      self.mean_byte_counter.update_n(mean_element_size, batch_length)
   227  
   228    def _observable_callback(self, inner_coder_impl, accumulator):
   229      def _observable_callback_inner(value, is_encoded=False):
   230        # TODO(ccy): If this stream is large, sample it as well.
   231        # To do this, we'll need to compute the average size of elements
   232        # in this stream to add the *total* size of this stream to accumulator.
   233        # We'll also want make sure we sample at least some of this stream
   234        # (as self.should_sample() may be sampling very sparsely by now).
   235        if is_encoded:
   236          size = len(value)
   237          accumulator.update(size)
   238        else:
   239          accumulator.update(inner_coder_impl.estimate_size(value))
   240  
   241      return _observable_callback_inner
   242  
   243    def type_check(self, value):
   244      # type: (Any, bool) -> None
   245      for transform_label, type_constraint_tuple in (
   246              self.output_type_constraints.items()):
   247        parameter_name, constraint = type_constraint_tuple
   248        try:
   249          _check_instance_type(constraint, value, parameter_name, verbose=True)
   250        except TypeCheckError as e:
   251          # TODO: Remove the 'ParDo' prefix for the label name (BEAM-10710)
   252          if not transform_label.startswith('ParDo'):
   253            transform_label = 'ParDo(%s)' % transform_label
   254          error_msg = (
   255              'Runtime type violation detected within %s: '
   256              '%s' % (transform_label, e))
   257          _, _, traceback = sys.exc_info()
   258          raise TypeCheckError(error_msg).with_traceback(traceback)
   259  
   260    def do_sample(self, windowed_value):
   261      # type: (windowed_value.WindowedValue) -> None
   262      self.type_check(windowed_value.value)
   263  
   264      size, observables = (
   265          self.coder_impl.get_estimated_size_and_observables(windowed_value))
   266      if not observables:
   267        self.current_size = size
   268      else:
   269        self.active_accumulator = SumAccumulator()
   270        self.active_accumulator.update(size)
   271        for observable, inner_coder_impl in observables:
   272          observable.register_observer(
   273              self._observable_callback(
   274                  inner_coder_impl, self.active_accumulator))
   275  
   276    def update_collect(self):
   277      """Collects the accumulated size estimates.
   278  
   279      Now that the element has been processed, we ask our accumulator
   280      for the total and store the result in a counter.
   281      """
   282      self.element_counter.update(1)
   283      if self.current_size is not None:
   284        self.mean_byte_counter.update(self.current_size)
   285        self.current_size = None
   286      elif self.active_accumulator is not None:
   287        self.mean_byte_counter.update(self.active_accumulator.value())
   288        self.active_accumulator = None
   289  
   290    def _compute_next_sample(self, i):
   291      # https://en.wikipedia.org/wiki/Reservoir_sampling#Fast_Approximation
   292      gap = math.log(1.0 - random.random()) / math.log(1.0 - (10.0 / i))
   293      return i + math.floor(gap)
   294  
   295    def _should_sample(self):
   296      """Determines whether to sample the next element.
   297  
   298      Size calculation can be expensive, so we don't do it for each element.
   299      Because we need only an estimate of average size, we sample.
   300  
   301      We always sample the first 10 elements, then the sampling rate
   302      is approximately 10/N.  After reading N elements, of the next N,
   303      we will sample approximately 10*ln(2) (about 7) elements.
   304  
   305      This algorithm samples at the same rate as Reservoir Sampling, but
   306      it never throws away early results.  (Because we keep only a
   307      running accumulation, storage is not a problem, so there is no
   308      need to discard earlier calculations.)
   309  
   310      Because we accumulate and do not replace, our statistics are
   311      biased toward early data.  If the data are distributed uniformly,
   312      this is not a problem.  If the data change over time (i.e., the
   313      element size tends to grow or shrink over time), our estimate will
   314      show the bias.  We could correct this by giving weight N to each
   315      sample, since each sample is a stand-in for the N/(10*ln(2))
   316      samples around it, which is proportional to N.  Since we do not
   317      expect biased data, for efficiency we omit the extra multiplication.
   318      We could reduce the early-data bias by putting a lower bound on
   319      the sampling rate.
   320  
   321      Computing random.randint(1, self._sample_counter) for each element
   322      is too slow, so when the sample size is big enough (we estimate 30
   323      is big enough), we estimate the size of the gap after each sample.
   324      This estimation allows us to call random much less often.
   325  
   326      Returns:
   327        True if it is time to compute another element's size.
   328      """
   329      if self.coder_impl is None:
   330        return False
   331      self._sample_counter += 1
   332      if self._next_sample == 0:
   333        if random.randint(1, self._sample_counter) <= 10:
   334          if self._sample_counter > 30:
   335            self._next_sample = self._compute_next_sample(self._sample_counter)
   336          return True
   337        return False
   338      elif self._sample_counter >= self._next_sample:
   339        self._next_sample = self._compute_next_sample(self._sample_counter)
   340        return True
   341      return False
   342  
   343    def should_sample(self):
   344      # We create this separate method because the above "_should_sample()" method
   345      # is marked as inline in Cython and thus can't be exposed to Python code.
   346      return self._should_sample()
   347  
   348    def restart_sampling(self):
   349      self._sample_counter = 0
   350  
   351    def __str__(self):
   352      return '<%s [%s]>' % (
   353          self.__class__.__name__, ', '.join([str(x) for x in self.__iter__()]))
   354  
   355    def __repr__(self):
   356      return '<%s %s at %s>' % (
   357          self.__class__.__name__, [x for x in self.__iter__()], hex(id(self)))