github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/utils/counters.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  # cython: profile=False
    19  # cython: overflowcheck=True
    20  # cython: language_level=3
    21  
    22  """Counters collect the progress of the Worker for reporting to the service.
    23  
    24  For internal use only; no backwards-compatibility guarantees.
    25  """
    26  
    27  # pytype: skip-file
    28  
    29  import threading
    30  from collections import namedtuple
    31  from typing import TYPE_CHECKING
    32  from typing import Dict
    33  
    34  from apache_beam.transforms import cy_combiners
    35  
    36  if TYPE_CHECKING:
    37    from apache_beam.transforms import core
    38  
    39  # Information identifying the IO being measured by a counter.
    40  #
    41  # A CounterName with IOTarget helps identify the IO being measured by a
    42  # counter.
    43  #
    44  # It may represent the consumption of Shuffle IO, or the consumption of
    45  # side inputs. The way in which each is represented is explained in the
    46  # documentation of the side_input_id, and shuffle_id functions.
    47  IOTargetName = namedtuple(
    48      'IOTargetName', ['requesting_step_name', 'input_index'])
    49  
    50  
    51  def side_input_id(step_name, input_index):
    52    # type: (str, int) -> IOTargetName
    53  
    54    """Create an IOTargetName that identifies the reading of a side input.
    55  
    56    Given a step "s4" that receives two side inputs, then the CounterName
    57    that represents the consumption of side input number 2 is:
    58    * step_name: s4    <---|
    59    * input_index: 2   <---|-- Identifying the side input itself
    60    * requesting_step_name: s4   <-- Identifying the step that reads from it.
    61  
    62    If "s4" emits the whole AsIter of the side input, down to a step, say "s5",
    63    then the requesting_step_name of the subsequent consumption will be "s5".
    64    """
    65    return IOTargetName(step_name, input_index)
    66  
    67  
    68  def shuffle_id(step_name):
    69    # type: (str) -> IOTargetName
    70  
    71    """Create an IOTargetName that identifies a GBK step.
    72  
    73    Given a step "s6" that is downstream from a GBK "s5", then "s6" will read
    74    from shuffle. The CounterName that quantifies the consumption of data from
    75    shuffle has:
    76    * step_name: s5
    77    * requesting_step_name: s6
    78  
    79    If "s6" emits the whole iterable down to a step, say "s7", and "s7" continues
    80    to consume data from the iterable, then a new CounterName will be:
    81    * step_name: s5    <--- Identifying the GBK
    82    * requesting_step_name: s6
    83    """
    84    return IOTargetName(step_name, None)
    85  
    86  
    87  _CounterName = namedtuple(
    88      '_CounterName',
    89      [
    90          'name',
    91          'stage_name',
    92          'step_name',
    93          'system_name',
    94          'namespace',
    95          'origin',
    96          'output_index',
    97          'io_target'
    98      ])
    99  
   100  
   101  class CounterName(_CounterName):
   102    """Naming information for a counter."""
   103    SYSTEM = object()
   104    USER = object()
   105  
   106    def __new__(
   107        cls,
   108        name,
   109        stage_name=None,
   110        step_name=None,
   111        system_name=None,
   112        namespace=None,
   113        origin=None,
   114        output_index=None,
   115        io_target=None):
   116      origin = origin or CounterName.SYSTEM
   117      return super().__new__(
   118          cls,
   119          name,
   120          stage_name,
   121          step_name,
   122          system_name,
   123          namespace,
   124          origin,
   125          output_index,
   126          io_target)
   127  
   128    def __repr__(self):
   129      return '<CounterName<%s> at %s>' % (self._str_internal(), hex(id(self)))
   130  
   131    def __str__(self):
   132      return self._str_internal()
   133  
   134    def _str_internal(self):
   135      if self.origin == CounterName.USER:
   136        return 'user-%s-%s' % (self.step_name, self.name)
   137      elif self.origin == CounterName.SYSTEM and self.output_index:
   138        return '%s-out%s-%s' % (self.step_name, self.output_index, self.name)
   139      else:
   140        return '%s-%s-%s' % (self.stage_name, self.step_name, self.name)
   141  
   142  
   143  class Counter(object):
   144    """A counter aggregates a series of values.
   145  
   146    The aggregation kind of the Counter is specified when the Counter
   147    is created.  The values aggregated must be of an appropriate for the
   148    aggregation used.  Aggregations supported are listed in the code.
   149  
   150    (The aggregated value will be reported to the Dataflow service.)
   151  
   152    Do not create directly; call CounterFactory.get_counter instead.
   153  
   154    Attributes:
   155      name: the name of the counter, a string
   156      combine_fn: the CombineFn to use for aggregation
   157      accumulator: the accumulator created for the combine_fn
   158    """
   159  
   160    # Handy references to common counters.
   161    SUM = cy_combiners.SumInt64Fn()
   162    MEAN = cy_combiners.MeanInt64Fn()
   163    BEAM_DISTRIBUTION = cy_combiners.DistributionInt64Fn()
   164  
   165    # Dataflow Distribution Accumulator Fn.
   166    # TODO(https://github.com/apache/beam/issues/18843): Generalize distribution
   167    # counter if necessary.
   168    DATAFLOW_DISTRIBUTION = cy_combiners.DataflowDistributionCounterFn()
   169  
   170    def __init__(self, name, combine_fn):
   171      # type: (CounterName, core.CombineFn) -> None
   172  
   173      """Creates a Counter object.
   174  
   175      Args:
   176        name: the name of this counter. It may be a string,
   177              or a CounterName object.
   178        combine_fn: the CombineFn to use for aggregation
   179      """
   180      self.name = name
   181      self.combine_fn = combine_fn
   182      self.accumulator = combine_fn.create_accumulator()
   183      self._add_input = self.combine_fn.add_input
   184  
   185    def update(self, value):
   186      self.accumulator = self._add_input(self.accumulator, value)
   187  
   188    def update_n(self, value, n):
   189      """Update the counter with the same value N times"""
   190      for _ in range(n):
   191        self.accumulator = self._add_input(self, value)
   192  
   193    def reset(self, value):
   194      self.accumulator = self.combine_fn.create_accumulator()
   195  
   196    def value(self):
   197      return self.combine_fn.extract_output(self.accumulator)
   198  
   199    def __str__(self):
   200      return '<%s>' % self._str_internal()
   201  
   202    def __repr__(self):
   203      return '<%s at %s>' % (self._str_internal(), hex(id(self)))
   204  
   205    def _str_internal(self):
   206      return '%s %s %s' % (
   207          self.name, self.combine_fn.__class__.__name__, self.value())
   208  
   209  
   210  class AccumulatorCombineFnCounter(Counter):
   211    """Counter optimized for a mutating accumulator that holds all the logic."""
   212    def __init__(self, name, combine_fn):
   213      # type: (CounterName, cy_combiners.AccumulatorCombineFn) -> None
   214      assert isinstance(combine_fn, cy_combiners.AccumulatorCombineFn)
   215      super().__init__(name, combine_fn)
   216      self.reset()
   217  
   218    def update(self, value):
   219      self._fast_add_input(value)
   220  
   221    def update_n(self, value, n):
   222      self._fast_add_input_n(value, n)
   223  
   224    def reset(self):
   225      self.accumulator = self.combine_fn.create_accumulator()
   226      self._fast_add_input = self.accumulator.add_input
   227      self._fast_add_input_n = self.accumulator.add_input_n
   228  
   229  
   230  class CounterFactory(object):
   231    """Keeps track of unique counters."""
   232    def __init__(self):
   233      self.counters = {}  # type: Dict[CounterName, Counter]
   234  
   235      # Lock to be acquired when accessing the counters map.
   236      self._lock = threading.Lock()
   237  
   238    def get_counter(self, name, combine_fn):
   239      # type: (CounterName, core.CombineFn) -> Counter
   240  
   241      """Returns a counter with the requested name.
   242  
   243      Passing in the same name will return the same counter; the
   244      combine_fn must agree.
   245  
   246      Args:
   247        name: the name of this counter.  Typically has three parts:
   248          "step-output-counter".
   249        combine_fn: the CombineFn to use for aggregation
   250      Returns:
   251        A new or existing counter with the requested name.
   252      """
   253      with self._lock:
   254        counter = self.counters.get(name, None)
   255        if counter:
   256          assert counter.combine_fn == combine_fn
   257        else:
   258          if isinstance(combine_fn, cy_combiners.AccumulatorCombineFn):
   259            counter = AccumulatorCombineFnCounter(name, combine_fn)
   260          else:
   261            counter = Counter(name, combine_fn)
   262          self.counters[name] = counter
   263        return counter
   264  
   265    def reset(self):
   266      # Counters are cached in state sampler states.
   267      with self._lock:
   268        for counter in self.counters.values():
   269          counter.reset()
   270  
   271    def get_counters(self):
   272      """Returns the current set of counters.
   273  
   274      Returns:
   275        An iterable that contains the current set of counters. To make sure that
   276        multiple threads can iterate over the set of counters, we return a new
   277        iterable here. Note that the actual set of counters may get modified after
   278        this method returns hence the returned iterable may be stale.
   279      """
   280      with self._lock:
   281        return self.counters.values()  # pylint: disable=bad-option-value