github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/utils/histogram.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  
    18  import logging
    19  import math
    20  import threading
    21  from collections import Counter
    22  
    23  _LOGGER = logging.getLogger(__name__)
    24  
    25  
    26  class Histogram(object):
    27    """A histogram that supports estimated percentile with linear interpolation.
    28    """
    29    def __init__(self, bucket_type):
    30      self._lock = threading.Lock()
    31      self._bucket_type = bucket_type
    32      self._buckets = Counter()
    33      self._num_records = 0
    34      self._num_top_records = 0
    35      self._num_bot_records = 0
    36  
    37    def clear(self):
    38      with self._lock:
    39        self._buckets = Counter()
    40        self._num_records = 0
    41        self._num_top_records = 0
    42        self._num_bot_records = 0
    43  
    44    def copy(self):
    45      with self._lock:
    46        histogram = Histogram(self._bucket_type)
    47        histogram._num_records = self._num_records
    48        histogram._num_top_records = self._num_top_records
    49        histogram._num_bot_records = self._num_bot_records
    50        histogram._buckets = self._buckets.copy()
    51        return histogram
    52  
    53    def combine(self, other):
    54      if not isinstance(other,
    55                        Histogram) or self._bucket_type != other._bucket_type:
    56        raise RuntimeError('failed to combine histogram.')
    57      other_histogram = other.copy()
    58      with self._lock:
    59        histogram = Histogram(self._bucket_type)
    60        histogram._num_records = self._num_records + other_histogram._num_records
    61        histogram._num_top_records = (
    62            self._num_top_records + other_histogram._num_top_records)
    63        histogram._num_bot_records = (
    64            self._num_bot_records + other_histogram._num_bot_records)
    65        histogram._buckets = self._buckets + other_histogram._buckets
    66        return histogram
    67  
    68    def record(self, *args):
    69      for arg in args:
    70        self._record(arg)
    71  
    72    def _record(self, value):
    73      range_from = self._bucket_type.range_from()
    74      range_to = self._bucket_type.range_to()
    75      with self._lock:
    76        if value >= range_to:
    77          _LOGGER.warning('record is out of upper bound %s: %s', range_to, value)
    78          self._num_top_records += 1
    79        elif value < range_from:
    80          _LOGGER.warning(
    81              'record is out of lower bound %s: %s', range_from, value)
    82          self._num_bot_records += 1
    83        else:
    84          index = self._bucket_type.bucket_index(value)
    85          self._buckets[index] = self._buckets.get(index, 0) + 1
    86          self._num_records += 1
    87  
    88    def total_count(self):
    89      return self._num_records + self._num_top_records + self._num_bot_records
    90  
    91    def p99(self):
    92      return self.get_linear_interpolation(0.99)
    93  
    94    def p90(self):
    95      return self.get_linear_interpolation(0.90)
    96  
    97    def p50(self):
    98      return self.get_linear_interpolation(0.50)
    99  
   100    def get_percentile_info(self):
   101      def _format(f):
   102        if f == float('-inf'):
   103          return '<%s' % self._bucket_type.range_from()
   104        elif f == float('inf'):
   105          return '>=%s' % self._bucket_type.range_to()
   106        else:
   107          return str(int(round(f)))  # pylint: disable=bad-option-value
   108  
   109      with self._lock:
   110        return (
   111            'Total count: %s, '
   112            'P99: %s, P90: %s, P50: %s' % (
   113                self.total_count(),
   114                _format(self._get_linear_interpolation(0.99)),
   115                _format(self._get_linear_interpolation(0.90)),
   116                _format(self._get_linear_interpolation(0.50))))
   117  
   118    def get_linear_interpolation(self, percentile):
   119      """Calculate percentile estimation based on linear interpolation.
   120  
   121      It first finds the bucket which includes the target percentile and
   122      projects the estimated point in the bucket by assuming all the elements
   123      in the bucket are uniformly distributed.
   124  
   125      Args:
   126        percentile: The target percentile of the value returning from this
   127          method. Should be a floating point number greater than 0 and less
   128          than 1.
   129      """
   130      with self._lock:
   131        return self._get_linear_interpolation(percentile)
   132  
   133    def _get_linear_interpolation(self, percentile):
   134      total_num_records = self.total_count()
   135      if total_num_records == 0:
   136        raise RuntimeError('histogram has no record.')
   137  
   138      index = 0
   139      record_sum = self._num_bot_records
   140      if record_sum / total_num_records >= percentile:
   141        return float('-inf')
   142      while index < self._bucket_type.num_buckets():
   143        record_sum += self._buckets.get(index, 0)
   144        if record_sum / total_num_records >= percentile:
   145          break
   146        index += 1
   147      if index == self._bucket_type.num_buckets():
   148        return float('inf')
   149  
   150      frac_percentile = percentile - (
   151          record_sum - self._buckets[index]) / total_num_records
   152      bucket_percentile = self._buckets[index] / total_num_records
   153      frac_bucket_size = frac_percentile * self._bucket_type.bucket_size(
   154          index) / bucket_percentile
   155      return (
   156          self._bucket_type.range_from() +
   157          self._bucket_type.accumulated_bucket_size(index) + frac_bucket_size)
   158  
   159    def __eq__(self, other):
   160      if not isinstance(other, Histogram):
   161        return False
   162      return (
   163          self._bucket_type == other._bucket_type and
   164          self._num_records == other._num_records and
   165          self._num_top_records == other._num_top_records and
   166          self._num_bot_records == other._num_bot_records and
   167          self._buckets == other._buckets)
   168  
   169    def __hash__(self):
   170      return hash((
   171          self._bucket_type,
   172          self._num_records,
   173          self._num_top_records,
   174          self._num_bot_records,
   175          frozenset(self._buckets.items())))
   176  
   177  
   178  class BucketType(object):
   179    def range_from(self):
   180      """Lower bound of a starting bucket."""
   181      raise NotImplementedError
   182  
   183    def range_to(self):
   184      """Upper bound of an ending bucket."""
   185      raise NotImplementedError
   186  
   187    def num_buckets(self):
   188      """The number of buckets."""
   189      raise NotImplementedError
   190  
   191    def bucket_index(self, value):
   192      """Get the bucket array index for the given value."""
   193      raise NotImplementedError
   194  
   195    def bucket_size(self, index):
   196      """Get the bucket size for the given bucket array index."""
   197      raise NotImplementedError
   198  
   199    def accumulated_bucket_size(self, end_index):
   200      """Get the accumulated bucket size from bucket index 0 until endIndex.
   201  
   202      Generally, this can be calculated as
   203      `sigma(0 <= i < endIndex) getBucketSize(i)`. However, a child class could
   204      provide better optimized calculation.
   205      """
   206      raise NotImplementedError
   207  
   208  
   209  class LinearBucket(BucketType):
   210    def __init__(self, start, width, num_buckets):
   211      """Create a histogram with linear buckets.
   212  
   213      Args:
   214        start: Lower bound of a starting bucket.
   215        width: Bucket width. Smaller width implies a better resolution for
   216          percentile estimation.
   217        num_buckets: The number of buckets. Upper bound of an ending bucket is
   218          defined by start + width * numBuckets.
   219      """
   220      self._start = start
   221      self._width = width
   222      self._num_buckets = num_buckets
   223  
   224    def range_from(self):
   225      return self._start
   226  
   227    def range_to(self):
   228      return self._start + self._width * self._num_buckets
   229  
   230    def num_buckets(self):
   231      return self._num_buckets
   232  
   233    def bucket_index(self, value):
   234      return math.floor((value - self._start) / self._width)
   235  
   236    def bucket_size(self, index):
   237      return self._width
   238  
   239    def accumulated_bucket_size(self, end_index):
   240      return self._width * end_index
   241  
   242    def __eq__(self, other):
   243      if not isinstance(other, LinearBucket):
   244        return False
   245      return (
   246          self._start == other._start and self._width == other._width and
   247          self._num_buckets == other._num_buckets)
   248  
   249    def __hash__(self):
   250      return hash((self._start, self._width, self._num_buckets))