github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/utils/histogram.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 18 import logging 19 import math 20 import threading 21 from collections import Counter 22 23 _LOGGER = logging.getLogger(__name__) 24 25 26 class Histogram(object): 27 """A histogram that supports estimated percentile with linear interpolation. 28 """ 29 def __init__(self, bucket_type): 30 self._lock = threading.Lock() 31 self._bucket_type = bucket_type 32 self._buckets = Counter() 33 self._num_records = 0 34 self._num_top_records = 0 35 self._num_bot_records = 0 36 37 def clear(self): 38 with self._lock: 39 self._buckets = Counter() 40 self._num_records = 0 41 self._num_top_records = 0 42 self._num_bot_records = 0 43 44 def copy(self): 45 with self._lock: 46 histogram = Histogram(self._bucket_type) 47 histogram._num_records = self._num_records 48 histogram._num_top_records = self._num_top_records 49 histogram._num_bot_records = self._num_bot_records 50 histogram._buckets = self._buckets.copy() 51 return histogram 52 53 def combine(self, other): 54 if not isinstance(other, 55 Histogram) or self._bucket_type != other._bucket_type: 56 raise RuntimeError('failed to combine histogram.') 57 other_histogram = other.copy() 58 with self._lock: 59 histogram = Histogram(self._bucket_type) 60 histogram._num_records = self._num_records + other_histogram._num_records 61 histogram._num_top_records = ( 62 self._num_top_records + other_histogram._num_top_records) 63 histogram._num_bot_records = ( 64 self._num_bot_records + other_histogram._num_bot_records) 65 histogram._buckets = self._buckets + other_histogram._buckets 66 return histogram 67 68 def record(self, *args): 69 for arg in args: 70 self._record(arg) 71 72 def _record(self, value): 73 range_from = self._bucket_type.range_from() 74 range_to = self._bucket_type.range_to() 75 with self._lock: 76 if value >= range_to: 77 _LOGGER.warning('record is out of upper bound %s: %s', range_to, value) 78 self._num_top_records += 1 79 elif value < range_from: 80 _LOGGER.warning( 81 'record is out of lower bound %s: %s', range_from, value) 82 self._num_bot_records += 1 83 else: 84 index = self._bucket_type.bucket_index(value) 85 self._buckets[index] = self._buckets.get(index, 0) + 1 86 self._num_records += 1 87 88 def total_count(self): 89 return self._num_records + self._num_top_records + self._num_bot_records 90 91 def p99(self): 92 return self.get_linear_interpolation(0.99) 93 94 def p90(self): 95 return self.get_linear_interpolation(0.90) 96 97 def p50(self): 98 return self.get_linear_interpolation(0.50) 99 100 def get_percentile_info(self): 101 def _format(f): 102 if f == float('-inf'): 103 return '<%s' % self._bucket_type.range_from() 104 elif f == float('inf'): 105 return '>=%s' % self._bucket_type.range_to() 106 else: 107 return str(int(round(f))) # pylint: disable=bad-option-value 108 109 with self._lock: 110 return ( 111 'Total count: %s, ' 112 'P99: %s, P90: %s, P50: %s' % ( 113 self.total_count(), 114 _format(self._get_linear_interpolation(0.99)), 115 _format(self._get_linear_interpolation(0.90)), 116 _format(self._get_linear_interpolation(0.50)))) 117 118 def get_linear_interpolation(self, percentile): 119 """Calculate percentile estimation based on linear interpolation. 120 121 It first finds the bucket which includes the target percentile and 122 projects the estimated point in the bucket by assuming all the elements 123 in the bucket are uniformly distributed. 124 125 Args: 126 percentile: The target percentile of the value returning from this 127 method. Should be a floating point number greater than 0 and less 128 than 1. 129 """ 130 with self._lock: 131 return self._get_linear_interpolation(percentile) 132 133 def _get_linear_interpolation(self, percentile): 134 total_num_records = self.total_count() 135 if total_num_records == 0: 136 raise RuntimeError('histogram has no record.') 137 138 index = 0 139 record_sum = self._num_bot_records 140 if record_sum / total_num_records >= percentile: 141 return float('-inf') 142 while index < self._bucket_type.num_buckets(): 143 record_sum += self._buckets.get(index, 0) 144 if record_sum / total_num_records >= percentile: 145 break 146 index += 1 147 if index == self._bucket_type.num_buckets(): 148 return float('inf') 149 150 frac_percentile = percentile - ( 151 record_sum - self._buckets[index]) / total_num_records 152 bucket_percentile = self._buckets[index] / total_num_records 153 frac_bucket_size = frac_percentile * self._bucket_type.bucket_size( 154 index) / bucket_percentile 155 return ( 156 self._bucket_type.range_from() + 157 self._bucket_type.accumulated_bucket_size(index) + frac_bucket_size) 158 159 def __eq__(self, other): 160 if not isinstance(other, Histogram): 161 return False 162 return ( 163 self._bucket_type == other._bucket_type and 164 self._num_records == other._num_records and 165 self._num_top_records == other._num_top_records and 166 self._num_bot_records == other._num_bot_records and 167 self._buckets == other._buckets) 168 169 def __hash__(self): 170 return hash(( 171 self._bucket_type, 172 self._num_records, 173 self._num_top_records, 174 self._num_bot_records, 175 frozenset(self._buckets.items()))) 176 177 178 class BucketType(object): 179 def range_from(self): 180 """Lower bound of a starting bucket.""" 181 raise NotImplementedError 182 183 def range_to(self): 184 """Upper bound of an ending bucket.""" 185 raise NotImplementedError 186 187 def num_buckets(self): 188 """The number of buckets.""" 189 raise NotImplementedError 190 191 def bucket_index(self, value): 192 """Get the bucket array index for the given value.""" 193 raise NotImplementedError 194 195 def bucket_size(self, index): 196 """Get the bucket size for the given bucket array index.""" 197 raise NotImplementedError 198 199 def accumulated_bucket_size(self, end_index): 200 """Get the accumulated bucket size from bucket index 0 until endIndex. 201 202 Generally, this can be calculated as 203 `sigma(0 <= i < endIndex) getBucketSize(i)`. However, a child class could 204 provide better optimized calculation. 205 """ 206 raise NotImplementedError 207 208 209 class LinearBucket(BucketType): 210 def __init__(self, start, width, num_buckets): 211 """Create a histogram with linear buckets. 212 213 Args: 214 start: Lower bound of a starting bucket. 215 width: Bucket width. Smaller width implies a better resolution for 216 percentile estimation. 217 num_buckets: The number of buckets. Upper bound of an ending bucket is 218 defined by start + width * numBuckets. 219 """ 220 self._start = start 221 self._width = width 222 self._num_buckets = num_buckets 223 224 def range_from(self): 225 return self._start 226 227 def range_to(self): 228 return self._start + self._width * self._num_buckets 229 230 def num_buckets(self): 231 return self._num_buckets 232 233 def bucket_index(self, value): 234 return math.floor((value - self._start) / self._width) 235 236 def bucket_size(self, index): 237 return self._width 238 239 def accumulated_bucket_size(self, end_index): 240 return self._width * end_index 241 242 def __eq__(self, other): 243 if not isinstance(other, LinearBucket): 244 return False 245 return ( 246 self._start == other._start and self._width == other._width and 247 self._num_buckets == other._num_buckets) 248 249 def __hash__(self): 250 return hash((self._start, self._width, self._num_buckets))