github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/sql/stats/histogram.go (about) 1 // Copyright 2017 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package stats 12 13 import ( 14 "math" 15 "sort" 16 17 "github.com/cockroachdb/cockroach/pkg/settings" 18 "github.com/cockroachdb/cockroach/pkg/sql/sem/tree" 19 "github.com/cockroachdb/cockroach/pkg/sql/sqlbase" 20 "github.com/cockroachdb/cockroach/pkg/sql/types" 21 "github.com/cockroachdb/cockroach/pkg/util/encoding" 22 "github.com/cockroachdb/errors" 23 ) 24 25 // HistogramClusterMode controls the cluster setting for enabling 26 // histogram collection. 27 var HistogramClusterMode = settings.RegisterPublicBoolSetting( 28 "sql.stats.histogram_collection.enabled", 29 "histogram collection mode", 30 true, 31 ) 32 33 // EquiDepthHistogram creates a histogram where each bucket contains roughly 34 // the same number of samples (though it can vary when a boundary value has 35 // high frequency). 36 // 37 // numRows is the total number of rows from which values were sampled 38 // (excluding rows that have NULL values on the histogram column). 39 // 40 // In addition to building the histogram buckets, EquiDepthHistogram also 41 // estimates the number of distinct values in each bucket. It distributes the 42 // known number of distinct values (distinctCount) among the buckets, in 43 // proportion with the number of rows in each bucket. 44 func EquiDepthHistogram( 45 evalCtx *tree.EvalContext, samples tree.Datums, numRows, distinctCount int64, maxBuckets int, 46 ) (HistogramData, error) { 47 numSamples := len(samples) 48 if numSamples == 0 { 49 return HistogramData{}, nil 50 } 51 if maxBuckets < 2 { 52 return HistogramData{}, errors.Errorf("histogram requires at least two buckets") 53 } 54 if numRows < int64(numSamples) { 55 return HistogramData{}, errors.Errorf("more samples than rows") 56 } 57 for _, d := range samples { 58 if d == tree.DNull { 59 return HistogramData{}, errors.Errorf("NULL values not allowed in histogram") 60 } 61 } 62 sort.Slice(samples, func(i, j int) bool { 63 return samples[i].Compare(evalCtx, samples[j]) < 0 64 }) 65 numBuckets := maxBuckets 66 if maxBuckets > numSamples { 67 numBuckets = numSamples 68 } 69 h := HistogramData{ 70 Buckets: make([]HistogramData_Bucket, 0, numBuckets), 71 } 72 lowerBound := samples[0] 73 h.ColumnType = lowerBound.ResolvedType() 74 var distinctCountRange, distinctCountEq float64 75 76 // i keeps track of the current sample and advances as we form buckets. 77 for i, b := 0, 0; b < numBuckets && i < numSamples; b++ { 78 // num is the number of samples in this bucket. The first bucket has 79 // num=1 so the histogram has a clear lower bound. 80 num := (numSamples - i) / (numBuckets - b) 81 if i == 0 || num < 1 { 82 num = 1 83 } 84 upper := samples[i+num-1] 85 // numLess is the number of samples less than upper (in this bucket). 86 numLess := 0 87 for ; numLess < num-1; numLess++ { 88 if c := samples[i+numLess].Compare(evalCtx, upper); c == 0 { 89 break 90 } else if c > 0 { 91 return HistogramData{}, errors.AssertionFailedf("%+v", "samples not sorted") 92 } 93 } 94 // Advance the boundary of the bucket to cover all samples equal to upper. 95 for ; i+num < numSamples; num++ { 96 if samples[i+num].Compare(evalCtx, upper) != 0 { 97 break 98 } 99 } 100 101 numEq := int64(num-numLess) * numRows / int64(numSamples) 102 numRange := int64(numLess) * numRows / int64(numSamples) 103 distinctRange := estimatedDistinctValuesInRange(float64(numRange), lowerBound, upper) 104 encoded, err := sqlbase.EncodeTableKey(nil, upper, encoding.Ascending) 105 if err != nil { 106 return HistogramData{}, err 107 } 108 109 i += num 110 h.Buckets = append(h.Buckets, HistogramData_Bucket{ 111 NumEq: numEq, 112 NumRange: numRange, 113 DistinctRange: distinctRange, 114 UpperBound: encoded, 115 }) 116 117 // Keep track of the total number of estimated distinct values. This will 118 // be used to adjust the distinct count below. 119 distinctCountRange += distinctRange 120 if numEq > 0 { 121 distinctCountEq++ 122 } 123 124 lowerBound = getNextLowerBound(evalCtx, upper) 125 } 126 h.adjustDistinctCount(float64(distinctCount), distinctCountRange, distinctCountEq) 127 return h, nil 128 } 129 130 // adjustDistinctCount adjusts the number of distinct values per bucket based 131 // on the total number of distinct values. 132 func (h *HistogramData) adjustDistinctCount( 133 distinctCountTotal, distinctCountRange, distinctCountEq float64, 134 ) { 135 if distinctCountRange == 0 { 136 return 137 } 138 139 adjustmentFactor := (distinctCountTotal - distinctCountEq) / distinctCountRange 140 if adjustmentFactor < 0 { 141 adjustmentFactor = 0 142 } 143 for i := range h.Buckets { 144 h.Buckets[i].DistinctRange *= adjustmentFactor 145 } 146 } 147 148 // estimatedDistinctValuesInRange returns the estimated number of distinct 149 // values in the range [lowerBound, upperBound), given that the total number 150 // of values is numRange. 151 func estimatedDistinctValuesInRange(numRange float64, lowerBound, upperBound tree.Datum) float64 { 152 if maxDistinct, ok := maxDistinctValuesInRange(lowerBound, upperBound); ok { 153 return expectedDistinctCount(numRange, maxDistinct) 154 } 155 return numRange 156 } 157 158 // maxDistinctValuesInRange returns the maximum number of distinct values in 159 // the range [lowerBound, upperBound). It returns ok=false when it is not 160 // possible to determine a finite value (which is the case for all types other 161 // than integers and dates). 162 func maxDistinctValuesInRange(lowerBound, upperBound tree.Datum) (_ float64, ok bool) { 163 switch lowerBound.ResolvedType().Family() { 164 case types.IntFamily: 165 return float64(*upperBound.(*tree.DInt)) - float64(*lowerBound.(*tree.DInt)), true 166 167 case types.DateFamily: 168 lower := lowerBound.(*tree.DDate) 169 upper := upperBound.(*tree.DDate) 170 if lower.IsFinite() && upper.IsFinite() { 171 return float64(upper.PGEpochDays()) - float64(lower.PGEpochDays()), true 172 } 173 return 0, false 174 175 default: 176 return 0, false 177 } 178 } 179 180 func getNextLowerBound(evalCtx *tree.EvalContext, currentUpperBound tree.Datum) tree.Datum { 181 nextLowerBound, ok := currentUpperBound.Next(evalCtx) 182 if !ok { 183 nextLowerBound = currentUpperBound 184 } 185 return nextLowerBound 186 } 187 188 // expectedDistinctCount returns the expected number of distinct values 189 // among k random numbers selected from n possible values. We assume the 190 // values are chosen using uniform random sampling with replacement. 191 func expectedDistinctCount(k, n float64) float64 { 192 if n == 0 || k == 0 { 193 return 0 194 } 195 // The probability that one specific value (out of the n possible values) 196 // does not appear in any of the k selections is: 197 // 198 // ⎛ n-1 ⎞ k 199 // p = ⎜-----⎟ 200 // ⎝ n ⎠ 201 // 202 // Therefore, the probability that a specific value appears at least once is 203 // 1-p. Over all n values, the expected number that appear at least once is 204 // n * (1-p). In other words, the expected distinct count is: 205 // 206 // ⎛ ⎛ n-1 ⎞ k ⎞ 207 // E[distinct count] = n * ⎜ 1 - ⎜-----⎟ ⎟ 208 // ⎝ ⎝ n ⎠ ⎠ 209 // 210 // See https://math.stackexchange.com/questions/72223/finding-expected- 211 // number-of-distinct-values-selected-from-a-set-of-integers for more info. 212 count := n * (1 - math.Pow((n-1)/n, k)) 213 214 // It's possible that if n is very large, floating point precision errors 215 // will cause count to be 0. In that case, just return min(n, k). 216 if count == 0 { 217 count = k 218 if n < k { 219 count = n 220 } 221 } 222 return count 223 }