github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/sql/stats/histogram.go (about)

     1  // Copyright 2017 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package stats
    12  
    13  import (
    14  	"math"
    15  	"sort"
    16  
    17  	"github.com/cockroachdb/cockroach/pkg/settings"
    18  	"github.com/cockroachdb/cockroach/pkg/sql/sem/tree"
    19  	"github.com/cockroachdb/cockroach/pkg/sql/sqlbase"
    20  	"github.com/cockroachdb/cockroach/pkg/sql/types"
    21  	"github.com/cockroachdb/cockroach/pkg/util/encoding"
    22  	"github.com/cockroachdb/errors"
    23  )
    24  
    25  // HistogramClusterMode controls the cluster setting for enabling
    26  // histogram collection.
    27  var HistogramClusterMode = settings.RegisterPublicBoolSetting(
    28  	"sql.stats.histogram_collection.enabled",
    29  	"histogram collection mode",
    30  	true,
    31  )
    32  
    33  // EquiDepthHistogram creates a histogram where each bucket contains roughly
    34  // the same number of samples (though it can vary when a boundary value has
    35  // high frequency).
    36  //
    37  // numRows is the total number of rows from which values were sampled
    38  // (excluding rows that have NULL values on the histogram column).
    39  //
    40  // In addition to building the histogram buckets, EquiDepthHistogram also
    41  // estimates the number of distinct values in each bucket. It distributes the
    42  // known number of distinct values (distinctCount) among the buckets, in
    43  // proportion with the number of rows in each bucket.
    44  func EquiDepthHistogram(
    45  	evalCtx *tree.EvalContext, samples tree.Datums, numRows, distinctCount int64, maxBuckets int,
    46  ) (HistogramData, error) {
    47  	numSamples := len(samples)
    48  	if numSamples == 0 {
    49  		return HistogramData{}, nil
    50  	}
    51  	if maxBuckets < 2 {
    52  		return HistogramData{}, errors.Errorf("histogram requires at least two buckets")
    53  	}
    54  	if numRows < int64(numSamples) {
    55  		return HistogramData{}, errors.Errorf("more samples than rows")
    56  	}
    57  	for _, d := range samples {
    58  		if d == tree.DNull {
    59  			return HistogramData{}, errors.Errorf("NULL values not allowed in histogram")
    60  		}
    61  	}
    62  	sort.Slice(samples, func(i, j int) bool {
    63  		return samples[i].Compare(evalCtx, samples[j]) < 0
    64  	})
    65  	numBuckets := maxBuckets
    66  	if maxBuckets > numSamples {
    67  		numBuckets = numSamples
    68  	}
    69  	h := HistogramData{
    70  		Buckets: make([]HistogramData_Bucket, 0, numBuckets),
    71  	}
    72  	lowerBound := samples[0]
    73  	h.ColumnType = lowerBound.ResolvedType()
    74  	var distinctCountRange, distinctCountEq float64
    75  
    76  	// i keeps track of the current sample and advances as we form buckets.
    77  	for i, b := 0, 0; b < numBuckets && i < numSamples; b++ {
    78  		// num is the number of samples in this bucket. The first bucket has
    79  		// num=1 so the histogram has a clear lower bound.
    80  		num := (numSamples - i) / (numBuckets - b)
    81  		if i == 0 || num < 1 {
    82  			num = 1
    83  		}
    84  		upper := samples[i+num-1]
    85  		// numLess is the number of samples less than upper (in this bucket).
    86  		numLess := 0
    87  		for ; numLess < num-1; numLess++ {
    88  			if c := samples[i+numLess].Compare(evalCtx, upper); c == 0 {
    89  				break
    90  			} else if c > 0 {
    91  				return HistogramData{}, errors.AssertionFailedf("%+v", "samples not sorted")
    92  			}
    93  		}
    94  		// Advance the boundary of the bucket to cover all samples equal to upper.
    95  		for ; i+num < numSamples; num++ {
    96  			if samples[i+num].Compare(evalCtx, upper) != 0 {
    97  				break
    98  			}
    99  		}
   100  
   101  		numEq := int64(num-numLess) * numRows / int64(numSamples)
   102  		numRange := int64(numLess) * numRows / int64(numSamples)
   103  		distinctRange := estimatedDistinctValuesInRange(float64(numRange), lowerBound, upper)
   104  		encoded, err := sqlbase.EncodeTableKey(nil, upper, encoding.Ascending)
   105  		if err != nil {
   106  			return HistogramData{}, err
   107  		}
   108  
   109  		i += num
   110  		h.Buckets = append(h.Buckets, HistogramData_Bucket{
   111  			NumEq:         numEq,
   112  			NumRange:      numRange,
   113  			DistinctRange: distinctRange,
   114  			UpperBound:    encoded,
   115  		})
   116  
   117  		// Keep track of the total number of estimated distinct values. This will
   118  		// be used to adjust the distinct count below.
   119  		distinctCountRange += distinctRange
   120  		if numEq > 0 {
   121  			distinctCountEq++
   122  		}
   123  
   124  		lowerBound = getNextLowerBound(evalCtx, upper)
   125  	}
   126  	h.adjustDistinctCount(float64(distinctCount), distinctCountRange, distinctCountEq)
   127  	return h, nil
   128  }
   129  
   130  // adjustDistinctCount adjusts the number of distinct values per bucket based
   131  // on the total number of distinct values.
   132  func (h *HistogramData) adjustDistinctCount(
   133  	distinctCountTotal, distinctCountRange, distinctCountEq float64,
   134  ) {
   135  	if distinctCountRange == 0 {
   136  		return
   137  	}
   138  
   139  	adjustmentFactor := (distinctCountTotal - distinctCountEq) / distinctCountRange
   140  	if adjustmentFactor < 0 {
   141  		adjustmentFactor = 0
   142  	}
   143  	for i := range h.Buckets {
   144  		h.Buckets[i].DistinctRange *= adjustmentFactor
   145  	}
   146  }
   147  
   148  // estimatedDistinctValuesInRange returns the estimated number of distinct
   149  // values in the range [lowerBound, upperBound), given that the total number
   150  // of values is numRange.
   151  func estimatedDistinctValuesInRange(numRange float64, lowerBound, upperBound tree.Datum) float64 {
   152  	if maxDistinct, ok := maxDistinctValuesInRange(lowerBound, upperBound); ok {
   153  		return expectedDistinctCount(numRange, maxDistinct)
   154  	}
   155  	return numRange
   156  }
   157  
   158  // maxDistinctValuesInRange returns the maximum number of distinct values in
   159  // the range [lowerBound, upperBound). It returns ok=false when it is not
   160  // possible to determine a finite value (which is the case for all types other
   161  // than integers and dates).
   162  func maxDistinctValuesInRange(lowerBound, upperBound tree.Datum) (_ float64, ok bool) {
   163  	switch lowerBound.ResolvedType().Family() {
   164  	case types.IntFamily:
   165  		return float64(*upperBound.(*tree.DInt)) - float64(*lowerBound.(*tree.DInt)), true
   166  
   167  	case types.DateFamily:
   168  		lower := lowerBound.(*tree.DDate)
   169  		upper := upperBound.(*tree.DDate)
   170  		if lower.IsFinite() && upper.IsFinite() {
   171  			return float64(upper.PGEpochDays()) - float64(lower.PGEpochDays()), true
   172  		}
   173  		return 0, false
   174  
   175  	default:
   176  		return 0, false
   177  	}
   178  }
   179  
   180  func getNextLowerBound(evalCtx *tree.EvalContext, currentUpperBound tree.Datum) tree.Datum {
   181  	nextLowerBound, ok := currentUpperBound.Next(evalCtx)
   182  	if !ok {
   183  		nextLowerBound = currentUpperBound
   184  	}
   185  	return nextLowerBound
   186  }
   187  
   188  // expectedDistinctCount returns the expected number of distinct values
   189  // among k random numbers selected from n possible values. We assume the
   190  // values are chosen using uniform random sampling with replacement.
   191  func expectedDistinctCount(k, n float64) float64 {
   192  	if n == 0 || k == 0 {
   193  		return 0
   194  	}
   195  	// The probability that one specific value (out of the n possible values)
   196  	// does not appear in any of the k selections is:
   197  	//
   198  	//         ⎛ n-1 ⎞ k
   199  	//     p = ⎜-----⎟
   200  	//         ⎝  n  ⎠
   201  	//
   202  	// Therefore, the probability that a specific value appears at least once is
   203  	// 1-p. Over all n values, the expected number that appear at least once is
   204  	// n * (1-p). In other words, the expected distinct count is:
   205  	//
   206  	//                             ⎛     ⎛ n-1 ⎞ k ⎞
   207  	//     E[distinct count] = n * ⎜ 1 - ⎜-----⎟   ⎟
   208  	//                             ⎝     ⎝  n  ⎠   ⎠
   209  	//
   210  	// See https://math.stackexchange.com/questions/72223/finding-expected-
   211  	//   number-of-distinct-values-selected-from-a-set-of-integers for more info.
   212  	count := n * (1 - math.Pow((n-1)/n, k))
   213  
   214  	// It's possible that if n is very large, floating point precision errors
   215  	// will cause count to be 0. In that case, just return min(n, k).
   216  	if count == 0 {
   217  		count = k
   218  		if n < k {
   219  			count = n
   220  		}
   221  	}
   222  	return count
   223  }