github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/sql/opt/props/histogram.go (about)

     1  // Copyright 2019 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package props
    12  
    13  import (
    14  	"bytes"
    15  	"fmt"
    16  	"io"
    17  	"math"
    18  	"sort"
    19  
    20  	"github.com/cockroachdb/cockroach/pkg/sql/opt"
    21  	"github.com/cockroachdb/cockroach/pkg/sql/opt/cat"
    22  	"github.com/cockroachdb/cockroach/pkg/sql/opt/constraint"
    23  	"github.com/cockroachdb/cockroach/pkg/sql/sem/tree"
    24  	"github.com/cockroachdb/cockroach/pkg/sql/types"
    25  	"github.com/cockroachdb/errors"
    26  	"github.com/olekukonko/tablewriter"
    27  )
    28  
    29  // Histogram captures the distribution of values for a particular column within
    30  // a relational expression.
    31  // Histograms are immutable.
    32  type Histogram struct {
    33  	evalCtx *tree.EvalContext
    34  	col     opt.ColumnID
    35  	buckets []cat.HistogramBucket
    36  }
    37  
    38  func (h *Histogram) String() string {
    39  	w := histogramWriter{}
    40  	w.init(h.buckets)
    41  	var buf bytes.Buffer
    42  	w.write(&buf)
    43  	return buf.String()
    44  }
    45  
    46  // Init initializes the histogram with data from the catalog.
    47  func (h *Histogram) Init(
    48  	evalCtx *tree.EvalContext, col opt.ColumnID, buckets []cat.HistogramBucket,
    49  ) {
    50  	h.evalCtx = evalCtx
    51  	h.col = col
    52  	h.buckets = buckets
    53  }
    54  
    55  // copy returns a deep copy of the histogram.
    56  func (h *Histogram) copy() *Histogram {
    57  	buckets := make([]cat.HistogramBucket, len(h.buckets))
    58  	copy(buckets, h.buckets)
    59  	return &Histogram{
    60  		evalCtx: h.evalCtx,
    61  		col:     h.col,
    62  		buckets: buckets,
    63  	}
    64  }
    65  
    66  // BucketCount returns the number of buckets in the histogram.
    67  func (h *Histogram) BucketCount() int {
    68  	return len(h.buckets)
    69  }
    70  
    71  // Bucket returns a pointer to the ith bucket in the histogram.
    72  // i must be greater than or equal to 0 and less than BucketCount.
    73  func (h *Histogram) Bucket(i int) *cat.HistogramBucket {
    74  	return &h.buckets[i]
    75  }
    76  
    77  // ValuesCount returns the total number of values in the histogram. It can
    78  // be used to estimate the selectivity of a predicate by comparing the values
    79  // count before and after calling Filter on the histogram.
    80  func (h *Histogram) ValuesCount() float64 {
    81  	var count float64
    82  	for i := range h.buckets {
    83  		count += h.buckets[i].NumRange
    84  		count += h.buckets[i].NumEq
    85  	}
    86  	return count
    87  }
    88  
    89  // DistinctValuesCount returns the estimated number of distinct values in the
    90  // histogram.
    91  func (h *Histogram) DistinctValuesCount() float64 {
    92  	var count float64
    93  	for i := range h.buckets {
    94  		b := &h.buckets[i]
    95  		count += b.DistinctRange
    96  		if b.NumEq > 1 {
    97  			count++
    98  		} else {
    99  			count += b.NumEq
   100  		}
   101  	}
   102  	if maxCount := h.maxDistinctValuesCount(); maxCount < count {
   103  		count = maxCount
   104  	}
   105  	return count
   106  }
   107  
   108  // maxDistinctValuesCount estimates the maximum number of distinct values in
   109  // the histogram.
   110  func (h *Histogram) maxDistinctValuesCount() float64 {
   111  	if len(h.buckets) == 0 {
   112  		return 0
   113  	}
   114  
   115  	// The first bucket always has a zero value for NumRange, so the lower bound
   116  	// of the histogram is the upper bound of the first bucket.
   117  	if h.Bucket(0).NumRange != 0 {
   118  		panic(errors.AssertionFailedf("the first bucket should have NumRange=0"))
   119  	}
   120  	lowerBound := h.Bucket(0).UpperBound
   121  
   122  	var count float64
   123  	for i := range h.buckets {
   124  		b := &h.buckets[i]
   125  		rng, ok := maxDistinctValuesInRange(lowerBound, b.UpperBound)
   126  
   127  		if ok && b.NumRange > rng {
   128  			count += rng
   129  		} else {
   130  			count += b.NumRange
   131  		}
   132  
   133  		if b.NumEq > 1 {
   134  			count++
   135  		} else {
   136  			count += b.NumEq
   137  		}
   138  		lowerBound = h.getNextLowerBound(b.UpperBound)
   139  	}
   140  	return count
   141  }
   142  
   143  // maxDistinctValuesInRange returns the maximum number of distinct values in
   144  // the range [lowerBound, upperBound). It returns ok=false when it is not
   145  // possible to determine a finite value (which is the case for all types other
   146  // than integers and dates).
   147  func maxDistinctValuesInRange(lowerBound, upperBound tree.Datum) (_ float64, ok bool) {
   148  	switch lowerBound.ResolvedType().Family() {
   149  	case types.IntFamily:
   150  		return float64(*upperBound.(*tree.DInt)) - float64(*lowerBound.(*tree.DInt)), true
   151  
   152  	case types.DateFamily:
   153  		lower := lowerBound.(*tree.DDate)
   154  		upper := upperBound.(*tree.DDate)
   155  		if lower.IsFinite() && upper.IsFinite() {
   156  			return float64(upper.PGEpochDays()) - float64(lower.PGEpochDays()), true
   157  		}
   158  		return 0, false
   159  
   160  	default:
   161  		return 0, false
   162  	}
   163  }
   164  
   165  // CanFilter returns true if the given constraint can filter the histogram.
   166  // This is the case if the histogram column matches one of the columns in
   167  // the exact prefix of c or the next column immediately after the exact prefix.
   168  // Returns the offset of the matching column in the constraint if found, as
   169  // well as the exact prefix.
   170  func (h *Histogram) CanFilter(c *constraint.Constraint) (colOffset, exactPrefix int, ok bool) {
   171  	exactPrefix = c.ExactPrefix(h.evalCtx)
   172  	constrainedCols := c.ConstrainedColumns(h.evalCtx)
   173  	for i := 0; i < constrainedCols && i <= exactPrefix; i++ {
   174  		if c.Columns.Get(i).ID() == h.col {
   175  			return i, exactPrefix, true
   176  		}
   177  	}
   178  	return 0, exactPrefix, false
   179  }
   180  
   181  // Filter filters the histogram according to the given constraint, and returns
   182  // a new histogram with the results. CanFilter should be called first to
   183  // validate that c can filter the histogram.
   184  func (h *Histogram) Filter(c *constraint.Constraint) *Histogram {
   185  	colOffset, exactPrefix, ok := h.CanFilter(c)
   186  	if !ok {
   187  		panic(errors.AssertionFailedf("column mismatch"))
   188  	}
   189  
   190  	bucketCount := h.BucketCount()
   191  	filtered := &Histogram{
   192  		evalCtx: h.evalCtx,
   193  		col:     h.col,
   194  		buckets: make([]cat.HistogramBucket, 0, bucketCount),
   195  	}
   196  	if bucketCount == 0 {
   197  		return filtered
   198  	}
   199  
   200  	// The first bucket always has a zero value for NumRange, so the lower bound
   201  	// of the histogram is the upper bound of the first bucket.
   202  	if h.Bucket(0).NumRange != 0 {
   203  		panic(errors.AssertionFailedf("the first bucket should have NumRange=0"))
   204  	}
   205  
   206  	prefix := make([]tree.Datum, colOffset)
   207  	for i := range prefix {
   208  		prefix[i] = c.Spans.Get(0).StartKey().Value(i)
   209  	}
   210  	desc := c.Columns.Get(colOffset).Descending()
   211  	var iter histogramIter
   212  	iter.init(h, desc)
   213  	spanIndex := 0
   214  	keyCtx := constraint.KeyContext{EvalCtx: h.evalCtx, Columns: c.Columns}
   215  
   216  	// Find the first span that may overlap with the histogram.
   217  	firstBucket := makeSpanFromBucket(&iter, prefix)
   218  	spanCount := c.Spans.Count()
   219  	for spanIndex < spanCount {
   220  		span := c.Spans.Get(spanIndex)
   221  		if firstBucket.StartsAfter(&keyCtx, span) {
   222  			spanIndex++
   223  			continue
   224  		}
   225  		break
   226  	}
   227  	if spanIndex == spanCount {
   228  		return filtered
   229  	}
   230  
   231  	// Use binary search to find the first bucket that overlaps with the span.
   232  	span := c.Spans.Get(spanIndex)
   233  	bucIndex := sort.Search(bucketCount, func(i int) bool {
   234  		iter.setIdx(i)
   235  		bucket := makeSpanFromBucket(&iter, prefix)
   236  		if desc {
   237  			return span.StartsAfter(&keyCtx, &bucket)
   238  		}
   239  		return !span.StartsAfter(&keyCtx, &bucket)
   240  	})
   241  	if desc {
   242  		bucIndex--
   243  		if bucIndex == -1 {
   244  			return filtered
   245  		}
   246  	} else if bucIndex == bucketCount {
   247  		return filtered
   248  	}
   249  	iter.setIdx(bucIndex)
   250  	if !desc && bucIndex > 0 {
   251  		prevUpperBound := h.Bucket(bucIndex - 1).UpperBound
   252  		filtered.addEmptyBucket(prevUpperBound, desc)
   253  	}
   254  
   255  	// For the remaining buckets and spans, use a variation on merge sort.
   256  	for spanIndex < spanCount {
   257  		if spanIndex > 0 && colOffset < exactPrefix {
   258  			// If this column is part of the exact prefix, we don't need to look at
   259  			// the rest of the spans.
   260  			break
   261  		}
   262  
   263  		// Convert the bucket to a span in order to take advantage of the
   264  		// constraint library.
   265  		left := makeSpanFromBucket(&iter, prefix)
   266  		right := c.Spans.Get(spanIndex)
   267  
   268  		if left.StartsAfter(&keyCtx, right) {
   269  			spanIndex++
   270  			continue
   271  		}
   272  
   273  		filteredSpan := left
   274  		if !filteredSpan.TryIntersectWith(&keyCtx, right) {
   275  			filtered.addEmptyBucket(iter.b.UpperBound, desc)
   276  			if ok := iter.next(); !ok {
   277  				break
   278  			}
   279  			continue
   280  		}
   281  
   282  		filteredBucket := iter.b
   283  		if filteredSpan.Compare(&keyCtx, &left) != 0 {
   284  			// The bucket was cut off in the middle. Get the resulting filtered
   285  			// bucket.
   286  			filteredBucket = getFilteredBucket(&iter, &keyCtx, &filteredSpan, colOffset)
   287  			if !desc && filteredSpan.CompareStarts(&keyCtx, &left) != 0 {
   288  				// We need to add an empty bucket before the new bucket.
   289  				ub := h.getPrevUpperBound(filteredSpan.StartKey(), filteredSpan.StartBoundary(), colOffset)
   290  				filtered.addEmptyBucket(ub, desc)
   291  			}
   292  		}
   293  		filtered.addBucket(filteredBucket, desc)
   294  
   295  		if desc && filteredSpan.CompareEnds(&keyCtx, &left) != 0 {
   296  			// We need to add an empty bucket after the new bucket.
   297  			ub := h.getPrevUpperBound(filteredSpan.EndKey(), filteredSpan.EndBoundary(), colOffset)
   298  			filtered.addEmptyBucket(ub, desc)
   299  		}
   300  
   301  		// Skip past whichever span ends first, or skip past both if they have
   302  		// the same endpoint.
   303  		cmp := left.CompareEnds(&keyCtx, right)
   304  		if cmp <= 0 {
   305  			if ok := iter.next(); !ok {
   306  				break
   307  			}
   308  		}
   309  		if cmp >= 0 {
   310  			spanIndex++
   311  		}
   312  	}
   313  
   314  	if desc {
   315  		// After we reverse the buckets below, the last bucket will become the
   316  		// first bucket. NumRange of the first bucket must be 0, so add an empty
   317  		// bucket if needed.
   318  		if iter.next() {
   319  			// The remaining buckets from the original histogram have been removed.
   320  			filtered.addEmptyBucket(iter.lb, desc)
   321  		} else if lastBucket := filtered.buckets[len(filtered.buckets)-1]; lastBucket.NumRange != 0 {
   322  			iter.setIdx(0)
   323  			span := makeSpanFromBucket(&iter, prefix)
   324  			ub := h.getPrevUpperBound(span.EndKey(), span.EndBoundary(), colOffset)
   325  			filtered.addEmptyBucket(ub, desc)
   326  		}
   327  
   328  		// Reverse the buckets so they are in ascending order.
   329  		for i := 0; i < len(filtered.buckets)/2; i++ {
   330  			j := len(filtered.buckets) - 1 - i
   331  			filtered.buckets[i], filtered.buckets[j] = filtered.buckets[j], filtered.buckets[i]
   332  		}
   333  	}
   334  
   335  	return filtered
   336  }
   337  
   338  func (h *Histogram) getNextLowerBound(currentUpperBound tree.Datum) tree.Datum {
   339  	nextLowerBound, ok := currentUpperBound.Next(h.evalCtx)
   340  	if !ok {
   341  		nextLowerBound = currentUpperBound
   342  	}
   343  	return nextLowerBound
   344  }
   345  
   346  func (h *Histogram) getPrevUpperBound(
   347  	currentLowerBound constraint.Key, boundary constraint.SpanBoundary, colOffset int,
   348  ) tree.Datum {
   349  	prevUpperBound := currentLowerBound.Value(colOffset)
   350  	if boundary == constraint.IncludeBoundary {
   351  		if prev, ok := prevUpperBound.Prev(h.evalCtx); ok {
   352  			prevUpperBound = prev
   353  		}
   354  	}
   355  	return prevUpperBound
   356  }
   357  
   358  func (h *Histogram) addEmptyBucket(upperBound tree.Datum, desc bool) {
   359  	h.addBucket(&cat.HistogramBucket{UpperBound: upperBound}, desc)
   360  }
   361  
   362  func (h *Histogram) addBucket(bucket *cat.HistogramBucket, desc bool) {
   363  	// Check whether we can combine this bucket with the previous bucket.
   364  	if len(h.buckets) != 0 {
   365  		lastBucket := &h.buckets[len(h.buckets)-1]
   366  		lower, higher := lastBucket, bucket
   367  		if desc {
   368  			lower, higher = bucket, lastBucket
   369  		}
   370  		if lower.NumRange == 0 && lower.NumEq == 0 && higher.NumRange == 0 {
   371  			lastBucket.NumEq = higher.NumEq
   372  			lastBucket.UpperBound = higher.UpperBound
   373  			return
   374  		}
   375  		if lastBucket.UpperBound.Compare(h.evalCtx, bucket.UpperBound) == 0 {
   376  			lastBucket.NumEq = lower.NumEq + higher.NumRange + higher.NumEq
   377  			lastBucket.NumRange = lower.NumRange
   378  			return
   379  		}
   380  	}
   381  	h.buckets = append(h.buckets, *bucket)
   382  }
   383  
   384  // ApplySelectivity reduces the size of each histogram bucket according to
   385  // the given selectivity, and returns a new histogram with the results.
   386  func (h *Histogram) ApplySelectivity(selectivity float64) *Histogram {
   387  	res := h.copy()
   388  	for i := range res.buckets {
   389  		b := &res.buckets[i]
   390  
   391  		// Save n and d for the distinct count formula below.
   392  		n := b.NumRange
   393  		d := b.DistinctRange
   394  
   395  		b.NumEq *= selectivity
   396  		b.NumRange *= selectivity
   397  
   398  		if d == 0 {
   399  			continue
   400  		}
   401  		// If each distinct value appears n/d times, and the probability of a
   402  		// row being filtered out is (1 - selectivity), the probability that all
   403  		// n/d rows are filtered out is (1 - selectivity)^(n/d). So the expected
   404  		// number of values that are filtered out is d*(1 - selectivity)^(n/d).
   405  		//
   406  		// This formula returns d * selectivity when d=n but is closer to d
   407  		// when d << n.
   408  		b.DistinctRange = d - d*math.Pow(1-selectivity, n/d)
   409  	}
   410  	return res
   411  }
   412  
   413  // histogramIter is a helper struct for iterating through the buckets in a
   414  // histogram. It enables iterating both forward and backward through the
   415  // buckets.
   416  type histogramIter struct {
   417  	h    *Histogram
   418  	desc bool
   419  	idx  int
   420  	b    *cat.HistogramBucket
   421  	lb   tree.Datum
   422  	ub   tree.Datum
   423  }
   424  
   425  // init initializes a histogramIter to point to the first bucket of the given
   426  // histogram. If desc is true, the iterator starts from the end of the
   427  // histogram and moves backwards.
   428  func (hi *histogramIter) init(h *Histogram, desc bool) {
   429  	hi.idx = -1
   430  	if desc {
   431  		hi.idx = h.BucketCount()
   432  	}
   433  	hi.h = h
   434  	hi.desc = desc
   435  	hi.next()
   436  }
   437  
   438  // setIdx updates the histogramIter to point to the ith bucket in the
   439  // histogram.
   440  func (hi *histogramIter) setIdx(i int) {
   441  	hi.idx = i - 1
   442  	if hi.desc {
   443  		hi.idx = i + 1
   444  	}
   445  	hi.next()
   446  }
   447  
   448  // next sets the histogramIter to point to the next bucket. If hi.desc is true
   449  // the "next" bucket is actually the previous bucket in the histogram. Returns
   450  // false if there are no more buckets.
   451  func (hi *histogramIter) next() (ok bool) {
   452  	getBounds := func() (lb, ub tree.Datum) {
   453  		hi.b = hi.h.Bucket(hi.idx)
   454  		ub = hi.b.UpperBound
   455  		if hi.idx == 0 {
   456  			lb = ub
   457  		} else {
   458  			lb = hi.h.getNextLowerBound(hi.h.Bucket(hi.idx - 1).UpperBound)
   459  		}
   460  		return lb, ub
   461  	}
   462  
   463  	if hi.desc {
   464  		hi.idx--
   465  		if hi.idx < 0 {
   466  			return false
   467  		}
   468  		hi.ub, hi.lb = getBounds()
   469  	} else {
   470  		hi.idx++
   471  		if hi.idx >= hi.h.BucketCount() {
   472  			return false
   473  		}
   474  		hi.lb, hi.ub = getBounds()
   475  	}
   476  
   477  	return true
   478  }
   479  
   480  func makeSpanFromBucket(iter *histogramIter, prefix []tree.Datum) (span constraint.Span) {
   481  	span.Init(
   482  		constraint.MakeCompositeKey(append(prefix[:len(prefix):len(prefix)], iter.lb)...),
   483  		constraint.IncludeBoundary,
   484  		constraint.MakeCompositeKey(append(prefix[:len(prefix):len(prefix)], iter.ub)...),
   485  		constraint.IncludeBoundary,
   486  	)
   487  	return span
   488  }
   489  
   490  // getFilteredBucket filters the histogram bucket according to the given span,
   491  // and returns a new bucket with the results. The span represents the maximum
   492  // range of values that remain in the bucket after filtering. The span must
   493  // be fully contained within the bucket, or else getFilteredBucket will throw
   494  // an error.
   495  //
   496  // For example, suppose a bucket initially has lower bound 0 (inclusive) and
   497  // contains the following data: {NumEq: 5, NumRange: 10, UpperBound: 10} (all
   498  // values are integers).
   499  //
   500  // The following spans will filter the bucket as shown:
   501  //   [/0 - /5]   => {NumEq: 1, NumRange: 5, UpperBound: 5}
   502  //   [/2 - /10]  => {NumEq: 5, NumRange: 8, UpperBound: 10}
   503  //   [/20 - /30] => error
   504  //
   505  // Note that the calculations for NumEq and NumRange depend on the data type.
   506  // For discrete data types such as integers and dates, it is always possible
   507  // to assign a non-zero value for NumEq as long as NumEq and NumRange were
   508  // non-zero in the original bucket. For continuous types such as floats,
   509  // NumEq will be zero unless the filtered bucket includes the original upper
   510  // bound. For example, given the same bucket as in the above example, but with
   511  // floating point values instead of integers:
   512  //
   513  //   [/0 - /5]   => {NumEq: 0, NumRange: 5, UpperBound: 5.0}
   514  //   [/2 - /10]  => {NumEq: 5, NumRange: 8, UpperBound: 10.0}
   515  //   [/20 - /30] => error
   516  //
   517  // For non-numeric types such as strings, it is not possible to estimate
   518  // the size of NumRange if the bucket is cut off in the middle. In this case,
   519  // we use the heuristic that NumRange is reduced by half.
   520  //
   521  func getFilteredBucket(
   522  	iter *histogramIter, keyCtx *constraint.KeyContext, filteredSpan *constraint.Span, colOffset int,
   523  ) *cat.HistogramBucket {
   524  	spanLowerBound := filteredSpan.StartKey().Value(colOffset)
   525  	spanUpperBound := filteredSpan.EndKey().Value(colOffset)
   526  	bucketLowerBound := iter.lb
   527  	bucketUpperBound := iter.ub
   528  	b := iter.b
   529  
   530  	// Check that the given span is contained in the bucket.
   531  	cmpSpanStartBucketStart := spanLowerBound.Compare(keyCtx.EvalCtx, bucketLowerBound)
   532  	cmpSpanEndBucketEnd := spanUpperBound.Compare(keyCtx.EvalCtx, bucketUpperBound)
   533  	contained := cmpSpanStartBucketStart >= 0 && cmpSpanEndBucketEnd <= 0
   534  	if iter.desc {
   535  		contained = cmpSpanStartBucketStart <= 0 && cmpSpanEndBucketEnd >= 0
   536  	}
   537  	if !contained {
   538  		panic(errors.AssertionFailedf("span must be fully contained in the bucket"))
   539  	}
   540  
   541  	// Extract the range sizes before and after filtering. Only numeric and
   542  	// date-time types will have ok=true, since these are the only types for
   543  	// which we can accurately calculate the range size of a non-equality span.
   544  	rangeBefore, rangeAfter, ok := getRangesBeforeAndAfter(
   545  		bucketLowerBound, bucketUpperBound, spanLowerBound, spanUpperBound, iter.desc,
   546  	)
   547  
   548  	// Determine whether this span represents an equality condition.
   549  	isEqualityCondition := spanLowerBound.Compare(keyCtx.EvalCtx, spanUpperBound) == 0
   550  
   551  	// Determine whether this span includes the original upper bound of the
   552  	// bucket.
   553  	isSpanEndBoundaryInclusive := filteredSpan.EndBoundary() == constraint.IncludeBoundary
   554  	includesOriginalUpperBound := isSpanEndBoundaryInclusive && cmpSpanEndBucketEnd == 0
   555  	if iter.desc {
   556  		isSpanStartBoundaryInclusive := filteredSpan.StartBoundary() == constraint.IncludeBoundary
   557  		includesOriginalUpperBound = isSpanStartBoundaryInclusive && cmpSpanStartBucketStart == 0
   558  	}
   559  
   560  	// Calculate the new value for numEq.
   561  	var numEq float64
   562  	if includesOriginalUpperBound {
   563  		numEq = b.NumEq
   564  	} else {
   565  		if isEqualityCondition {
   566  			// This span represents an equality condition with a value in the range
   567  			// of this bucket. Use the distinct count of the bucket to estimate the
   568  			// selectivity of the equality condition.
   569  			selectivity := 1.0
   570  			if b.DistinctRange > 1 {
   571  				selectivity = 1 / b.DistinctRange
   572  			}
   573  			numEq = selectivity * b.NumRange
   574  		} else if ok && rangeBefore > 0 && isDiscrete(bucketLowerBound.ResolvedType()) {
   575  			// If we were successful in finding the ranges before and after filtering
   576  			// and the data type is discrete (e.g., integer, date, or timestamp), we
   577  			// can assign some of the old NumRange to the new NumEq.
   578  			numEq = b.NumRange / rangeBefore
   579  		}
   580  	}
   581  
   582  	// Calculate the new value for numRange.
   583  	var numRange float64
   584  	if isEqualityCondition {
   585  		numRange = 0
   586  	} else if ok && rangeBefore > 0 {
   587  		// If we were successful in finding the ranges before and after filtering,
   588  		// calculate the fraction of values that should be assigned to the new
   589  		// bucket.
   590  		numRange = b.NumRange * rangeAfter / rangeBefore
   591  	} else {
   592  		// In the absence of any information, assume we reduced the size of the
   593  		// bucket by half.
   594  		numRange = 0.5 * b.NumRange
   595  	}
   596  
   597  	// Calculate the new value for distinctCountRange.
   598  	var distinctCountRange float64
   599  	if b.NumRange > 0 {
   600  		distinctCountRange = b.DistinctRange * numRange / b.NumRange
   601  	}
   602  
   603  	upperBound := spanUpperBound
   604  	if iter.desc {
   605  		upperBound = spanLowerBound
   606  	}
   607  	return &cat.HistogramBucket{
   608  		NumEq:         numEq,
   609  		NumRange:      numRange,
   610  		DistinctRange: distinctCountRange,
   611  		UpperBound:    upperBound,
   612  	}
   613  }
   614  
   615  // getRangesBeforeAndAfter returns the size of the ranges before and after the
   616  // given bucket is filtered by the given span. If swap is true, the upper and
   617  // lower bounds should be swapped for the bucket and the span. Returns ok=true
   618  // if these range sizes are calculated successfully, and false otherwise.
   619  func getRangesBeforeAndAfter(
   620  	bucketLowerBound, bucketUpperBound, spanLowerBound, spanUpperBound tree.Datum, swap bool,
   621  ) (rangeBefore, rangeAfter float64, ok bool) {
   622  	// If the data types don't match, don't bother trying to calculate the range
   623  	// sizes. This should almost never happen, but we want to avoid type
   624  	// assertion errors below.
   625  	typesMatch :=
   626  		bucketLowerBound.ResolvedType().Equivalent(bucketUpperBound.ResolvedType()) &&
   627  			bucketUpperBound.ResolvedType().Equivalent(spanLowerBound.ResolvedType()) &&
   628  			spanLowerBound.ResolvedType().Equivalent(spanUpperBound.ResolvedType())
   629  	if !typesMatch {
   630  		return 0, 0, false
   631  	}
   632  
   633  	if swap {
   634  		bucketLowerBound, bucketUpperBound = bucketUpperBound, bucketLowerBound
   635  		spanLowerBound, spanUpperBound = spanUpperBound, spanLowerBound
   636  	}
   637  
   638  	// TODO(rytaft): handle more types here.
   639  	// Note: the calculations below assume that bucketLowerBound is inclusive and
   640  	// Span.PreferInclusive() has been called on the span.
   641  
   642  	getRange := func(lowerBound, upperBound tree.Datum) (rng float64, ok bool) {
   643  		switch lowerBound.ResolvedType().Family() {
   644  		case types.IntFamily:
   645  			rng = float64(*upperBound.(*tree.DInt)) - float64(*lowerBound.(*tree.DInt))
   646  			return rng, true
   647  
   648  		case types.DateFamily:
   649  			lower := lowerBound.(*tree.DDate)
   650  			upper := upperBound.(*tree.DDate)
   651  			if lower.IsFinite() && upper.IsFinite() {
   652  				rng = float64(upper.PGEpochDays()) - float64(lower.PGEpochDays())
   653  				return rng, true
   654  			}
   655  			return 0, false
   656  
   657  		case types.DecimalFamily:
   658  			lower, err := lowerBound.(*tree.DDecimal).Float64()
   659  			if err != nil {
   660  				return 0, false
   661  			}
   662  			upper, err := upperBound.(*tree.DDecimal).Float64()
   663  			if err != nil {
   664  				return 0, false
   665  			}
   666  			rng = upper - lower
   667  			return rng, true
   668  
   669  		case types.FloatFamily:
   670  			rng = float64(*upperBound.(*tree.DFloat)) - float64(*lowerBound.(*tree.DFloat))
   671  			return rng, true
   672  
   673  		case types.TimestampFamily:
   674  			lower := lowerBound.(*tree.DTimestamp).Time
   675  			upper := upperBound.(*tree.DTimestamp).Time
   676  			rng = float64(upper.Sub(lower))
   677  			return rng, true
   678  
   679  		case types.TimestampTZFamily:
   680  			lower := lowerBound.(*tree.DTimestampTZ).Time
   681  			upper := upperBound.(*tree.DTimestampTZ).Time
   682  			rng = float64(upper.Sub(lower))
   683  			return rng, true
   684  
   685  		default:
   686  			return 0, false
   687  		}
   688  	}
   689  
   690  	rangeBefore, okBefore := getRange(bucketLowerBound, bucketUpperBound)
   691  	rangeAfter, okAfter := getRange(spanLowerBound, spanUpperBound)
   692  	ok = okBefore && okAfter
   693  
   694  	return rangeBefore, rangeAfter, ok
   695  }
   696  
   697  // isDiscrete returns true if the given data type is discrete.
   698  func isDiscrete(typ *types.T) bool {
   699  	switch typ.Family() {
   700  	case types.IntFamily, types.DateFamily, types.TimestampFamily, types.TimestampTZFamily:
   701  		return true
   702  	}
   703  	return false
   704  }
   705  
   706  // histogramWriter prints histograms with the following formatting:
   707  //   NumRange1    NumEq1     NumRange2    NumEq2    ....
   708  // <----------- UpperBound1 ----------- UpperBound2 ....
   709  //
   710  // For example:
   711  //   0  1  90  10   0  20
   712  // <--- 0 ---- 100 --- 200
   713  //
   714  // This describes a histogram with 3 buckets. The first bucket contains 1 value
   715  // equal to 0. The second bucket contains 90 values between 0 and 100 and
   716  // 10 values equal to 100. Finally, the third bucket contains 20 values equal
   717  // to 200.
   718  type histogramWriter struct {
   719  	cells     [][]string
   720  	colWidths []int
   721  }
   722  
   723  const (
   724  	// These constants describe the two rows that are printed.
   725  	counts = iota
   726  	boundaries
   727  )
   728  
   729  func (w *histogramWriter) init(buckets []cat.HistogramBucket) {
   730  	w.cells = [][]string{
   731  		make([]string, len(buckets)*2),
   732  		make([]string, len(buckets)*2),
   733  	}
   734  	w.colWidths = make([]int, len(buckets)*2)
   735  
   736  	for i, b := range buckets {
   737  		w.cells[counts][i*2] = fmt.Sprintf(" %.5g ", b.NumRange)
   738  		w.cells[counts][i*2+1] = fmt.Sprintf("%.5g", b.NumEq)
   739  		// TODO(rytaft): truncate large strings.
   740  		w.cells[boundaries][i*2+1] = fmt.Sprintf(" %s ", b.UpperBound.String())
   741  		if width := tablewriter.DisplayWidth(w.cells[counts][i*2]); width > w.colWidths[i*2] {
   742  			w.colWidths[i*2] = width
   743  		}
   744  		if width := tablewriter.DisplayWidth(w.cells[counts][i*2+1]); width > w.colWidths[i*2+1] {
   745  			w.colWidths[i*2+1] = width
   746  		}
   747  		if width := tablewriter.DisplayWidth(w.cells[boundaries][i*2+1]); width > w.colWidths[i*2+1] {
   748  			w.colWidths[i*2+1] = width
   749  		}
   750  	}
   751  }
   752  
   753  func (w *histogramWriter) write(out io.Writer) {
   754  	if len(w.cells[counts]) == 0 {
   755  		return
   756  	}
   757  
   758  	// Print a space to match up with the "<" character below.
   759  	fmt.Fprint(out, " ")
   760  	for i := range w.cells[counts] {
   761  		fmt.Fprintf(out, "%s", tablewriter.Pad(w.cells[counts][i], " ", w.colWidths[i]))
   762  	}
   763  	fmt.Fprint(out, "\n")
   764  	fmt.Fprint(out, "<")
   765  	for i := range w.cells[boundaries] {
   766  		fmt.Fprintf(out, "%s", tablewriter.Pad(w.cells[boundaries][i], "-", w.colWidths[i]))
   767  	}
   768  }