github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/sql/opt/props/statistics.go

github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/sql/opt/props/statistics.go (about)

     1  // Copyright 2018 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package props
    12  
    13  import (
    14  	"bytes"
    15  	"fmt"
    16  	"math"
    17  	"sort"
    18  	"strings"
    19  
    20  	"github.com/cockroachdb/cockroach/pkg/sql/opt"
    21  	"github.com/olekukonko/tablewriter"
    22  )
    23  
    24  // Statistics is a collection of measurements and statistics that is used by
    25  // the coster to estimate the cost of expressions. Statistics are collected
    26  // for tables and indexes and are exposed to the optimizer via cat.Catalog
    27  // interfaces.
    28  //
    29  // As logical properties are derived bottom-up for each expression, the
    30  // estimated row count is derived bottom-up for each relational expression.
    31  // The column statistics (stored in ColStats and MultiColStats) are derived
    32  // lazily, and only as needed to determine the row count for the current
    33  // expression or a parent expression. For example:
    34  //
    35  //   SELECT y FROM a WHERE x=1
    36  //
    37  // The only column that affects the row count of this query is x, since the
    38  // distribution of values in x is what determines the selectivity of the
    39  // predicate. As a result, column statistics will be derived for column x but
    40  // not for column y.
    41  //
    42  // See memo/statistics_builder.go for more information about how statistics are
    43  // calculated.
    44  type Statistics struct {
    45  	// Available indicates whether the underlying table statistics for this
    46  	// expression were available. If true, RowCount contains a real estimate.
    47  	// If false, RowCount does not represent reality, and should only be used
    48  	// for relative cost comparison.
    49  	Available bool
    50  
    51  	// RowCount is the estimated number of rows returned by the expression.
    52  	// Note that - especially when there are no stats available - the scaling of
    53  	// the row counts can be unpredictable; thus, a row count of 0.001 should be
    54  	// considered 1000 times better than a row count of 1, even though if this was
    55  	// a true row count they would be pretty much the same thing.
    56  	RowCount float64
    57  
    58  	// ColStats is a collection of statistics that pertain to columns in an
    59  	// expression or table. It is keyed by a set of one or more columns over which
    60  	// the statistic is defined.
    61  	ColStats ColStatsMap
    62  
    63  	// Selectivity is a value between 0 and 1 representing the estimated
    64  	// reduction in number of rows for the top-level operator in this
    65  	// expression.
    66  	Selectivity float64
    67  }
    68  
    69  // Init initializes the data members of Statistics.
    70  func (s *Statistics) Init(relProps *Relational) (zeroCardinality bool) {
    71  	if relProps.Cardinality.IsZero() {
    72  		s.RowCount = 0
    73  		s.Selectivity = 0
    74  		s.Available = true
    75  		return true
    76  	}
    77  	s.Selectivity = 1
    78  	return false
    79  }
    80  
    81  // CopyFrom copies a Statistics object which can then be modified independently.
    82  func (s *Statistics) CopyFrom(other *Statistics) {
    83  	s.Available = other.Available
    84  	s.RowCount = other.RowCount
    85  	s.ColStats.CopyFrom(&other.ColStats)
    86  	s.Selectivity = other.Selectivity
    87  }
    88  
    89  // ApplySelectivity applies a given selectivity to the statistics. RowCount and
    90  // Selectivity are updated. Note that DistinctCounts and NullCounts are not
    91  // updated.
    92  // See ColumnStatistic.ApplySelectivity for updating distinct counts and null
    93  // counts.
    94  func (s *Statistics) ApplySelectivity(selectivity float64) {
    95  	s.RowCount *= selectivity
    96  	s.Selectivity *= selectivity
    97  }
    98  
    99  func (s *Statistics) String() string {
   100  	var buf bytes.Buffer
   101  
   102  	fmt.Fprintf(&buf, "[rows=%.9g", s.RowCount)
   103  	colStats := make(ColumnStatistics, s.ColStats.Count())
   104  	for i := 0; i < s.ColStats.Count(); i++ {
   105  		colStats[i] = s.ColStats.Get(i)
   106  	}
   107  	sort.Sort(colStats)
   108  	for _, col := range colStats {
   109  		fmt.Fprintf(&buf, ", distinct%s=%.9g", col.Cols.String(), col.DistinctCount)
   110  		fmt.Fprintf(&buf, ", null%s=%.9g", col.Cols.String(), col.NullCount)
   111  	}
   112  	buf.WriteString("]")
   113  	for _, col := range colStats {
   114  		if col.Histogram != nil {
   115  			label := fmt.Sprintf("histogram%s=", col.Cols.String())
   116  			indent := strings.Repeat(" ", tablewriter.DisplayWidth(label))
   117  			fmt.Fprintf(&buf, "\n%s", label)
   118  			histLines := strings.Split(strings.TrimRight(col.Histogram.String(), "\n"), "\n")
   119  			for i, line := range histLines {
   120  				if i != 0 {
   121  					fmt.Fprintf(&buf, "\n%s", indent)
   122  				}
   123  				fmt.Fprintf(&buf, "%s", strings.TrimRight(line, " "))
   124  			}
   125  		}
   126  	}
   127  
   128  	return buf.String()
   129  }
   130  
   131  // ColumnStatistic is a collection of statistics that applies to a particular
   132  // set of columns. In theory, a table could have a ColumnStatistic object
   133  // for every possible subset of columns. In practice, it is only worth
   134  // maintaining statistics on a few columns and column sets that are frequently
   135  // used in predicates, group by columns, etc.
   136  //
   137  // ColumnStatistiscs can be copied by value.
   138  type ColumnStatistic struct {
   139  	// Cols is the set of columns whose data are summarized by this
   140  	// ColumnStatistic struct. The ColSet is never modified in-place.
   141  	Cols opt.ColSet
   142  
   143  	// DistinctCount is the estimated number of distinct values of this
   144  	// set of columns for this expression. Includes null values.
   145  	DistinctCount float64
   146  
   147  	// NullCount is the estimated number of null values of this set of
   148  	// columns for this expression. For multi-column stats, this null
   149  	// count tracks only the rows in which all columns in the set are null.
   150  	NullCount float64
   151  
   152  	// Histogram is only used when the size of Cols is one. It contains
   153  	// the approximate distribution of values for that column, represented
   154  	// by a slice of histogram buckets.
   155  	Histogram *Histogram
   156  }
   157  
   158  // ApplySelectivity updates the distinct count, null count, and histogram
   159  // according to a given selectivity.
   160  func (c *ColumnStatistic) ApplySelectivity(selectivity, inputRows float64) {
   161  	// Since the null count is a simple count of all null rows, we can
   162  	// just multiply the selectivity with it.
   163  	c.NullCount *= selectivity
   164  
   165  	if c.Histogram != nil {
   166  		c.Histogram = c.Histogram.ApplySelectivity(selectivity)
   167  	}
   168  
   169  	if selectivity == 1 || c.DistinctCount == 0 {
   170  		return
   171  	}
   172  	if selectivity == 0 {
   173  		c.DistinctCount = 0
   174  		return
   175  	}
   176  
   177  	n := inputRows
   178  	d := c.DistinctCount
   179  
   180  	// If each distinct value appears n/d times, and the probability of a
   181  	// row being filtered out is (1 - selectivity), the probability that all
   182  	// n/d rows are filtered out is (1 - selectivity)^(n/d). So the expected
   183  	// number of values that are filtered out is d*(1 - selectivity)^(n/d).
   184  	//
   185  	// This formula returns d * selectivity when d=n but is closer to d
   186  	// when d << n.
   187  	c.DistinctCount = d - d*math.Pow(1-selectivity, n/d)
   188  	const epsilon = 1e-10
   189  	if c.DistinctCount < epsilon {
   190  		// Avoid setting the distinct count to 0 (since the row count is
   191  		// non-zero).
   192  		c.DistinctCount = epsilon
   193  	}
   194  
   195  }
   196  
   197  // ColumnStatistics is a slice of pointers to ColumnStatistic values.
   198  type ColumnStatistics []*ColumnStatistic
   199  
   200  // Len returns the number of ColumnStatistic values.
   201  func (c ColumnStatistics) Len() int { return len(c) }
   202  
   203  // Less is part of the Sorter interface.
   204  func (c ColumnStatistics) Less(i, j int) bool {
   205  	if c[i].Cols.Len() != c[j].Cols.Len() {
   206  		return c[i].Cols.Len() < c[j].Cols.Len()
   207  	}
   208  
   209  	prev := opt.ColumnID(0)
   210  	for {
   211  		nextI, ok := c[i].Cols.Next(prev)
   212  		if !ok {
   213  			return false
   214  		}
   215  
   216  		// No need to check if ok since both ColSets are the same length and
   217  		// so far have had the same elements.
   218  		nextJ, _ := c[j].Cols.Next(prev)
   219  
   220  		if nextI != nextJ {
   221  			return nextI < nextJ
   222  		}
   223  
   224  		prev = nextI
   225  	}
   226  }
   227  
   228  // Swap is part of the Sorter interface.
   229  func (c ColumnStatistics) Swap(i, j int) {
   230  	c[i], c[j] = c[j], c[i]
   231  }