github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/sql/opt/props/statistics.go (about) 1 // Copyright 2018 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package props 12 13 import ( 14 "bytes" 15 "fmt" 16 "math" 17 "sort" 18 "strings" 19 20 "github.com/cockroachdb/cockroach/pkg/sql/opt" 21 "github.com/olekukonko/tablewriter" 22 ) 23 24 // Statistics is a collection of measurements and statistics that is used by 25 // the coster to estimate the cost of expressions. Statistics are collected 26 // for tables and indexes and are exposed to the optimizer via cat.Catalog 27 // interfaces. 28 // 29 // As logical properties are derived bottom-up for each expression, the 30 // estimated row count is derived bottom-up for each relational expression. 31 // The column statistics (stored in ColStats and MultiColStats) are derived 32 // lazily, and only as needed to determine the row count for the current 33 // expression or a parent expression. For example: 34 // 35 // SELECT y FROM a WHERE x=1 36 // 37 // The only column that affects the row count of this query is x, since the 38 // distribution of values in x is what determines the selectivity of the 39 // predicate. As a result, column statistics will be derived for column x but 40 // not for column y. 41 // 42 // See memo/statistics_builder.go for more information about how statistics are 43 // calculated. 44 type Statistics struct { 45 // Available indicates whether the underlying table statistics for this 46 // expression were available. If true, RowCount contains a real estimate. 47 // If false, RowCount does not represent reality, and should only be used 48 // for relative cost comparison. 49 Available bool 50 51 // RowCount is the estimated number of rows returned by the expression. 52 // Note that - especially when there are no stats available - the scaling of 53 // the row counts can be unpredictable; thus, a row count of 0.001 should be 54 // considered 1000 times better than a row count of 1, even though if this was 55 // a true row count they would be pretty much the same thing. 56 RowCount float64 57 58 // ColStats is a collection of statistics that pertain to columns in an 59 // expression or table. It is keyed by a set of one or more columns over which 60 // the statistic is defined. 61 ColStats ColStatsMap 62 63 // Selectivity is a value between 0 and 1 representing the estimated 64 // reduction in number of rows for the top-level operator in this 65 // expression. 66 Selectivity float64 67 } 68 69 // Init initializes the data members of Statistics. 70 func (s *Statistics) Init(relProps *Relational) (zeroCardinality bool) { 71 if relProps.Cardinality.IsZero() { 72 s.RowCount = 0 73 s.Selectivity = 0 74 s.Available = true 75 return true 76 } 77 s.Selectivity = 1 78 return false 79 } 80 81 // CopyFrom copies a Statistics object which can then be modified independently. 82 func (s *Statistics) CopyFrom(other *Statistics) { 83 s.Available = other.Available 84 s.RowCount = other.RowCount 85 s.ColStats.CopyFrom(&other.ColStats) 86 s.Selectivity = other.Selectivity 87 } 88 89 // ApplySelectivity applies a given selectivity to the statistics. RowCount and 90 // Selectivity are updated. Note that DistinctCounts and NullCounts are not 91 // updated. 92 // See ColumnStatistic.ApplySelectivity for updating distinct counts and null 93 // counts. 94 func (s *Statistics) ApplySelectivity(selectivity float64) { 95 s.RowCount *= selectivity 96 s.Selectivity *= selectivity 97 } 98 99 func (s *Statistics) String() string { 100 var buf bytes.Buffer 101 102 fmt.Fprintf(&buf, "[rows=%.9g", s.RowCount) 103 colStats := make(ColumnStatistics, s.ColStats.Count()) 104 for i := 0; i < s.ColStats.Count(); i++ { 105 colStats[i] = s.ColStats.Get(i) 106 } 107 sort.Sort(colStats) 108 for _, col := range colStats { 109 fmt.Fprintf(&buf, ", distinct%s=%.9g", col.Cols.String(), col.DistinctCount) 110 fmt.Fprintf(&buf, ", null%s=%.9g", col.Cols.String(), col.NullCount) 111 } 112 buf.WriteString("]") 113 for _, col := range colStats { 114 if col.Histogram != nil { 115 label := fmt.Sprintf("histogram%s=", col.Cols.String()) 116 indent := strings.Repeat(" ", tablewriter.DisplayWidth(label)) 117 fmt.Fprintf(&buf, "\n%s", label) 118 histLines := strings.Split(strings.TrimRight(col.Histogram.String(), "\n"), "\n") 119 for i, line := range histLines { 120 if i != 0 { 121 fmt.Fprintf(&buf, "\n%s", indent) 122 } 123 fmt.Fprintf(&buf, "%s", strings.TrimRight(line, " ")) 124 } 125 } 126 } 127 128 return buf.String() 129 } 130 131 // ColumnStatistic is a collection of statistics that applies to a particular 132 // set of columns. In theory, a table could have a ColumnStatistic object 133 // for every possible subset of columns. In practice, it is only worth 134 // maintaining statistics on a few columns and column sets that are frequently 135 // used in predicates, group by columns, etc. 136 // 137 // ColumnStatistiscs can be copied by value. 138 type ColumnStatistic struct { 139 // Cols is the set of columns whose data are summarized by this 140 // ColumnStatistic struct. The ColSet is never modified in-place. 141 Cols opt.ColSet 142 143 // DistinctCount is the estimated number of distinct values of this 144 // set of columns for this expression. Includes null values. 145 DistinctCount float64 146 147 // NullCount is the estimated number of null values of this set of 148 // columns for this expression. For multi-column stats, this null 149 // count tracks only the rows in which all columns in the set are null. 150 NullCount float64 151 152 // Histogram is only used when the size of Cols is one. It contains 153 // the approximate distribution of values for that column, represented 154 // by a slice of histogram buckets. 155 Histogram *Histogram 156 } 157 158 // ApplySelectivity updates the distinct count, null count, and histogram 159 // according to a given selectivity. 160 func (c *ColumnStatistic) ApplySelectivity(selectivity, inputRows float64) { 161 // Since the null count is a simple count of all null rows, we can 162 // just multiply the selectivity with it. 163 c.NullCount *= selectivity 164 165 if c.Histogram != nil { 166 c.Histogram = c.Histogram.ApplySelectivity(selectivity) 167 } 168 169 if selectivity == 1 || c.DistinctCount == 0 { 170 return 171 } 172 if selectivity == 0 { 173 c.DistinctCount = 0 174 return 175 } 176 177 n := inputRows 178 d := c.DistinctCount 179 180 // If each distinct value appears n/d times, and the probability of a 181 // row being filtered out is (1 - selectivity), the probability that all 182 // n/d rows are filtered out is (1 - selectivity)^(n/d). So the expected 183 // number of values that are filtered out is d*(1 - selectivity)^(n/d). 184 // 185 // This formula returns d * selectivity when d=n but is closer to d 186 // when d << n. 187 c.DistinctCount = d - d*math.Pow(1-selectivity, n/d) 188 const epsilon = 1e-10 189 if c.DistinctCount < epsilon { 190 // Avoid setting the distinct count to 0 (since the row count is 191 // non-zero). 192 c.DistinctCount = epsilon 193 } 194 195 } 196 197 // ColumnStatistics is a slice of pointers to ColumnStatistic values. 198 type ColumnStatistics []*ColumnStatistic 199 200 // Len returns the number of ColumnStatistic values. 201 func (c ColumnStatistics) Len() int { return len(c) } 202 203 // Less is part of the Sorter interface. 204 func (c ColumnStatistics) Less(i, j int) bool { 205 if c[i].Cols.Len() != c[j].Cols.Len() { 206 return c[i].Cols.Len() < c[j].Cols.Len() 207 } 208 209 prev := opt.ColumnID(0) 210 for { 211 nextI, ok := c[i].Cols.Next(prev) 212 if !ok { 213 return false 214 } 215 216 // No need to check if ok since both ColSets are the same length and 217 // so far have had the same elements. 218 nextJ, _ := c[j].Cols.Next(prev) 219 220 if nextI != nextJ { 221 return nextI < nextJ 222 } 223 224 prev = nextI 225 } 226 } 227 228 // Swap is part of the Sorter interface. 229 func (c ColumnStatistics) Swap(i, j int) { 230 c[i], c[j] = c[j], c[i] 231 }