github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/sql/opt/memo/statistics_builder.go (about) 1 // Copyright 2018 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package memo 12 13 import ( 14 "math" 15 "reflect" 16 17 "github.com/cockroachdb/cockroach/pkg/sql/opt" 18 "github.com/cockroachdb/cockroach/pkg/sql/opt/constraint" 19 "github.com/cockroachdb/cockroach/pkg/sql/opt/props" 20 "github.com/cockroachdb/cockroach/pkg/sql/sem/tree" 21 "github.com/cockroachdb/cockroach/pkg/sql/types" 22 "github.com/cockroachdb/cockroach/pkg/util/json" 23 "github.com/cockroachdb/cockroach/pkg/util/log" 24 "github.com/cockroachdb/errors" 25 ) 26 27 var statsAnnID = opt.NewTableAnnID() 28 29 // statisticsBuilder is responsible for building the statistics that are 30 // used by the coster to estimate the cost of expressions. 31 // 32 // Background 33 // ---------- 34 // 35 // Conceptually, there are two kinds of statistics: table statistics and 36 // relational expression statistics. 37 // 38 // 1. Table statistics 39 // 40 // Table statistics are stats derived from the underlying data in the 41 // database. These stats are calculated either automatically or on-demand for 42 // each table, and include the number of rows in the table as well as 43 // statistics about selected individual columns or sets of columns. The column 44 // statistics include the number of null values, the number of distinct values, 45 // and optionally, a histogram of the data distribution (only applicable for 46 // single columns, not sets of columns). These stats are only collected 47 // periodically to avoid overloading the database, so they may be stale. They 48 // are currently persisted in the system.table_statistics table (see sql/stats 49 // for details). Inside the optimizer, they are cached in a props.Statistics 50 // object as a table annotation in opt.Metadata. 51 // 52 // 2. Relational expression statistics 53 // 54 // Relational expression statistics are derived from table statistics, and are 55 // only valid for a particular memo group. They are used to estimate how the 56 // underlying table statistics change as different relational operators are 57 // applied. The same types of statistics are stored for relational expressions 58 // as for tables (row count, null count, distinct count, etc.). Inside the 59 // optimizer, they are stored in a props.Statistics object in the logical 60 // properties of the relational expression's memo group. 61 // 62 // For example, here is a query plan with corresponding estimated statistics at 63 // each level: 64 // 65 // Query: SELECT y FROM a WHERE x=1 66 // 67 // Plan: Project y Row Count: 10, Distinct(x): 1 68 // | 69 // Select x=1 Row Count: 10, Distinct(x): 1 70 // | 71 // Scan a Row Count: 100, Distinct(x): 10 72 // 73 // The statistics for the Scan operator were presumably retrieved from the 74 // underlying table statistics cached in the metadata. The statistics for 75 // the Select operator are determined as follows: Since the predicate x=1 76 // reduces the number of distinct values of x down to 1, and the previous 77 // distinct count of x was 10, the selectivity of the predicate is 1/10. 78 // Thus, the estimated number of output rows is 1/10 * 100 = 10. Finally, the 79 // Project operator passes through the statistics from its child expression. 80 // 81 // Statistics for expressions high up in the query tree tend to be quite 82 // inaccurate since the estimation errors from lower expressions are 83 // compounded. Still, statistics are useful throughout the query tree to help 84 // the optimizer choose between multiple alternative, logically equivalent 85 // plans. 86 // 87 // How statisticsBuilder works 88 // --------------------------- 89 // 90 // statisticsBuilder is responsible for building the second type of statistics, 91 // relational expression statistics. It builds the statistics lazily, and only 92 // calculates column statistics if needed to estimate the row count of an 93 // expression (currently, the row count is the only statistic used by the 94 // coster). 95 // 96 // Every relational operator has a buildXXX and a colStatXXX function. For 97 // example, Scan has buildScan and colStatScan. buildScan is called when the 98 // logical properties of a Scan expression are built. The goal of each buildXXX 99 // function is to calculate the number of rows output by the expression so that 100 // its cost can be estimated by the coster. 101 // 102 // In order to determine the row count, column statistics may be required for a 103 // subset of the columns of the expression. Column statistics are calculated 104 // recursively from the child expression(s) via calls to the colStatFromInput 105 // function. colStatFromInput finds the child expression that might contain the 106 // requested stats, and calls colStat on the child. colStat checks if the 107 // requested stats are already cached for the child expression, and if not, 108 // calls colStatXXX (where the XXX corresponds to the operator of the child 109 // expression). The child expression may need to calculate column statistics 110 // from its children, and if so, it makes another recursive call to 111 // colStatFromInput. 112 // 113 // The "base case" for colStatFromInput is a Scan, where the "input" is the raw 114 // table itself; the table statistics are retrieved from the metadata (the 115 // metadata may in turn need to fetch the stats from the database if they are 116 // not already cached). If a particular table statistic is not available, a 117 // best-effort guess is made (see colStatLeaf for details). 118 // 119 // To better understand how the statisticsBuilder works, let us consider this 120 // simple query, which consists of a scan followed by an aggregation: 121 // 122 // SELECT count(*), x, y FROM t GROUP BY x, y 123 // 124 // The statistics for the scan of t will be calculated first, since logical 125 // properties are built bottom-up. The estimated row count is retrieved from 126 // the table statistics in the metadata, so no column statistics are needed. 127 // 128 // The statistics for the group by operator are calculated second. The row 129 // count for GROUP BY can be determined by the distinct count of its grouping 130 // columns. Therefore, the statisticsBuilder recursively updates the statistics 131 // for the scan operator to include column stats for x and y, and then uses 132 // these column stats to update the statistics for GROUP BY. 133 // 134 // At each stage where column statistics are requested, the statisticsBuilder 135 // makes a call to colStatFromChild, which in turn calls colStat on the child 136 // to retrieve the cached statistics or calculate them recursively. Assuming 137 // that no statistics are cached, this is the order of function calls for the 138 // above example (somewhat simplified): 139 // 140 // +-------------+ +--------------+ 141 // 1. | buildScan t | 2. | buildGroupBy | 142 // +-------------+ +--------------+ 143 // | | 144 // +-----------------------+ +-------------------------+ 145 // | makeTableStatistics t | | colStatFromChild (x, y) | 146 // +-----------------------+ +-------------------------+ 147 // | 148 // +--------------------+ 149 // | colStatScan (x, y) | 150 // +--------------------+ 151 // | 152 // +---------------------+ 153 // | colStatTable (x, y) | 154 // +---------------------+ 155 // | 156 // +--------------------+ 157 // | colStatLeaf (x, y) | 158 // +--------------------+ 159 // 160 // See props/statistics.go for more details. 161 type statisticsBuilder struct { 162 evalCtx *tree.EvalContext 163 md *opt.Metadata 164 } 165 166 func (sb *statisticsBuilder) init(evalCtx *tree.EvalContext, md *opt.Metadata) { 167 sb.evalCtx = evalCtx 168 sb.md = md 169 } 170 171 func (sb *statisticsBuilder) clear() { 172 sb.evalCtx = nil 173 sb.md = nil 174 } 175 176 // colStatFromChild retrieves a column statistic from a specific child of the 177 // given expression. 178 func (sb *statisticsBuilder) colStatFromChild( 179 colSet opt.ColSet, e RelExpr, childIdx int, 180 ) *props.ColumnStatistic { 181 // Helper function to return the column statistic if the output columns of 182 // the child with the given index intersect colSet. 183 child := e.Child(childIdx).(RelExpr) 184 childProps := child.Relational() 185 if !colSet.SubsetOf(childProps.OutputCols) { 186 colSet = colSet.Intersection(childProps.OutputCols) 187 if colSet.Empty() { 188 // All the columns in colSet are outer columns; therefore, we can treat 189 // them as a constant. 190 return &props.ColumnStatistic{Cols: colSet, DistinctCount: 1} 191 } 192 } 193 return sb.colStat(colSet, child) 194 } 195 196 // statsFromChild retrieves the main statistics struct from a specific child 197 // of the given expression. 198 func (sb *statisticsBuilder) statsFromChild(e RelExpr, childIdx int) *props.Statistics { 199 return &e.Child(childIdx).(RelExpr).Relational().Stats 200 } 201 202 // availabilityFromInput determines the availability of the underlying table 203 // statistics from the children of the expression. 204 func (sb *statisticsBuilder) availabilityFromInput(e RelExpr) bool { 205 switch t := e.(type) { 206 case *ScanExpr: 207 return sb.makeTableStatistics(t.Table).Available 208 209 case *LookupJoinExpr: 210 ensureLookupJoinInputProps(t, sb) 211 return t.lookupProps.Stats.Available && t.Input.Relational().Stats.Available 212 213 case *GeoLookupJoinExpr: 214 ensureGeoLookupJoinInputProps(t, sb) 215 return t.lookupProps.Stats.Available && t.Input.Relational().Stats.Available 216 217 case *ZigzagJoinExpr: 218 ensureZigzagJoinInputProps(t, sb) 219 return t.leftProps.Stats.Available 220 221 case *WithScanExpr: 222 return t.BindingProps.Stats.Available 223 } 224 225 available := true 226 for i, n := 0, e.ChildCount(); i < n; i++ { 227 if child, ok := e.Child(i).(RelExpr); ok { 228 available = available && child.Relational().Stats.Available 229 } 230 } 231 return available 232 } 233 234 // colStatFromInput retrieves a column statistic from the input(s) of a Scan, 235 // Select, or Join. The input to the Scan is the "raw" table. 236 // 237 // colStatFromInput also retrieves a pointer to the full statistics from the 238 // relevant input. 239 func (sb *statisticsBuilder) colStatFromInput( 240 colSet opt.ColSet, e RelExpr, 241 ) (*props.ColumnStatistic, *props.Statistics) { 242 var lookupJoin *LookupJoinExpr 243 var geospatialLookupJoin *GeoLookupJoinExpr 244 var zigzagJoin *ZigzagJoinExpr 245 246 switch t := e.(type) { 247 case *ScanExpr: 248 return sb.colStatTable(t.Table, colSet), sb.makeTableStatistics(t.Table) 249 250 case *SelectExpr: 251 return sb.colStatFromChild(colSet, t, 0 /* childIdx */), sb.statsFromChild(e, 0 /* childIdx */) 252 253 case *LookupJoinExpr: 254 lookupJoin = t 255 ensureLookupJoinInputProps(lookupJoin, sb) 256 257 case *GeoLookupJoinExpr: 258 geospatialLookupJoin = t 259 ensureGeoLookupJoinInputProps(geospatialLookupJoin, sb) 260 261 case *ZigzagJoinExpr: 262 zigzagJoin = t 263 ensureZigzagJoinInputProps(zigzagJoin, sb) 264 } 265 266 if lookupJoin != nil || geospatialLookupJoin != nil || zigzagJoin != nil || 267 opt.IsJoinOp(e) || e.Op() == opt.MergeJoinOp { 268 var leftProps *props.Relational 269 if zigzagJoin != nil { 270 leftProps = &zigzagJoin.leftProps 271 } else { 272 leftProps = e.Child(0).(RelExpr).Relational() 273 } 274 275 intersectsLeft := leftProps.OutputCols.Intersects(colSet) 276 var intersectsRight bool 277 if lookupJoin != nil { 278 intersectsRight = lookupJoin.lookupProps.OutputCols.Intersects(colSet) 279 } else if geospatialLookupJoin != nil { 280 intersectsRight = geospatialLookupJoin.lookupProps.OutputCols.Intersects(colSet) 281 } else if zigzagJoin != nil { 282 intersectsRight = zigzagJoin.rightProps.OutputCols.Intersects(colSet) 283 } else { 284 intersectsRight = e.Child(1).(RelExpr).Relational().OutputCols.Intersects(colSet) 285 } 286 287 // It's possible that colSet intersects both left and right if we have a 288 // lookup join that was converted from an index join, so check the left 289 // side first. 290 if intersectsLeft { 291 if zigzagJoin != nil { 292 return sb.colStatTable(zigzagJoin.LeftTable, colSet), 293 sb.makeTableStatistics(zigzagJoin.LeftTable) 294 } 295 return sb.colStatFromChild(colSet, e, 0 /* childIdx */), 296 sb.statsFromChild(e, 0 /* childIdx */) 297 } 298 if intersectsRight { 299 if lookupJoin != nil { 300 return sb.colStatTable(lookupJoin.Table, colSet), 301 sb.makeTableStatistics(lookupJoin.Table) 302 } 303 if geospatialLookupJoin != nil { 304 // TODO(rytaft): use inverted index stats when available. 305 return sb.colStatTable(geospatialLookupJoin.Table, colSet), 306 sb.makeTableStatistics(geospatialLookupJoin.Table) 307 } 308 if zigzagJoin != nil { 309 return sb.colStatTable(zigzagJoin.RightTable, colSet), 310 sb.makeTableStatistics(zigzagJoin.RightTable) 311 } 312 return sb.colStatFromChild(colSet, e, 1 /* childIdx */), 313 sb.statsFromChild(e, 1 /* childIdx */) 314 } 315 // All columns in colSet are outer columns; therefore, we can treat them 316 // as a constant. Return table stats from the left side. 317 if zigzagJoin != nil { 318 return &props.ColumnStatistic{Cols: colSet, DistinctCount: 1}, 319 sb.makeTableStatistics(zigzagJoin.LeftTable) 320 } 321 return &props.ColumnStatistic{Cols: colSet, DistinctCount: 1}, sb.statsFromChild(e, 0 /* childIdx */) 322 } 323 324 panic(errors.AssertionFailedf("unsupported operator type %s", log.Safe(e.Op()))) 325 } 326 327 // colStat gets a column statistic for the given set of columns if it exists. 328 // If the column statistic is not available in the current expression, colStat 329 // recursively tries to find it in the children of the expression, lazily 330 // populating s.ColStats with the statistic as it gets passed up the expression 331 // tree. 332 func (sb *statisticsBuilder) colStat(colSet opt.ColSet, e RelExpr) *props.ColumnStatistic { 333 if colSet.Empty() { 334 panic(errors.AssertionFailedf("column statistics cannot be determined for empty column set")) 335 } 336 337 // Check if the requested column statistic is already cached. 338 if stat, ok := e.Relational().Stats.ColStats.Lookup(colSet); ok { 339 return stat 340 } 341 342 // We only calculate statistics on the normalized expression in a memo group. 343 e = e.FirstExpr() 344 345 // The statistic was not found in the cache, so calculate it based on the 346 // type of expression. 347 switch e.Op() { 348 case opt.ScanOp: 349 return sb.colStatScan(colSet, e.(*ScanExpr)) 350 351 case opt.SelectOp: 352 return sb.colStatSelect(colSet, e.(*SelectExpr)) 353 354 case opt.ProjectOp: 355 return sb.colStatProject(colSet, e.(*ProjectExpr)) 356 357 case opt.ValuesOp: 358 return sb.colStatValues(colSet, e.(*ValuesExpr)) 359 360 case opt.InnerJoinOp, opt.LeftJoinOp, opt.RightJoinOp, opt.FullJoinOp, 361 opt.SemiJoinOp, opt.AntiJoinOp, opt.InnerJoinApplyOp, opt.LeftJoinApplyOp, 362 opt.SemiJoinApplyOp, opt.AntiJoinApplyOp, opt.MergeJoinOp, opt.LookupJoinOp, 363 opt.GeoLookupJoinOp, opt.ZigzagJoinOp: 364 return sb.colStatJoin(colSet, e) 365 366 case opt.IndexJoinOp: 367 return sb.colStatIndexJoin(colSet, e.(*IndexJoinExpr)) 368 369 case opt.UnionOp, opt.IntersectOp, opt.ExceptOp, 370 opt.UnionAllOp, opt.IntersectAllOp, opt.ExceptAllOp: 371 return sb.colStatSetNode(colSet, e) 372 373 case opt.GroupByOp, opt.ScalarGroupByOp, opt.DistinctOnOp, opt.EnsureDistinctOnOp, 374 opt.UpsertDistinctOnOp, opt.EnsureUpsertDistinctOnOp: 375 return sb.colStatGroupBy(colSet, e) 376 377 case opt.LimitOp: 378 return sb.colStatLimit(colSet, e.(*LimitExpr)) 379 380 case opt.OffsetOp: 381 return sb.colStatOffset(colSet, e.(*OffsetExpr)) 382 383 case opt.Max1RowOp: 384 return sb.colStatMax1Row(colSet, e.(*Max1RowExpr)) 385 386 case opt.OrdinalityOp: 387 return sb.colStatOrdinality(colSet, e.(*OrdinalityExpr)) 388 389 case opt.WindowOp: 390 return sb.colStatWindow(colSet, e.(*WindowExpr)) 391 392 case opt.ProjectSetOp: 393 return sb.colStatProjectSet(colSet, e.(*ProjectSetExpr)) 394 395 case opt.WithScanOp: 396 return sb.colStatWithScan(colSet, e.(*WithScanExpr)) 397 398 case opt.InsertOp, opt.UpdateOp, opt.UpsertOp, opt.DeleteOp: 399 return sb.colStatMutation(colSet, e) 400 401 case opt.SequenceSelectOp: 402 return sb.colStatSequenceSelect(colSet, e.(*SequenceSelectExpr)) 403 404 case opt.ExplainOp, opt.ShowTraceForSessionOp, 405 opt.OpaqueRelOp, opt.OpaqueMutationOp, opt.OpaqueDDLOp, opt.RecursiveCTEOp: 406 return sb.colStatUnknown(colSet, e.Relational()) 407 408 case opt.WithOp: 409 return sb.colStat(colSet, e.Child(1).(RelExpr)) 410 411 case opt.FakeRelOp: 412 panic(errors.AssertionFailedf("FakeRelOp does not contain col stat for %v", colSet)) 413 } 414 415 panic(errors.AssertionFailedf("unrecognized relational expression type: %v", log.Safe(e.Op()))) 416 } 417 418 // colStatLeaf creates a column statistic for a given column set (if it doesn't 419 // already exist in s), by deriving the statistic from the general statistics. 420 // Used when there is no child expression to retrieve statistics from, typically 421 // with the Statistics derived for a table. 422 func (sb *statisticsBuilder) colStatLeaf( 423 colSet opt.ColSet, s *props.Statistics, fd *props.FuncDepSet, notNullCols opt.ColSet, 424 ) *props.ColumnStatistic { 425 // Ensure that the requested column statistic is in the cache. 426 colStat, added := s.ColStats.Add(colSet) 427 if !added { 428 // Already in the cache. 429 return colStat 430 } 431 432 // If some of the columns are a lax key, the distinct count equals the row 433 // count. The null count is 0 if any of these columns are not nullable, 434 // otherwise copy the null count from the nullable columns in colSet. 435 if fd.ColsAreLaxKey(colSet) { 436 if colSet.Intersects(notNullCols) { 437 colStat.NullCount = 0 438 } else { 439 nullableCols := colSet.Difference(notNullCols) 440 if nullableCols.Equals(colSet) { 441 // No column statistics on this colSet - use the unknown 442 // null count ratio. 443 colStat.NullCount = s.RowCount * unknownNullCountRatio 444 } else { 445 colStatLeaf := sb.colStatLeaf(nullableCols, s, fd, notNullCols) 446 // Fetch the colStat again since it may now have a different address. 447 colStat, _ = s.ColStats.Lookup(colSet) 448 colStat.NullCount = colStatLeaf.NullCount 449 } 450 } 451 // Only one of the null values counts towards the distinct count. 452 colStat.DistinctCount = s.RowCount - max(colStat.NullCount-1, 0) 453 return colStat 454 } 455 456 if colSet.Len() == 1 { 457 col, _ := colSet.Next(0) 458 colStat.DistinctCount = unknownDistinctCountRatio * s.RowCount 459 colStat.NullCount = unknownNullCountRatio * s.RowCount 460 if notNullCols.Contains(col) { 461 colStat.NullCount = 0 462 } 463 if sb.md.ColumnMeta(col).Type.Family() == types.BoolFamily { 464 // There are maximum three distinct values: true, false, and null. 465 maxDistinct := float64(2) 466 if colStat.NullCount > 0 { 467 maxDistinct++ 468 } 469 colStat.DistinctCount = min(colStat.DistinctCount, maxDistinct) 470 } 471 } else { 472 distinctCount := 1.0 473 nullCount := s.RowCount 474 colSet.ForEach(func(i opt.ColumnID) { 475 colStatLeaf := sb.colStatLeaf(opt.MakeColSet(i), s, fd, notNullCols) 476 distinctCount *= colStatLeaf.DistinctCount 477 // Multiply by the expected chance of collisions with nulls already 478 // collected. 479 nullCount *= colStatLeaf.NullCount / s.RowCount 480 }) 481 // Fetch the colStat again since it may now have a different address. 482 colStat, _ = s.ColStats.Lookup(colSet) 483 colStat.DistinctCount = min(distinctCount, s.RowCount) 484 colStat.NullCount = min(nullCount, s.RowCount) 485 } 486 487 return colStat 488 } 489 490 // +-------+ 491 // | Table | 492 // +-------+ 493 494 // makeTableStatistics returns the available statistics for the given table. 495 // Statistics are derived lazily and are cached in the metadata, since they may 496 // be accessed multiple times during query optimization. For more details, see 497 // props.Statistics. 498 func (sb *statisticsBuilder) makeTableStatistics(tabID opt.TableID) *props.Statistics { 499 stats, ok := sb.md.TableAnnotation(tabID, statsAnnID).(*props.Statistics) 500 if ok { 501 // Already made. 502 return stats 503 } 504 505 // Make now and annotate the metadata table with it for next time. 506 tab := sb.md.Table(tabID) 507 stats = &props.Statistics{} 508 if tab.StatisticCount() == 0 { 509 // No statistics. 510 stats.Available = false 511 stats.RowCount = unknownRowCount 512 } else { 513 // Get the RowCount from the most recent statistic. Stats are ordered 514 // with most recent first. 515 stats.Available = true 516 stats.RowCount = float64(tab.Statistic(0).RowCount()) 517 518 // Make sure the row count is at least 1. The stats may be stale, and we 519 // can end up with weird and inefficient plans if we estimate 0 rows. 520 stats.RowCount = max(stats.RowCount, 1) 521 522 // Add all the column statistics, using the most recent statistic for each 523 // column set. Stats are ordered with most recent first. 524 for i := 0; i < tab.StatisticCount(); i++ { 525 stat := tab.Statistic(i) 526 if stat.ColumnCount() > 1 && !sb.evalCtx.SessionData.OptimizerUseMultiColStats { 527 continue 528 } 529 530 var cols opt.ColSet 531 for i := 0; i < stat.ColumnCount(); i++ { 532 cols.Add(tabID.ColumnID(stat.ColumnOrdinal(i))) 533 } 534 535 if colStat, ok := stats.ColStats.Add(cols); ok { 536 colStat.DistinctCount = float64(stat.DistinctCount()) 537 colStat.NullCount = float64(stat.NullCount()) 538 if cols.Len() == 1 && stat.Histogram() != nil && 539 sb.evalCtx.SessionData.OptimizerUseHistograms { 540 col, _ := cols.Next(0) 541 colStat.Histogram = &props.Histogram{} 542 colStat.Histogram.Init(sb.evalCtx, col, stat.Histogram()) 543 } 544 545 // Make sure the distinct count is at least 1, for the same reason as 546 // the row count above. 547 colStat.DistinctCount = max(colStat.DistinctCount, 1) 548 549 // Make sure the values are consistent in case some of the column stats 550 // were added at different times (and therefore have a different row 551 // count). 552 sb.finalizeFromRowCountAndDistinctCounts(colStat, stats) 553 } 554 } 555 } 556 sb.md.SetTableAnnotation(tabID, statsAnnID, stats) 557 return stats 558 } 559 560 func (sb *statisticsBuilder) colStatTable( 561 tabID opt.TableID, colSet opt.ColSet, 562 ) *props.ColumnStatistic { 563 tableStats := sb.makeTableStatistics(tabID) 564 tableFD := MakeTableFuncDep(sb.md, tabID) 565 tableNotNullCols := tableNotNullCols(sb.md, tabID) 566 return sb.colStatLeaf(colSet, tableStats, tableFD, tableNotNullCols) 567 } 568 569 // +------+ 570 // | Scan | 571 // +------+ 572 573 func (sb *statisticsBuilder) buildScan(scan *ScanExpr, relProps *props.Relational) { 574 s := &relProps.Stats 575 if zeroCardinality := s.Init(relProps); zeroCardinality { 576 // Short cut if cardinality is 0. 577 return 578 } 579 s.Available = sb.availabilityFromInput(scan) 580 581 inputStats := sb.makeTableStatistics(scan.Table) 582 s.RowCount = inputStats.RowCount 583 584 if scan.Constraint != nil { 585 // Calculate distinct counts and histograms for constrained columns 586 // ---------------------------------------------------------------- 587 var numUnappliedConjuncts float64 588 var constrainedCols, histCols opt.ColSet 589 // Inverted indexes are a special case; a constraint like: 590 // /1: [/'{"a": "b"}' - /'{"a": "b"}'] 591 // does not necessarily mean there is only going to be one distinct 592 // value for column 1, if it is being applied to an inverted index. 593 // This is because inverted index keys could correspond to partial 594 // column values, such as one path-to-a-leaf through a JSON object. 595 // 596 // For now, don't apply constraints on inverted index columns. 597 if sb.md.Table(scan.Table).Index(scan.Index).IsInverted() { 598 for i, n := 0, scan.Constraint.ConstrainedColumns(sb.evalCtx); i < n; i++ { 599 numUnappliedConjuncts += sb.numConjunctsInConstraint(scan.Constraint, i) 600 } 601 } else { 602 constrainedCols, histCols = sb.applyIndexConstraint(scan.Constraint, scan, relProps) 603 } 604 605 // Set null counts to 0 for non-nullable columns 606 // --------------------------------------------- 607 sb.updateNullCountsFromProps(scan, relProps) 608 609 // Calculate row count and selectivity 610 // ----------------------------------- 611 s.ApplySelectivity(sb.selectivityFromHistograms(histCols, scan, s)) 612 s.ApplySelectivity(sb.selectivityFromMultiColDistinctCounts(constrainedCols, scan, s)) 613 s.ApplySelectivity(sb.selectivityFromUnappliedConjuncts(numUnappliedConjuncts)) 614 s.ApplySelectivity(sb.selectivityFromNullsRemoved(scan, relProps, constrainedCols)) 615 616 // Adjust the selectivity so we don't double-count the histogram columns. 617 s.ApplySelectivity(1.0 / sb.selectivityFromSingleColDistinctCounts(histCols, scan, s)) 618 } 619 620 sb.finalizeFromCardinality(relProps) 621 } 622 623 func (sb *statisticsBuilder) colStatScan(colSet opt.ColSet, scan *ScanExpr) *props.ColumnStatistic { 624 relProps := scan.Relational() 625 s := &relProps.Stats 626 627 inputColStat := sb.colStatTable(scan.Table, colSet) 628 colStat := sb.copyColStat(colSet, s, inputColStat) 629 630 if sb.shouldUseHistogram(relProps) { 631 colStat.Histogram = inputColStat.Histogram 632 } 633 634 if s.Selectivity != 1 { 635 tableStats := sb.makeTableStatistics(scan.Table) 636 colStat.ApplySelectivity(s.Selectivity, tableStats.RowCount) 637 } 638 639 if colSet.Intersects(relProps.NotNullCols) { 640 colStat.NullCount = 0 641 } 642 643 sb.finalizeFromRowCountAndDistinctCounts(colStat, s) 644 return colStat 645 } 646 647 // +--------+ 648 // | Select | 649 // +--------+ 650 651 func (sb *statisticsBuilder) buildSelect(sel *SelectExpr, relProps *props.Relational) { 652 s := &relProps.Stats 653 if zeroCardinality := s.Init(relProps); zeroCardinality { 654 // Short cut if cardinality is 0. 655 return 656 } 657 s.Available = sb.availabilityFromInput(sel) 658 659 // Update stats based on equivalencies in the filter conditions. Note that 660 // EquivReps from the Select FD should not be used, as they include 661 // equivalencies derived from input expressions. 662 var equivFD props.FuncDepSet 663 for i := range sel.Filters { 664 equivFD.AddEquivFrom(&sel.Filters[i].ScalarProps().FuncDeps) 665 } 666 equivReps := equivFD.EquivReps() 667 668 // Calculate distinct counts and histograms for constrained columns 669 // ---------------------------------------------------------------- 670 numUnappliedConjuncts, constrainedCols, histCols := sb.applyFilter(sel.Filters, sel, relProps) 671 672 // Try to reduce the number of columns used for selectivity 673 // calculation based on functional dependencies. 674 inputFD := &sel.Input.Relational().FuncDeps 675 constrainedCols = sb.tryReduceCols(constrainedCols, s, inputFD) 676 677 // Set null counts to 0 for non-nullable columns 678 // ------------------------------------------- 679 sb.updateNullCountsFromProps(sel, relProps) 680 681 // Calculate selectivity and row count 682 // ----------------------------------- 683 inputStats := &sel.Input.Relational().Stats 684 s.RowCount = inputStats.RowCount 685 s.ApplySelectivity(sb.selectivityFromHistograms(histCols, sel, s)) 686 s.ApplySelectivity(sb.selectivityFromMultiColDistinctCounts(constrainedCols, sel, s)) 687 s.ApplySelectivity(sb.selectivityFromEquivalencies(equivReps, &relProps.FuncDeps, sel, s)) 688 s.ApplySelectivity(sb.selectivityFromUnappliedConjuncts(numUnappliedConjuncts)) 689 s.ApplySelectivity(sb.selectivityFromNullsRemoved(sel, relProps, constrainedCols)) 690 691 // Adjust the selectivity so we don't double-count the histogram columns. 692 s.ApplySelectivity(1.0 / sb.selectivityFromSingleColDistinctCounts(histCols, sel, s)) 693 694 // Update distinct counts based on equivalencies; this should happen after 695 // selectivityFromMultiColDistinctCounts and selectivityFromEquivalencies. 696 sb.applyEquivalencies(equivReps, &relProps.FuncDeps, sel, relProps) 697 698 sb.finalizeFromCardinality(relProps) 699 } 700 701 func (sb *statisticsBuilder) colStatSelect( 702 colSet opt.ColSet, sel *SelectExpr, 703 ) *props.ColumnStatistic { 704 relProps := sel.Relational() 705 s := &relProps.Stats 706 inputStats := &sel.Input.Relational().Stats 707 colStat := sb.copyColStatFromChild(colSet, sel, s) 708 709 // It's not safe to use s.Selectivity, because it's possible that some of the 710 // filter conditions were pushed down into the input after s.Selectivity 711 // was calculated. For example, an index scan or index join created during 712 // exploration could absorb some of the filter conditions. 713 selectivity := s.RowCount / inputStats.RowCount 714 colStat.ApplySelectivity(selectivity, inputStats.RowCount) 715 if colSet.Intersects(relProps.NotNullCols) { 716 colStat.NullCount = 0 717 } 718 sb.finalizeFromRowCountAndDistinctCounts(colStat, s) 719 return colStat 720 } 721 722 // +---------+ 723 // | Project | 724 // +---------+ 725 726 func (sb *statisticsBuilder) buildProject(prj *ProjectExpr, relProps *props.Relational) { 727 s := &relProps.Stats 728 if zeroCardinality := s.Init(relProps); zeroCardinality { 729 // Short cut if cardinality is 0. 730 return 731 } 732 s.Available = sb.availabilityFromInput(prj) 733 734 inputStats := &prj.Input.Relational().Stats 735 736 s.RowCount = inputStats.RowCount 737 sb.finalizeFromCardinality(relProps) 738 } 739 740 func (sb *statisticsBuilder) colStatProject( 741 colSet opt.ColSet, prj *ProjectExpr, 742 ) *props.ColumnStatistic { 743 relProps := prj.Relational() 744 s := &relProps.Stats 745 746 // Columns may be passed through from the input, or they may reference a 747 // higher scope (in the case of a correlated subquery), or they 748 // may be synthesized by the projection operation. 749 inputCols := prj.Input.Relational().OutputCols 750 reqInputCols := colSet.Intersection(inputCols) 751 nonNullFound := false 752 reqSynthCols := colSet.Difference(inputCols) 753 if !reqSynthCols.Empty() { 754 // Some of the columns in colSet were synthesized or from a higher scope 755 // (in the case of a correlated subquery). We assume that the statistics of 756 // the synthesized columns are the same as the statistics of their input 757 // columns. For example, the distinct count of (x + 2) is the same as the 758 // distinct count of x. 759 // TODO(rytaft): This assumption breaks down for certain types of 760 // expressions, such as (x < y). 761 for i := range prj.Projections { 762 item := &prj.Projections[i] 763 if reqSynthCols.Contains(item.Col) { 764 reqInputCols.UnionWith(item.scalar.OuterCols) 765 766 // If the element is not a null constant, account for that 767 // when calculating null counts. 768 if item.Element.Op() != opt.NullOp { 769 nonNullFound = true 770 } 771 } 772 } 773 774 // Intersect with the input columns one more time to remove any columns 775 // from higher scopes. Columns from higher scopes are effectively constant 776 // in this scope, and therefore have distinct count = 1. 777 reqInputCols.IntersectionWith(inputCols) 778 } 779 780 colStat, _ := s.ColStats.Add(colSet) 781 782 if !reqInputCols.Empty() { 783 // Inherit column statistics from input, using the reqInputCols identified 784 // above. 785 inputColStat := sb.colStatFromChild(reqInputCols, prj, 0 /* childIdx */) 786 colStat.DistinctCount = inputColStat.DistinctCount 787 if nonNullFound { 788 colStat.NullCount = 0 789 } else { 790 colStat.NullCount = inputColStat.NullCount 791 } 792 } else { 793 // There are no columns in this expression, so it must be a constant. 794 colStat.DistinctCount = 1 795 if nonNullFound { 796 colStat.NullCount = 0 797 } else { 798 colStat.NullCount = s.RowCount 799 } 800 } 801 if colSet.Intersects(relProps.NotNullCols) { 802 colStat.NullCount = 0 803 } 804 sb.finalizeFromRowCountAndDistinctCounts(colStat, s) 805 return colStat 806 } 807 808 // +------+ 809 // | Join | 810 // +------+ 811 812 func (sb *statisticsBuilder) buildJoin( 813 join RelExpr, relProps *props.Relational, h *joinPropsHelper, 814 ) { 815 // Zigzag joins have their own stats builder case. 816 if join.Op() == opt.ZigzagJoinOp { 817 sb.buildZigzagJoin(join.(*ZigzagJoinExpr), relProps, h) 818 return 819 } 820 821 s := &relProps.Stats 822 if zeroCardinality := s.Init(relProps); zeroCardinality { 823 // Short cut if cardinality is 0. 824 return 825 } 826 s.Available = sb.availabilityFromInput(join) 827 828 leftStats := &h.leftProps.Stats 829 rightStats := &h.rightProps.Stats 830 leftCols := h.leftProps.OutputCols.Copy() 831 rightCols := h.rightProps.OutputCols.Copy() 832 equivReps := h.filtersFD.EquivReps() 833 834 // Shortcut if there are no ON conditions. Note that for lookup join, there 835 // are implicit equality conditions on KeyCols. 836 if h.filterIsTrue { 837 s.RowCount = leftStats.RowCount * rightStats.RowCount 838 s.Selectivity = 1 839 switch h.joinType { 840 case opt.InnerJoinOp, opt.InnerJoinApplyOp: 841 case opt.LeftJoinOp, opt.LeftJoinApplyOp: 842 // All rows from left side should be in the result. 843 s.RowCount = max(s.RowCount, leftStats.RowCount) 844 845 case opt.RightJoinOp: 846 // All rows from right side should be in the result. 847 s.RowCount = max(s.RowCount, rightStats.RowCount) 848 849 case opt.FullJoinOp: 850 // All rows from both sides should be in the result. 851 s.RowCount = max(s.RowCount, leftStats.RowCount) 852 s.RowCount = max(s.RowCount, rightStats.RowCount) 853 854 case opt.SemiJoinOp, opt.SemiJoinApplyOp: 855 s.RowCount = leftStats.RowCount 856 857 case opt.AntiJoinOp, opt.AntiJoinApplyOp: 858 // Don't set the row count to 0 since we can't guarantee that the 859 // cardinality is 0. 860 s.RowCount = epsilon 861 s.Selectivity = epsilon 862 } 863 return 864 } 865 866 // Shortcut if the ON condition is false or there is a contradiction. 867 if h.filters.IsFalse() { 868 s.Selectivity = 0 869 switch h.joinType { 870 case opt.InnerJoinOp, opt.InnerJoinApplyOp: 871 s.RowCount = 0 872 873 case opt.LeftJoinOp, opt.LeftJoinApplyOp: 874 // All rows from left side should be in the result. 875 s.RowCount = leftStats.RowCount 876 877 case opt.RightJoinOp: 878 // All rows from right side should be in the result. 879 s.RowCount = rightStats.RowCount 880 881 case opt.FullJoinOp: 882 // All rows from both sides should be in the result. 883 s.RowCount = leftStats.RowCount + rightStats.RowCount 884 885 case opt.SemiJoinOp, opt.SemiJoinApplyOp: 886 s.RowCount = 0 887 888 case opt.AntiJoinOp, opt.AntiJoinApplyOp: 889 s.RowCount = leftStats.RowCount 890 s.Selectivity = 1 891 } 892 return 893 } 894 895 // Calculate distinct counts for constrained columns in the ON conditions 896 // ---------------------------------------------------------------------- 897 numUnappliedConjuncts, constrainedCols, histCols := sb.applyFilter(h.filters, join, relProps) 898 899 // Try to reduce the number of columns used for selectivity 900 // calculation based on functional dependencies. 901 constrainedCols = sb.tryReduceJoinCols( 902 constrainedCols, 903 s, 904 h.leftProps.OutputCols, 905 h.rightProps.OutputCols, 906 &h.leftProps.FuncDeps, 907 &h.rightProps.FuncDeps, 908 ) 909 910 // Set null counts to 0 for non-nullable columns 911 // --------------------------------------------- 912 sb.updateNullCountsFromProps(join, relProps) 913 914 // Calculate selectivity and row count 915 // ----------------------------------- 916 switch h.joinType { 917 case opt.SemiJoinOp, opt.SemiJoinApplyOp, opt.AntiJoinOp, opt.AntiJoinApplyOp: 918 // Treat anti join as if it were a semi join for the selectivity 919 // calculations. It will be fixed below. 920 s.RowCount = leftStats.RowCount 921 s.ApplySelectivity(sb.selectivityFromEquivalenciesSemiJoin( 922 equivReps, h.leftProps.OutputCols, h.rightProps.OutputCols, &h.filtersFD, join, s, 923 )) 924 925 default: 926 s.RowCount = leftStats.RowCount * rightStats.RowCount 927 if h.rightProps.FuncDeps.ColsAreStrictKey(h.selfJoinCols) { 928 // This is like an index join, so apply a selectivity that will result 929 // in leftStats.RowCount rows. 930 if rightStats.RowCount != 0 { 931 s.ApplySelectivity(1 / rightStats.RowCount) 932 } 933 } else { 934 // Add the self join columns to equivReps so they are included in the 935 // calculation for selectivityFromEquivalencies below. 936 equivReps.UnionWith(h.selfJoinCols) 937 } 938 939 s.ApplySelectivity(sb.selectivityFromEquivalencies(equivReps, &h.filtersFD, join, s)) 940 } 941 942 if join.Op() == opt.GeoLookupJoinOp { 943 s.ApplySelectivity(sb.selectivityFromGeoRelationship(join, s)) 944 } 945 s.ApplySelectivity(sb.selectivityFromHistograms(histCols, join, s)) 946 s.ApplySelectivity(sb.selectivityFromMultiColDistinctCounts( 947 constrainedCols.Intersection(leftCols), join, s, 948 )) 949 s.ApplySelectivity(sb.selectivityFromMultiColDistinctCounts( 950 constrainedCols.Intersection(rightCols), join, s, 951 )) 952 s.ApplySelectivity(sb.selectivityFromUnappliedConjuncts(numUnappliedConjuncts)) 953 s.ApplySelectivity(sb.selectivityFromNullsRemoved(join, relProps, constrainedCols)) 954 955 // Adjust the selectivity so we don't double-count the histogram columns. 956 s.ApplySelectivity(1.0 / sb.selectivityFromSingleColDistinctCounts(histCols, join, s)) 957 958 // Update distinct counts based on equivalencies; this should happen after 959 // selectivityFromMultiColDistinctCounts and selectivityFromEquivalencies. 960 sb.applyEquivalencies(equivReps, &h.filtersFD, join, relProps) 961 962 switch h.joinType { 963 case opt.SemiJoinOp, opt.SemiJoinApplyOp, opt.AntiJoinOp, opt.AntiJoinApplyOp: 964 // Keep only column stats from the left side. 965 s.ColStats.RemoveIntersecting(h.rightProps.OutputCols) 966 } 967 968 // The above calculation is for inner joins. Other joins need to remove stats 969 // that involve outer columns. 970 switch h.joinType { 971 case opt.LeftJoinOp, opt.LeftJoinApplyOp: 972 // Keep only column stats from the right side. The stats from the left side 973 // are not valid. 974 s.ColStats.RemoveIntersecting(h.leftProps.OutputCols) 975 976 case opt.RightJoinOp: 977 // Keep only column stats from the left side. The stats from the right side 978 // are not valid. 979 s.ColStats.RemoveIntersecting(h.rightProps.OutputCols) 980 981 case opt.FullJoinOp: 982 // Do not keep any column stats. 983 s.ColStats.Clear() 984 } 985 986 // Tweak the row count. 987 innerJoinRowCount := s.RowCount 988 switch h.joinType { 989 case opt.LeftJoinOp, opt.LeftJoinApplyOp: 990 // All rows from left side should be in the result. 991 s.RowCount = max(innerJoinRowCount, leftStats.RowCount) 992 993 case opt.RightJoinOp: 994 // All rows from right side should be in the result. 995 s.RowCount = max(innerJoinRowCount, rightStats.RowCount) 996 997 case opt.FullJoinOp: 998 // All rows from both sides should be in the result. 999 // T(A FOJ B) = T(A LOJ B) + T(A ROJ B) - T(A IJ B) 1000 leftJoinRowCount := max(innerJoinRowCount, leftStats.RowCount) 1001 rightJoinRowCount := max(innerJoinRowCount, rightStats.RowCount) 1002 s.RowCount = leftJoinRowCount + rightJoinRowCount - innerJoinRowCount 1003 } 1004 1005 // Fix the stats for anti join. 1006 switch h.joinType { 1007 case opt.AntiJoinOp, opt.AntiJoinApplyOp: 1008 s.RowCount = max(leftStats.RowCount-s.RowCount, epsilon) 1009 s.Selectivity = max(1-s.Selectivity, epsilon) 1010 1011 // Converting column stats is error-prone. If any column stats are needed, 1012 // colStatJoin will use the selectivity calculated above to estimate the 1013 // column stats from the input. 1014 s.ColStats.Clear() 1015 } 1016 1017 // Loop through all colSets added in this step, and adjust null counts, 1018 // distinct counts, and histograms. 1019 for i := 0; i < s.ColStats.Count(); i++ { 1020 colStat := s.ColStats.Get(i) 1021 leftSideCols := leftCols.Intersection(colStat.Cols) 1022 rightSideCols := rightCols.Intersection(colStat.Cols) 1023 leftNullCount, rightNullCount := sb.leftRightNullCounts( 1024 join, 1025 leftSideCols, 1026 rightSideCols, 1027 leftStats.RowCount, 1028 rightStats.RowCount, 1029 ) 1030 1031 // Update all null counts not zeroed out in updateNullCountsFromProps 1032 // to equal the inner join count, instead of what it currently is (either 1033 // leftNullCount or rightNullCount). 1034 if colStat.NullCount != 0 { 1035 colStat.NullCount = innerJoinNullCount( 1036 s.RowCount, 1037 leftNullCount, 1038 leftStats.RowCount, 1039 rightNullCount, 1040 rightStats.RowCount, 1041 ) 1042 } 1043 1044 switch h.joinType { 1045 case opt.LeftJoinOp, opt.LeftJoinApplyOp, opt.RightJoinOp, opt.FullJoinOp: 1046 if !colStat.Cols.Intersects(relProps.NotNullCols) { 1047 sb.adjustNullCountsForOuterJoins( 1048 colStat, 1049 h.joinType, 1050 leftSideCols, 1051 rightSideCols, 1052 leftNullCount, 1053 leftStats.RowCount, 1054 rightNullCount, 1055 rightStats.RowCount, 1056 s.RowCount, 1057 innerJoinRowCount, 1058 ) 1059 } 1060 1061 // Ensure distinct count is non-zero. 1062 colStat.DistinctCount = max(colStat.DistinctCount, 1) 1063 } 1064 1065 // We don't yet calculate histograms correctly for joins, so remove any 1066 // histograms that have been created above. 1067 colStat.Histogram = nil 1068 } 1069 1070 sb.finalizeFromCardinality(relProps) 1071 } 1072 1073 func (sb *statisticsBuilder) colStatJoin(colSet opt.ColSet, join RelExpr) *props.ColumnStatistic { 1074 relProps := join.Relational() 1075 s := &relProps.Stats 1076 1077 var joinType opt.Operator 1078 var leftProps, rightProps *props.Relational 1079 1080 switch j := join.(type) { 1081 case *LookupJoinExpr: 1082 joinType = j.JoinType 1083 leftProps = j.Input.Relational() 1084 ensureLookupJoinInputProps(j, sb) 1085 rightProps = &j.lookupProps 1086 1087 case *GeoLookupJoinExpr: 1088 joinType = j.JoinType 1089 leftProps = j.Input.Relational() 1090 ensureGeoLookupJoinInputProps(j, sb) 1091 rightProps = &j.lookupProps 1092 1093 case *ZigzagJoinExpr: 1094 joinType = opt.InnerJoinOp 1095 ensureZigzagJoinInputProps(j, sb) 1096 leftProps = &j.leftProps 1097 rightProps = &j.rightProps 1098 1099 default: 1100 joinType = join.Op() 1101 leftProps = join.Child(0).(RelExpr).Relational() 1102 rightProps = join.Child(1).(RelExpr).Relational() 1103 } 1104 1105 switch joinType { 1106 case opt.SemiJoinOp, opt.SemiJoinApplyOp, opt.AntiJoinOp, opt.AntiJoinApplyOp: 1107 // Column stats come from left side of join. 1108 colStat := sb.copyColStat(colSet, s, sb.colStatFromJoinLeft(colSet, join)) 1109 colStat.ApplySelectivity(s.Selectivity, leftProps.Stats.RowCount) 1110 sb.finalizeFromRowCountAndDistinctCounts(colStat, s) 1111 return colStat 1112 1113 default: 1114 // Column stats come from both sides of join. 1115 leftCols := leftProps.OutputCols.Intersection(colSet) 1116 rightCols := rightProps.OutputCols.Intersection(colSet) 1117 1118 // Join selectivity affects the distinct counts for different columns 1119 // in different ways depending on the type of join. 1120 // 1121 // - For FULL OUTER joins, the selectivity has no impact on distinct count; 1122 // all rows from the input are included at least once in the output. 1123 // - For LEFT OUTER joins, the selectivity only impacts the distinct count 1124 // of columns from the right side of the join; all rows from the left 1125 // side are included at least once in the output. 1126 // - For RIGHT OUTER joins, the selectivity only impacts the distinct count 1127 // of columns from the left side of the join; all rows from the right 1128 // side are included at least once in the output. 1129 // - For INNER joins, the selectivity impacts the distinct count of all 1130 // columns. 1131 var colStat *props.ColumnStatistic 1132 inputRowCount := leftProps.Stats.RowCount * rightProps.Stats.RowCount 1133 leftNullCount, rightNullCount := leftProps.Stats.RowCount, rightProps.Stats.RowCount 1134 if rightCols.Empty() { 1135 colStat = sb.copyColStat(colSet, s, sb.colStatFromJoinLeft(colSet, join)) 1136 leftNullCount = colStat.NullCount 1137 switch joinType { 1138 case opt.InnerJoinOp, opt.InnerJoinApplyOp, opt.RightJoinOp: 1139 colStat.ApplySelectivity(s.Selectivity, inputRowCount) 1140 } 1141 } else if leftCols.Empty() { 1142 colStat = sb.copyColStat(colSet, s, sb.colStatFromJoinRight(colSet, join)) 1143 rightNullCount = colStat.NullCount 1144 switch joinType { 1145 case opt.InnerJoinOp, opt.InnerJoinApplyOp, opt.LeftJoinOp, opt.LeftJoinApplyOp: 1146 colStat.ApplySelectivity(s.Selectivity, inputRowCount) 1147 } 1148 } else { 1149 // Make a copy of the input column stats so we don't modify the originals. 1150 leftColStat := *sb.colStatFromJoinLeft(leftCols, join) 1151 rightColStat := *sb.colStatFromJoinRight(rightCols, join) 1152 1153 leftNullCount = leftColStat.NullCount 1154 rightNullCount = rightColStat.NullCount 1155 switch joinType { 1156 case opt.InnerJoinOp, opt.InnerJoinApplyOp: 1157 leftColStat.ApplySelectivity(s.Selectivity, inputRowCount) 1158 rightColStat.ApplySelectivity(s.Selectivity, inputRowCount) 1159 1160 case opt.LeftJoinOp, opt.LeftJoinApplyOp: 1161 rightColStat.ApplySelectivity(s.Selectivity, inputRowCount) 1162 1163 case opt.RightJoinOp: 1164 leftColStat.ApplySelectivity(s.Selectivity, inputRowCount) 1165 } 1166 colStat, _ = s.ColStats.Add(colSet) 1167 colStat.DistinctCount = leftColStat.DistinctCount * rightColStat.DistinctCount 1168 } 1169 1170 // Null count estimation - assume an inner join and then bump the null count later 1171 // based on the type of join. 1172 colStat.NullCount = innerJoinNullCount( 1173 s.RowCount, 1174 leftNullCount, 1175 leftProps.Stats.RowCount, 1176 rightNullCount, 1177 rightProps.Stats.RowCount, 1178 ) 1179 1180 switch joinType { 1181 case opt.LeftJoinOp, opt.LeftJoinApplyOp, opt.RightJoinOp, opt.FullJoinOp: 1182 sb.adjustNullCountsForOuterJoins( 1183 colStat, 1184 joinType, 1185 leftCols, 1186 rightCols, 1187 leftNullCount, 1188 leftProps.Stats.RowCount, 1189 rightNullCount, 1190 rightProps.Stats.RowCount, 1191 s.RowCount, 1192 s.Selectivity*inputRowCount, 1193 ) 1194 1195 // Ensure distinct count is non-zero. 1196 colStat.DistinctCount = max(colStat.DistinctCount, 1) 1197 } 1198 1199 if colSet.Intersects(relProps.NotNullCols) { 1200 colStat.NullCount = 0 1201 } 1202 sb.finalizeFromRowCountAndDistinctCounts(colStat, s) 1203 return colStat 1204 } 1205 } 1206 1207 // leftRightNullCounts returns null counts on the left and right sides of the 1208 // specified join. If either leftCols or rightCols are empty, the corresponding 1209 // side's return value is equal to the number of rows on that side. 1210 func (sb *statisticsBuilder) leftRightNullCounts( 1211 e RelExpr, leftCols, rightCols opt.ColSet, leftRowCount, rightRowCount float64, 1212 ) (leftNullCount, rightNullCount float64) { 1213 leftNullCount, rightNullCount = leftRowCount, rightRowCount 1214 if !leftCols.Empty() { 1215 leftColStat := sb.colStatFromJoinLeft(leftCols, e) 1216 leftNullCount = leftColStat.NullCount 1217 } 1218 if !rightCols.Empty() { 1219 rightColStat := sb.colStatFromJoinRight(rightCols, e) 1220 rightNullCount = rightColStat.NullCount 1221 } 1222 1223 return leftNullCount, rightNullCount 1224 } 1225 1226 // innerJoinNullCount returns an estimate of the number of nulls in an inner 1227 // join with the specified column stats. 1228 // 1229 // The caller should ensure that if leftCols are empty then leftNullCount 1230 // equals leftRowCount, and if rightCols are empty then rightNullCount equals 1231 // rightRowCount. 1232 func innerJoinNullCount( 1233 rowCount, leftNullCount, leftRowCount, rightNullCount, rightRowCount float64, 1234 ) float64 { 1235 // Here, f1 and f2 are probabilities of nulls on either side of the join. 1236 var f1, f2 float64 1237 if leftRowCount != 0 { 1238 f1 = leftNullCount / leftRowCount 1239 } 1240 if rightRowCount != 0 { 1241 f2 = rightNullCount / rightRowCount 1242 } 1243 1244 // Rough estimate of nulls in the combined result set, assuming independence. 1245 return rowCount * f1 * f2 1246 } 1247 1248 // adjustNullCountsForOuterJoins modifies the null counts for the specified 1249 // columns to adjust for additional nulls created in this type of outer join. 1250 // It adds an expected number of nulls created by column extension on 1251 // non-matching rows (such as on right cols for left joins and both for full). 1252 // 1253 // The caller should ensure that if leftCols are empty then leftNullCount 1254 // equals leftRowCount, and if rightCols are empty then rightNullCount equals 1255 // rightRowCount. 1256 func (sb *statisticsBuilder) adjustNullCountsForOuterJoins( 1257 colStat *props.ColumnStatistic, 1258 joinType opt.Operator, 1259 leftCols, rightCols opt.ColSet, 1260 leftNullCount, leftRowCount, rightNullCount, rightRowCount, rowCount, innerJoinRowCount float64, 1261 ) { 1262 // Adjust null counts for non-inner joins, adding nulls created due to column 1263 // extension - such as right columns for non-matching rows in left joins. 1264 switch joinType { 1265 case opt.LeftJoinOp, opt.LeftJoinApplyOp: 1266 if !rightCols.Empty() { 1267 colStat.NullCount += (rowCount - innerJoinRowCount) * leftNullCount / leftRowCount 1268 } 1269 1270 case opt.RightJoinOp: 1271 if !leftCols.Empty() { 1272 colStat.NullCount += (rowCount - innerJoinRowCount) * rightNullCount / rightRowCount 1273 } 1274 1275 case opt.FullJoinOp: 1276 leftJoinRowCount := max(innerJoinRowCount, leftRowCount) 1277 rightJoinRowCount := max(innerJoinRowCount, rightRowCount) 1278 1279 if !leftCols.Empty() { 1280 colStat.NullCount += (rightJoinRowCount - innerJoinRowCount) * rightNullCount / rightRowCount 1281 } 1282 if !rightCols.Empty() { 1283 colStat.NullCount += (leftJoinRowCount - innerJoinRowCount) * leftNullCount / leftRowCount 1284 } 1285 } 1286 } 1287 1288 // colStatfromJoinLeft returns a column statistic from the left input of a join. 1289 func (sb *statisticsBuilder) colStatFromJoinLeft( 1290 cols opt.ColSet, join RelExpr, 1291 ) *props.ColumnStatistic { 1292 if join.Op() == opt.ZigzagJoinOp { 1293 return sb.colStatTable(join.Private().(*ZigzagJoinPrivate).LeftTable, cols) 1294 } 1295 return sb.colStatFromChild(cols, join, 0 /* childIdx */) 1296 } 1297 1298 // colStatfromJoinRight returns a column statistic from the right input of a 1299 // join (or the table for a lookup join). 1300 func (sb *statisticsBuilder) colStatFromJoinRight( 1301 cols opt.ColSet, join RelExpr, 1302 ) *props.ColumnStatistic { 1303 if join.Op() == opt.ZigzagJoinOp { 1304 return sb.colStatTable(join.Private().(*ZigzagJoinPrivate).RightTable, cols) 1305 } else if join.Op() == opt.LookupJoinOp { 1306 lookupPrivate := join.Private().(*LookupJoinPrivate) 1307 return sb.colStatTable(lookupPrivate.Table, cols) 1308 } 1309 return sb.colStatFromChild(cols, join, 1 /* childIdx */) 1310 } 1311 1312 // +------------+ 1313 // | Index Join | 1314 // +------------+ 1315 1316 func (sb *statisticsBuilder) buildIndexJoin(indexJoin *IndexJoinExpr, relProps *props.Relational) { 1317 s := &relProps.Stats 1318 if zeroCardinality := s.Init(relProps); zeroCardinality { 1319 // Short cut if cardinality is 0. 1320 return 1321 } 1322 s.Available = sb.availabilityFromInput(indexJoin) 1323 1324 inputStats := &indexJoin.Input.Relational().Stats 1325 1326 s.RowCount = inputStats.RowCount 1327 sb.finalizeFromCardinality(relProps) 1328 } 1329 1330 func (sb *statisticsBuilder) colStatIndexJoin( 1331 colSet opt.ColSet, join *IndexJoinExpr, 1332 ) *props.ColumnStatistic { 1333 relProps := join.Relational() 1334 s := &relProps.Stats 1335 1336 inputProps := join.Input.Relational() 1337 inputCols := inputProps.OutputCols 1338 1339 colStat, _ := s.ColStats.Add(colSet) 1340 colStat.DistinctCount = 1 1341 colStat.NullCount = s.RowCount 1342 1343 // Some of the requested columns may be from the input index. 1344 reqInputCols := colSet.Intersection(inputCols) 1345 if !reqInputCols.Empty() { 1346 inputColStat := sb.colStatFromChild(reqInputCols, join, 0 /* childIdx */) 1347 colStat.DistinctCount = inputColStat.DistinctCount 1348 colStat.NullCount = inputColStat.NullCount 1349 } 1350 1351 // Other requested columns may be from the primary index. 1352 reqLookupCols := colSet.Difference(inputCols).Intersection(join.Cols) 1353 if !reqLookupCols.Empty() { 1354 // Make a copy of the lookup column stats so we don't modify the originals. 1355 lookupColStat := *sb.colStatTable(join.Table, reqLookupCols) 1356 1357 // Calculate the distinct count of the lookup columns given the selectivity 1358 // of any filters on the input. 1359 inputStats := &inputProps.Stats 1360 tableStats := sb.makeTableStatistics(join.Table) 1361 selectivity := inputStats.RowCount / tableStats.RowCount 1362 lookupColStat.ApplySelectivity(selectivity, tableStats.RowCount) 1363 1364 // Multiply the distinct counts in case colStat.DistinctCount is 1365 // already populated with a statistic from the subset of columns 1366 // provided by the input index. Multiplying the counts gives a worst-case 1367 // estimate of the joint distinct count. 1368 colStat.DistinctCount *= lookupColStat.DistinctCount 1369 1370 // Assuming null columns are completely independent, calculate 1371 // the expected value of having nulls in both column sets. 1372 f1 := lookupColStat.NullCount / inputStats.RowCount 1373 f2 := colStat.NullCount / inputStats.RowCount 1374 colStat.NullCount = inputStats.RowCount * f1 * f2 1375 } 1376 1377 if colSet.Intersects(relProps.NotNullCols) { 1378 colStat.NullCount = 0 1379 } 1380 sb.finalizeFromRowCountAndDistinctCounts(colStat, s) 1381 return colStat 1382 } 1383 1384 // +-------------+ 1385 // | Zigzag Join | 1386 // +-------------+ 1387 1388 // buildZigzagJoin builds the rowCount for a zigzag join. The colStat case 1389 // for ZigzagJoins is shared with Joins, while the builder code is more similar 1390 // to that for a Select. This is to ensure zigzag joins have select/scan-like 1391 // RowCounts, while at an individual column stats level, distinct and null 1392 // counts are handled like they would be for a join with two sides. 1393 func (sb *statisticsBuilder) buildZigzagJoin( 1394 zigzag *ZigzagJoinExpr, relProps *props.Relational, h *joinPropsHelper, 1395 ) { 1396 s := &relProps.Stats 1397 if zeroCardinality := s.Init(relProps); zeroCardinality { 1398 // Short cut if cardinality is 0. 1399 return 1400 } 1401 s.Available = sb.availabilityFromInput(zigzag) 1402 1403 leftStats := zigzag.leftProps.Stats 1404 equivReps := h.filtersFD.EquivReps() 1405 1406 // We assume that we only plan zigzag joins in cases where the result set 1407 // will have a row count smaller than or equal to the left/right index 1408 // row counts, and where the left and right sides are indexes on the same 1409 // table. Their row count should be the same, so use either row count. 1410 s.RowCount = leftStats.RowCount 1411 1412 // Calculate distinct counts for constrained columns 1413 // ------------------------------------------------- 1414 // Note that fixed columns (i.e. columns constrained to constant values) 1415 // specified in zigzag.FixedVals and zigzag.{Left,Right}FixedCols 1416 // still have corresponding filters in zigzag.On. So we don't need 1417 // to iterate through FixedCols here if we are already processing the ON 1418 // clause. 1419 // TODO(rytaft): use histogram for zig zag join. 1420 numUnappliedConjuncts, constrainedCols, _ := sb.applyFilter(zigzag.On, zigzag, relProps) 1421 1422 // Application of constraints on inverted indexes needs to be handled a 1423 // little differently since a constraint on an inverted index key column 1424 // could match multiple distinct values in the actual column. This is 1425 // because inverted index keys could correspond to partial values, like JSON 1426 // paths. 1427 // 1428 // Since filters on inverted index columns cannot be pushed down into the ON 1429 // clause (due to them containing partial values), we need to explicitly 1430 // increment numUnappliedConjuncts here for every constrained inverted index 1431 // column. The multiplication by 2 is to mimic the logic in 1432 // numConjunctsInConstraint, which counts an equality as 2 conjuncts. This 1433 // comes from the intuition that a = 1 is probably more selective than a > 1 1434 // and if we do not mimic a similar selectivity calculation here, the zigzag 1435 // join ends up having a higher row count and therefore higher cost than 1436 // a competing index join + constrained scan. 1437 tab := sb.md.Table(zigzag.LeftTable) 1438 if tab.Index(zigzag.LeftIndex).IsInverted() { 1439 numUnappliedConjuncts += float64(len(zigzag.LeftFixedCols) * 2) 1440 } 1441 if tab.Index(zigzag.RightIndex).IsInverted() { 1442 numUnappliedConjuncts += float64(len(zigzag.RightFixedCols) * 2) 1443 } 1444 1445 // Try to reduce the number of columns used for selectivity 1446 // calculation based on functional dependencies. Note that 1447 // these functional dependencies already include equalities 1448 // inferred by zigzag.{Left,Right}EqCols. 1449 inputFD := &zigzag.Relational().FuncDeps 1450 constrainedCols = sb.tryReduceCols(constrainedCols, s, inputFD) 1451 1452 // Set null counts to 0 for non-nullable columns 1453 // --------------------------------------------- 1454 sb.updateNullCountsFromProps(zigzag, relProps) 1455 1456 // Calculate selectivity and row count 1457 // ----------------------------------- 1458 s.ApplySelectivity(sb.selectivityFromMultiColDistinctCounts(constrainedCols, zigzag, s)) 1459 s.ApplySelectivity(sb.selectivityFromEquivalencies(equivReps, &relProps.FuncDeps, zigzag, s)) 1460 s.ApplySelectivity(sb.selectivityFromUnappliedConjuncts(numUnappliedConjuncts)) 1461 s.ApplySelectivity(sb.selectivityFromNullsRemoved(zigzag, relProps, constrainedCols)) 1462 1463 // Update distinct counts based on equivalencies; this should happen after 1464 // selectivityFromMultiColDistinctCounts and selectivityFromEquivalencies. 1465 sb.applyEquivalencies(equivReps, &relProps.FuncDeps, zigzag, relProps) 1466 1467 sb.finalizeFromCardinality(relProps) 1468 } 1469 1470 // +----------+ 1471 // | Group By | 1472 // +----------+ 1473 1474 func (sb *statisticsBuilder) buildGroupBy(groupNode RelExpr, relProps *props.Relational) { 1475 s := &relProps.Stats 1476 if zeroCardinality := s.Init(relProps); zeroCardinality { 1477 // Short cut if cardinality is 0. 1478 return 1479 } 1480 s.Available = sb.availabilityFromInput(groupNode) 1481 1482 groupingPrivate := groupNode.Private().(*GroupingPrivate) 1483 groupingColSet := groupingPrivate.GroupingCols 1484 1485 if groupingColSet.Empty() { 1486 if groupNode.Op() == opt.ScalarGroupByOp { 1487 // ScalarGroupBy always returns exactly one row. 1488 s.RowCount = 1 1489 } else { 1490 // GroupBy with empty grouping columns returns 0 or 1 rows, depending 1491 // on whether input has rows. If input has < 1 row, use that, as that 1492 // represents the probability of having 0 vs. 1 rows. 1493 inputStats := sb.statsFromChild(groupNode, 0 /* childIdx */) 1494 s.RowCount = min(1, inputStats.RowCount) 1495 } 1496 } else { 1497 inputStats := sb.statsFromChild(groupNode, 0 /* childIdx */) 1498 1499 if groupingPrivate.ErrorOnDup != "" { 1500 // If any input group has more than one row, then the distinct operator 1501 // will raise an error, so in non-error cases it has the same number of 1502 // rows as its input. 1503 s.RowCount = inputStats.RowCount 1504 } else { 1505 // Estimate the row count based on the distinct count of the grouping 1506 // columns. Non-scalar GroupBy should never increase the number of rows. 1507 colStat := sb.copyColStatFromChild(groupingColSet, groupNode, s) 1508 s.RowCount = min(colStat.DistinctCount, inputStats.RowCount) 1509 1510 // Update the null counts for the column statistic. 1511 if groupNode.Op() != opt.UpsertDistinctOnOp { 1512 // UpsertDistinctOp inherits NullCount from child, since it does not 1513 // group NULL values. Other group by operators only have 1 possible 1514 // null value. 1515 colStat.NullCount = min(1, colStat.NullCount) 1516 } 1517 if groupingColSet.Intersects(relProps.NotNullCols) { 1518 colStat.NullCount = 0 1519 } 1520 } 1521 } 1522 1523 sb.finalizeFromCardinality(relProps) 1524 } 1525 1526 func (sb *statisticsBuilder) colStatGroupBy( 1527 colSet opt.ColSet, groupNode RelExpr, 1528 ) *props.ColumnStatistic { 1529 relProps := groupNode.Relational() 1530 s := &relProps.Stats 1531 1532 groupingPrivate := groupNode.Private().(*GroupingPrivate) 1533 groupingColSet := groupingPrivate.GroupingCols 1534 if groupingColSet.Empty() { 1535 // ScalarGroupBy or GroupBy with empty grouping columns. 1536 colStat, _ := s.ColStats.Add(colSet) 1537 colStat.DistinctCount = 1 1538 // TODO(itsbilal): Handle case where the scalar resolves to NULL. 1539 colStat.NullCount = 0 1540 return colStat 1541 } 1542 1543 var colStat *props.ColumnStatistic 1544 var inputColStat *props.ColumnStatistic 1545 if !colSet.SubsetOf(groupingColSet) { 1546 // Some of the requested columns are aggregates. Estimate the distinct 1547 // count to be the same as the grouping columns. 1548 colStat, _ = s.ColStats.Add(colSet) 1549 inputColStat = sb.colStatFromChild(groupingColSet, groupNode, 0 /* childIdx */) 1550 colStat.DistinctCount = inputColStat.DistinctCount 1551 } else { 1552 // Make a copy so we don't modify the original 1553 colStat = sb.copyColStatFromChild(colSet, groupNode, s) 1554 inputColStat = sb.colStatFromChild(colSet, groupNode, 0 /* childIdx */) 1555 1556 if groupingPrivate.ErrorOnDup != "" && colSet.Equals(groupingColSet) { 1557 // If any input group has more than one row, then the distinct operator 1558 // will raise an error, so in non-error cases its distinct count is the 1559 // same as its row count. 1560 colStat.DistinctCount = s.RowCount 1561 } 1562 } 1563 1564 if groupNode.Op() == opt.UpsertDistinctOnOp || groupNode.Op() == opt.EnsureUpsertDistinctOnOp { 1565 // UpsertDistinctOnOp and EnsureUpsertDistinctOnOp inherit NullCount from 1566 // child, since they do not group NULL values. 1567 colStat.NullCount = inputColStat.NullCount 1568 } else { 1569 // For other group by operators, we only have 1 possible null value. 1570 colStat.NullCount = min(1, inputColStat.NullCount) 1571 } 1572 1573 if colSet.Intersects(relProps.NotNullCols) { 1574 colStat.NullCount = 0 1575 } 1576 sb.finalizeFromRowCountAndDistinctCounts(colStat, s) 1577 return colStat 1578 } 1579 1580 // +--------+ 1581 // | Set Op | 1582 // +--------+ 1583 1584 func (sb *statisticsBuilder) buildSetNode(setNode RelExpr, relProps *props.Relational) { 1585 s := &relProps.Stats 1586 if zeroCardinality := s.Init(relProps); zeroCardinality { 1587 // Short cut if cardinality is 0. 1588 return 1589 } 1590 s.Available = sb.availabilityFromInput(setNode) 1591 1592 leftStats := sb.statsFromChild(setNode, 0 /* childIdx */) 1593 rightStats := sb.statsFromChild(setNode, 1 /* childIdx */) 1594 1595 // These calculations are an upper bound on the row count. It's likely that 1596 // there is some overlap between the two sets, but not full overlap. 1597 switch setNode.Op() { 1598 case opt.UnionOp, opt.UnionAllOp: 1599 s.RowCount = leftStats.RowCount + rightStats.RowCount 1600 1601 case opt.IntersectOp, opt.IntersectAllOp: 1602 s.RowCount = min(leftStats.RowCount, rightStats.RowCount) 1603 1604 case opt.ExceptOp, opt.ExceptAllOp: 1605 s.RowCount = leftStats.RowCount 1606 } 1607 1608 switch setNode.Op() { 1609 case opt.UnionOp, opt.IntersectOp, opt.ExceptOp: 1610 // Since UNION, INTERSECT and EXCEPT eliminate duplicate rows, the row 1611 // count will equal the distinct count of the set of output columns. 1612 setPrivate := setNode.Private().(*SetPrivate) 1613 outputCols := setPrivate.OutCols.ToSet() 1614 colStat := sb.colStatSetNodeImpl(outputCols, setNode, relProps) 1615 s.RowCount = colStat.DistinctCount 1616 } 1617 1618 sb.finalizeFromCardinality(relProps) 1619 } 1620 1621 func (sb *statisticsBuilder) colStatSetNode( 1622 colSet opt.ColSet, setNode RelExpr, 1623 ) *props.ColumnStatistic { 1624 return sb.colStatSetNodeImpl(colSet, setNode, setNode.Relational()) 1625 } 1626 1627 func (sb *statisticsBuilder) colStatSetNodeImpl( 1628 outputCols opt.ColSet, setNode RelExpr, relProps *props.Relational, 1629 ) *props.ColumnStatistic { 1630 s := &relProps.Stats 1631 setPrivate := setNode.Private().(*SetPrivate) 1632 1633 leftCols := opt.TranslateColSet(outputCols, setPrivate.OutCols, setPrivate.LeftCols) 1634 rightCols := opt.TranslateColSet(outputCols, setPrivate.OutCols, setPrivate.RightCols) 1635 leftColStat := sb.colStatFromChild(leftCols, setNode, 0 /* childIdx */) 1636 rightColStat := sb.colStatFromChild(rightCols, setNode, 1 /* childIdx */) 1637 1638 colStat, _ := s.ColStats.Add(outputCols) 1639 1640 leftNullCount := leftColStat.NullCount 1641 rightNullCount := rightColStat.NullCount 1642 1643 // These calculations are an upper bound on the distinct count. It's likely 1644 // that there is some overlap between the two sets, but not full overlap. 1645 switch setNode.Op() { 1646 case opt.UnionOp, opt.UnionAllOp: 1647 colStat.DistinctCount = leftColStat.DistinctCount + rightColStat.DistinctCount 1648 colStat.NullCount = leftNullCount + rightNullCount 1649 1650 case opt.IntersectOp, opt.IntersectAllOp: 1651 colStat.DistinctCount = min(leftColStat.DistinctCount, rightColStat.DistinctCount) 1652 colStat.NullCount = min(leftNullCount, rightNullCount) 1653 1654 case opt.ExceptOp, opt.ExceptAllOp: 1655 colStat.DistinctCount = leftColStat.DistinctCount 1656 colStat.NullCount = max(leftNullCount-rightNullCount, 0) 1657 } 1658 1659 // Use the actual null counts for bag operations, and normalize them for set 1660 // operations. 1661 switch setNode.Op() { 1662 case opt.UnionOp, opt.IntersectOp: 1663 colStat.NullCount = min(1, colStat.NullCount) 1664 case opt.ExceptOp: 1665 colStat.NullCount = min(1, colStat.NullCount) 1666 if rightNullCount > 0 { 1667 colStat.NullCount = 0 1668 } 1669 } 1670 1671 if outputCols.Intersects(relProps.NotNullCols) { 1672 colStat.NullCount = 0 1673 } 1674 sb.finalizeFromRowCountAndDistinctCounts(colStat, s) 1675 return colStat 1676 } 1677 1678 // +--------+ 1679 // | Values | 1680 // +--------+ 1681 1682 // buildValues builds the statistics for a VALUES expression. 1683 func (sb *statisticsBuilder) buildValues(values *ValuesExpr, relProps *props.Relational) { 1684 s := &relProps.Stats 1685 if zeroCardinality := s.Init(relProps); zeroCardinality { 1686 // Short cut if cardinality is 0. 1687 return 1688 } 1689 s.Available = sb.availabilityFromInput(values) 1690 1691 s.RowCount = float64(len(values.Rows)) 1692 sb.finalizeFromCardinality(relProps) 1693 } 1694 1695 func (sb *statisticsBuilder) colStatValues( 1696 colSet opt.ColSet, values *ValuesExpr, 1697 ) *props.ColumnStatistic { 1698 s := &values.Relational().Stats 1699 if len(values.Rows) == 0 { 1700 colStat, _ := s.ColStats.Add(colSet) 1701 return colStat 1702 } 1703 1704 // Determine distinct count from the number of distinct memo groups. Use a 1705 // map to find the exact count of distinct values for the columns in colSet. 1706 // Use a hash to combine column values (this does not have to be exact). 1707 distinct := make(map[uint64]struct{}, values.Rows[0].ChildCount()) 1708 // Determine null count by looking at tuples that have only NullOps in them. 1709 nullCount := 0 1710 for _, row := range values.Rows { 1711 // Use the FNV-1a algorithm. See comments for the interner class. 1712 hash := uint64(offset64) 1713 hasNonNull := false 1714 for i, elem := range row.(*TupleExpr).Elems { 1715 if colSet.Contains(values.Cols[i]) { 1716 if elem.Op() != opt.NullOp { 1717 hasNonNull = true 1718 } 1719 // Use the pointer value of the scalar expression, since it's already 1720 // been interned. Therefore, two expressions with the same pointer 1721 // have the same value. 1722 ptr := reflect.ValueOf(elem).Pointer() 1723 hash ^= uint64(ptr) 1724 hash *= prime64 1725 } 1726 } 1727 if !hasNonNull { 1728 nullCount++ 1729 } 1730 distinct[hash] = struct{}{} 1731 } 1732 1733 // Update the column statistics. 1734 colStat, _ := s.ColStats.Add(colSet) 1735 colStat.DistinctCount = float64(len(distinct)) 1736 colStat.NullCount = float64(nullCount) 1737 sb.finalizeFromRowCountAndDistinctCounts(colStat, s) 1738 return colStat 1739 } 1740 1741 // +-------+ 1742 // | Limit | 1743 // +-------+ 1744 1745 func (sb *statisticsBuilder) buildLimit(limit *LimitExpr, relProps *props.Relational) { 1746 s := &relProps.Stats 1747 if zeroCardinality := s.Init(relProps); zeroCardinality { 1748 // Short cut if cardinality is 0. 1749 return 1750 } 1751 s.Available = sb.availabilityFromInput(limit) 1752 1753 inputStats := &limit.Input.Relational().Stats 1754 1755 // Copy row count from input. 1756 s.RowCount = inputStats.RowCount 1757 1758 // Update row count if limit is a constant and row count is non-zero. 1759 if cnst, ok := limit.Limit.(*ConstExpr); ok && inputStats.RowCount > 0 { 1760 hardLimit := *cnst.Value.(*tree.DInt) 1761 if hardLimit > 0 { 1762 s.RowCount = min(float64(hardLimit), inputStats.RowCount) 1763 s.Selectivity = s.RowCount / inputStats.RowCount 1764 } 1765 } 1766 1767 sb.finalizeFromCardinality(relProps) 1768 } 1769 1770 func (sb *statisticsBuilder) colStatLimit( 1771 colSet opt.ColSet, limit *LimitExpr, 1772 ) *props.ColumnStatistic { 1773 relProps := limit.Relational() 1774 s := &relProps.Stats 1775 inputStats := &limit.Input.Relational().Stats 1776 colStat := sb.copyColStatFromChild(colSet, limit, s) 1777 1778 // Scale distinct count based on the selectivity of the limit operation. 1779 colStat.ApplySelectivity(s.Selectivity, inputStats.RowCount) 1780 if colSet.Intersects(relProps.NotNullCols) { 1781 colStat.NullCount = 0 1782 } 1783 sb.finalizeFromRowCountAndDistinctCounts(colStat, s) 1784 return colStat 1785 } 1786 1787 // +--------+ 1788 // | Offset | 1789 // +--------+ 1790 1791 func (sb *statisticsBuilder) buildOffset(offset *OffsetExpr, relProps *props.Relational) { 1792 s := &relProps.Stats 1793 if zeroCardinality := s.Init(relProps); zeroCardinality { 1794 // Short cut if cardinality is 0. 1795 return 1796 } 1797 s.Available = sb.availabilityFromInput(offset) 1798 1799 inputStats := &offset.Input.Relational().Stats 1800 1801 // Copy row count from input. 1802 s.RowCount = inputStats.RowCount 1803 1804 // Update row count if offset is a constant and row count is non-zero. 1805 if cnst, ok := offset.Offset.(*ConstExpr); ok && inputStats.RowCount > 0 { 1806 hardOffset := *cnst.Value.(*tree.DInt) 1807 if float64(hardOffset) >= inputStats.RowCount { 1808 // The correct estimate for row count here is 0, but we don't ever want 1809 // row count to be zero unless the cardinality is zero. This is because 1810 // the stats may be stale, and we can end up with weird and inefficient 1811 // plans if we estimate 0 rows. Use a small number instead. 1812 s.RowCount = min(1, inputStats.RowCount) 1813 } else if hardOffset > 0 { 1814 s.RowCount = inputStats.RowCount - float64(hardOffset) 1815 } 1816 s.Selectivity = s.RowCount / inputStats.RowCount 1817 } 1818 1819 sb.finalizeFromCardinality(relProps) 1820 } 1821 1822 func (sb *statisticsBuilder) colStatOffset( 1823 colSet opt.ColSet, offset *OffsetExpr, 1824 ) *props.ColumnStatistic { 1825 relProps := offset.Relational() 1826 s := &relProps.Stats 1827 inputStats := &offset.Input.Relational().Stats 1828 colStat := sb.copyColStatFromChild(colSet, offset, s) 1829 1830 // Scale distinct count based on the selectivity of the offset operation. 1831 colStat.ApplySelectivity(s.Selectivity, inputStats.RowCount) 1832 if colSet.Intersects(relProps.NotNullCols) { 1833 colStat.NullCount = 0 1834 } 1835 sb.finalizeFromRowCountAndDistinctCounts(colStat, s) 1836 return colStat 1837 } 1838 1839 // +---------+ 1840 // | Max1Row | 1841 // +---------+ 1842 1843 func (sb *statisticsBuilder) buildMax1Row(max1Row *Max1RowExpr, relProps *props.Relational) { 1844 s := &relProps.Stats 1845 if zeroCardinality := s.Init(relProps); zeroCardinality { 1846 // Short cut if cardinality is 0. 1847 return 1848 } 1849 s.Available = true 1850 1851 s.RowCount = 1 1852 sb.finalizeFromCardinality(relProps) 1853 } 1854 1855 func (sb *statisticsBuilder) colStatMax1Row( 1856 colSet opt.ColSet, max1Row *Max1RowExpr, 1857 ) *props.ColumnStatistic { 1858 s := &max1Row.Relational().Stats 1859 colStat, _ := s.ColStats.Add(colSet) 1860 colStat.DistinctCount = 1 1861 colStat.NullCount = s.RowCount * unknownNullCountRatio 1862 if colSet.Intersects(max1Row.Relational().NotNullCols) { 1863 colStat.NullCount = 0 1864 } 1865 sb.finalizeFromRowCountAndDistinctCounts(colStat, s) 1866 return colStat 1867 } 1868 1869 // +------------+ 1870 // | Row Number | 1871 // +------------+ 1872 1873 func (sb *statisticsBuilder) buildOrdinality(ord *OrdinalityExpr, relProps *props.Relational) { 1874 s := &relProps.Stats 1875 if zeroCardinality := s.Init(relProps); zeroCardinality { 1876 // Short cut if cardinality is 0. 1877 return 1878 } 1879 s.Available = sb.availabilityFromInput(ord) 1880 1881 inputStats := &ord.Input.Relational().Stats 1882 1883 s.RowCount = inputStats.RowCount 1884 sb.finalizeFromCardinality(relProps) 1885 } 1886 1887 func (sb *statisticsBuilder) colStatOrdinality( 1888 colSet opt.ColSet, ord *OrdinalityExpr, 1889 ) *props.ColumnStatistic { 1890 relProps := ord.Relational() 1891 s := &relProps.Stats 1892 1893 colStat, _ := s.ColStats.Add(colSet) 1894 1895 if colSet.Contains(ord.ColID) { 1896 // The ordinality column is a key, so every row is distinct. 1897 colStat.DistinctCount = s.RowCount 1898 colStat.NullCount = 0 1899 } else { 1900 inputColStat := sb.colStatFromChild(colSet, ord, 0 /* childIdx */) 1901 colStat.DistinctCount = inputColStat.DistinctCount 1902 colStat.NullCount = inputColStat.NullCount 1903 } 1904 1905 if colSet.Intersects(relProps.NotNullCols) { 1906 colStat.NullCount = 0 1907 } 1908 sb.finalizeFromRowCountAndDistinctCounts(colStat, s) 1909 return colStat 1910 } 1911 1912 // +------------+ 1913 // | Window | 1914 // +------------+ 1915 1916 func (sb *statisticsBuilder) buildWindow(window *WindowExpr, relProps *props.Relational) { 1917 s := &relProps.Stats 1918 if zeroCardinality := s.Init(relProps); zeroCardinality { 1919 // Short cut if cardinality is 0. 1920 return 1921 } 1922 s.Available = sb.availabilityFromInput(window) 1923 1924 inputStats := &window.Input.Relational().Stats 1925 1926 // The row count of a window is equal to the row count of its input. 1927 s.RowCount = inputStats.RowCount 1928 1929 sb.finalizeFromCardinality(relProps) 1930 } 1931 1932 func (sb *statisticsBuilder) colStatWindow( 1933 colSet opt.ColSet, window *WindowExpr, 1934 ) *props.ColumnStatistic { 1935 relProps := window.Relational() 1936 s := &relProps.Stats 1937 1938 colStat, _ := s.ColStats.Add(colSet) 1939 1940 var windowCols opt.ColSet 1941 for _, w := range window.Windows { 1942 windowCols.Add(w.Col) 1943 } 1944 1945 if colSet.Intersects(windowCols) { 1946 // These can be quite complicated and differ dramatically based on which 1947 // window function is being computed. For now, just assume row_number and 1948 // that every row is distinct. 1949 // TODO(justin): make these accurate and take into consideration the window 1950 // function being computed. 1951 colStat.DistinctCount = s.RowCount 1952 1953 // Just assume that no NULLs are output. 1954 // TODO(justin): there are window fns for which this is not true, make 1955 // sure those are handled. 1956 if colSet.SubsetOf(windowCols) { 1957 // The generated columns are the only columns being requested. 1958 colStat.NullCount = 0 1959 } else { 1960 // Copy NullCount from child. 1961 colSetChild := colSet.Difference(windowCols) 1962 inputColStat := sb.colStatFromChild(colSetChild, window, 0 /* childIdx */) 1963 colStat.NullCount = inputColStat.NullCount 1964 } 1965 } else { 1966 inputColStat := sb.colStatFromChild(colSet, window, 0 /* childIdx */) 1967 colStat.DistinctCount = inputColStat.DistinctCount 1968 colStat.NullCount = inputColStat.NullCount 1969 } 1970 1971 if colSet.Intersects(relProps.NotNullCols) { 1972 colStat.NullCount = 0 1973 } 1974 sb.finalizeFromRowCountAndDistinctCounts(colStat, s) 1975 return colStat 1976 } 1977 1978 // +-------------+ 1979 // | Project Set | 1980 // +-------------+ 1981 1982 func (sb *statisticsBuilder) buildProjectSet( 1983 projectSet *ProjectSetExpr, relProps *props.Relational, 1984 ) { 1985 s := &relProps.Stats 1986 if zeroCardinality := s.Init(relProps); zeroCardinality { 1987 // Short cut if cardinality is 0. 1988 return 1989 } 1990 s.Available = sb.availabilityFromInput(projectSet) 1991 1992 // The row count of a zip operation is equal to the maximum row count of its 1993 // children. 1994 var zipRowCount float64 1995 for i := range projectSet.Zip { 1996 if fn, ok := projectSet.Zip[i].Fn.(*FunctionExpr); ok { 1997 if fn.Overload.Generator != nil { 1998 // TODO(rytaft): We may want to estimate the number of rows based on 1999 // the type of generator function and its parameters. 2000 zipRowCount = unknownGeneratorRowCount 2001 break 2002 } 2003 } 2004 2005 // A scalar function generates one row. 2006 zipRowCount = 1 2007 } 2008 2009 // Multiply by the input row count to get the total. 2010 inputStats := &projectSet.Input.Relational().Stats 2011 s.RowCount = zipRowCount * inputStats.RowCount 2012 2013 sb.finalizeFromCardinality(relProps) 2014 } 2015 2016 func (sb *statisticsBuilder) colStatProjectSet( 2017 colSet opt.ColSet, projectSet *ProjectSetExpr, 2018 ) *props.ColumnStatistic { 2019 s := &projectSet.Relational().Stats 2020 if s.RowCount == 0 { 2021 // Short cut if cardinality is 0. 2022 colStat, _ := s.ColStats.Add(colSet) 2023 return colStat 2024 } 2025 2026 inputProps := projectSet.Input.Relational() 2027 inputStats := inputProps.Stats 2028 inputCols := inputProps.OutputCols 2029 2030 colStat, _ := s.ColStats.Add(colSet) 2031 colStat.DistinctCount = 1 2032 colStat.NullCount = s.RowCount 2033 2034 // Some of the requested columns may be from the input. 2035 reqInputCols := colSet.Intersection(inputCols) 2036 if !reqInputCols.Empty() { 2037 inputColStat := sb.colStatFromChild(reqInputCols, projectSet, 0 /* childIdx */) 2038 colStat.DistinctCount = inputColStat.DistinctCount 2039 colStat.NullCount = inputColStat.NullCount * (s.RowCount / inputStats.RowCount) 2040 } 2041 2042 // Other requested columns may be from the output columns of the zip. 2043 zipCols := projectSet.Zip.OutputCols() 2044 reqZipCols := colSet.Difference(inputCols).Intersection(zipCols) 2045 if !reqZipCols.Empty() { 2046 // Calculate the the distinct count and null count for the zip columns 2047 // after the cross join has been applied. 2048 zipColsDistinctCount := float64(1) 2049 zipColsNullCount := s.RowCount 2050 for i := range projectSet.Zip { 2051 item := &projectSet.Zip[i] 2052 if item.Cols.ToSet().Intersects(reqZipCols) { 2053 if fn, ok := item.Fn.(*FunctionExpr); ok && fn.Overload.Generator != nil { 2054 // The columns(s) contain a generator function. 2055 // TODO(rytaft): We may want to determine which generator function the 2056 // requested columns correspond to, and estimate the distinct count and 2057 // null count based on the type of generator function and its parameters. 2058 zipColsDistinctCount *= unknownGeneratorRowCount * unknownGeneratorDistinctCountRatio 2059 zipColsNullCount *= unknownNullCountRatio 2060 } else { 2061 // The columns(s) contain a scalar function or expression. 2062 // These columns can contain many null values if the zip also 2063 // contains a generator function. For example: 2064 // 2065 // SELECT * FROM ROWS FROM (generate_series(0, 3), upper('abc')); 2066 // 2067 // Produces: 2068 // 2069 // generate_series | upper 2070 // -----------------+------- 2071 // 0 | ABC 2072 // 1 | NULL 2073 // 2 | NULL 2074 // 3 | NULL 2075 // 2076 // After the cross product with the input, the total number of nulls 2077 // for the column(s) equals (output row count - input row count). 2078 // (Also multiply by the expected chance of collisions with nulls 2079 // already collected.) 2080 zipColsNullCount *= (s.RowCount - inputStats.RowCount) / s.RowCount 2081 } 2082 2083 if item.ScalarProps().OuterCols.Intersects(inputProps.OutputCols) { 2084 // The column(s) are correlated with the input, so they may have a 2085 // distinct value for each distinct row of the input. 2086 zipColsDistinctCount *= inputStats.RowCount * unknownDistinctCountRatio 2087 } 2088 } 2089 } 2090 2091 // Multiply the distinct counts in case colStat.DistinctCount is 2092 // already populated with a statistic from the subset of columns 2093 // provided by the input. Multiplying the counts gives a worst-case 2094 // estimate of the joint distinct count. 2095 colStat.DistinctCount *= zipColsDistinctCount 2096 2097 // Assuming null columns are completely independent, calculate 2098 // the expected value of having nulls in both column sets. 2099 f1 := zipColsNullCount / s.RowCount 2100 f2 := colStat.NullCount / s.RowCount 2101 colStat.NullCount = s.RowCount * f1 * f2 2102 } 2103 2104 if colSet.Intersects(projectSet.Relational().NotNullCols) { 2105 colStat.NullCount = 0 2106 } 2107 sb.finalizeFromRowCountAndDistinctCounts(colStat, s) 2108 return colStat 2109 } 2110 2111 // +----------+ 2112 // | WithScan | 2113 // +----------+ 2114 2115 func (sb *statisticsBuilder) buildWithScan(withScan *WithScanExpr, relProps *props.Relational) { 2116 s := &relProps.Stats 2117 if zeroCardinality := s.Init(relProps); zeroCardinality { 2118 // Short cut if cardinality is 0. 2119 return 2120 } 2121 s.Available = sb.availabilityFromInput(withScan) 2122 2123 inputStats := withScan.BindingProps.Stats 2124 2125 s.RowCount = inputStats.RowCount 2126 sb.finalizeFromCardinality(relProps) 2127 } 2128 2129 func (sb *statisticsBuilder) colStatWithScan( 2130 colSet opt.ColSet, withScan *WithScanExpr, 2131 ) *props.ColumnStatistic { 2132 s := &withScan.Relational().Stats 2133 withProps := withScan.BindingProps 2134 inColSet := opt.TranslateColSet(colSet, withScan.OutCols, withScan.InCols) 2135 2136 // We cannot call colStatLeaf on &withProps.Stats directly because it can 2137 // modify it. 2138 var statsCopy props.Statistics 2139 statsCopy.CopyFrom(&withProps.Stats) 2140 2141 // TODO(rytaft): This would be more accurate if we could access the WithExpr 2142 // itself. 2143 inColStat := sb.colStatLeaf(inColSet, &statsCopy, &withProps.FuncDeps, withProps.NotNullCols) 2144 2145 colStat, _ := s.ColStats.Add(colSet) 2146 colStat.DistinctCount = inColStat.DistinctCount 2147 colStat.NullCount = inColStat.NullCount 2148 sb.finalizeFromRowCountAndDistinctCounts(colStat, s) 2149 return colStat 2150 } 2151 2152 // +--------------------------------+ 2153 // | Insert, Update, Upsert, Delete | 2154 // +--------------------------------+ 2155 2156 func (sb *statisticsBuilder) buildMutation(mutation RelExpr, relProps *props.Relational) { 2157 s := &relProps.Stats 2158 if zeroCardinality := s.Init(relProps); zeroCardinality { 2159 // Short cut if cardinality is 0. 2160 return 2161 } 2162 s.Available = sb.availabilityFromInput(mutation) 2163 2164 inputStats := sb.statsFromChild(mutation, 0 /* childIdx */) 2165 2166 s.RowCount = inputStats.RowCount 2167 sb.finalizeFromCardinality(relProps) 2168 } 2169 2170 func (sb *statisticsBuilder) colStatMutation( 2171 colSet opt.ColSet, mutation RelExpr, 2172 ) *props.ColumnStatistic { 2173 s := &mutation.Relational().Stats 2174 private := mutation.Private().(*MutationPrivate) 2175 2176 // Get colstat from child by mapping requested columns to corresponding 2177 // input columns. 2178 inColSet := private.MapToInputCols(colSet) 2179 inColStat := sb.colStatFromChild(inColSet, mutation, 0 /* childIdx */) 2180 2181 // Construct mutation colstat using the corresponding input stats. 2182 colStat, _ := s.ColStats.Add(colSet) 2183 colStat.DistinctCount = inColStat.DistinctCount 2184 colStat.NullCount = inColStat.NullCount 2185 sb.finalizeFromRowCountAndDistinctCounts(colStat, s) 2186 return colStat 2187 } 2188 2189 // +-----------------+ 2190 // | Sequence Select | 2191 // +-----------------+ 2192 2193 func (sb *statisticsBuilder) buildSequenceSelect(relProps *props.Relational) { 2194 s := &relProps.Stats 2195 s.Available = true 2196 s.RowCount = 1 2197 sb.finalizeFromCardinality(relProps) 2198 } 2199 2200 func (sb *statisticsBuilder) colStatSequenceSelect( 2201 colSet opt.ColSet, seq *SequenceSelectExpr, 2202 ) *props.ColumnStatistic { 2203 s := &seq.Relational().Stats 2204 2205 colStat, _ := s.ColStats.Add(colSet) 2206 colStat.DistinctCount = 1 2207 colStat.NullCount = 0 2208 sb.finalizeFromRowCountAndDistinctCounts(colStat, s) 2209 return colStat 2210 } 2211 2212 // +---------+ 2213 // | Unknown | 2214 // +---------+ 2215 2216 func (sb *statisticsBuilder) buildUnknown(relProps *props.Relational) { 2217 s := &relProps.Stats 2218 s.Available = false 2219 s.RowCount = unknownGeneratorRowCount 2220 sb.finalizeFromCardinality(relProps) 2221 } 2222 2223 func (sb *statisticsBuilder) colStatUnknown( 2224 colSet opt.ColSet, relProps *props.Relational, 2225 ) *props.ColumnStatistic { 2226 s := &relProps.Stats 2227 2228 colStat, _ := s.ColStats.Add(colSet) 2229 colStat.DistinctCount = s.RowCount 2230 colStat.NullCount = 0 2231 sb.finalizeFromRowCountAndDistinctCounts(colStat, s) 2232 return colStat 2233 } 2234 2235 ///////////////////////////////////////////////// 2236 // General helper functions for building stats // 2237 ///////////////////////////////////////////////// 2238 2239 // copyColStatFromChild copies the column statistic for the given colSet from 2240 // the first child of ev into ev. colStatFromChild may trigger recursive 2241 // calls if the requested statistic is not already cached in the child. 2242 func (sb *statisticsBuilder) copyColStatFromChild( 2243 colSet opt.ColSet, e RelExpr, s *props.Statistics, 2244 ) *props.ColumnStatistic { 2245 childColStat := sb.colStatFromChild(colSet, e, 0 /* childIdx */) 2246 return sb.copyColStat(colSet, s, childColStat) 2247 } 2248 2249 // ensureColStat creates a column statistic for column "col" if it doesn't 2250 // already exist in s.ColStats, copying the statistic from a child. 2251 // Then, ensureColStat sets the distinct count to the minimum of the existing 2252 // value and the new value. 2253 func (sb *statisticsBuilder) ensureColStat( 2254 colSet opt.ColSet, maxDistinctCount float64, e RelExpr, relProps *props.Relational, 2255 ) *props.ColumnStatistic { 2256 s := &relProps.Stats 2257 2258 colStat, ok := s.ColStats.Lookup(colSet) 2259 if !ok { 2260 colStat, _ = sb.colStatFromInput(colSet, e) 2261 colStat = sb.copyColStat(colSet, s, colStat) 2262 } 2263 2264 colStat.DistinctCount = min(colStat.DistinctCount, maxDistinctCount) 2265 return colStat 2266 } 2267 2268 // copyColStat creates a column statistic and copies the data from an existing 2269 // column statistic. Does not copy the histogram. 2270 func (sb *statisticsBuilder) copyColStat( 2271 colSet opt.ColSet, s *props.Statistics, inputColStat *props.ColumnStatistic, 2272 ) *props.ColumnStatistic { 2273 if !inputColStat.Cols.SubsetOf(colSet) { 2274 panic(errors.AssertionFailedf( 2275 "copyColStat colSet: %v inputColSet: %v\n", log.Safe(colSet), log.Safe(inputColStat.Cols), 2276 )) 2277 } 2278 colStat, _ := s.ColStats.Add(colSet) 2279 colStat.DistinctCount = inputColStat.DistinctCount 2280 colStat.NullCount = inputColStat.NullCount 2281 return colStat 2282 } 2283 2284 func (sb *statisticsBuilder) finalizeFromCardinality(relProps *props.Relational) { 2285 s := &relProps.Stats 2286 2287 // We don't ever want row count to be zero unless the cardinality is zero. 2288 // This is because the stats may be stale, and we can end up with weird and 2289 // inefficient plans if we estimate 0 rows. 2290 if s.RowCount <= 0 && relProps.Cardinality.Max > 0 { 2291 panic(errors.AssertionFailedf("estimated row count must be non-zero")) 2292 } 2293 2294 // The row count should be between the min and max cardinality. 2295 if s.RowCount > float64(relProps.Cardinality.Max) && relProps.Cardinality.Max != math.MaxUint32 { 2296 s.RowCount = float64(relProps.Cardinality.Max) 2297 } else if s.RowCount < float64(relProps.Cardinality.Min) { 2298 s.RowCount = float64(relProps.Cardinality.Min) 2299 } 2300 2301 for i, n := 0, s.ColStats.Count(); i < n; i++ { 2302 colStat := s.ColStats.Get(i) 2303 sb.finalizeFromRowCountAndDistinctCounts(colStat, s) 2304 } 2305 } 2306 2307 func (sb *statisticsBuilder) finalizeFromRowCountAndDistinctCounts( 2308 colStat *props.ColumnStatistic, s *props.Statistics, 2309 ) { 2310 rowCount := s.RowCount 2311 2312 // We should always have at least one distinct value if row count > 0. 2313 if rowCount > 0 && colStat.DistinctCount == 0 { 2314 panic(errors.AssertionFailedf("estimated distinct count must be non-zero")) 2315 } 2316 2317 // If this is a multi-column statistic, the distinct count should be no 2318 // larger than the product of all the distinct counts of its individual 2319 // columns, and no smaller than the distinct count of any single column. 2320 if colStat.Cols.Len() > 1 && rowCount > 1 { 2321 product := 1.0 2322 maxDistinct := 0.0 2323 colStat.Cols.ForEach(func(col opt.ColumnID) { 2324 if singleColStat, ok := s.ColStats.Lookup(opt.MakeColSet(col)); ok { 2325 if singleColStat.DistinctCount > 1 { 2326 product *= singleColStat.DistinctCount 2327 } 2328 if singleColStat.DistinctCount > maxDistinct { 2329 maxDistinct = singleColStat.DistinctCount 2330 } 2331 } else { 2332 // This is just a best-effort check, so if we don't have one of the 2333 // column stats, assume its distinct count equals the row count. 2334 product *= rowCount 2335 } 2336 }) 2337 colStat.DistinctCount = min(colStat.DistinctCount, product) 2338 colStat.DistinctCount = max(colStat.DistinctCount, maxDistinct) 2339 } 2340 2341 // The distinct and null counts should be no larger than the row count. 2342 colStat.DistinctCount = min(colStat.DistinctCount, rowCount) 2343 colStat.NullCount = min(colStat.NullCount, rowCount) 2344 2345 // Uniformly reduce the size of each histogram bucket so the number of values 2346 // is no larger than the row count. 2347 if colStat.Histogram != nil { 2348 valuesCount := colStat.Histogram.ValuesCount() 2349 if valuesCount > rowCount { 2350 colStat.Histogram = colStat.Histogram.ApplySelectivity(rowCount / valuesCount) 2351 } 2352 } 2353 } 2354 2355 func (sb *statisticsBuilder) shouldUseHistogram(relProps *props.Relational) bool { 2356 // If we know that the cardinality is below a certain threshold (e.g., due to 2357 // a constraint on a key column), don't bother adding the overhead of 2358 // creating a histogram. 2359 return relProps.Cardinality.Max >= minCardinalityForHistogram 2360 } 2361 2362 // rowsProcessed calculates and returns the number of rows processed by the 2363 // relational expression. It is currently only supported for joins. 2364 func (sb *statisticsBuilder) rowsProcessed(e RelExpr) float64 { 2365 semiAntiJoinToInnerJoin := func(joinType opt.Operator) opt.Operator { 2366 switch joinType { 2367 case opt.SemiJoinOp, opt.AntiJoinOp: 2368 return opt.InnerJoinOp 2369 case opt.SemiJoinApplyOp, opt.AntiJoinApplyOp: 2370 return opt.InnerJoinApplyOp 2371 default: 2372 return joinType 2373 } 2374 } 2375 2376 switch t := e.(type) { 2377 case *LookupJoinExpr: 2378 var lookupJoinPrivate *LookupJoinPrivate 2379 switch t.JoinType { 2380 case opt.SemiJoinOp, opt.SemiJoinApplyOp, opt.AntiJoinOp, opt.AntiJoinApplyOp: 2381 // The number of rows processed for semi and anti joins is closer to the 2382 // number of output rows for an equivalent inner join. 2383 copy := t.LookupJoinPrivate 2384 copy.JoinType = semiAntiJoinToInnerJoin(t.JoinType) 2385 lookupJoinPrivate = © 2386 2387 default: 2388 if t.On.IsTrue() { 2389 // If there are no additional ON filters, the number of rows processed 2390 // equals the number of output rows. 2391 return e.Relational().Stats.RowCount 2392 } 2393 lookupJoinPrivate = &t.LookupJoinPrivate 2394 } 2395 2396 // We need to determine the row count of the join before the 2397 // ON conditions are applied. 2398 withoutOn := e.Memo().MemoizeLookupJoin(t.Input, nil /* on */, lookupJoinPrivate) 2399 return withoutOn.Relational().Stats.RowCount 2400 2401 case *GeoLookupJoinExpr: 2402 var lookupJoinPrivate *GeoLookupJoinPrivate 2403 switch t.JoinType { 2404 case opt.SemiJoinOp, opt.SemiJoinApplyOp, opt.AntiJoinOp, opt.AntiJoinApplyOp: 2405 // The number of rows processed for semi and anti joins is closer to the 2406 // number of output rows for an equivalent inner join. 2407 copy := t.GeoLookupJoinPrivate 2408 copy.JoinType = semiAntiJoinToInnerJoin(t.JoinType) 2409 lookupJoinPrivate = © 2410 2411 default: 2412 if t.On.IsTrue() { 2413 // If there are no additional ON filters, the number of rows processed 2414 // equals the number of output rows. 2415 return e.Relational().Stats.RowCount 2416 } 2417 lookupJoinPrivate = &t.GeoLookupJoinPrivate 2418 } 2419 2420 // We need to determine the row count of the join before the 2421 // ON conditions are applied. 2422 withoutOn := e.Memo().MemoizeGeoLookupJoin(t.Input, nil /* on */, lookupJoinPrivate) 2423 return withoutOn.Relational().Stats.RowCount 2424 2425 case *MergeJoinExpr: 2426 var mergeJoinPrivate *MergeJoinPrivate 2427 switch t.JoinType { 2428 case opt.SemiJoinOp, opt.SemiJoinApplyOp, opt.AntiJoinOp, opt.AntiJoinApplyOp: 2429 // The number of rows processed for semi and anti joins is closer to the 2430 // number of output rows for an equivalent inner join. 2431 copy := t.MergeJoinPrivate 2432 copy.JoinType = semiAntiJoinToInnerJoin(t.JoinType) 2433 mergeJoinPrivate = © 2434 2435 default: 2436 if t.On.IsTrue() { 2437 // If there are no additional ON filters, the number of rows processed 2438 // equals the number of output rows. 2439 return e.Relational().Stats.RowCount 2440 } 2441 mergeJoinPrivate = &t.MergeJoinPrivate 2442 } 2443 2444 // We need to determine the row count of the join before the 2445 // ON conditions are applied. 2446 withoutOn := e.Memo().MemoizeMergeJoin(t.Left, t.Right, nil /* on */, mergeJoinPrivate) 2447 return withoutOn.Relational().Stats.RowCount 2448 2449 default: 2450 if !opt.IsJoinOp(e) { 2451 panic(errors.AssertionFailedf("rowsProcessed not supported for operator type %v", log.Safe(e.Op()))) 2452 } 2453 2454 leftCols := e.Child(0).(RelExpr).Relational().OutputCols 2455 rightCols := e.Child(1).(RelExpr).Relational().OutputCols 2456 filters := e.Child(2).(*FiltersExpr) 2457 2458 // Remove ON conditions that are not equality conditions, 2459 on := ExtractJoinEqualityFilters(leftCols, rightCols, *filters) 2460 2461 switch t := e.(type) { 2462 // The number of rows processed for semi and anti joins is closer to the 2463 // number of output rows for an equivalent inner join. 2464 case *SemiJoinExpr: 2465 e = e.Memo().MemoizeInnerJoin(t.Left, t.Right, on, &t.JoinPrivate) 2466 case *SemiJoinApplyExpr: 2467 e = e.Memo().MemoizeInnerJoinApply(t.Left, t.Right, on, &t.JoinPrivate) 2468 case *AntiJoinExpr: 2469 e = e.Memo().MemoizeInnerJoin(t.Left, t.Right, on, &t.JoinPrivate) 2470 case *AntiJoinApplyExpr: 2471 e = e.Memo().MemoizeInnerJoinApply(t.Left, t.Right, on, &t.JoinPrivate) 2472 2473 default: 2474 if len(on) == len(*filters) { 2475 // No filters were removed. 2476 return e.Relational().Stats.RowCount 2477 } 2478 2479 switch t := e.(type) { 2480 case *InnerJoinExpr: 2481 e = e.Memo().MemoizeInnerJoin(t.Left, t.Right, on, &t.JoinPrivate) 2482 case *InnerJoinApplyExpr: 2483 e = e.Memo().MemoizeInnerJoinApply(t.Left, t.Right, on, &t.JoinPrivate) 2484 case *LeftJoinExpr: 2485 e = e.Memo().MemoizeLeftJoin(t.Left, t.Right, on, &t.JoinPrivate) 2486 case *LeftJoinApplyExpr: 2487 e = e.Memo().MemoizeLeftJoinApply(t.Left, t.Right, on, &t.JoinPrivate) 2488 case *RightJoinExpr: 2489 e = e.Memo().MemoizeRightJoin(t.Left, t.Right, on, &t.JoinPrivate) 2490 case *FullJoinExpr: 2491 e = e.Memo().MemoizeFullJoin(t.Left, t.Right, on, &t.JoinPrivate) 2492 default: 2493 panic(errors.AssertionFailedf("join type %v not handled", log.Safe(e.Op()))) 2494 } 2495 } 2496 return e.Relational().Stats.RowCount 2497 } 2498 } 2499 2500 func min(a, b float64) float64 { 2501 if a < b { 2502 return a 2503 } 2504 return b 2505 } 2506 2507 func max(a, b float64) float64 { 2508 if a > b { 2509 return a 2510 } 2511 return b 2512 } 2513 2514 // fraction returns a/b if a is less than b. Otherwise returns 1. 2515 func fraction(a, b float64) float64 { 2516 if a < b { 2517 return a / b 2518 } 2519 return 1 2520 } 2521 2522 ////////////////////////////////////////////////// 2523 // Helper functions for selectivity calculation // 2524 ////////////////////////////////////////////////// 2525 2526 const ( 2527 // This is the value used for inequality filters such as x < 1 in 2528 // "Access Path Selection in a Relational Database Management System" 2529 // by Pat Selinger et al. 2530 unknownFilterSelectivity = 1.0 / 3.0 2531 2532 // TODO(rytaft): Add other selectivities for other types of predicates. 2533 2534 // This is an arbitrary row count used in the absence of any real statistics. 2535 unknownRowCount = 1000 2536 2537 // This is the ratio of distinct column values to number of rows, which is 2538 // used in the absence of any real statistics for non-key columns. 2539 // TODO(rytaft): See if there is an industry standard value for this. 2540 unknownDistinctCountRatio = 0.1 2541 2542 // This is the ratio of null column values to number of rows for nullable 2543 // columns, which is used in the absence of any real statistics. 2544 unknownNullCountRatio = 0.01 2545 2546 // Use a small row count for generator functions; this allows use of lookup 2547 // join in cases like using json_array_elements with a small constant array. 2548 unknownGeneratorRowCount = 10 2549 2550 // Since the generator row count is so small, we need a larger distinct count 2551 // ratio for generator functions. 2552 unknownGeneratorDistinctCountRatio = 0.7 2553 2554 // When subtracting floating point numbers, avoid precision errors by making 2555 // sure the result is greater than or equal to epsilon. 2556 epsilon = 1e-10 2557 2558 // This is the minimum cardinality an expression should have in order to make 2559 // it worth adding the overhead of using a histogram. 2560 minCardinalityForHistogram = 100 2561 2562 // This is the default selectivity estimated for geospatial lookup joins 2563 // until we can get better statistics on inverted indexes and geospatial 2564 // columns. 2565 unknownGeoRelationshipSelectivity = 1.0 / 100.0 2566 2567 // multiColWeight is the weight to assign the selectivity calculation using 2568 // multi-column statistics versus the calculation using single-column 2569 // statistics. See the comment above selectivityFromMultiColDistinctCounts for 2570 // details. 2571 multiColWeight = 9.0 / 10.0 2572 ) 2573 2574 // countJSONPaths returns the number of JSON paths in the specified 2575 // FiltersItem. Used in the calculation of unapplied conjuncts in a JSON 2576 // Contains operator. Returns 0 if paths could not be counted for any 2577 // reason, such as malformed JSON. 2578 func countJSONPaths(conjunct *FiltersItem) int { 2579 rhs := conjunct.Condition.Child(1) 2580 if !CanExtractConstDatum(rhs) { 2581 return 0 2582 } 2583 rightDatum := ExtractConstDatum(rhs) 2584 if rightDatum == tree.DNull { 2585 return 0 2586 } 2587 rd, ok := rightDatum.(*tree.DJSON) 2588 if !ok { 2589 return 0 2590 } 2591 2592 // TODO(itsbilal): Replace this with a method that only counts paths 2593 // instead of generating a slice for all of them. 2594 paths, err := json.AllPaths(rd.JSON) 2595 if err != nil { 2596 return 0 2597 } 2598 return len(paths) 2599 } 2600 2601 // applyFilter uses constraints to update the distinct counts and histograms 2602 // for the constrained columns in the filter. The changes in the distinct 2603 // counts and histograms will be used later to determine the selectivity of 2604 // the filter. 2605 // 2606 // Some filters can be translated directly to distinct counts using the 2607 // constraint set. For example, the tight constraint `/a: [/1 - /1]` indicates 2608 // that column `a` has exactly one distinct value. Other filters, such as 2609 // `a % 2 = 0` may not have a tight constraint. In this case, it is not 2610 // possible to determine the distinct count for column `a`, so instead we 2611 // increment numUnappliedConjuncts, which will be used later for selectivity 2612 // calculation. See comments in applyConstraintSet and 2613 // updateDistinctCountsFromConstraint for more details about how distinct 2614 // counts are calculated from constraints. 2615 // 2616 // Equalities between two variables (e.g., var1=var2) are handled separately. 2617 // See applyEquivalencies and selectivityFromEquivalencies for details. 2618 // 2619 func (sb *statisticsBuilder) applyFilter( 2620 filters FiltersExpr, e RelExpr, relProps *props.Relational, 2621 ) (numUnappliedConjuncts float64, constrainedCols, histCols opt.ColSet) { 2622 if lookupJoin, ok := e.(*LookupJoinExpr); ok { 2623 // Special hack for lookup joins. Add constant filters from the equality 2624 // conditions. 2625 // TODO(rytaft): the correct way to do this is probably to fully implement 2626 // histograms in Project and Join expressions, and use them in 2627 // selectivityFromEquivalencies. See Issue #38082. 2628 filters = append(filters, lookupJoin.ConstFilters...) 2629 } 2630 2631 applyConjunct := func(conjunct *FiltersItem) { 2632 if isEqualityWithTwoVars(conjunct.Condition) { 2633 // We'll handle equalities later. 2634 return 2635 } 2636 2637 // Special case: The current conjunct is a JSON Contains operator. 2638 // If so, count every path to a leaf node in the RHS as a separate 2639 // conjunct. If for whatever reason we can't get to the JSON datum 2640 // or enumerate its paths, count the whole operator as one conjunct. 2641 if conjunct.Condition.Op() == opt.ContainsOp { 2642 numPaths := countJSONPaths(conjunct) 2643 if numPaths == 0 { 2644 numUnappliedConjuncts++ 2645 } else { 2646 // Multiply the number of paths by 2 to mimic the logic in 2647 // numConjunctsInConstraint, for constraints like 2648 // /1: [/'{"a":"b"}' - /'{"a":"b"}'] . That function counts 2649 // this as 2 conjuncts, and to keep row counts as consistent 2650 // as possible between competing filtered selects and 2651 // constrained scans, we apply the same logic here. 2652 numUnappliedConjuncts += 2 * float64(numPaths) 2653 } 2654 return 2655 } 2656 2657 // Update constrainedCols after the above check for isEqualityWithTwoVars. 2658 // We will use constrainedCols later to determine which columns to use for 2659 // selectivity calculation in selectivityFromMultiColDistinctCounts, and we 2660 // want to make sure that we don't include columns that were only present in 2661 // equality conjuncts such as var1=var2. The selectivity of these conjuncts 2662 // will be accounted for in selectivityFromEquivalencies. 2663 scalarProps := conjunct.ScalarProps() 2664 constrainedCols.UnionWith(scalarProps.OuterCols) 2665 if scalarProps.Constraints != nil { 2666 histColsLocal := sb.applyConstraintSet( 2667 scalarProps.Constraints, scalarProps.TightConstraints, e, relProps, 2668 ) 2669 histCols.UnionWith(histColsLocal) 2670 if !scalarProps.TightConstraints { 2671 numUnappliedConjuncts++ 2672 } 2673 } else { 2674 numUnappliedConjuncts++ 2675 } 2676 } 2677 2678 for i := range filters { 2679 applyConjunct(&filters[i]) 2680 } 2681 2682 return numUnappliedConjuncts, constrainedCols, histCols 2683 } 2684 2685 // applyIndexConstraint is used to update the distinct counts and histograms 2686 // for the constrained columns in an index constraint. Returns the set of 2687 // constrained columns and the set of columns with a filtered histogram. 2688 func (sb *statisticsBuilder) applyIndexConstraint( 2689 c *constraint.Constraint, e RelExpr, relProps *props.Relational, 2690 ) (constrainedCols, histCols opt.ColSet) { 2691 // If unconstrained, then no constraint could be derived from the expression, 2692 // so fall back to estimate. 2693 // If a contradiction, then optimizations must not be enabled (say for 2694 // testing), or else this would have been reduced. 2695 if c.IsUnconstrained() || c.IsContradiction() { 2696 return 2697 } 2698 2699 // Calculate distinct counts. 2700 applied, lastColMinDistinct := sb.updateDistinctCountsFromConstraint(c, e, relProps) 2701 2702 // Collect the set of constrained columns for which we were able to estimate 2703 // a distinct count, including the first column after the constraint prefix 2704 // (if applicable, add a distinct count estimate for that column using the 2705 // function updateDistinctCountFromUnappliedConjuncts). 2706 // 2707 // Note that the resulting set might not include *all* constrained columns. 2708 // For example, we cannot make any assumptions about the distinct count of 2709 // column b based on the constraints /a/b: [/1 - /5/6] or /a/b: [ - /5/6]. 2710 // TODO(rytaft): Consider treating remaining constrained columns as 2711 // "unapplied conjuncts" and account for their selectivity in 2712 // selectivityFromUnappliedConjuncts. 2713 prefix := c.Prefix(sb.evalCtx) 2714 for i, n := 0, c.ConstrainedColumns(sb.evalCtx); i < n && i <= prefix; i++ { 2715 col := c.Columns.Get(i).ID() 2716 constrainedCols.Add(col) 2717 if i < applied { 2718 continue 2719 } 2720 2721 // Unlike the constraints found in Select and Join filters, an index 2722 // constraint may represent multiple conjuncts. Therefore, we need to 2723 // calculate the number of unapplied conjuncts for each constrained column. 2724 numConjuncts := sb.numConjunctsInConstraint(c, i) 2725 2726 // Set the distinct count for the current column of the constraint 2727 // according to unknownDistinctCountRatio. 2728 var lowerBound float64 2729 if i == applied { 2730 lowerBound = lastColMinDistinct 2731 } 2732 sb.updateDistinctCountFromUnappliedConjuncts(col, e, relProps, numConjuncts, lowerBound) 2733 } 2734 2735 if !sb.shouldUseHistogram(relProps) { 2736 return constrainedCols, histCols 2737 } 2738 2739 // Calculate histograms. 2740 constrainedCols.ForEach(func(col opt.ColumnID) { 2741 colSet := opt.MakeColSet(col) 2742 inputStat, _ := sb.colStatFromInput(colSet, e) 2743 inputHist := inputStat.Histogram 2744 if inputHist != nil { 2745 if _, _, ok := inputHist.CanFilter(c); ok { 2746 s := &relProps.Stats 2747 if colStat, ok := s.ColStats.Lookup(colSet); ok { 2748 colStat.Histogram = inputHist.Filter(c) 2749 histCols.Add(col) 2750 sb.updateDistinctCountFromHistogram(colStat, inputStat.DistinctCount) 2751 } 2752 } 2753 } 2754 }) 2755 2756 return constrainedCols, histCols 2757 } 2758 2759 // applyConstraintSet is used to update the distinct counts and histograms 2760 // for the constrained columns in a constraint set. Returns the set of 2761 // columns with a filtered histogram. 2762 func (sb *statisticsBuilder) applyConstraintSet( 2763 cs *constraint.Set, tight bool, e RelExpr, relProps *props.Relational, 2764 ) (histCols opt.ColSet) { 2765 // If unconstrained, then no constraint could be derived from the expression, 2766 // so fall back to estimate. 2767 // If a contradiction, then optimizations must not be enabled (say for 2768 // testing), or else this would have been reduced. 2769 if cs.IsUnconstrained() || cs == constraint.Contradiction { 2770 return opt.ColSet{} 2771 } 2772 2773 s := &relProps.Stats 2774 for i := 0; i < cs.Length(); i++ { 2775 c := cs.Constraint(i) 2776 col := c.Columns.Get(0).ID() 2777 2778 // Calculate distinct counts. 2779 applied, lastColMinDistinct := sb.updateDistinctCountsFromConstraint(c, e, relProps) 2780 if applied == 0 { 2781 // If a constraint cannot be applied, it may represent an 2782 // inequality like x < 1. As a result, distinctCounts does not fully 2783 // represent the selectivity of the constraint set. 2784 // We return an estimate of the number of unapplied conjuncts to the 2785 // caller function to be used for selectivity calculation. 2786 numConjuncts := sb.numConjunctsInConstraint(c, 0 /* nth */) 2787 2788 // Set the distinct count for the first column of the constraint 2789 // according to unknownDistinctCountRatio. 2790 sb.updateDistinctCountFromUnappliedConjuncts(col, e, relProps, numConjuncts, lastColMinDistinct) 2791 } 2792 2793 if !tight { 2794 // TODO(rytaft): it may still be beneficial to calculate the histogram 2795 // even if the constraint is not tight, but don't bother for now. 2796 continue 2797 } 2798 2799 if !sb.shouldUseHistogram(relProps) { 2800 continue 2801 } 2802 2803 // Calculate histogram. 2804 cols := opt.MakeColSet(col) 2805 inputStat, _ := sb.colStatFromInput(cols, e) 2806 inputHist := inputStat.Histogram 2807 if inputHist != nil { 2808 if _, _, ok := inputHist.CanFilter(c); ok { 2809 if colStat, ok := s.ColStats.Lookup(cols); ok { 2810 colStat.Histogram = inputHist.Filter(c) 2811 histCols.UnionWith(cols) 2812 sb.updateDistinctCountFromHistogram(colStat, inputStat.DistinctCount) 2813 } 2814 } 2815 } 2816 } 2817 2818 return histCols 2819 } 2820 2821 // updateNullCountsFromProps zeroes null counts for columns that cannot 2822 // have nulls in them, usually due to a column property or an application. 2823 // of a null-excluding filter. The actual determination of non-nullable 2824 // columns is done in the logical props builder. 2825 // 2826 // For example, consider the following constraint sets: 2827 // /a/b/c: [/1 - /1/2/3] [/1/2/5 - /1/2/8] 2828 // /c: [/6 - /6] 2829 // 2830 // The first constraint set filters nulls out of column a, and the 2831 // second constraint set filters nulls out of column c. 2832 // 2833 func (sb *statisticsBuilder) updateNullCountsFromProps(e RelExpr, relProps *props.Relational) { 2834 s := &relProps.Stats 2835 relProps.NotNullCols.ForEach(func(col opt.ColumnID) { 2836 colSet := opt.MakeColSet(col) 2837 colStat, ok := s.ColStats.Lookup(colSet) 2838 if ok { 2839 colStat.NullCount = 0 2840 } 2841 }) 2842 } 2843 2844 // updateDistinctCountsFromConstraint updates the distinct count for each 2845 // column in a constraint that can be determined to have a finite number of 2846 // possible values. It returns the number of columns for which the distinct 2847 // count could be inferred from the constraint. If the same column appears 2848 // in multiple constraints, the distinct count is the minimum for that column 2849 // across all constraints. 2850 // 2851 // For example, consider the following constraint set: 2852 // 2853 // /a/b/c: [/1/2/3 - /1/2/3] [/1/2/5 - /1/2/8] 2854 // /c: [/6 - /6] 2855 // 2856 // After the first constraint is processed, s.ColStats contains the 2857 // following: 2858 // [a] -> { ... DistinctCount: 1 ... } 2859 // [b] -> { ... DistinctCount: 1 ... } 2860 // [c] -> { ... DistinctCount: 5 ... } 2861 // 2862 // After the second constraint is processed, column c is further constrained, 2863 // so s.ColStats contains the following: 2864 // [a] -> { ... DistinctCount: 1 ... } 2865 // [b] -> { ... DistinctCount: 1 ... } 2866 // [c] -> { ... DistinctCount: 1 ... } 2867 // 2868 // Note that updateDistinctCountsFromConstraint is pessimistic, and assumes 2869 // that there is at least one row for every possible value provided by the 2870 // constraint. For example, /a: [/1 - /1000000] would find a distinct count of 2871 // 1000000 for column "a" even if there are only 10 rows in the table. This 2872 // discrepancy must be resolved by the calling function. 2873 // 2874 // Even if the distinct count can not be inferred for a particular column, 2875 // It's possible that we can determine a lower bound. In particular, if the 2876 // query specifically mentions some exact values we should use that as a hint. 2877 // For example, consider the following constraint: 2878 // 2879 // /a: [ - 5][10 - 10][15 - 15] 2880 // 2881 // In this case, updateDistinctCountsFromConstraint will infer that there 2882 // are at least two distinct values (10 and 15). This lower bound will be 2883 // returned in the second return value, lastColMinDistinct. 2884 func (sb *statisticsBuilder) updateDistinctCountsFromConstraint( 2885 c *constraint.Constraint, e RelExpr, relProps *props.Relational, 2886 ) (applied int, lastColMinDistinct float64) { 2887 // All of the columns that are part of the prefix have a finite number of 2888 // distinct values. 2889 prefix := c.Prefix(sb.evalCtx) 2890 2891 // If there are any other columns beyond the prefix, we may be able to 2892 // determine the number of distinct values for the first one. For example: 2893 // /a/b/c: [/1/2/3 - /1/2/3] [/1/4/5 - /1/4/8] 2894 // -> Column a has DistinctCount = 1. 2895 // -> Column b has DistinctCount = 2. 2896 // -> Column c has DistinctCount = 5. 2897 for col := 0; col <= prefix; col++ { 2898 // All columns should have at least one distinct value. 2899 distinctCount := 1.0 2900 2901 var val tree.Datum 2902 countable := true 2903 for i := 0; i < c.Spans.Count(); i++ { 2904 sp := c.Spans.Get(i) 2905 if sp.StartKey().Length() <= col || sp.EndKey().Length() <= col { 2906 // We can't determine the distinct count for this column. For example, 2907 // the number of distinct values for column b in the constraint 2908 // /a/b: [/1/1 - /1] cannot be determined. 2909 countable = false 2910 continue 2911 } 2912 startVal := sp.StartKey().Value(col) 2913 endVal := sp.EndKey().Value(col) 2914 if startVal.Compare(sb.evalCtx, endVal) != 0 { 2915 var start, end float64 2916 if startVal.ResolvedType().Family() == types.IntFamily && 2917 endVal.ResolvedType().Family() == types.IntFamily { 2918 start = float64(*startVal.(*tree.DInt)) 2919 end = float64(*endVal.(*tree.DInt)) 2920 } else if startVal.ResolvedType().Family() == types.DateFamily && 2921 endVal.ResolvedType().Family() == types.DateFamily { 2922 startDate := startVal.(*tree.DDate) 2923 endDate := endVal.(*tree.DDate) 2924 if !startDate.IsFinite() || !endDate.IsFinite() { 2925 // One of the boundaries is not finite, so we can't determine the 2926 // distinct count for this column. 2927 countable = false 2928 continue 2929 } 2930 start = float64(startDate.PGEpochDays()) 2931 end = float64(endDate.PGEpochDays()) 2932 } else { 2933 // We can't determine the distinct count for this column. For example, 2934 // the number of distinct values in the constraint 2935 // /a: [/'cherry' - /'mango'] cannot be determined. 2936 countable = false 2937 continue 2938 } 2939 // We assume that both start and end boundaries are inclusive. This 2940 // should be the case for integer and date columns (due to 2941 // normalization by constraint.PreferInclusive). 2942 if c.Columns.Get(col).Ascending() { 2943 distinctCount += end - start 2944 } else { 2945 distinctCount += start - end 2946 } 2947 } 2948 if i != 0 && val != nil { 2949 compare := startVal.Compare(sb.evalCtx, val) 2950 ascending := c.Columns.Get(col).Ascending() 2951 if (compare > 0 && ascending) || (compare < 0 && !ascending) { 2952 // This check is needed to ensure that we calculate the correct distinct 2953 // value count for constraints such as: 2954 // /a/b: [/1/2 - /1/2] [/1/4 - /1/4] [/2 - /2] 2955 // We should only increment the distinct count for column "a" once we 2956 // reach the third span. 2957 distinctCount++ 2958 } else if compare != 0 { 2959 // This can happen if we have a prefix, but not an exact prefix. For 2960 // example: 2961 // /a/b: [/1/2 - /1/4] [/3/2 - /3/5] [/6/0 - /6/0] 2962 // In this case, /a is a prefix, but not an exact prefix. Trying to 2963 // figure out the distinct count for column b may be more trouble 2964 // than it's worth. For now, don't bother trying. 2965 countable = false 2966 continue 2967 } 2968 } 2969 val = endVal 2970 } 2971 2972 if !countable { 2973 // The last column was not fully applied since there was at least one 2974 // uncountable span. The calculated distinct count will be used as a 2975 // lower bound for updateDistinctCountFromUnappliedConjuncts. 2976 return applied, distinctCount 2977 } 2978 2979 colID := c.Columns.Get(col).ID() 2980 sb.ensureColStat(opt.MakeColSet(colID), distinctCount, e, relProps) 2981 applied = col + 1 2982 } 2983 2984 return applied, 0 2985 } 2986 2987 // updateDistinctCountFromUnappliedConjuncts is used to update the distinct 2988 // count for a constrained column when the exact count cannot be determined. 2989 // The provided lowerBound serves as a lower bound on the calculated distinct 2990 // count. 2991 func (sb *statisticsBuilder) updateDistinctCountFromUnappliedConjuncts( 2992 colID opt.ColumnID, e RelExpr, relProps *props.Relational, numConjuncts, lowerBound float64, 2993 ) { 2994 colSet := opt.MakeColSet(colID) 2995 inputStat, _ := sb.colStatFromInput(colSet, e) 2996 distinctCount := inputStat.DistinctCount * math.Pow(unknownFilterSelectivity, numConjuncts) 2997 distinctCount = max(distinctCount, lowerBound) 2998 sb.ensureColStat(colSet, distinctCount, e, relProps) 2999 } 3000 3001 // updateDistinctCountFromHistogram updates the distinct count for the given 3002 // column statistic based on the estimated number of distinct values in the 3003 // histogram. The updated count will be no larger than the provided value for 3004 // maxDistinctCount. 3005 // 3006 // This function should be called after updateDistinctCountsFromConstraint 3007 // and/or updateDistinctCountFromUnappliedConjuncts. 3008 func (sb *statisticsBuilder) updateDistinctCountFromHistogram( 3009 colStat *props.ColumnStatistic, maxDistinctCount float64, 3010 ) { 3011 distinct := colStat.Histogram.DistinctValuesCount() 3012 if distinct == 0 { 3013 // Make sure the count is non-zero. The stats may be stale, and we 3014 // can end up with weird and inefficient plans if we estimate 0 rows. 3015 distinct = min(1, colStat.DistinctCount) 3016 } 3017 3018 // The distinct count estimate from the histogram should always be more 3019 // accurate than the distinct count analysis performed in 3020 // updateDistinctCountsFromConstraint, so use the histogram estimate to 3021 // replace the distinct count value. 3022 colStat.DistinctCount = min(distinct, maxDistinctCount) 3023 3024 // The histogram does not include null values, so add 1 if there are any 3025 // null values. 3026 if colStat.NullCount > 0 { 3027 colStat.DistinctCount++ 3028 } 3029 } 3030 3031 func (sb *statisticsBuilder) applyEquivalencies( 3032 equivReps opt.ColSet, filterFD *props.FuncDepSet, e RelExpr, relProps *props.Relational, 3033 ) { 3034 equivReps.ForEach(func(i opt.ColumnID) { 3035 equivGroup := filterFD.ComputeEquivGroup(i) 3036 sb.updateDistinctNullCountsFromEquivalency(equivGroup, e, relProps) 3037 }) 3038 } 3039 3040 func (sb *statisticsBuilder) updateDistinctNullCountsFromEquivalency( 3041 equivGroup opt.ColSet, e RelExpr, relProps *props.Relational, 3042 ) { 3043 s := &relProps.Stats 3044 3045 // Find the minimum distinct and null counts for all columns in this equivalency 3046 // group. 3047 minDistinctCount := s.RowCount 3048 minNullCount := s.RowCount 3049 equivGroup.ForEach(func(i opt.ColumnID) { 3050 colSet := opt.MakeColSet(i) 3051 colStat, ok := s.ColStats.Lookup(colSet) 3052 if !ok { 3053 colStat, _ = sb.colStatFromInput(colSet, e) 3054 colStat = sb.copyColStat(colSet, s, colStat) 3055 if colStat.NullCount > 0 && colSet.Intersects(relProps.NotNullCols) { 3056 colStat.NullCount = 0 3057 colStat.DistinctCount = max(colStat.DistinctCount-1, epsilon) 3058 } 3059 } 3060 if colStat.DistinctCount < minDistinctCount { 3061 minDistinctCount = colStat.DistinctCount 3062 } 3063 if colStat.NullCount < minNullCount { 3064 minNullCount = colStat.NullCount 3065 } 3066 }) 3067 3068 // Set the distinct and null counts to the minimum for all columns in this 3069 // equivalency group. 3070 equivGroup.ForEach(func(i opt.ColumnID) { 3071 colStat, _ := s.ColStats.Lookup(opt.MakeColSet(i)) 3072 colStat.DistinctCount = minDistinctCount 3073 colStat.NullCount = minNullCount 3074 }) 3075 } 3076 3077 // selectivityFromMultiColDistinctCounts calculates the selectivity of a filter 3078 // by using estimated distinct counts of each constrained column before 3079 // and after the filter was applied. We can perform this calculation in 3080 // two different ways: (1) by treating the columns as completely independent, 3081 // or (2) by assuming they are correlated. 3082 // 3083 // (1) Assuming independence between columns, we can calculate the selectivity 3084 // by taking the product of selectivities of each constrained column. In 3085 // the general case, this can be represented by the formula: 3086 // 3087 // ┬-┬ ⎛ new_distinct(i) ⎞ 3088 // selectivity = │ │ ⎜ --------------- ⎟ 3089 // ┴ ┴ ⎝ old_distinct(i) ⎠ 3090 // i in 3091 // {constrained 3092 // columns} 3093 // 3094 // (2) If we instead assume there is some correlation between columns, we 3095 // calculate the selectivity using multi-column statistics. 3096 // 3097 // ⎛ new_distinct({constrained columns}) ⎞ 3098 // selectivity = ⎜ ----------------------------------- ⎟ 3099 // ⎝ old_distinct({constrained columns}) ⎠ 3100 // 3101 // This formula looks simple, but the challenge is that it is difficult 3102 // to determine the correct value for new_distinct({constrained columns}) 3103 // if each column is not constrained to a single value. For example, if 3104 // new_distinct(x)=2 and new_distinct(y)=2, new_distinct({x,y}) could be 2, 3105 // 3 or 4. We estimate the new distinct count as follows, using the concept 3106 // of "soft functional dependency (FD) strength" as defined in [1]: 3107 // 3108 // new_distinct({x,y}) = min_value + range * (1 - FD_strength_scaled) 3109 // 3110 // where 3111 // 3112 // min_value = max(new_distinct(x), new_distinct(y)) 3113 // max_value = new_distinct(x) * new_distinct(y) 3114 // range = max_value - min_value 3115 // 3116 // ⎛ max(old_distinct(x),old_distinct(y)) ⎞ 3117 // FD_strength = ⎜ ------------------------------------ ⎟ 3118 // ⎝ old_distinct({x,y}) ⎠ 3119 // 3120 // ⎛ max(old_distinct(x), old_distinct(y)) ⎞ 3121 // min_FD_strength = ⎜ ------------------------------------- ⎟ 3122 // ⎝ old_distinct(x) * old_distinct(y) ⎠ 3123 // 3124 // ⎛ FD_strength - min_FD_strength ⎞ // scales FD_strength 3125 // FD_strength_scaled = ⎜ ----------------------------- ⎟ // to be between 3126 // ⎝ 1 - min_FD_strength ⎠ // 0 and 1 3127 // 3128 // Suppose that old_distinct(x)=100 and old_distinct(y)=10. If x and y are 3129 // perfectly correlated, old_distinct({x,y})=100. Using the example from 3130 // above, new_distinct(x)=2 and new_distinct(y)=2. Plugging in the values 3131 // into the equation, we get: 3132 // 3133 // FD_strength_scaled = 1 3134 // new_distinct({x,y}) = 2 + (4 - 2) * (1 - 1) = 2 3135 // 3136 // If x and y are completely independent, however, old_distinct({x,y})=1000. 3137 // In this case, we get: 3138 // 3139 // FD_strength_scaled = 0 3140 // new_distinct({x,y}) = 2 + (4 - 2) * (1 - 0) = 4 3141 // 3142 // Note that even if we calculate the selectivity based on equation (2) above, 3143 // we still want to take equation (1) into account. This is because it is 3144 // possible that there are two predicates that each have selectivity s, but the 3145 // multi-column selectivity is also s. In order to ensure that the cost model 3146 // considers the two predicates combined to be more selective than either one 3147 // individually, we must give some weight to equation (1). Therefore, instead 3148 // of equation (2) we actually return the following selectivity: 3149 // 3150 // selectivity = (1 - w) * (equation 1) + w * (equation 2) 3151 // 3152 // where w is the constant multiColWeight. 3153 // 3154 // This selectivity will be used later to update the row count and the 3155 // distinct count for the unconstrained columns. 3156 // 3157 // [1] Ilyas, Ihab F., et al. "CORDS: automatic discovery of correlations and 3158 // soft functional dependencies." SIGMOD 2004. 3159 // 3160 func (sb *statisticsBuilder) selectivityFromMultiColDistinctCounts( 3161 cols opt.ColSet, e RelExpr, s *props.Statistics, 3162 ) (selectivity float64) { 3163 // Respect the session setting OptimizerUseMultiColStats. 3164 if !sb.evalCtx.SessionData.OptimizerUseMultiColStats { 3165 return sb.selectivityFromSingleColDistinctCounts(cols, e, s) 3166 } 3167 3168 // Make a copy of cols so we can remove columns that are not constrained. 3169 multiColSet := cols.Copy() 3170 3171 // First calculate the selectivity from equation (1) (see function comment), 3172 // and collect the inputs to equation (2). 3173 singleColSelectivity := 1.0 3174 newDistinctProduct, oldDistinctProduct := 1.0, 1.0 3175 maxNewDistinct, maxOldDistinct := float64(0), float64(0) 3176 multiColNullCount := -1.0 3177 minLocalSel := math.MaxFloat64 3178 for col, ok := cols.Next(0); ok; col, ok = cols.Next(col + 1) { 3179 colStat, ok := s.ColStats.Lookup(opt.MakeColSet(col)) 3180 if !ok { 3181 multiColSet.Remove(col) 3182 continue 3183 } 3184 3185 inputColStat, inputStats := sb.colStatFromInput(colStat.Cols, e) 3186 localSel := sb.selectivityFromDistinctCount(colStat, inputColStat, inputStats.RowCount) 3187 singleColSelectivity *= localSel 3188 3189 // Don't bother including columns in the multi-column calculation that 3190 // don't contribute to the selectivity. 3191 if localSel == 1 { 3192 multiColSet.Remove(col) 3193 continue 3194 } 3195 3196 // Calculate values needed for the multi-column stats calculation below. 3197 newDistinctProduct *= colStat.DistinctCount 3198 oldDistinctProduct *= inputColStat.DistinctCount 3199 if colStat.DistinctCount > maxNewDistinct { 3200 maxNewDistinct = colStat.DistinctCount 3201 } 3202 if inputColStat.DistinctCount > maxOldDistinct { 3203 maxOldDistinct = inputColStat.DistinctCount 3204 } 3205 if localSel < minLocalSel { 3206 minLocalSel = localSel 3207 } 3208 if multiColNullCount < 0 { 3209 multiColNullCount = inputStats.RowCount 3210 } 3211 // Multiply by the expected chance of collisions with nulls already 3212 // collected. 3213 multiColNullCount *= colStat.NullCount / inputStats.RowCount 3214 } 3215 3216 // If we don't need to use a multi-column statistic, we're done. 3217 if multiColSet.Len() <= 1 { 3218 return singleColSelectivity 3219 } 3220 3221 // Otherwise, calculate the selectivity using multi-column stats from 3222 // equation (2). See the comment above the function definition for details 3223 // about the formula. 3224 inputColStat, inputStats := sb.colStatFromInput(multiColSet, e) 3225 fdStrength := min(maxOldDistinct/inputColStat.DistinctCount, 1.0) 3226 maxMutiColOldDistinct := min(oldDistinctProduct, inputStats.RowCount) 3227 minFdStrength := min(maxOldDistinct/maxMutiColOldDistinct, fdStrength) 3228 if minFdStrength < 1 { 3229 // Scale the fdStrength so it ranges between 0 and 1. 3230 fdStrength = (fdStrength - minFdStrength) / (1 - minFdStrength) 3231 } 3232 distinctCountRange := max(newDistinctProduct-maxNewDistinct, 0) 3233 3234 colStat, _ := s.ColStats.Add(multiColSet) 3235 colStat.DistinctCount = maxNewDistinct + distinctCountRange*(1-fdStrength) 3236 colStat.NullCount = multiColNullCount 3237 multiColSelectivity := sb.selectivityFromDistinctCount(colStat, inputColStat, inputStats.RowCount) 3238 3239 // Now, we must adjust multiColSelectivity so that it is not greater than 3240 // the selectivity of any subset of the columns in multiColSet. This would 3241 // be internally inconsistent and could lead to bad plans. For example, 3242 // x=1 AND y=1 should always be considered more selective (i.e., with lower 3243 // selectivity) than x=1 alone. 3244 // 3245 // We have already found the minimum selectivity of all the individual 3246 // columns (subsets of size 1) above and stored it in minLocalSel. It's not 3247 // practical, however, to calculate the minimum selectivity for all subsets 3248 // larger than size 1. 3249 // 3250 // Instead, we focus on a specific case known to occasionally have this 3251 // problem: when multiColSet contains 3 or more columns and at least one has 3252 // distinct count greater than 1, the subset of columns that have distinct 3253 // count less than or equal to 1 may have a smaller selectivity according to 3254 // equation (2). 3255 // 3256 // In this case, update minLocalSel and adjust multiColSelectivity as needed. 3257 // 3258 if maxNewDistinct > 1 && multiColSet.Len() > 2 { 3259 var lowDistinctCountCols opt.ColSet 3260 multiColSet.ForEach(func(col opt.ColumnID) { 3261 // We already know the column stat exists if it's in multiColSet. 3262 colStat, _ := s.ColStats.Lookup(opt.MakeColSet(col)) 3263 if colStat.DistinctCount <= 1 { 3264 lowDistinctCountCols.Add(col) 3265 } 3266 }) 3267 3268 if lowDistinctCountCols.Len() > 1 { 3269 selLowDistinctCountCols := sb.selectivityFromMultiColDistinctCounts( 3270 lowDistinctCountCols, e, s, 3271 ) 3272 if selLowDistinctCountCols < minLocalSel { 3273 minLocalSel = selLowDistinctCountCols 3274 } 3275 } 3276 } 3277 multiColSelectivity = min(multiColSelectivity, minLocalSel) 3278 3279 // As described in the function comment, we actually return a weighted sum 3280 // of multi-column and single-column selectivity estimates. 3281 return (1-multiColWeight)*singleColSelectivity + multiColWeight*multiColSelectivity 3282 } 3283 3284 // selectivityFromSingleColDistinctCounts calculates the selectivity of a 3285 // filter by using estimated distinct counts of each constrained column before 3286 // and after the filter was applied. It assumes independence between columns, 3287 // so it uses equation (1) from selectivityFromMultiColDistinctCounts. See the 3288 // comment above that function for details. 3289 func (sb *statisticsBuilder) selectivityFromSingleColDistinctCounts( 3290 cols opt.ColSet, e RelExpr, s *props.Statistics, 3291 ) (selectivity float64) { 3292 selectivity = 1.0 3293 for col, ok := cols.Next(0); ok; col, ok = cols.Next(col + 1) { 3294 colStat, ok := s.ColStats.Lookup(opt.MakeColSet(col)) 3295 if !ok { 3296 continue 3297 } 3298 3299 inputColStat, inputStats := sb.colStatFromInput(colStat.Cols, e) 3300 selectivity *= sb.selectivityFromDistinctCount(colStat, inputColStat, inputStats.RowCount) 3301 } 3302 3303 return selectivity 3304 } 3305 3306 // selectivityFromDistinctCount calculates the selectivity of a filter by using 3307 // the estimated distinct count of a single constrained column or set of 3308 // columns before and after the filter was applied. 3309 func (sb *statisticsBuilder) selectivityFromDistinctCount( 3310 colStat, inputColStat *props.ColumnStatistic, inputRowCount float64, 3311 ) float64 { 3312 newDistinct := colStat.DistinctCount 3313 oldDistinct := inputColStat.DistinctCount 3314 3315 // Nulls are included in the distinct count, so remove 1 from the 3316 // distinct counts if needed. 3317 if inputColStat.NullCount > 0 { 3318 oldDistinct = max(oldDistinct-1, 0) 3319 } 3320 if colStat.NullCount > 0 { 3321 newDistinct = max(newDistinct-1, 0) 3322 } 3323 3324 // Calculate the selectivity of the predicate. 3325 nonNullSelectivity := fraction(newDistinct, oldDistinct) 3326 nullSelectivity := fraction(colStat.NullCount, inputColStat.NullCount) 3327 return sb.predicateSelectivity( 3328 nonNullSelectivity, nullSelectivity, inputColStat.NullCount, inputRowCount, 3329 ) 3330 } 3331 3332 // selectivityFromHistograms is similar to selectivityFromSingleColDistinctCounts, 3333 // in that it calculates the selectivity of a filter by taking the product of 3334 // selectivities of each constrained column. 3335 // 3336 // For histograms, the selectivity of a constrained column is calculated as 3337 // (# values in histogram after filter) / (# values in histogram before filter). 3338 func (sb *statisticsBuilder) selectivityFromHistograms( 3339 cols opt.ColSet, e RelExpr, s *props.Statistics, 3340 ) (selectivity float64) { 3341 selectivity = 1.0 3342 for col, ok := cols.Next(0); ok; col, ok = cols.Next(col + 1) { 3343 colStat, ok := s.ColStats.Lookup(opt.MakeColSet(col)) 3344 if !ok { 3345 continue 3346 } 3347 3348 inputColStat, inputStats := sb.colStatFromInput(colStat.Cols, e) 3349 newHist := colStat.Histogram 3350 oldHist := inputColStat.Histogram 3351 if newHist == nil || oldHist == nil { 3352 continue 3353 } 3354 3355 newCount := newHist.ValuesCount() 3356 oldCount := oldHist.ValuesCount() 3357 3358 // Calculate the selectivity of the predicate. 3359 nonNullSelectivity := fraction(newCount, oldCount) 3360 nullSelectivity := fraction(colStat.NullCount, inputColStat.NullCount) 3361 selectivity *= sb.predicateSelectivity( 3362 nonNullSelectivity, nullSelectivity, inputColStat.NullCount, inputStats.RowCount, 3363 ) 3364 } 3365 return selectivity 3366 } 3367 3368 // selectivityFromNullsRemoved calculates the selectivity from null-rejecting 3369 // filters that were not already accounted for in selectivityFromMultiColDistinctCounts 3370 // or selectivityFromHistograms. The columns for filters already accounted for 3371 // should be designated by ignoreCols. 3372 func (sb *statisticsBuilder) selectivityFromNullsRemoved( 3373 e RelExpr, relProps *props.Relational, ignoreCols opt.ColSet, 3374 ) (selectivity float64) { 3375 selectivity = 1.0 3376 relProps.NotNullCols.ForEach(func(col opt.ColumnID) { 3377 if !ignoreCols.Contains(col) { 3378 inputColStat, inputStats := sb.colStatFromInput(opt.MakeColSet(col), e) 3379 selectivity *= sb.predicateSelectivity( 3380 1, /* nonNullSelectivity */ 3381 0, /* nullSelectivity */ 3382 inputColStat.NullCount, 3383 inputStats.RowCount, 3384 ) 3385 } 3386 }) 3387 3388 return selectivity 3389 } 3390 3391 // predicateSelectivity calculates the selectivity of a predicate, using the 3392 // following formula: 3393 // 3394 // sel = (output row count) / (input row count) 3395 // 3396 // where 3397 // 3398 // output row count = 3399 // (fraction of non-null values preserved) * (number of non-null input rows) + 3400 // (fraction of null values preserved) * (number of null input rows) 3401 // 3402 func (sb *statisticsBuilder) predicateSelectivity( 3403 nonNullSelectivity, nullSelectivity, inputNullCount, inputRowCount float64, 3404 ) float64 { 3405 outRowCount := nonNullSelectivity*(inputRowCount-inputNullCount) + nullSelectivity*inputNullCount 3406 sel := outRowCount / inputRowCount 3407 3408 // Avoid setting selectivity to 0. The stats may be stale, and we 3409 // can end up with weird and inefficient plans if we estimate 0 rows. 3410 return max(sel, epsilon) 3411 } 3412 3413 // selectivityFromEquivalencies determines the selectivity of equality 3414 // constraints. It must be called before applyEquivalencies. 3415 func (sb *statisticsBuilder) selectivityFromEquivalencies( 3416 equivReps opt.ColSet, filterFD *props.FuncDepSet, e RelExpr, s *props.Statistics, 3417 ) (selectivity float64) { 3418 selectivity = 1.0 3419 equivReps.ForEach(func(i opt.ColumnID) { 3420 equivGroup := filterFD.ComputeEquivGroup(i) 3421 selectivity *= sb.selectivityFromEquivalency(equivGroup, e, s) 3422 }) 3423 return selectivity 3424 } 3425 3426 func (sb *statisticsBuilder) selectivityFromEquivalency( 3427 equivGroup opt.ColSet, e RelExpr, s *props.Statistics, 3428 ) (selectivity float64) { 3429 // Find the maximum input distinct count for all columns in this equivalency 3430 // group. 3431 maxDistinctCount := float64(0) 3432 equivGroup.ForEach(func(i opt.ColumnID) { 3433 // If any of the distinct counts were updated by the filter, we want to use 3434 // the updated value. 3435 colSet := opt.MakeColSet(i) 3436 colStat, ok := s.ColStats.Lookup(colSet) 3437 if !ok { 3438 colStat, _ = sb.colStatFromInput(colSet, e) 3439 } 3440 if maxDistinctCount < colStat.DistinctCount { 3441 maxDistinctCount = colStat.DistinctCount 3442 } 3443 }) 3444 if maxDistinctCount > s.RowCount { 3445 maxDistinctCount = s.RowCount 3446 } 3447 3448 // The selectivity of an equality condition var1=var2 is 3449 // 1/max(distinct(var1), distinct(var2)). 3450 return fraction(1, maxDistinctCount) 3451 } 3452 3453 // selectivityFromEquivalenciesSemiJoin determines the selectivity of equality 3454 // constraints on a semi join. It must be called before applyEquivalencies. 3455 func (sb *statisticsBuilder) selectivityFromEquivalenciesSemiJoin( 3456 equivReps, leftOutputCols, rightOutputCols opt.ColSet, 3457 filterFD *props.FuncDepSet, 3458 e RelExpr, 3459 s *props.Statistics, 3460 ) (selectivity float64) { 3461 selectivity = 1.0 3462 equivReps.ForEach(func(i opt.ColumnID) { 3463 equivGroup := filterFD.ComputeEquivGroup(i) 3464 selectivity *= sb.selectivityFromEquivalencySemiJoin( 3465 equivGroup, leftOutputCols, rightOutputCols, e, s, 3466 ) 3467 }) 3468 return selectivity 3469 } 3470 3471 func (sb *statisticsBuilder) selectivityFromEquivalencySemiJoin( 3472 equivGroup, leftOutputCols, rightOutputCols opt.ColSet, e RelExpr, s *props.Statistics, 3473 ) (selectivity float64) { 3474 // Find the minimum (maximum) input distinct count for all columns in this 3475 // equivalency group from the right (left). 3476 minDistinctCountRight := math.MaxFloat64 3477 maxDistinctCountLeft := float64(0) 3478 equivGroup.ForEach(func(i opt.ColumnID) { 3479 // If any of the distinct counts were updated by the filter, we want to use 3480 // the updated value. 3481 colSet := opt.MakeColSet(i) 3482 colStat, ok := s.ColStats.Lookup(colSet) 3483 if !ok { 3484 colStat, _ = sb.colStatFromInput(colSet, e) 3485 } 3486 if leftOutputCols.Contains(i) { 3487 if maxDistinctCountLeft < colStat.DistinctCount { 3488 maxDistinctCountLeft = colStat.DistinctCount 3489 } 3490 } else if rightOutputCols.Contains(i) { 3491 if minDistinctCountRight > colStat.DistinctCount { 3492 minDistinctCountRight = colStat.DistinctCount 3493 } 3494 } 3495 }) 3496 if maxDistinctCountLeft > s.RowCount { 3497 maxDistinctCountLeft = s.RowCount 3498 } 3499 3500 return fraction(minDistinctCountRight, maxDistinctCountLeft) 3501 } 3502 3503 func (sb *statisticsBuilder) selectivityFromGeoRelationship( 3504 e RelExpr, s *props.Statistics, 3505 ) (selectivity float64) { 3506 return unknownGeoRelationshipSelectivity 3507 } 3508 3509 func (sb *statisticsBuilder) selectivityFromUnappliedConjuncts( 3510 numUnappliedConjuncts float64, 3511 ) (selectivity float64) { 3512 return math.Pow(unknownFilterSelectivity, numUnappliedConjuncts) 3513 } 3514 3515 // tryReduceCols is used to determine which columns to use for selectivity 3516 // calculation. 3517 // 3518 // When columns in the colStats are functionally determined by other columns, 3519 // and the determinant columns each have distinctCount = 1, we should consider 3520 // the implied correlations for selectivity calculation. Consider the query: 3521 // 3522 // SELECT * FROM customer WHERE id = 123 and name = 'John Smith' 3523 // 3524 // If id is the primary key of customer, then name is functionally determined 3525 // by id. We only need to consider the selectivity of id, not name, since id 3526 // and name are fully correlated. To determine if we have a case such as this 3527 // one, we functionally reduce the set of columns which have column statistics, 3528 // eliminating columns that can be functionally determined by other columns. 3529 // If the distinct count on all of these reduced columns is one, then we return 3530 // this reduced column set to be used for selectivity calculation. 3531 // 3532 func (sb *statisticsBuilder) tryReduceCols( 3533 cols opt.ColSet, s *props.Statistics, fd *props.FuncDepSet, 3534 ) opt.ColSet { 3535 reducedCols := fd.ReduceCols(cols) 3536 if reducedCols.Empty() { 3537 // There are no reduced columns so we return the original column set. 3538 return cols 3539 } 3540 3541 for i, ok := reducedCols.Next(0); ok; i, ok = reducedCols.Next(i + 1) { 3542 colStat, ok := s.ColStats.Lookup(opt.MakeColSet(i)) 3543 if !ok || colStat.DistinctCount != 1 { 3544 // The reduced columns are not all constant, so return the original 3545 // column set. 3546 return cols 3547 } 3548 } 3549 3550 return reducedCols 3551 } 3552 3553 // tryReduceJoinCols is used to determine which columns to use for join ON 3554 // condition selectivity calculation. See tryReduceCols. 3555 func (sb *statisticsBuilder) tryReduceJoinCols( 3556 cols opt.ColSet, 3557 s *props.Statistics, 3558 leftCols, rightCols opt.ColSet, 3559 leftFD, rightFD *props.FuncDepSet, 3560 ) opt.ColSet { 3561 leftCols = sb.tryReduceCols(leftCols.Intersection(cols), s, leftFD) 3562 rightCols = sb.tryReduceCols(rightCols.Intersection(cols), s, rightFD) 3563 return leftCols.Union(rightCols) 3564 } 3565 3566 func isEqualityWithTwoVars(cond opt.ScalarExpr) bool { 3567 if eq, ok := cond.(*EqExpr); ok { 3568 return eq.Left.Op() == opt.VariableOp && eq.Right.Op() == opt.VariableOp 3569 } 3570 return false 3571 } 3572 3573 // numConjunctsInConstraint returns a rough estimate of the number of conjuncts 3574 // used to build the given constraint for the column at position nth. 3575 func (sb *statisticsBuilder) numConjunctsInConstraint( 3576 c *constraint.Constraint, nth int, 3577 ) (numConjuncts float64) { 3578 if c.Spans.Count() == 0 { 3579 return 0 /* numConjuncts */ 3580 } 3581 3582 numConjuncts = math.MaxFloat64 3583 for i := 0; i < c.Spans.Count(); i++ { 3584 span := c.Spans.Get(i) 3585 numSpanConjuncts := float64(0) 3586 if span.StartKey().Length() > nth { 3587 // Cases of NULL in a constraint should be ignored. For example, 3588 // without knowledge of the data distribution, /a: (/NULL - /10] should 3589 // have the same estimated selectivity as /a: [/10 - ]. Selectivity 3590 // of NULL constraints is handled in selectivityFromMultiColDistinctCounts, 3591 // selectivityFromHistograms, and selectivityFromNullsRemoved. 3592 if c.Columns.Get(nth).Descending() || 3593 span.StartKey().Value(nth) != tree.DNull { 3594 numSpanConjuncts++ 3595 } 3596 } 3597 if span.EndKey().Length() > nth { 3598 // Ignore cases of NULL in constraints. (see above comment). 3599 if !c.Columns.Get(nth).Descending() || 3600 span.EndKey().Value(nth) != tree.DNull { 3601 numSpanConjuncts++ 3602 } 3603 } 3604 if numSpanConjuncts < numConjuncts { 3605 numConjuncts = numSpanConjuncts 3606 } 3607 } 3608 3609 return numConjuncts 3610 } 3611 3612 // RequestColStat causes a column statistic to be calculated on the relational 3613 // expression. This is used for testing. 3614 func RequestColStat(evalCtx *tree.EvalContext, e RelExpr, cols opt.ColSet) { 3615 var sb statisticsBuilder 3616 sb.init(evalCtx, e.Memo().Metadata()) 3617 sb.colStat(cols, e) 3618 }