github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/sql/opt/memo/statistics_builder.go

github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/sql/opt/memo/statistics_builder.go (about)

     1  // Copyright 2018 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package memo
    12  
    13  import (
    14  	"math"
    15  	"reflect"
    16  
    17  	"github.com/cockroachdb/cockroach/pkg/sql/opt"
    18  	"github.com/cockroachdb/cockroach/pkg/sql/opt/constraint"
    19  	"github.com/cockroachdb/cockroach/pkg/sql/opt/props"
    20  	"github.com/cockroachdb/cockroach/pkg/sql/sem/tree"
    21  	"github.com/cockroachdb/cockroach/pkg/sql/types"
    22  	"github.com/cockroachdb/cockroach/pkg/util/json"
    23  	"github.com/cockroachdb/cockroach/pkg/util/log"
    24  	"github.com/cockroachdb/errors"
    25  )
    26  
    27  var statsAnnID = opt.NewTableAnnID()
    28  
    29  // statisticsBuilder is responsible for building the statistics that are
    30  // used by the coster to estimate the cost of expressions.
    31  //
    32  // Background
    33  // ----------
    34  //
    35  // Conceptually, there are two kinds of statistics: table statistics and
    36  // relational expression statistics.
    37  //
    38  // 1. Table statistics
    39  //
    40  // Table statistics are stats derived from the underlying data in the
    41  // database. These stats are calculated either automatically or on-demand for
    42  // each table, and include the number of rows in the table as well as
    43  // statistics about selected individual columns or sets of columns. The column
    44  // statistics include the number of null values, the number of distinct values,
    45  // and optionally, a histogram of the data distribution (only applicable for
    46  // single columns, not sets of columns). These stats are only collected
    47  // periodically to avoid overloading the database, so they may be stale. They
    48  // are currently persisted in the system.table_statistics table (see sql/stats
    49  // for details). Inside the optimizer, they are cached in a props.Statistics
    50  // object as a table annotation in opt.Metadata.
    51  //
    52  // 2. Relational expression statistics
    53  //
    54  // Relational expression statistics are derived from table statistics, and are
    55  // only valid for a particular memo group. They are used to estimate how the
    56  // underlying table statistics change as different relational operators are
    57  // applied. The same types of statistics are stored for relational expressions
    58  // as for tables (row count, null count, distinct count, etc.). Inside the
    59  // optimizer, they are stored in a props.Statistics object in the logical
    60  // properties of the relational expression's memo group.
    61  //
    62  // For example, here is a query plan with corresponding estimated statistics at
    63  // each level:
    64  //
    65  //        Query:    SELECT y FROM a WHERE x=1
    66  //
    67  //        Plan:            Project y        Row Count: 10, Distinct(x): 1
    68  //                             |
    69  //                         Select x=1       Row Count: 10, Distinct(x): 1
    70  //                             |
    71  //                          Scan a          Row Count: 100, Distinct(x): 10
    72  //
    73  // The statistics for the Scan operator were presumably retrieved from the
    74  // underlying table statistics cached in the metadata. The statistics for
    75  // the Select operator are determined as follows: Since the predicate x=1
    76  // reduces the number of distinct values of x down to 1, and the previous
    77  // distinct count of x was 10, the selectivity of the predicate is 1/10.
    78  // Thus, the estimated number of output rows is 1/10 * 100 = 10. Finally, the
    79  // Project operator passes through the statistics from its child expression.
    80  //
    81  // Statistics for expressions high up in the query tree tend to be quite
    82  // inaccurate since the estimation errors from lower expressions are
    83  // compounded. Still, statistics are useful throughout the query tree to help
    84  // the optimizer choose between multiple alternative, logically equivalent
    85  // plans.
    86  //
    87  // How statisticsBuilder works
    88  // ---------------------------
    89  //
    90  // statisticsBuilder is responsible for building the second type of statistics,
    91  // relational expression statistics. It builds the statistics lazily, and only
    92  // calculates column statistics if needed to estimate the row count of an
    93  // expression (currently, the row count is the only statistic used by the
    94  // coster).
    95  //
    96  // Every relational operator has a buildXXX and a colStatXXX function. For
    97  // example, Scan has buildScan and colStatScan. buildScan is called when the
    98  // logical properties of a Scan expression are built. The goal of each buildXXX
    99  // function is to calculate the number of rows output by the expression so that
   100  // its cost can be estimated by the coster.
   101  //
   102  // In order to determine the row count, column statistics may be required for a
   103  // subset of the columns of the expression. Column statistics are calculated
   104  // recursively from the child expression(s) via calls to the colStatFromInput
   105  // function. colStatFromInput finds the child expression that might contain the
   106  // requested stats, and calls colStat on the child. colStat checks if the
   107  // requested stats are already cached for the child expression, and if not,
   108  // calls colStatXXX (where the XXX corresponds to the operator of the child
   109  // expression). The child expression may need to calculate column statistics
   110  // from its children, and if so, it makes another recursive call to
   111  // colStatFromInput.
   112  //
   113  // The "base case" for colStatFromInput is a Scan, where the "input" is the raw
   114  // table itself; the table statistics are retrieved from the metadata (the
   115  // metadata may in turn need to fetch the stats from the database if they are
   116  // not already cached). If a particular table statistic is not available, a
   117  // best-effort guess is made (see colStatLeaf for details).
   118  //
   119  // To better understand how the statisticsBuilder works, let us consider this
   120  // simple query, which consists of a scan followed by an aggregation:
   121  //
   122  //   SELECT count(*), x, y FROM t GROUP BY x, y
   123  //
   124  // The statistics for the scan of t will be calculated first, since logical
   125  // properties are built bottom-up. The estimated row count is retrieved from
   126  // the table statistics in the metadata, so no column statistics are needed.
   127  //
   128  // The statistics for the group by operator are calculated second. The row
   129  // count for GROUP BY can be determined by the distinct count of its grouping
   130  // columns. Therefore, the statisticsBuilder recursively updates the statistics
   131  // for the scan operator to include column stats for x and y, and then uses
   132  // these column stats to update the statistics for GROUP BY.
   133  //
   134  // At each stage where column statistics are requested, the statisticsBuilder
   135  // makes a call to colStatFromChild, which in turn calls colStat on the child
   136  // to retrieve the cached statistics or calculate them recursively. Assuming
   137  // that no statistics are cached, this is the order of function calls for the
   138  // above example (somewhat simplified):
   139  //
   140  //        +-------------+               +--------------+
   141  //  1.    | buildScan t |           2.  | buildGroupBy |
   142  //        +-------------+               +--------------+
   143  //               |                             |
   144  //     +-----------------------+   +-------------------------+
   145  //     | makeTableStatistics t |   | colStatFromChild (x, y) |
   146  //     +-----------------------+   +-------------------------+
   147  //                                             |
   148  //                                   +--------------------+
   149  //                                   | colStatScan (x, y) |
   150  //                                   +--------------------+
   151  //                                             |
   152  //                                   +---------------------+
   153  //                                   | colStatTable (x, y) |
   154  //                                   +---------------------+
   155  //                                             |
   156  //                                   +--------------------+
   157  //                                   | colStatLeaf (x, y) |
   158  //                                   +--------------------+
   159  //
   160  // See props/statistics.go for more details.
   161  type statisticsBuilder struct {
   162  	evalCtx *tree.EvalContext
   163  	md      *opt.Metadata
   164  }
   165  
   166  func (sb *statisticsBuilder) init(evalCtx *tree.EvalContext, md *opt.Metadata) {
   167  	sb.evalCtx = evalCtx
   168  	sb.md = md
   169  }
   170  
   171  func (sb *statisticsBuilder) clear() {
   172  	sb.evalCtx = nil
   173  	sb.md = nil
   174  }
   175  
   176  // colStatFromChild retrieves a column statistic from a specific child of the
   177  // given expression.
   178  func (sb *statisticsBuilder) colStatFromChild(
   179  	colSet opt.ColSet, e RelExpr, childIdx int,
   180  ) *props.ColumnStatistic {
   181  	// Helper function to return the column statistic if the output columns of
   182  	// the child with the given index intersect colSet.
   183  	child := e.Child(childIdx).(RelExpr)
   184  	childProps := child.Relational()
   185  	if !colSet.SubsetOf(childProps.OutputCols) {
   186  		colSet = colSet.Intersection(childProps.OutputCols)
   187  		if colSet.Empty() {
   188  			// All the columns in colSet are outer columns; therefore, we can treat
   189  			// them as a constant.
   190  			return &props.ColumnStatistic{Cols: colSet, DistinctCount: 1}
   191  		}
   192  	}
   193  	return sb.colStat(colSet, child)
   194  }
   195  
   196  // statsFromChild retrieves the main statistics struct from a specific child
   197  // of the given expression.
   198  func (sb *statisticsBuilder) statsFromChild(e RelExpr, childIdx int) *props.Statistics {
   199  	return &e.Child(childIdx).(RelExpr).Relational().Stats
   200  }
   201  
   202  // availabilityFromInput determines the availability of the underlying table
   203  // statistics from the children of the expression.
   204  func (sb *statisticsBuilder) availabilityFromInput(e RelExpr) bool {
   205  	switch t := e.(type) {
   206  	case *ScanExpr:
   207  		return sb.makeTableStatistics(t.Table).Available
   208  
   209  	case *LookupJoinExpr:
   210  		ensureLookupJoinInputProps(t, sb)
   211  		return t.lookupProps.Stats.Available && t.Input.Relational().Stats.Available
   212  
   213  	case *GeoLookupJoinExpr:
   214  		ensureGeoLookupJoinInputProps(t, sb)
   215  		return t.lookupProps.Stats.Available && t.Input.Relational().Stats.Available
   216  
   217  	case *ZigzagJoinExpr:
   218  		ensureZigzagJoinInputProps(t, sb)
   219  		return t.leftProps.Stats.Available
   220  
   221  	case *WithScanExpr:
   222  		return t.BindingProps.Stats.Available
   223  	}
   224  
   225  	available := true
   226  	for i, n := 0, e.ChildCount(); i < n; i++ {
   227  		if child, ok := e.Child(i).(RelExpr); ok {
   228  			available = available && child.Relational().Stats.Available
   229  		}
   230  	}
   231  	return available
   232  }
   233  
   234  // colStatFromInput retrieves a column statistic from the input(s) of a Scan,
   235  // Select, or Join. The input to the Scan is the "raw" table.
   236  //
   237  // colStatFromInput also retrieves a pointer to the full statistics from the
   238  // relevant input.
   239  func (sb *statisticsBuilder) colStatFromInput(
   240  	colSet opt.ColSet, e RelExpr,
   241  ) (*props.ColumnStatistic, *props.Statistics) {
   242  	var lookupJoin *LookupJoinExpr
   243  	var geospatialLookupJoin *GeoLookupJoinExpr
   244  	var zigzagJoin *ZigzagJoinExpr
   245  
   246  	switch t := e.(type) {
   247  	case *ScanExpr:
   248  		return sb.colStatTable(t.Table, colSet), sb.makeTableStatistics(t.Table)
   249  
   250  	case *SelectExpr:
   251  		return sb.colStatFromChild(colSet, t, 0 /* childIdx */), sb.statsFromChild(e, 0 /* childIdx */)
   252  
   253  	case *LookupJoinExpr:
   254  		lookupJoin = t
   255  		ensureLookupJoinInputProps(lookupJoin, sb)
   256  
   257  	case *GeoLookupJoinExpr:
   258  		geospatialLookupJoin = t
   259  		ensureGeoLookupJoinInputProps(geospatialLookupJoin, sb)
   260  
   261  	case *ZigzagJoinExpr:
   262  		zigzagJoin = t
   263  		ensureZigzagJoinInputProps(zigzagJoin, sb)
   264  	}
   265  
   266  	if lookupJoin != nil || geospatialLookupJoin != nil || zigzagJoin != nil ||
   267  		opt.IsJoinOp(e) || e.Op() == opt.MergeJoinOp {
   268  		var leftProps *props.Relational
   269  		if zigzagJoin != nil {
   270  			leftProps = &zigzagJoin.leftProps
   271  		} else {
   272  			leftProps = e.Child(0).(RelExpr).Relational()
   273  		}
   274  
   275  		intersectsLeft := leftProps.OutputCols.Intersects(colSet)
   276  		var intersectsRight bool
   277  		if lookupJoin != nil {
   278  			intersectsRight = lookupJoin.lookupProps.OutputCols.Intersects(colSet)
   279  		} else if geospatialLookupJoin != nil {
   280  			intersectsRight = geospatialLookupJoin.lookupProps.OutputCols.Intersects(colSet)
   281  		} else if zigzagJoin != nil {
   282  			intersectsRight = zigzagJoin.rightProps.OutputCols.Intersects(colSet)
   283  		} else {
   284  			intersectsRight = e.Child(1).(RelExpr).Relational().OutputCols.Intersects(colSet)
   285  		}
   286  
   287  		// It's possible that colSet intersects both left and right if we have a
   288  		// lookup join that was converted from an index join, so check the left
   289  		// side first.
   290  		if intersectsLeft {
   291  			if zigzagJoin != nil {
   292  				return sb.colStatTable(zigzagJoin.LeftTable, colSet),
   293  					sb.makeTableStatistics(zigzagJoin.LeftTable)
   294  			}
   295  			return sb.colStatFromChild(colSet, e, 0 /* childIdx */),
   296  				sb.statsFromChild(e, 0 /* childIdx */)
   297  		}
   298  		if intersectsRight {
   299  			if lookupJoin != nil {
   300  				return sb.colStatTable(lookupJoin.Table, colSet),
   301  					sb.makeTableStatistics(lookupJoin.Table)
   302  			}
   303  			if geospatialLookupJoin != nil {
   304  				// TODO(rytaft): use inverted index stats when available.
   305  				return sb.colStatTable(geospatialLookupJoin.Table, colSet),
   306  					sb.makeTableStatistics(geospatialLookupJoin.Table)
   307  			}
   308  			if zigzagJoin != nil {
   309  				return sb.colStatTable(zigzagJoin.RightTable, colSet),
   310  					sb.makeTableStatistics(zigzagJoin.RightTable)
   311  			}
   312  			return sb.colStatFromChild(colSet, e, 1 /* childIdx */),
   313  				sb.statsFromChild(e, 1 /* childIdx */)
   314  		}
   315  		// All columns in colSet are outer columns; therefore, we can treat them
   316  		// as a constant. Return table stats from the left side.
   317  		if zigzagJoin != nil {
   318  			return &props.ColumnStatistic{Cols: colSet, DistinctCount: 1},
   319  				sb.makeTableStatistics(zigzagJoin.LeftTable)
   320  		}
   321  		return &props.ColumnStatistic{Cols: colSet, DistinctCount: 1}, sb.statsFromChild(e, 0 /* childIdx */)
   322  	}
   323  
   324  	panic(errors.AssertionFailedf("unsupported operator type %s", log.Safe(e.Op())))
   325  }
   326  
   327  // colStat gets a column statistic for the given set of columns if it exists.
   328  // If the column statistic is not available in the current expression, colStat
   329  // recursively tries to find it in the children of the expression, lazily
   330  // populating s.ColStats with the statistic as it gets passed up the expression
   331  // tree.
   332  func (sb *statisticsBuilder) colStat(colSet opt.ColSet, e RelExpr) *props.ColumnStatistic {
   333  	if colSet.Empty() {
   334  		panic(errors.AssertionFailedf("column statistics cannot be determined for empty column set"))
   335  	}
   336  
   337  	// Check if the requested column statistic is already cached.
   338  	if stat, ok := e.Relational().Stats.ColStats.Lookup(colSet); ok {
   339  		return stat
   340  	}
   341  
   342  	// We only calculate statistics on the normalized expression in a memo group.
   343  	e = e.FirstExpr()
   344  
   345  	// The statistic was not found in the cache, so calculate it based on the
   346  	// type of expression.
   347  	switch e.Op() {
   348  	case opt.ScanOp:
   349  		return sb.colStatScan(colSet, e.(*ScanExpr))
   350  
   351  	case opt.SelectOp:
   352  		return sb.colStatSelect(colSet, e.(*SelectExpr))
   353  
   354  	case opt.ProjectOp:
   355  		return sb.colStatProject(colSet, e.(*ProjectExpr))
   356  
   357  	case opt.ValuesOp:
   358  		return sb.colStatValues(colSet, e.(*ValuesExpr))
   359  
   360  	case opt.InnerJoinOp, opt.LeftJoinOp, opt.RightJoinOp, opt.FullJoinOp,
   361  		opt.SemiJoinOp, opt.AntiJoinOp, opt.InnerJoinApplyOp, opt.LeftJoinApplyOp,
   362  		opt.SemiJoinApplyOp, opt.AntiJoinApplyOp, opt.MergeJoinOp, opt.LookupJoinOp,
   363  		opt.GeoLookupJoinOp, opt.ZigzagJoinOp:
   364  		return sb.colStatJoin(colSet, e)
   365  
   366  	case opt.IndexJoinOp:
   367  		return sb.colStatIndexJoin(colSet, e.(*IndexJoinExpr))
   368  
   369  	case opt.UnionOp, opt.IntersectOp, opt.ExceptOp,
   370  		opt.UnionAllOp, opt.IntersectAllOp, opt.ExceptAllOp:
   371  		return sb.colStatSetNode(colSet, e)
   372  
   373  	case opt.GroupByOp, opt.ScalarGroupByOp, opt.DistinctOnOp, opt.EnsureDistinctOnOp,
   374  		opt.UpsertDistinctOnOp, opt.EnsureUpsertDistinctOnOp:
   375  		return sb.colStatGroupBy(colSet, e)
   376  
   377  	case opt.LimitOp:
   378  		return sb.colStatLimit(colSet, e.(*LimitExpr))
   379  
   380  	case opt.OffsetOp:
   381  		return sb.colStatOffset(colSet, e.(*OffsetExpr))
   382  
   383  	case opt.Max1RowOp:
   384  		return sb.colStatMax1Row(colSet, e.(*Max1RowExpr))
   385  
   386  	case opt.OrdinalityOp:
   387  		return sb.colStatOrdinality(colSet, e.(*OrdinalityExpr))
   388  
   389  	case opt.WindowOp:
   390  		return sb.colStatWindow(colSet, e.(*WindowExpr))
   391  
   392  	case opt.ProjectSetOp:
   393  		return sb.colStatProjectSet(colSet, e.(*ProjectSetExpr))
   394  
   395  	case opt.WithScanOp:
   396  		return sb.colStatWithScan(colSet, e.(*WithScanExpr))
   397  
   398  	case opt.InsertOp, opt.UpdateOp, opt.UpsertOp, opt.DeleteOp:
   399  		return sb.colStatMutation(colSet, e)
   400  
   401  	case opt.SequenceSelectOp:
   402  		return sb.colStatSequenceSelect(colSet, e.(*SequenceSelectExpr))
   403  
   404  	case opt.ExplainOp, opt.ShowTraceForSessionOp,
   405  		opt.OpaqueRelOp, opt.OpaqueMutationOp, opt.OpaqueDDLOp, opt.RecursiveCTEOp:
   406  		return sb.colStatUnknown(colSet, e.Relational())
   407  
   408  	case opt.WithOp:
   409  		return sb.colStat(colSet, e.Child(1).(RelExpr))
   410  
   411  	case opt.FakeRelOp:
   412  		panic(errors.AssertionFailedf("FakeRelOp does not contain col stat for %v", colSet))
   413  	}
   414  
   415  	panic(errors.AssertionFailedf("unrecognized relational expression type: %v", log.Safe(e.Op())))
   416  }
   417  
   418  // colStatLeaf creates a column statistic for a given column set (if it doesn't
   419  // already exist in s), by deriving the statistic from the general statistics.
   420  // Used when there is no child expression to retrieve statistics from, typically
   421  // with the Statistics derived for a table.
   422  func (sb *statisticsBuilder) colStatLeaf(
   423  	colSet opt.ColSet, s *props.Statistics, fd *props.FuncDepSet, notNullCols opt.ColSet,
   424  ) *props.ColumnStatistic {
   425  	// Ensure that the requested column statistic is in the cache.
   426  	colStat, added := s.ColStats.Add(colSet)
   427  	if !added {
   428  		// Already in the cache.
   429  		return colStat
   430  	}
   431  
   432  	// If some of the columns are a lax key, the distinct count equals the row
   433  	// count. The null count is 0 if any of these columns are not nullable,
   434  	// otherwise copy the null count from the nullable columns in colSet.
   435  	if fd.ColsAreLaxKey(colSet) {
   436  		if colSet.Intersects(notNullCols) {
   437  			colStat.NullCount = 0
   438  		} else {
   439  			nullableCols := colSet.Difference(notNullCols)
   440  			if nullableCols.Equals(colSet) {
   441  				// No column statistics on this colSet - use the unknown
   442  				// null count ratio.
   443  				colStat.NullCount = s.RowCount * unknownNullCountRatio
   444  			} else {
   445  				colStatLeaf := sb.colStatLeaf(nullableCols, s, fd, notNullCols)
   446  				// Fetch the colStat again since it may now have a different address.
   447  				colStat, _ = s.ColStats.Lookup(colSet)
   448  				colStat.NullCount = colStatLeaf.NullCount
   449  			}
   450  		}
   451  		// Only one of the null values counts towards the distinct count.
   452  		colStat.DistinctCount = s.RowCount - max(colStat.NullCount-1, 0)
   453  		return colStat
   454  	}
   455  
   456  	if colSet.Len() == 1 {
   457  		col, _ := colSet.Next(0)
   458  		colStat.DistinctCount = unknownDistinctCountRatio * s.RowCount
   459  		colStat.NullCount = unknownNullCountRatio * s.RowCount
   460  		if notNullCols.Contains(col) {
   461  			colStat.NullCount = 0
   462  		}
   463  		if sb.md.ColumnMeta(col).Type.Family() == types.BoolFamily {
   464  			// There are maximum three distinct values: true, false, and null.
   465  			maxDistinct := float64(2)
   466  			if colStat.NullCount > 0 {
   467  				maxDistinct++
   468  			}
   469  			colStat.DistinctCount = min(colStat.DistinctCount, maxDistinct)
   470  		}
   471  	} else {
   472  		distinctCount := 1.0
   473  		nullCount := s.RowCount
   474  		colSet.ForEach(func(i opt.ColumnID) {
   475  			colStatLeaf := sb.colStatLeaf(opt.MakeColSet(i), s, fd, notNullCols)
   476  			distinctCount *= colStatLeaf.DistinctCount
   477  			// Multiply by the expected chance of collisions with nulls already
   478  			// collected.
   479  			nullCount *= colStatLeaf.NullCount / s.RowCount
   480  		})
   481  		// Fetch the colStat again since it may now have a different address.
   482  		colStat, _ = s.ColStats.Lookup(colSet)
   483  		colStat.DistinctCount = min(distinctCount, s.RowCount)
   484  		colStat.NullCount = min(nullCount, s.RowCount)
   485  	}
   486  
   487  	return colStat
   488  }
   489  
   490  // +-------+
   491  // | Table |
   492  // +-------+
   493  
   494  // makeTableStatistics returns the available statistics for the given table.
   495  // Statistics are derived lazily and are cached in the metadata, since they may
   496  // be accessed multiple times during query optimization. For more details, see
   497  // props.Statistics.
   498  func (sb *statisticsBuilder) makeTableStatistics(tabID opt.TableID) *props.Statistics {
   499  	stats, ok := sb.md.TableAnnotation(tabID, statsAnnID).(*props.Statistics)
   500  	if ok {
   501  		// Already made.
   502  		return stats
   503  	}
   504  
   505  	// Make now and annotate the metadata table with it for next time.
   506  	tab := sb.md.Table(tabID)
   507  	stats = &props.Statistics{}
   508  	if tab.StatisticCount() == 0 {
   509  		// No statistics.
   510  		stats.Available = false
   511  		stats.RowCount = unknownRowCount
   512  	} else {
   513  		// Get the RowCount from the most recent statistic. Stats are ordered
   514  		// with most recent first.
   515  		stats.Available = true
   516  		stats.RowCount = float64(tab.Statistic(0).RowCount())
   517  
   518  		// Make sure the row count is at least 1. The stats may be stale, and we
   519  		// can end up with weird and inefficient plans if we estimate 0 rows.
   520  		stats.RowCount = max(stats.RowCount, 1)
   521  
   522  		// Add all the column statistics, using the most recent statistic for each
   523  		// column set. Stats are ordered with most recent first.
   524  		for i := 0; i < tab.StatisticCount(); i++ {
   525  			stat := tab.Statistic(i)
   526  			if stat.ColumnCount() > 1 && !sb.evalCtx.SessionData.OptimizerUseMultiColStats {
   527  				continue
   528  			}
   529  
   530  			var cols opt.ColSet
   531  			for i := 0; i < stat.ColumnCount(); i++ {
   532  				cols.Add(tabID.ColumnID(stat.ColumnOrdinal(i)))
   533  			}
   534  
   535  			if colStat, ok := stats.ColStats.Add(cols); ok {
   536  				colStat.DistinctCount = float64(stat.DistinctCount())
   537  				colStat.NullCount = float64(stat.NullCount())
   538  				if cols.Len() == 1 && stat.Histogram() != nil &&
   539  					sb.evalCtx.SessionData.OptimizerUseHistograms {
   540  					col, _ := cols.Next(0)
   541  					colStat.Histogram = &props.Histogram{}
   542  					colStat.Histogram.Init(sb.evalCtx, col, stat.Histogram())
   543  				}
   544  
   545  				// Make sure the distinct count is at least 1, for the same reason as
   546  				// the row count above.
   547  				colStat.DistinctCount = max(colStat.DistinctCount, 1)
   548  
   549  				// Make sure the values are consistent in case some of the column stats
   550  				// were added at different times (and therefore have a different row
   551  				// count).
   552  				sb.finalizeFromRowCountAndDistinctCounts(colStat, stats)
   553  			}
   554  		}
   555  	}
   556  	sb.md.SetTableAnnotation(tabID, statsAnnID, stats)
   557  	return stats
   558  }
   559  
   560  func (sb *statisticsBuilder) colStatTable(
   561  	tabID opt.TableID, colSet opt.ColSet,
   562  ) *props.ColumnStatistic {
   563  	tableStats := sb.makeTableStatistics(tabID)
   564  	tableFD := MakeTableFuncDep(sb.md, tabID)
   565  	tableNotNullCols := tableNotNullCols(sb.md, tabID)
   566  	return sb.colStatLeaf(colSet, tableStats, tableFD, tableNotNullCols)
   567  }
   568  
   569  // +------+
   570  // | Scan |
   571  // +------+
   572  
   573  func (sb *statisticsBuilder) buildScan(scan *ScanExpr, relProps *props.Relational) {
   574  	s := &relProps.Stats
   575  	if zeroCardinality := s.Init(relProps); zeroCardinality {
   576  		// Short cut if cardinality is 0.
   577  		return
   578  	}
   579  	s.Available = sb.availabilityFromInput(scan)
   580  
   581  	inputStats := sb.makeTableStatistics(scan.Table)
   582  	s.RowCount = inputStats.RowCount
   583  
   584  	if scan.Constraint != nil {
   585  		// Calculate distinct counts and histograms for constrained columns
   586  		// ----------------------------------------------------------------
   587  		var numUnappliedConjuncts float64
   588  		var constrainedCols, histCols opt.ColSet
   589  		// Inverted indexes are a special case; a constraint like:
   590  		// /1: [/'{"a": "b"}' - /'{"a": "b"}']
   591  		// does not necessarily mean there is only going to be one distinct
   592  		// value for column 1, if it is being applied to an inverted index.
   593  		// This is because inverted index keys could correspond to partial
   594  		// column values, such as one path-to-a-leaf through a JSON object.
   595  		//
   596  		// For now, don't apply constraints on inverted index columns.
   597  		if sb.md.Table(scan.Table).Index(scan.Index).IsInverted() {
   598  			for i, n := 0, scan.Constraint.ConstrainedColumns(sb.evalCtx); i < n; i++ {
   599  				numUnappliedConjuncts += sb.numConjunctsInConstraint(scan.Constraint, i)
   600  			}
   601  		} else {
   602  			constrainedCols, histCols = sb.applyIndexConstraint(scan.Constraint, scan, relProps)
   603  		}
   604  
   605  		// Set null counts to 0 for non-nullable columns
   606  		// ---------------------------------------------
   607  		sb.updateNullCountsFromProps(scan, relProps)
   608  
   609  		// Calculate row count and selectivity
   610  		// -----------------------------------
   611  		s.ApplySelectivity(sb.selectivityFromHistograms(histCols, scan, s))
   612  		s.ApplySelectivity(sb.selectivityFromMultiColDistinctCounts(constrainedCols, scan, s))
   613  		s.ApplySelectivity(sb.selectivityFromUnappliedConjuncts(numUnappliedConjuncts))
   614  		s.ApplySelectivity(sb.selectivityFromNullsRemoved(scan, relProps, constrainedCols))
   615  
   616  		// Adjust the selectivity so we don't double-count the histogram columns.
   617  		s.ApplySelectivity(1.0 / sb.selectivityFromSingleColDistinctCounts(histCols, scan, s))
   618  	}
   619  
   620  	sb.finalizeFromCardinality(relProps)
   621  }
   622  
   623  func (sb *statisticsBuilder) colStatScan(colSet opt.ColSet, scan *ScanExpr) *props.ColumnStatistic {
   624  	relProps := scan.Relational()
   625  	s := &relProps.Stats
   626  
   627  	inputColStat := sb.colStatTable(scan.Table, colSet)
   628  	colStat := sb.copyColStat(colSet, s, inputColStat)
   629  
   630  	if sb.shouldUseHistogram(relProps) {
   631  		colStat.Histogram = inputColStat.Histogram
   632  	}
   633  
   634  	if s.Selectivity != 1 {
   635  		tableStats := sb.makeTableStatistics(scan.Table)
   636  		colStat.ApplySelectivity(s.Selectivity, tableStats.RowCount)
   637  	}
   638  
   639  	if colSet.Intersects(relProps.NotNullCols) {
   640  		colStat.NullCount = 0
   641  	}
   642  
   643  	sb.finalizeFromRowCountAndDistinctCounts(colStat, s)
   644  	return colStat
   645  }
   646  
   647  // +--------+
   648  // | Select |
   649  // +--------+
   650  
   651  func (sb *statisticsBuilder) buildSelect(sel *SelectExpr, relProps *props.Relational) {
   652  	s := &relProps.Stats
   653  	if zeroCardinality := s.Init(relProps); zeroCardinality {
   654  		// Short cut if cardinality is 0.
   655  		return
   656  	}
   657  	s.Available = sb.availabilityFromInput(sel)
   658  
   659  	// Update stats based on equivalencies in the filter conditions. Note that
   660  	// EquivReps from the Select FD should not be used, as they include
   661  	// equivalencies derived from input expressions.
   662  	var equivFD props.FuncDepSet
   663  	for i := range sel.Filters {
   664  		equivFD.AddEquivFrom(&sel.Filters[i].ScalarProps().FuncDeps)
   665  	}
   666  	equivReps := equivFD.EquivReps()
   667  
   668  	// Calculate distinct counts and histograms for constrained columns
   669  	// ----------------------------------------------------------------
   670  	numUnappliedConjuncts, constrainedCols, histCols := sb.applyFilter(sel.Filters, sel, relProps)
   671  
   672  	// Try to reduce the number of columns used for selectivity
   673  	// calculation based on functional dependencies.
   674  	inputFD := &sel.Input.Relational().FuncDeps
   675  	constrainedCols = sb.tryReduceCols(constrainedCols, s, inputFD)
   676  
   677  	// Set null counts to 0 for non-nullable columns
   678  	// -------------------------------------------
   679  	sb.updateNullCountsFromProps(sel, relProps)
   680  
   681  	// Calculate selectivity and row count
   682  	// -----------------------------------
   683  	inputStats := &sel.Input.Relational().Stats
   684  	s.RowCount = inputStats.RowCount
   685  	s.ApplySelectivity(sb.selectivityFromHistograms(histCols, sel, s))
   686  	s.ApplySelectivity(sb.selectivityFromMultiColDistinctCounts(constrainedCols, sel, s))
   687  	s.ApplySelectivity(sb.selectivityFromEquivalencies(equivReps, &relProps.FuncDeps, sel, s))
   688  	s.ApplySelectivity(sb.selectivityFromUnappliedConjuncts(numUnappliedConjuncts))
   689  	s.ApplySelectivity(sb.selectivityFromNullsRemoved(sel, relProps, constrainedCols))
   690  
   691  	// Adjust the selectivity so we don't double-count the histogram columns.
   692  	s.ApplySelectivity(1.0 / sb.selectivityFromSingleColDistinctCounts(histCols, sel, s))
   693  
   694  	// Update distinct counts based on equivalencies; this should happen after
   695  	// selectivityFromMultiColDistinctCounts and selectivityFromEquivalencies.
   696  	sb.applyEquivalencies(equivReps, &relProps.FuncDeps, sel, relProps)
   697  
   698  	sb.finalizeFromCardinality(relProps)
   699  }
   700  
   701  func (sb *statisticsBuilder) colStatSelect(
   702  	colSet opt.ColSet, sel *SelectExpr,
   703  ) *props.ColumnStatistic {
   704  	relProps := sel.Relational()
   705  	s := &relProps.Stats
   706  	inputStats := &sel.Input.Relational().Stats
   707  	colStat := sb.copyColStatFromChild(colSet, sel, s)
   708  
   709  	// It's not safe to use s.Selectivity, because it's possible that some of the
   710  	// filter conditions were pushed down into the input after s.Selectivity
   711  	// was calculated. For example, an index scan or index join created during
   712  	// exploration could absorb some of the filter conditions.
   713  	selectivity := s.RowCount / inputStats.RowCount
   714  	colStat.ApplySelectivity(selectivity, inputStats.RowCount)
   715  	if colSet.Intersects(relProps.NotNullCols) {
   716  		colStat.NullCount = 0
   717  	}
   718  	sb.finalizeFromRowCountAndDistinctCounts(colStat, s)
   719  	return colStat
   720  }
   721  
   722  // +---------+
   723  // | Project |
   724  // +---------+
   725  
   726  func (sb *statisticsBuilder) buildProject(prj *ProjectExpr, relProps *props.Relational) {
   727  	s := &relProps.Stats
   728  	if zeroCardinality := s.Init(relProps); zeroCardinality {
   729  		// Short cut if cardinality is 0.
   730  		return
   731  	}
   732  	s.Available = sb.availabilityFromInput(prj)
   733  
   734  	inputStats := &prj.Input.Relational().Stats
   735  
   736  	s.RowCount = inputStats.RowCount
   737  	sb.finalizeFromCardinality(relProps)
   738  }
   739  
   740  func (sb *statisticsBuilder) colStatProject(
   741  	colSet opt.ColSet, prj *ProjectExpr,
   742  ) *props.ColumnStatistic {
   743  	relProps := prj.Relational()
   744  	s := &relProps.Stats
   745  
   746  	// Columns may be passed through from the input, or they may reference a
   747  	// higher scope (in the case of a correlated subquery), or they
   748  	// may be synthesized by the projection operation.
   749  	inputCols := prj.Input.Relational().OutputCols
   750  	reqInputCols := colSet.Intersection(inputCols)
   751  	nonNullFound := false
   752  	reqSynthCols := colSet.Difference(inputCols)
   753  	if !reqSynthCols.Empty() {
   754  		// Some of the columns in colSet were synthesized or from a higher scope
   755  		// (in the case of a correlated subquery). We assume that the statistics of
   756  		// the synthesized columns are the same as the statistics of their input
   757  		// columns. For example, the distinct count of (x + 2) is the same as the
   758  		// distinct count of x.
   759  		// TODO(rytaft): This assumption breaks down for certain types of
   760  		// expressions, such as (x < y).
   761  		for i := range prj.Projections {
   762  			item := &prj.Projections[i]
   763  			if reqSynthCols.Contains(item.Col) {
   764  				reqInputCols.UnionWith(item.scalar.OuterCols)
   765  
   766  				// If the element is not a null constant, account for that
   767  				// when calculating null counts.
   768  				if item.Element.Op() != opt.NullOp {
   769  					nonNullFound = true
   770  				}
   771  			}
   772  		}
   773  
   774  		// Intersect with the input columns one more time to remove any columns
   775  		// from higher scopes. Columns from higher scopes are effectively constant
   776  		// in this scope, and therefore have distinct count = 1.
   777  		reqInputCols.IntersectionWith(inputCols)
   778  	}
   779  
   780  	colStat, _ := s.ColStats.Add(colSet)
   781  
   782  	if !reqInputCols.Empty() {
   783  		// Inherit column statistics from input, using the reqInputCols identified
   784  		// above.
   785  		inputColStat := sb.colStatFromChild(reqInputCols, prj, 0 /* childIdx */)
   786  		colStat.DistinctCount = inputColStat.DistinctCount
   787  		if nonNullFound {
   788  			colStat.NullCount = 0
   789  		} else {
   790  			colStat.NullCount = inputColStat.NullCount
   791  		}
   792  	} else {
   793  		// There are no columns in this expression, so it must be a constant.
   794  		colStat.DistinctCount = 1
   795  		if nonNullFound {
   796  			colStat.NullCount = 0
   797  		} else {
   798  			colStat.NullCount = s.RowCount
   799  		}
   800  	}
   801  	if colSet.Intersects(relProps.NotNullCols) {
   802  		colStat.NullCount = 0
   803  	}
   804  	sb.finalizeFromRowCountAndDistinctCounts(colStat, s)
   805  	return colStat
   806  }
   807  
   808  // +------+
   809  // | Join |
   810  // +------+
   811  
   812  func (sb *statisticsBuilder) buildJoin(
   813  	join RelExpr, relProps *props.Relational, h *joinPropsHelper,
   814  ) {
   815  	// Zigzag joins have their own stats builder case.
   816  	if join.Op() == opt.ZigzagJoinOp {
   817  		sb.buildZigzagJoin(join.(*ZigzagJoinExpr), relProps, h)
   818  		return
   819  	}
   820  
   821  	s := &relProps.Stats
   822  	if zeroCardinality := s.Init(relProps); zeroCardinality {
   823  		// Short cut if cardinality is 0.
   824  		return
   825  	}
   826  	s.Available = sb.availabilityFromInput(join)
   827  
   828  	leftStats := &h.leftProps.Stats
   829  	rightStats := &h.rightProps.Stats
   830  	leftCols := h.leftProps.OutputCols.Copy()
   831  	rightCols := h.rightProps.OutputCols.Copy()
   832  	equivReps := h.filtersFD.EquivReps()
   833  
   834  	// Shortcut if there are no ON conditions. Note that for lookup join, there
   835  	// are implicit equality conditions on KeyCols.
   836  	if h.filterIsTrue {
   837  		s.RowCount = leftStats.RowCount * rightStats.RowCount
   838  		s.Selectivity = 1
   839  		switch h.joinType {
   840  		case opt.InnerJoinOp, opt.InnerJoinApplyOp:
   841  		case opt.LeftJoinOp, opt.LeftJoinApplyOp:
   842  			// All rows from left side should be in the result.
   843  			s.RowCount = max(s.RowCount, leftStats.RowCount)
   844  
   845  		case opt.RightJoinOp:
   846  			// All rows from right side should be in the result.
   847  			s.RowCount = max(s.RowCount, rightStats.RowCount)
   848  
   849  		case opt.FullJoinOp:
   850  			// All rows from both sides should be in the result.
   851  			s.RowCount = max(s.RowCount, leftStats.RowCount)
   852  			s.RowCount = max(s.RowCount, rightStats.RowCount)
   853  
   854  		case opt.SemiJoinOp, opt.SemiJoinApplyOp:
   855  			s.RowCount = leftStats.RowCount
   856  
   857  		case opt.AntiJoinOp, opt.AntiJoinApplyOp:
   858  			// Don't set the row count to 0 since we can't guarantee that the
   859  			// cardinality is 0.
   860  			s.RowCount = epsilon
   861  			s.Selectivity = epsilon
   862  		}
   863  		return
   864  	}
   865  
   866  	// Shortcut if the ON condition is false or there is a contradiction.
   867  	if h.filters.IsFalse() {
   868  		s.Selectivity = 0
   869  		switch h.joinType {
   870  		case opt.InnerJoinOp, opt.InnerJoinApplyOp:
   871  			s.RowCount = 0
   872  
   873  		case opt.LeftJoinOp, opt.LeftJoinApplyOp:
   874  			// All rows from left side should be in the result.
   875  			s.RowCount = leftStats.RowCount
   876  
   877  		case opt.RightJoinOp:
   878  			// All rows from right side should be in the result.
   879  			s.RowCount = rightStats.RowCount
   880  
   881  		case opt.FullJoinOp:
   882  			// All rows from both sides should be in the result.
   883  			s.RowCount = leftStats.RowCount + rightStats.RowCount
   884  
   885  		case opt.SemiJoinOp, opt.SemiJoinApplyOp:
   886  			s.RowCount = 0
   887  
   888  		case opt.AntiJoinOp, opt.AntiJoinApplyOp:
   889  			s.RowCount = leftStats.RowCount
   890  			s.Selectivity = 1
   891  		}
   892  		return
   893  	}
   894  
   895  	// Calculate distinct counts for constrained columns in the ON conditions
   896  	// ----------------------------------------------------------------------
   897  	numUnappliedConjuncts, constrainedCols, histCols := sb.applyFilter(h.filters, join, relProps)
   898  
   899  	// Try to reduce the number of columns used for selectivity
   900  	// calculation based on functional dependencies.
   901  	constrainedCols = sb.tryReduceJoinCols(
   902  		constrainedCols,
   903  		s,
   904  		h.leftProps.OutputCols,
   905  		h.rightProps.OutputCols,
   906  		&h.leftProps.FuncDeps,
   907  		&h.rightProps.FuncDeps,
   908  	)
   909  
   910  	// Set null counts to 0 for non-nullable columns
   911  	// ---------------------------------------------
   912  	sb.updateNullCountsFromProps(join, relProps)
   913  
   914  	// Calculate selectivity and row count
   915  	// -----------------------------------
   916  	switch h.joinType {
   917  	case opt.SemiJoinOp, opt.SemiJoinApplyOp, opt.AntiJoinOp, opt.AntiJoinApplyOp:
   918  		// Treat anti join as if it were a semi join for the selectivity
   919  		// calculations. It will be fixed below.
   920  		s.RowCount = leftStats.RowCount
   921  		s.ApplySelectivity(sb.selectivityFromEquivalenciesSemiJoin(
   922  			equivReps, h.leftProps.OutputCols, h.rightProps.OutputCols, &h.filtersFD, join, s,
   923  		))
   924  
   925  	default:
   926  		s.RowCount = leftStats.RowCount * rightStats.RowCount
   927  		if h.rightProps.FuncDeps.ColsAreStrictKey(h.selfJoinCols) {
   928  			// This is like an index join, so apply a selectivity that will result
   929  			// in leftStats.RowCount rows.
   930  			if rightStats.RowCount != 0 {
   931  				s.ApplySelectivity(1 / rightStats.RowCount)
   932  			}
   933  		} else {
   934  			// Add the self join columns to equivReps so they are included in the
   935  			// calculation for selectivityFromEquivalencies below.
   936  			equivReps.UnionWith(h.selfJoinCols)
   937  		}
   938  
   939  		s.ApplySelectivity(sb.selectivityFromEquivalencies(equivReps, &h.filtersFD, join, s))
   940  	}
   941  
   942  	if join.Op() == opt.GeoLookupJoinOp {
   943  		s.ApplySelectivity(sb.selectivityFromGeoRelationship(join, s))
   944  	}
   945  	s.ApplySelectivity(sb.selectivityFromHistograms(histCols, join, s))
   946  	s.ApplySelectivity(sb.selectivityFromMultiColDistinctCounts(
   947  		constrainedCols.Intersection(leftCols), join, s,
   948  	))
   949  	s.ApplySelectivity(sb.selectivityFromMultiColDistinctCounts(
   950  		constrainedCols.Intersection(rightCols), join, s,
   951  	))
   952  	s.ApplySelectivity(sb.selectivityFromUnappliedConjuncts(numUnappliedConjuncts))
   953  	s.ApplySelectivity(sb.selectivityFromNullsRemoved(join, relProps, constrainedCols))
   954  
   955  	// Adjust the selectivity so we don't double-count the histogram columns.
   956  	s.ApplySelectivity(1.0 / sb.selectivityFromSingleColDistinctCounts(histCols, join, s))
   957  
   958  	// Update distinct counts based on equivalencies; this should happen after
   959  	// selectivityFromMultiColDistinctCounts and selectivityFromEquivalencies.
   960  	sb.applyEquivalencies(equivReps, &h.filtersFD, join, relProps)
   961  
   962  	switch h.joinType {
   963  	case opt.SemiJoinOp, opt.SemiJoinApplyOp, opt.AntiJoinOp, opt.AntiJoinApplyOp:
   964  		// Keep only column stats from the left side.
   965  		s.ColStats.RemoveIntersecting(h.rightProps.OutputCols)
   966  	}
   967  
   968  	// The above calculation is for inner joins. Other joins need to remove stats
   969  	// that involve outer columns.
   970  	switch h.joinType {
   971  	case opt.LeftJoinOp, opt.LeftJoinApplyOp:
   972  		// Keep only column stats from the right side. The stats from the left side
   973  		// are not valid.
   974  		s.ColStats.RemoveIntersecting(h.leftProps.OutputCols)
   975  
   976  	case opt.RightJoinOp:
   977  		// Keep only column stats from the left side. The stats from the right side
   978  		// are not valid.
   979  		s.ColStats.RemoveIntersecting(h.rightProps.OutputCols)
   980  
   981  	case opt.FullJoinOp:
   982  		// Do not keep any column stats.
   983  		s.ColStats.Clear()
   984  	}
   985  
   986  	// Tweak the row count.
   987  	innerJoinRowCount := s.RowCount
   988  	switch h.joinType {
   989  	case opt.LeftJoinOp, opt.LeftJoinApplyOp:
   990  		// All rows from left side should be in the result.
   991  		s.RowCount = max(innerJoinRowCount, leftStats.RowCount)
   992  
   993  	case opt.RightJoinOp:
   994  		// All rows from right side should be in the result.
   995  		s.RowCount = max(innerJoinRowCount, rightStats.RowCount)
   996  
   997  	case opt.FullJoinOp:
   998  		// All rows from both sides should be in the result.
   999  		// T(A FOJ B) = T(A LOJ B) + T(A ROJ B) - T(A IJ B)
  1000  		leftJoinRowCount := max(innerJoinRowCount, leftStats.RowCount)
  1001  		rightJoinRowCount := max(innerJoinRowCount, rightStats.RowCount)
  1002  		s.RowCount = leftJoinRowCount + rightJoinRowCount - innerJoinRowCount
  1003  	}
  1004  
  1005  	// Fix the stats for anti join.
  1006  	switch h.joinType {
  1007  	case opt.AntiJoinOp, opt.AntiJoinApplyOp:
  1008  		s.RowCount = max(leftStats.RowCount-s.RowCount, epsilon)
  1009  		s.Selectivity = max(1-s.Selectivity, epsilon)
  1010  
  1011  		// Converting column stats is error-prone. If any column stats are needed,
  1012  		// colStatJoin will use the selectivity calculated above to estimate the
  1013  		// column stats from the input.
  1014  		s.ColStats.Clear()
  1015  	}
  1016  
  1017  	// Loop through all colSets added in this step, and adjust null counts,
  1018  	// distinct counts, and histograms.
  1019  	for i := 0; i < s.ColStats.Count(); i++ {
  1020  		colStat := s.ColStats.Get(i)
  1021  		leftSideCols := leftCols.Intersection(colStat.Cols)
  1022  		rightSideCols := rightCols.Intersection(colStat.Cols)
  1023  		leftNullCount, rightNullCount := sb.leftRightNullCounts(
  1024  			join,
  1025  			leftSideCols,
  1026  			rightSideCols,
  1027  			leftStats.RowCount,
  1028  			rightStats.RowCount,
  1029  		)
  1030  
  1031  		// Update all null counts not zeroed out in updateNullCountsFromProps
  1032  		// to equal the inner join count, instead of what it currently is (either
  1033  		// leftNullCount or rightNullCount).
  1034  		if colStat.NullCount != 0 {
  1035  			colStat.NullCount = innerJoinNullCount(
  1036  				s.RowCount,
  1037  				leftNullCount,
  1038  				leftStats.RowCount,
  1039  				rightNullCount,
  1040  				rightStats.RowCount,
  1041  			)
  1042  		}
  1043  
  1044  		switch h.joinType {
  1045  		case opt.LeftJoinOp, opt.LeftJoinApplyOp, opt.RightJoinOp, opt.FullJoinOp:
  1046  			if !colStat.Cols.Intersects(relProps.NotNullCols) {
  1047  				sb.adjustNullCountsForOuterJoins(
  1048  					colStat,
  1049  					h.joinType,
  1050  					leftSideCols,
  1051  					rightSideCols,
  1052  					leftNullCount,
  1053  					leftStats.RowCount,
  1054  					rightNullCount,
  1055  					rightStats.RowCount,
  1056  					s.RowCount,
  1057  					innerJoinRowCount,
  1058  				)
  1059  			}
  1060  
  1061  			// Ensure distinct count is non-zero.
  1062  			colStat.DistinctCount = max(colStat.DistinctCount, 1)
  1063  		}
  1064  
  1065  		// We don't yet calculate histograms correctly for joins, so remove any
  1066  		// histograms that have been created above.
  1067  		colStat.Histogram = nil
  1068  	}
  1069  
  1070  	sb.finalizeFromCardinality(relProps)
  1071  }
  1072  
  1073  func (sb *statisticsBuilder) colStatJoin(colSet opt.ColSet, join RelExpr) *props.ColumnStatistic {
  1074  	relProps := join.Relational()
  1075  	s := &relProps.Stats
  1076  
  1077  	var joinType opt.Operator
  1078  	var leftProps, rightProps *props.Relational
  1079  
  1080  	switch j := join.(type) {
  1081  	case *LookupJoinExpr:
  1082  		joinType = j.JoinType
  1083  		leftProps = j.Input.Relational()
  1084  		ensureLookupJoinInputProps(j, sb)
  1085  		rightProps = &j.lookupProps
  1086  
  1087  	case *GeoLookupJoinExpr:
  1088  		joinType = j.JoinType
  1089  		leftProps = j.Input.Relational()
  1090  		ensureGeoLookupJoinInputProps(j, sb)
  1091  		rightProps = &j.lookupProps
  1092  
  1093  	case *ZigzagJoinExpr:
  1094  		joinType = opt.InnerJoinOp
  1095  		ensureZigzagJoinInputProps(j, sb)
  1096  		leftProps = &j.leftProps
  1097  		rightProps = &j.rightProps
  1098  
  1099  	default:
  1100  		joinType = join.Op()
  1101  		leftProps = join.Child(0).(RelExpr).Relational()
  1102  		rightProps = join.Child(1).(RelExpr).Relational()
  1103  	}
  1104  
  1105  	switch joinType {
  1106  	case opt.SemiJoinOp, opt.SemiJoinApplyOp, opt.AntiJoinOp, opt.AntiJoinApplyOp:
  1107  		// Column stats come from left side of join.
  1108  		colStat := sb.copyColStat(colSet, s, sb.colStatFromJoinLeft(colSet, join))
  1109  		colStat.ApplySelectivity(s.Selectivity, leftProps.Stats.RowCount)
  1110  		sb.finalizeFromRowCountAndDistinctCounts(colStat, s)
  1111  		return colStat
  1112  
  1113  	default:
  1114  		// Column stats come from both sides of join.
  1115  		leftCols := leftProps.OutputCols.Intersection(colSet)
  1116  		rightCols := rightProps.OutputCols.Intersection(colSet)
  1117  
  1118  		// Join selectivity affects the distinct counts for different columns
  1119  		// in different ways depending on the type of join.
  1120  		//
  1121  		// - For FULL OUTER joins, the selectivity has no impact on distinct count;
  1122  		//   all rows from the input are included at least once in the output.
  1123  		// - For LEFT OUTER joins, the selectivity only impacts the distinct count
  1124  		//   of columns from the right side of the join; all rows from the left
  1125  		//   side are included at least once in the output.
  1126  		// - For RIGHT OUTER joins, the selectivity only impacts the distinct count
  1127  		//   of columns from the left side of the join; all rows from the right
  1128  		//   side are included at least once in the output.
  1129  		// - For INNER joins, the selectivity impacts the distinct count of all
  1130  		//   columns.
  1131  		var colStat *props.ColumnStatistic
  1132  		inputRowCount := leftProps.Stats.RowCount * rightProps.Stats.RowCount
  1133  		leftNullCount, rightNullCount := leftProps.Stats.RowCount, rightProps.Stats.RowCount
  1134  		if rightCols.Empty() {
  1135  			colStat = sb.copyColStat(colSet, s, sb.colStatFromJoinLeft(colSet, join))
  1136  			leftNullCount = colStat.NullCount
  1137  			switch joinType {
  1138  			case opt.InnerJoinOp, opt.InnerJoinApplyOp, opt.RightJoinOp:
  1139  				colStat.ApplySelectivity(s.Selectivity, inputRowCount)
  1140  			}
  1141  		} else if leftCols.Empty() {
  1142  			colStat = sb.copyColStat(colSet, s, sb.colStatFromJoinRight(colSet, join))
  1143  			rightNullCount = colStat.NullCount
  1144  			switch joinType {
  1145  			case opt.InnerJoinOp, opt.InnerJoinApplyOp, opt.LeftJoinOp, opt.LeftJoinApplyOp:
  1146  				colStat.ApplySelectivity(s.Selectivity, inputRowCount)
  1147  			}
  1148  		} else {
  1149  			// Make a copy of the input column stats so we don't modify the originals.
  1150  			leftColStat := *sb.colStatFromJoinLeft(leftCols, join)
  1151  			rightColStat := *sb.colStatFromJoinRight(rightCols, join)
  1152  
  1153  			leftNullCount = leftColStat.NullCount
  1154  			rightNullCount = rightColStat.NullCount
  1155  			switch joinType {
  1156  			case opt.InnerJoinOp, opt.InnerJoinApplyOp:
  1157  				leftColStat.ApplySelectivity(s.Selectivity, inputRowCount)
  1158  				rightColStat.ApplySelectivity(s.Selectivity, inputRowCount)
  1159  
  1160  			case opt.LeftJoinOp, opt.LeftJoinApplyOp:
  1161  				rightColStat.ApplySelectivity(s.Selectivity, inputRowCount)
  1162  
  1163  			case opt.RightJoinOp:
  1164  				leftColStat.ApplySelectivity(s.Selectivity, inputRowCount)
  1165  			}
  1166  			colStat, _ = s.ColStats.Add(colSet)
  1167  			colStat.DistinctCount = leftColStat.DistinctCount * rightColStat.DistinctCount
  1168  		}
  1169  
  1170  		// Null count estimation - assume an inner join and then bump the null count later
  1171  		// based on the type of join.
  1172  		colStat.NullCount = innerJoinNullCount(
  1173  			s.RowCount,
  1174  			leftNullCount,
  1175  			leftProps.Stats.RowCount,
  1176  			rightNullCount,
  1177  			rightProps.Stats.RowCount,
  1178  		)
  1179  
  1180  		switch joinType {
  1181  		case opt.LeftJoinOp, opt.LeftJoinApplyOp, opt.RightJoinOp, opt.FullJoinOp:
  1182  			sb.adjustNullCountsForOuterJoins(
  1183  				colStat,
  1184  				joinType,
  1185  				leftCols,
  1186  				rightCols,
  1187  				leftNullCount,
  1188  				leftProps.Stats.RowCount,
  1189  				rightNullCount,
  1190  				rightProps.Stats.RowCount,
  1191  				s.RowCount,
  1192  				s.Selectivity*inputRowCount,
  1193  			)
  1194  
  1195  			// Ensure distinct count is non-zero.
  1196  			colStat.DistinctCount = max(colStat.DistinctCount, 1)
  1197  		}
  1198  
  1199  		if colSet.Intersects(relProps.NotNullCols) {
  1200  			colStat.NullCount = 0
  1201  		}
  1202  		sb.finalizeFromRowCountAndDistinctCounts(colStat, s)
  1203  		return colStat
  1204  	}
  1205  }
  1206  
  1207  // leftRightNullCounts returns null counts on the left and right sides of the
  1208  // specified join. If either leftCols or rightCols are empty, the corresponding
  1209  // side's return value is equal to the number of rows on that side.
  1210  func (sb *statisticsBuilder) leftRightNullCounts(
  1211  	e RelExpr, leftCols, rightCols opt.ColSet, leftRowCount, rightRowCount float64,
  1212  ) (leftNullCount, rightNullCount float64) {
  1213  	leftNullCount, rightNullCount = leftRowCount, rightRowCount
  1214  	if !leftCols.Empty() {
  1215  		leftColStat := sb.colStatFromJoinLeft(leftCols, e)
  1216  		leftNullCount = leftColStat.NullCount
  1217  	}
  1218  	if !rightCols.Empty() {
  1219  		rightColStat := sb.colStatFromJoinRight(rightCols, e)
  1220  		rightNullCount = rightColStat.NullCount
  1221  	}
  1222  
  1223  	return leftNullCount, rightNullCount
  1224  }
  1225  
  1226  // innerJoinNullCount returns an estimate of the number of nulls in an inner
  1227  // join with the specified column stats.
  1228  //
  1229  // The caller should ensure that if leftCols are empty then leftNullCount
  1230  // equals leftRowCount, and if rightCols are empty then rightNullCount equals
  1231  // rightRowCount.
  1232  func innerJoinNullCount(
  1233  	rowCount, leftNullCount, leftRowCount, rightNullCount, rightRowCount float64,
  1234  ) float64 {
  1235  	// Here, f1 and f2 are probabilities of nulls on either side of the join.
  1236  	var f1, f2 float64
  1237  	if leftRowCount != 0 {
  1238  		f1 = leftNullCount / leftRowCount
  1239  	}
  1240  	if rightRowCount != 0 {
  1241  		f2 = rightNullCount / rightRowCount
  1242  	}
  1243  
  1244  	// Rough estimate of nulls in the combined result set, assuming independence.
  1245  	return rowCount * f1 * f2
  1246  }
  1247  
  1248  // adjustNullCountsForOuterJoins modifies the null counts for the specified
  1249  // columns to adjust for additional nulls created in this type of outer join.
  1250  // It adds an expected number of nulls created by column extension on
  1251  // non-matching rows (such as on right cols for left joins and both for full).
  1252  //
  1253  // The caller should ensure that if leftCols are empty then leftNullCount
  1254  // equals leftRowCount, and if rightCols are empty then rightNullCount equals
  1255  // rightRowCount.
  1256  func (sb *statisticsBuilder) adjustNullCountsForOuterJoins(
  1257  	colStat *props.ColumnStatistic,
  1258  	joinType opt.Operator,
  1259  	leftCols, rightCols opt.ColSet,
  1260  	leftNullCount, leftRowCount, rightNullCount, rightRowCount, rowCount, innerJoinRowCount float64,
  1261  ) {
  1262  	// Adjust null counts for non-inner joins, adding nulls created due to column
  1263  	// extension - such as right columns for non-matching rows in left joins.
  1264  	switch joinType {
  1265  	case opt.LeftJoinOp, opt.LeftJoinApplyOp:
  1266  		if !rightCols.Empty() {
  1267  			colStat.NullCount += (rowCount - innerJoinRowCount) * leftNullCount / leftRowCount
  1268  		}
  1269  
  1270  	case opt.RightJoinOp:
  1271  		if !leftCols.Empty() {
  1272  			colStat.NullCount += (rowCount - innerJoinRowCount) * rightNullCount / rightRowCount
  1273  		}
  1274  
  1275  	case opt.FullJoinOp:
  1276  		leftJoinRowCount := max(innerJoinRowCount, leftRowCount)
  1277  		rightJoinRowCount := max(innerJoinRowCount, rightRowCount)
  1278  
  1279  		if !leftCols.Empty() {
  1280  			colStat.NullCount += (rightJoinRowCount - innerJoinRowCount) * rightNullCount / rightRowCount
  1281  		}
  1282  		if !rightCols.Empty() {
  1283  			colStat.NullCount += (leftJoinRowCount - innerJoinRowCount) * leftNullCount / leftRowCount
  1284  		}
  1285  	}
  1286  }
  1287  
  1288  // colStatfromJoinLeft returns a column statistic from the left input of a join.
  1289  func (sb *statisticsBuilder) colStatFromJoinLeft(
  1290  	cols opt.ColSet, join RelExpr,
  1291  ) *props.ColumnStatistic {
  1292  	if join.Op() == opt.ZigzagJoinOp {
  1293  		return sb.colStatTable(join.Private().(*ZigzagJoinPrivate).LeftTable, cols)
  1294  	}
  1295  	return sb.colStatFromChild(cols, join, 0 /* childIdx */)
  1296  }
  1297  
  1298  // colStatfromJoinRight returns a column statistic from the right input of a
  1299  // join (or the table for a lookup join).
  1300  func (sb *statisticsBuilder) colStatFromJoinRight(
  1301  	cols opt.ColSet, join RelExpr,
  1302  ) *props.ColumnStatistic {
  1303  	if join.Op() == opt.ZigzagJoinOp {
  1304  		return sb.colStatTable(join.Private().(*ZigzagJoinPrivate).RightTable, cols)
  1305  	} else if join.Op() == opt.LookupJoinOp {
  1306  		lookupPrivate := join.Private().(*LookupJoinPrivate)
  1307  		return sb.colStatTable(lookupPrivate.Table, cols)
  1308  	}
  1309  	return sb.colStatFromChild(cols, join, 1 /* childIdx */)
  1310  }
  1311  
  1312  // +------------+
  1313  // | Index Join |
  1314  // +------------+
  1315  
  1316  func (sb *statisticsBuilder) buildIndexJoin(indexJoin *IndexJoinExpr, relProps *props.Relational) {
  1317  	s := &relProps.Stats
  1318  	if zeroCardinality := s.Init(relProps); zeroCardinality {
  1319  		// Short cut if cardinality is 0.
  1320  		return
  1321  	}
  1322  	s.Available = sb.availabilityFromInput(indexJoin)
  1323  
  1324  	inputStats := &indexJoin.Input.Relational().Stats
  1325  
  1326  	s.RowCount = inputStats.RowCount
  1327  	sb.finalizeFromCardinality(relProps)
  1328  }
  1329  
  1330  func (sb *statisticsBuilder) colStatIndexJoin(
  1331  	colSet opt.ColSet, join *IndexJoinExpr,
  1332  ) *props.ColumnStatistic {
  1333  	relProps := join.Relational()
  1334  	s := &relProps.Stats
  1335  
  1336  	inputProps := join.Input.Relational()
  1337  	inputCols := inputProps.OutputCols
  1338  
  1339  	colStat, _ := s.ColStats.Add(colSet)
  1340  	colStat.DistinctCount = 1
  1341  	colStat.NullCount = s.RowCount
  1342  
  1343  	// Some of the requested columns may be from the input index.
  1344  	reqInputCols := colSet.Intersection(inputCols)
  1345  	if !reqInputCols.Empty() {
  1346  		inputColStat := sb.colStatFromChild(reqInputCols, join, 0 /* childIdx */)
  1347  		colStat.DistinctCount = inputColStat.DistinctCount
  1348  		colStat.NullCount = inputColStat.NullCount
  1349  	}
  1350  
  1351  	// Other requested columns may be from the primary index.
  1352  	reqLookupCols := colSet.Difference(inputCols).Intersection(join.Cols)
  1353  	if !reqLookupCols.Empty() {
  1354  		// Make a copy of the lookup column stats so we don't modify the originals.
  1355  		lookupColStat := *sb.colStatTable(join.Table, reqLookupCols)
  1356  
  1357  		// Calculate the distinct count of the lookup columns given the selectivity
  1358  		// of any filters on the input.
  1359  		inputStats := &inputProps.Stats
  1360  		tableStats := sb.makeTableStatistics(join.Table)
  1361  		selectivity := inputStats.RowCount / tableStats.RowCount
  1362  		lookupColStat.ApplySelectivity(selectivity, tableStats.RowCount)
  1363  
  1364  		// Multiply the distinct counts in case colStat.DistinctCount is
  1365  		// already populated with a statistic from the subset of columns
  1366  		// provided by the input index. Multiplying the counts gives a worst-case
  1367  		// estimate of the joint distinct count.
  1368  		colStat.DistinctCount *= lookupColStat.DistinctCount
  1369  
  1370  		// Assuming null columns are completely independent, calculate
  1371  		// the expected value of having nulls in both column sets.
  1372  		f1 := lookupColStat.NullCount / inputStats.RowCount
  1373  		f2 := colStat.NullCount / inputStats.RowCount
  1374  		colStat.NullCount = inputStats.RowCount * f1 * f2
  1375  	}
  1376  
  1377  	if colSet.Intersects(relProps.NotNullCols) {
  1378  		colStat.NullCount = 0
  1379  	}
  1380  	sb.finalizeFromRowCountAndDistinctCounts(colStat, s)
  1381  	return colStat
  1382  }
  1383  
  1384  // +-------------+
  1385  // | Zigzag Join |
  1386  // +-------------+
  1387  
  1388  // buildZigzagJoin builds the rowCount for a zigzag join. The colStat case
  1389  // for ZigzagJoins is shared with Joins, while the builder code is more similar
  1390  // to that for a Select. This is to ensure zigzag joins have select/scan-like
  1391  // RowCounts, while at an individual column stats level, distinct and null
  1392  // counts are handled like they would be for a join with two sides.
  1393  func (sb *statisticsBuilder) buildZigzagJoin(
  1394  	zigzag *ZigzagJoinExpr, relProps *props.Relational, h *joinPropsHelper,
  1395  ) {
  1396  	s := &relProps.Stats
  1397  	if zeroCardinality := s.Init(relProps); zeroCardinality {
  1398  		// Short cut if cardinality is 0.
  1399  		return
  1400  	}
  1401  	s.Available = sb.availabilityFromInput(zigzag)
  1402  
  1403  	leftStats := zigzag.leftProps.Stats
  1404  	equivReps := h.filtersFD.EquivReps()
  1405  
  1406  	// We assume that we only plan zigzag joins in cases where the result set
  1407  	// will have a row count smaller than or equal to the left/right index
  1408  	// row counts, and where the left and right sides are indexes on the same
  1409  	// table. Their row count should be the same, so use either row count.
  1410  	s.RowCount = leftStats.RowCount
  1411  
  1412  	// Calculate distinct counts for constrained columns
  1413  	// -------------------------------------------------
  1414  	// Note that fixed columns (i.e. columns constrained to constant values)
  1415  	// specified in zigzag.FixedVals and zigzag.{Left,Right}FixedCols
  1416  	// still have corresponding filters in zigzag.On. So we don't need
  1417  	// to iterate through FixedCols here if we are already processing the ON
  1418  	// clause.
  1419  	// TODO(rytaft): use histogram for zig zag join.
  1420  	numUnappliedConjuncts, constrainedCols, _ := sb.applyFilter(zigzag.On, zigzag, relProps)
  1421  
  1422  	// Application of constraints on inverted indexes needs to be handled a
  1423  	// little differently since a constraint on an inverted index key column
  1424  	// could match multiple distinct values in the actual column. This is
  1425  	// because inverted index keys could correspond to partial values, like JSON
  1426  	// paths.
  1427  	//
  1428  	// Since filters on inverted index columns cannot be pushed down into the ON
  1429  	// clause (due to them containing partial values), we need to explicitly
  1430  	// increment numUnappliedConjuncts here for every constrained inverted index
  1431  	// column. The multiplication by 2 is to mimic the logic in
  1432  	// numConjunctsInConstraint, which counts an equality as 2 conjuncts. This
  1433  	// comes from the intuition that a = 1 is probably more selective than a > 1
  1434  	// and if we do not mimic a similar selectivity calculation here, the zigzag
  1435  	// join ends up having a higher row count and therefore higher cost than
  1436  	// a competing index join + constrained scan.
  1437  	tab := sb.md.Table(zigzag.LeftTable)
  1438  	if tab.Index(zigzag.LeftIndex).IsInverted() {
  1439  		numUnappliedConjuncts += float64(len(zigzag.LeftFixedCols) * 2)
  1440  	}
  1441  	if tab.Index(zigzag.RightIndex).IsInverted() {
  1442  		numUnappliedConjuncts += float64(len(zigzag.RightFixedCols) * 2)
  1443  	}
  1444  
  1445  	// Try to reduce the number of columns used for selectivity
  1446  	// calculation based on functional dependencies. Note that
  1447  	// these functional dependencies already include equalities
  1448  	// inferred by zigzag.{Left,Right}EqCols.
  1449  	inputFD := &zigzag.Relational().FuncDeps
  1450  	constrainedCols = sb.tryReduceCols(constrainedCols, s, inputFD)
  1451  
  1452  	// Set null counts to 0 for non-nullable columns
  1453  	// ---------------------------------------------
  1454  	sb.updateNullCountsFromProps(zigzag, relProps)
  1455  
  1456  	// Calculate selectivity and row count
  1457  	// -----------------------------------
  1458  	s.ApplySelectivity(sb.selectivityFromMultiColDistinctCounts(constrainedCols, zigzag, s))
  1459  	s.ApplySelectivity(sb.selectivityFromEquivalencies(equivReps, &relProps.FuncDeps, zigzag, s))
  1460  	s.ApplySelectivity(sb.selectivityFromUnappliedConjuncts(numUnappliedConjuncts))
  1461  	s.ApplySelectivity(sb.selectivityFromNullsRemoved(zigzag, relProps, constrainedCols))
  1462  
  1463  	// Update distinct counts based on equivalencies; this should happen after
  1464  	// selectivityFromMultiColDistinctCounts and selectivityFromEquivalencies.
  1465  	sb.applyEquivalencies(equivReps, &relProps.FuncDeps, zigzag, relProps)
  1466  
  1467  	sb.finalizeFromCardinality(relProps)
  1468  }
  1469  
  1470  // +----------+
  1471  // | Group By |
  1472  // +----------+
  1473  
  1474  func (sb *statisticsBuilder) buildGroupBy(groupNode RelExpr, relProps *props.Relational) {
  1475  	s := &relProps.Stats
  1476  	if zeroCardinality := s.Init(relProps); zeroCardinality {
  1477  		// Short cut if cardinality is 0.
  1478  		return
  1479  	}
  1480  	s.Available = sb.availabilityFromInput(groupNode)
  1481  
  1482  	groupingPrivate := groupNode.Private().(*GroupingPrivate)
  1483  	groupingColSet := groupingPrivate.GroupingCols
  1484  
  1485  	if groupingColSet.Empty() {
  1486  		if groupNode.Op() == opt.ScalarGroupByOp {
  1487  			// ScalarGroupBy always returns exactly one row.
  1488  			s.RowCount = 1
  1489  		} else {
  1490  			// GroupBy with empty grouping columns returns 0 or 1 rows, depending
  1491  			// on whether input has rows. If input has < 1 row, use that, as that
  1492  			// represents the probability of having 0 vs. 1 rows.
  1493  			inputStats := sb.statsFromChild(groupNode, 0 /* childIdx */)
  1494  			s.RowCount = min(1, inputStats.RowCount)
  1495  		}
  1496  	} else {
  1497  		inputStats := sb.statsFromChild(groupNode, 0 /* childIdx */)
  1498  
  1499  		if groupingPrivate.ErrorOnDup != "" {
  1500  			// If any input group has more than one row, then the distinct operator
  1501  			// will raise an error, so in non-error cases it has the same number of
  1502  			// rows as its input.
  1503  			s.RowCount = inputStats.RowCount
  1504  		} else {
  1505  			// Estimate the row count based on the distinct count of the grouping
  1506  			// columns. Non-scalar GroupBy should never increase the number of rows.
  1507  			colStat := sb.copyColStatFromChild(groupingColSet, groupNode, s)
  1508  			s.RowCount = min(colStat.DistinctCount, inputStats.RowCount)
  1509  
  1510  			// Update the null counts for the column statistic.
  1511  			if groupNode.Op() != opt.UpsertDistinctOnOp {
  1512  				// UpsertDistinctOp inherits NullCount from child, since it does not
  1513  				// group NULL values. Other group by operators only have 1 possible
  1514  				// null value.
  1515  				colStat.NullCount = min(1, colStat.NullCount)
  1516  			}
  1517  			if groupingColSet.Intersects(relProps.NotNullCols) {
  1518  				colStat.NullCount = 0
  1519  			}
  1520  		}
  1521  	}
  1522  
  1523  	sb.finalizeFromCardinality(relProps)
  1524  }
  1525  
  1526  func (sb *statisticsBuilder) colStatGroupBy(
  1527  	colSet opt.ColSet, groupNode RelExpr,
  1528  ) *props.ColumnStatistic {
  1529  	relProps := groupNode.Relational()
  1530  	s := &relProps.Stats
  1531  
  1532  	groupingPrivate := groupNode.Private().(*GroupingPrivate)
  1533  	groupingColSet := groupingPrivate.GroupingCols
  1534  	if groupingColSet.Empty() {
  1535  		// ScalarGroupBy or GroupBy with empty grouping columns.
  1536  		colStat, _ := s.ColStats.Add(colSet)
  1537  		colStat.DistinctCount = 1
  1538  		// TODO(itsbilal): Handle case where the scalar resolves to NULL.
  1539  		colStat.NullCount = 0
  1540  		return colStat
  1541  	}
  1542  
  1543  	var colStat *props.ColumnStatistic
  1544  	var inputColStat *props.ColumnStatistic
  1545  	if !colSet.SubsetOf(groupingColSet) {
  1546  		// Some of the requested columns are aggregates. Estimate the distinct
  1547  		// count to be the same as the grouping columns.
  1548  		colStat, _ = s.ColStats.Add(colSet)
  1549  		inputColStat = sb.colStatFromChild(groupingColSet, groupNode, 0 /* childIdx */)
  1550  		colStat.DistinctCount = inputColStat.DistinctCount
  1551  	} else {
  1552  		// Make a copy so we don't modify the original
  1553  		colStat = sb.copyColStatFromChild(colSet, groupNode, s)
  1554  		inputColStat = sb.colStatFromChild(colSet, groupNode, 0 /* childIdx */)
  1555  
  1556  		if groupingPrivate.ErrorOnDup != "" && colSet.Equals(groupingColSet) {
  1557  			// If any input group has more than one row, then the distinct operator
  1558  			// will raise an error, so in non-error cases its distinct count is the
  1559  			// same as its row count.
  1560  			colStat.DistinctCount = s.RowCount
  1561  		}
  1562  	}
  1563  
  1564  	if groupNode.Op() == opt.UpsertDistinctOnOp || groupNode.Op() == opt.EnsureUpsertDistinctOnOp {
  1565  		// UpsertDistinctOnOp and EnsureUpsertDistinctOnOp inherit NullCount from
  1566  		// child, since they do not group NULL values.
  1567  		colStat.NullCount = inputColStat.NullCount
  1568  	} else {
  1569  		// For other group by operators, we only have 1 possible null value.
  1570  		colStat.NullCount = min(1, inputColStat.NullCount)
  1571  	}
  1572  
  1573  	if colSet.Intersects(relProps.NotNullCols) {
  1574  		colStat.NullCount = 0
  1575  	}
  1576  	sb.finalizeFromRowCountAndDistinctCounts(colStat, s)
  1577  	return colStat
  1578  }
  1579  
  1580  // +--------+
  1581  // | Set Op |
  1582  // +--------+
  1583  
  1584  func (sb *statisticsBuilder) buildSetNode(setNode RelExpr, relProps *props.Relational) {
  1585  	s := &relProps.Stats
  1586  	if zeroCardinality := s.Init(relProps); zeroCardinality {
  1587  		// Short cut if cardinality is 0.
  1588  		return
  1589  	}
  1590  	s.Available = sb.availabilityFromInput(setNode)
  1591  
  1592  	leftStats := sb.statsFromChild(setNode, 0 /* childIdx */)
  1593  	rightStats := sb.statsFromChild(setNode, 1 /* childIdx */)
  1594  
  1595  	// These calculations are an upper bound on the row count. It's likely that
  1596  	// there is some overlap between the two sets, but not full overlap.
  1597  	switch setNode.Op() {
  1598  	case opt.UnionOp, opt.UnionAllOp:
  1599  		s.RowCount = leftStats.RowCount + rightStats.RowCount
  1600  
  1601  	case opt.IntersectOp, opt.IntersectAllOp:
  1602  		s.RowCount = min(leftStats.RowCount, rightStats.RowCount)
  1603  
  1604  	case opt.ExceptOp, opt.ExceptAllOp:
  1605  		s.RowCount = leftStats.RowCount
  1606  	}
  1607  
  1608  	switch setNode.Op() {
  1609  	case opt.UnionOp, opt.IntersectOp, opt.ExceptOp:
  1610  		// Since UNION, INTERSECT and EXCEPT eliminate duplicate rows, the row
  1611  		// count will equal the distinct count of the set of output columns.
  1612  		setPrivate := setNode.Private().(*SetPrivate)
  1613  		outputCols := setPrivate.OutCols.ToSet()
  1614  		colStat := sb.colStatSetNodeImpl(outputCols, setNode, relProps)
  1615  		s.RowCount = colStat.DistinctCount
  1616  	}
  1617  
  1618  	sb.finalizeFromCardinality(relProps)
  1619  }
  1620  
  1621  func (sb *statisticsBuilder) colStatSetNode(
  1622  	colSet opt.ColSet, setNode RelExpr,
  1623  ) *props.ColumnStatistic {
  1624  	return sb.colStatSetNodeImpl(colSet, setNode, setNode.Relational())
  1625  }
  1626  
  1627  func (sb *statisticsBuilder) colStatSetNodeImpl(
  1628  	outputCols opt.ColSet, setNode RelExpr, relProps *props.Relational,
  1629  ) *props.ColumnStatistic {
  1630  	s := &relProps.Stats
  1631  	setPrivate := setNode.Private().(*SetPrivate)
  1632  
  1633  	leftCols := opt.TranslateColSet(outputCols, setPrivate.OutCols, setPrivate.LeftCols)
  1634  	rightCols := opt.TranslateColSet(outputCols, setPrivate.OutCols, setPrivate.RightCols)
  1635  	leftColStat := sb.colStatFromChild(leftCols, setNode, 0 /* childIdx */)
  1636  	rightColStat := sb.colStatFromChild(rightCols, setNode, 1 /* childIdx */)
  1637  
  1638  	colStat, _ := s.ColStats.Add(outputCols)
  1639  
  1640  	leftNullCount := leftColStat.NullCount
  1641  	rightNullCount := rightColStat.NullCount
  1642  
  1643  	// These calculations are an upper bound on the distinct count. It's likely
  1644  	// that there is some overlap between the two sets, but not full overlap.
  1645  	switch setNode.Op() {
  1646  	case opt.UnionOp, opt.UnionAllOp:
  1647  		colStat.DistinctCount = leftColStat.DistinctCount + rightColStat.DistinctCount
  1648  		colStat.NullCount = leftNullCount + rightNullCount
  1649  
  1650  	case opt.IntersectOp, opt.IntersectAllOp:
  1651  		colStat.DistinctCount = min(leftColStat.DistinctCount, rightColStat.DistinctCount)
  1652  		colStat.NullCount = min(leftNullCount, rightNullCount)
  1653  
  1654  	case opt.ExceptOp, opt.ExceptAllOp:
  1655  		colStat.DistinctCount = leftColStat.DistinctCount
  1656  		colStat.NullCount = max(leftNullCount-rightNullCount, 0)
  1657  	}
  1658  
  1659  	// Use the actual null counts for bag operations, and normalize them for set
  1660  	// operations.
  1661  	switch setNode.Op() {
  1662  	case opt.UnionOp, opt.IntersectOp:
  1663  		colStat.NullCount = min(1, colStat.NullCount)
  1664  	case opt.ExceptOp:
  1665  		colStat.NullCount = min(1, colStat.NullCount)
  1666  		if rightNullCount > 0 {
  1667  			colStat.NullCount = 0
  1668  		}
  1669  	}
  1670  
  1671  	if outputCols.Intersects(relProps.NotNullCols) {
  1672  		colStat.NullCount = 0
  1673  	}
  1674  	sb.finalizeFromRowCountAndDistinctCounts(colStat, s)
  1675  	return colStat
  1676  }
  1677  
  1678  // +--------+
  1679  // | Values |
  1680  // +--------+
  1681  
  1682  // buildValues builds the statistics for a VALUES expression.
  1683  func (sb *statisticsBuilder) buildValues(values *ValuesExpr, relProps *props.Relational) {
  1684  	s := &relProps.Stats
  1685  	if zeroCardinality := s.Init(relProps); zeroCardinality {
  1686  		// Short cut if cardinality is 0.
  1687  		return
  1688  	}
  1689  	s.Available = sb.availabilityFromInput(values)
  1690  
  1691  	s.RowCount = float64(len(values.Rows))
  1692  	sb.finalizeFromCardinality(relProps)
  1693  }
  1694  
  1695  func (sb *statisticsBuilder) colStatValues(
  1696  	colSet opt.ColSet, values *ValuesExpr,
  1697  ) *props.ColumnStatistic {
  1698  	s := &values.Relational().Stats
  1699  	if len(values.Rows) == 0 {
  1700  		colStat, _ := s.ColStats.Add(colSet)
  1701  		return colStat
  1702  	}
  1703  
  1704  	// Determine distinct count from the number of distinct memo groups. Use a
  1705  	// map to find the exact count of distinct values for the columns in colSet.
  1706  	// Use a hash to combine column values (this does not have to be exact).
  1707  	distinct := make(map[uint64]struct{}, values.Rows[0].ChildCount())
  1708  	// Determine null count by looking at tuples that have only NullOps in them.
  1709  	nullCount := 0
  1710  	for _, row := range values.Rows {
  1711  		// Use the FNV-1a algorithm. See comments for the interner class.
  1712  		hash := uint64(offset64)
  1713  		hasNonNull := false
  1714  		for i, elem := range row.(*TupleExpr).Elems {
  1715  			if colSet.Contains(values.Cols[i]) {
  1716  				if elem.Op() != opt.NullOp {
  1717  					hasNonNull = true
  1718  				}
  1719  				// Use the pointer value of the scalar expression, since it's already
  1720  				// been interned. Therefore, two expressions with the same pointer
  1721  				// have the same value.
  1722  				ptr := reflect.ValueOf(elem).Pointer()
  1723  				hash ^= uint64(ptr)
  1724  				hash *= prime64
  1725  			}
  1726  		}
  1727  		if !hasNonNull {
  1728  			nullCount++
  1729  		}
  1730  		distinct[hash] = struct{}{}
  1731  	}
  1732  
  1733  	// Update the column statistics.
  1734  	colStat, _ := s.ColStats.Add(colSet)
  1735  	colStat.DistinctCount = float64(len(distinct))
  1736  	colStat.NullCount = float64(nullCount)
  1737  	sb.finalizeFromRowCountAndDistinctCounts(colStat, s)
  1738  	return colStat
  1739  }
  1740  
  1741  // +-------+
  1742  // | Limit |
  1743  // +-------+
  1744  
  1745  func (sb *statisticsBuilder) buildLimit(limit *LimitExpr, relProps *props.Relational) {
  1746  	s := &relProps.Stats
  1747  	if zeroCardinality := s.Init(relProps); zeroCardinality {
  1748  		// Short cut if cardinality is 0.
  1749  		return
  1750  	}
  1751  	s.Available = sb.availabilityFromInput(limit)
  1752  
  1753  	inputStats := &limit.Input.Relational().Stats
  1754  
  1755  	// Copy row count from input.
  1756  	s.RowCount = inputStats.RowCount
  1757  
  1758  	// Update row count if limit is a constant and row count is non-zero.
  1759  	if cnst, ok := limit.Limit.(*ConstExpr); ok && inputStats.RowCount > 0 {
  1760  		hardLimit := *cnst.Value.(*tree.DInt)
  1761  		if hardLimit > 0 {
  1762  			s.RowCount = min(float64(hardLimit), inputStats.RowCount)
  1763  			s.Selectivity = s.RowCount / inputStats.RowCount
  1764  		}
  1765  	}
  1766  
  1767  	sb.finalizeFromCardinality(relProps)
  1768  }
  1769  
  1770  func (sb *statisticsBuilder) colStatLimit(
  1771  	colSet opt.ColSet, limit *LimitExpr,
  1772  ) *props.ColumnStatistic {
  1773  	relProps := limit.Relational()
  1774  	s := &relProps.Stats
  1775  	inputStats := &limit.Input.Relational().Stats
  1776  	colStat := sb.copyColStatFromChild(colSet, limit, s)
  1777  
  1778  	// Scale distinct count based on the selectivity of the limit operation.
  1779  	colStat.ApplySelectivity(s.Selectivity, inputStats.RowCount)
  1780  	if colSet.Intersects(relProps.NotNullCols) {
  1781  		colStat.NullCount = 0
  1782  	}
  1783  	sb.finalizeFromRowCountAndDistinctCounts(colStat, s)
  1784  	return colStat
  1785  }
  1786  
  1787  // +--------+
  1788  // | Offset |
  1789  // +--------+
  1790  
  1791  func (sb *statisticsBuilder) buildOffset(offset *OffsetExpr, relProps *props.Relational) {
  1792  	s := &relProps.Stats
  1793  	if zeroCardinality := s.Init(relProps); zeroCardinality {
  1794  		// Short cut if cardinality is 0.
  1795  		return
  1796  	}
  1797  	s.Available = sb.availabilityFromInput(offset)
  1798  
  1799  	inputStats := &offset.Input.Relational().Stats
  1800  
  1801  	// Copy row count from input.
  1802  	s.RowCount = inputStats.RowCount
  1803  
  1804  	// Update row count if offset is a constant and row count is non-zero.
  1805  	if cnst, ok := offset.Offset.(*ConstExpr); ok && inputStats.RowCount > 0 {
  1806  		hardOffset := *cnst.Value.(*tree.DInt)
  1807  		if float64(hardOffset) >= inputStats.RowCount {
  1808  			// The correct estimate for row count here is 0, but we don't ever want
  1809  			// row count to be zero unless the cardinality is zero. This is because
  1810  			// the stats may be stale, and we can end up with weird and inefficient
  1811  			// plans if we estimate 0 rows. Use a small number instead.
  1812  			s.RowCount = min(1, inputStats.RowCount)
  1813  		} else if hardOffset > 0 {
  1814  			s.RowCount = inputStats.RowCount - float64(hardOffset)
  1815  		}
  1816  		s.Selectivity = s.RowCount / inputStats.RowCount
  1817  	}
  1818  
  1819  	sb.finalizeFromCardinality(relProps)
  1820  }
  1821  
  1822  func (sb *statisticsBuilder) colStatOffset(
  1823  	colSet opt.ColSet, offset *OffsetExpr,
  1824  ) *props.ColumnStatistic {
  1825  	relProps := offset.Relational()
  1826  	s := &relProps.Stats
  1827  	inputStats := &offset.Input.Relational().Stats
  1828  	colStat := sb.copyColStatFromChild(colSet, offset, s)
  1829  
  1830  	// Scale distinct count based on the selectivity of the offset operation.
  1831  	colStat.ApplySelectivity(s.Selectivity, inputStats.RowCount)
  1832  	if colSet.Intersects(relProps.NotNullCols) {
  1833  		colStat.NullCount = 0
  1834  	}
  1835  	sb.finalizeFromRowCountAndDistinctCounts(colStat, s)
  1836  	return colStat
  1837  }
  1838  
  1839  // +---------+
  1840  // | Max1Row |
  1841  // +---------+
  1842  
  1843  func (sb *statisticsBuilder) buildMax1Row(max1Row *Max1RowExpr, relProps *props.Relational) {
  1844  	s := &relProps.Stats
  1845  	if zeroCardinality := s.Init(relProps); zeroCardinality {
  1846  		// Short cut if cardinality is 0.
  1847  		return
  1848  	}
  1849  	s.Available = true
  1850  
  1851  	s.RowCount = 1
  1852  	sb.finalizeFromCardinality(relProps)
  1853  }
  1854  
  1855  func (sb *statisticsBuilder) colStatMax1Row(
  1856  	colSet opt.ColSet, max1Row *Max1RowExpr,
  1857  ) *props.ColumnStatistic {
  1858  	s := &max1Row.Relational().Stats
  1859  	colStat, _ := s.ColStats.Add(colSet)
  1860  	colStat.DistinctCount = 1
  1861  	colStat.NullCount = s.RowCount * unknownNullCountRatio
  1862  	if colSet.Intersects(max1Row.Relational().NotNullCols) {
  1863  		colStat.NullCount = 0
  1864  	}
  1865  	sb.finalizeFromRowCountAndDistinctCounts(colStat, s)
  1866  	return colStat
  1867  }
  1868  
  1869  // +------------+
  1870  // | Row Number |
  1871  // +------------+
  1872  
  1873  func (sb *statisticsBuilder) buildOrdinality(ord *OrdinalityExpr, relProps *props.Relational) {
  1874  	s := &relProps.Stats
  1875  	if zeroCardinality := s.Init(relProps); zeroCardinality {
  1876  		// Short cut if cardinality is 0.
  1877  		return
  1878  	}
  1879  	s.Available = sb.availabilityFromInput(ord)
  1880  
  1881  	inputStats := &ord.Input.Relational().Stats
  1882  
  1883  	s.RowCount = inputStats.RowCount
  1884  	sb.finalizeFromCardinality(relProps)
  1885  }
  1886  
  1887  func (sb *statisticsBuilder) colStatOrdinality(
  1888  	colSet opt.ColSet, ord *OrdinalityExpr,
  1889  ) *props.ColumnStatistic {
  1890  	relProps := ord.Relational()
  1891  	s := &relProps.Stats
  1892  
  1893  	colStat, _ := s.ColStats.Add(colSet)
  1894  
  1895  	if colSet.Contains(ord.ColID) {
  1896  		// The ordinality column is a key, so every row is distinct.
  1897  		colStat.DistinctCount = s.RowCount
  1898  		colStat.NullCount = 0
  1899  	} else {
  1900  		inputColStat := sb.colStatFromChild(colSet, ord, 0 /* childIdx */)
  1901  		colStat.DistinctCount = inputColStat.DistinctCount
  1902  		colStat.NullCount = inputColStat.NullCount
  1903  	}
  1904  
  1905  	if colSet.Intersects(relProps.NotNullCols) {
  1906  		colStat.NullCount = 0
  1907  	}
  1908  	sb.finalizeFromRowCountAndDistinctCounts(colStat, s)
  1909  	return colStat
  1910  }
  1911  
  1912  // +------------+
  1913  // |   Window   |
  1914  // +------------+
  1915  
  1916  func (sb *statisticsBuilder) buildWindow(window *WindowExpr, relProps *props.Relational) {
  1917  	s := &relProps.Stats
  1918  	if zeroCardinality := s.Init(relProps); zeroCardinality {
  1919  		// Short cut if cardinality is 0.
  1920  		return
  1921  	}
  1922  	s.Available = sb.availabilityFromInput(window)
  1923  
  1924  	inputStats := &window.Input.Relational().Stats
  1925  
  1926  	// The row count of a window is equal to the row count of its input.
  1927  	s.RowCount = inputStats.RowCount
  1928  
  1929  	sb.finalizeFromCardinality(relProps)
  1930  }
  1931  
  1932  func (sb *statisticsBuilder) colStatWindow(
  1933  	colSet opt.ColSet, window *WindowExpr,
  1934  ) *props.ColumnStatistic {
  1935  	relProps := window.Relational()
  1936  	s := &relProps.Stats
  1937  
  1938  	colStat, _ := s.ColStats.Add(colSet)
  1939  
  1940  	var windowCols opt.ColSet
  1941  	for _, w := range window.Windows {
  1942  		windowCols.Add(w.Col)
  1943  	}
  1944  
  1945  	if colSet.Intersects(windowCols) {
  1946  		// These can be quite complicated and differ dramatically based on which
  1947  		// window function is being computed. For now, just assume row_number and
  1948  		// that every row is distinct.
  1949  		// TODO(justin): make these accurate and take into consideration the window
  1950  		// function being computed.
  1951  		colStat.DistinctCount = s.RowCount
  1952  
  1953  		// Just assume that no NULLs are output.
  1954  		// TODO(justin): there are window fns for which this is not true, make
  1955  		// sure those are handled.
  1956  		if colSet.SubsetOf(windowCols) {
  1957  			// The generated columns are the only columns being requested.
  1958  			colStat.NullCount = 0
  1959  		} else {
  1960  			// Copy NullCount from child.
  1961  			colSetChild := colSet.Difference(windowCols)
  1962  			inputColStat := sb.colStatFromChild(colSetChild, window, 0 /* childIdx */)
  1963  			colStat.NullCount = inputColStat.NullCount
  1964  		}
  1965  	} else {
  1966  		inputColStat := sb.colStatFromChild(colSet, window, 0 /* childIdx */)
  1967  		colStat.DistinctCount = inputColStat.DistinctCount
  1968  		colStat.NullCount = inputColStat.NullCount
  1969  	}
  1970  
  1971  	if colSet.Intersects(relProps.NotNullCols) {
  1972  		colStat.NullCount = 0
  1973  	}
  1974  	sb.finalizeFromRowCountAndDistinctCounts(colStat, s)
  1975  	return colStat
  1976  }
  1977  
  1978  // +-------------+
  1979  // | Project Set |
  1980  // +-------------+
  1981  
  1982  func (sb *statisticsBuilder) buildProjectSet(
  1983  	projectSet *ProjectSetExpr, relProps *props.Relational,
  1984  ) {
  1985  	s := &relProps.Stats
  1986  	if zeroCardinality := s.Init(relProps); zeroCardinality {
  1987  		// Short cut if cardinality is 0.
  1988  		return
  1989  	}
  1990  	s.Available = sb.availabilityFromInput(projectSet)
  1991  
  1992  	// The row count of a zip operation is equal to the maximum row count of its
  1993  	// children.
  1994  	var zipRowCount float64
  1995  	for i := range projectSet.Zip {
  1996  		if fn, ok := projectSet.Zip[i].Fn.(*FunctionExpr); ok {
  1997  			if fn.Overload.Generator != nil {
  1998  				// TODO(rytaft): We may want to estimate the number of rows based on
  1999  				// the type of generator function and its parameters.
  2000  				zipRowCount = unknownGeneratorRowCount
  2001  				break
  2002  			}
  2003  		}
  2004  
  2005  		// A scalar function generates one row.
  2006  		zipRowCount = 1
  2007  	}
  2008  
  2009  	// Multiply by the input row count to get the total.
  2010  	inputStats := &projectSet.Input.Relational().Stats
  2011  	s.RowCount = zipRowCount * inputStats.RowCount
  2012  
  2013  	sb.finalizeFromCardinality(relProps)
  2014  }
  2015  
  2016  func (sb *statisticsBuilder) colStatProjectSet(
  2017  	colSet opt.ColSet, projectSet *ProjectSetExpr,
  2018  ) *props.ColumnStatistic {
  2019  	s := &projectSet.Relational().Stats
  2020  	if s.RowCount == 0 {
  2021  		// Short cut if cardinality is 0.
  2022  		colStat, _ := s.ColStats.Add(colSet)
  2023  		return colStat
  2024  	}
  2025  
  2026  	inputProps := projectSet.Input.Relational()
  2027  	inputStats := inputProps.Stats
  2028  	inputCols := inputProps.OutputCols
  2029  
  2030  	colStat, _ := s.ColStats.Add(colSet)
  2031  	colStat.DistinctCount = 1
  2032  	colStat.NullCount = s.RowCount
  2033  
  2034  	// Some of the requested columns may be from the input.
  2035  	reqInputCols := colSet.Intersection(inputCols)
  2036  	if !reqInputCols.Empty() {
  2037  		inputColStat := sb.colStatFromChild(reqInputCols, projectSet, 0 /* childIdx */)
  2038  		colStat.DistinctCount = inputColStat.DistinctCount
  2039  		colStat.NullCount = inputColStat.NullCount * (s.RowCount / inputStats.RowCount)
  2040  	}
  2041  
  2042  	// Other requested columns may be from the output columns of the zip.
  2043  	zipCols := projectSet.Zip.OutputCols()
  2044  	reqZipCols := colSet.Difference(inputCols).Intersection(zipCols)
  2045  	if !reqZipCols.Empty() {
  2046  		// Calculate the the distinct count and null count for the zip columns
  2047  		// after the cross join has been applied.
  2048  		zipColsDistinctCount := float64(1)
  2049  		zipColsNullCount := s.RowCount
  2050  		for i := range projectSet.Zip {
  2051  			item := &projectSet.Zip[i]
  2052  			if item.Cols.ToSet().Intersects(reqZipCols) {
  2053  				if fn, ok := item.Fn.(*FunctionExpr); ok && fn.Overload.Generator != nil {
  2054  					// The columns(s) contain a generator function.
  2055  					// TODO(rytaft): We may want to determine which generator function the
  2056  					// requested columns correspond to, and estimate the distinct count and
  2057  					// null count based on the type of generator function and its parameters.
  2058  					zipColsDistinctCount *= unknownGeneratorRowCount * unknownGeneratorDistinctCountRatio
  2059  					zipColsNullCount *= unknownNullCountRatio
  2060  				} else {
  2061  					// The columns(s) contain a scalar function or expression.
  2062  					// These columns can contain many null values if the zip also
  2063  					// contains a generator function. For example:
  2064  					//
  2065  					//    SELECT * FROM ROWS FROM (generate_series(0, 3), upper('abc'));
  2066  					//
  2067  					// Produces:
  2068  					//
  2069  					//     generate_series | upper
  2070  					//    -----------------+-------
  2071  					//                   0 | ABC
  2072  					//                   1 | NULL
  2073  					//                   2 | NULL
  2074  					//                   3 | NULL
  2075  					//
  2076  					// After the cross product with the input, the total number of nulls
  2077  					// for the column(s) equals (output row count - input row count).
  2078  					// (Also multiply by the expected chance of collisions with nulls
  2079  					// already collected.)
  2080  					zipColsNullCount *= (s.RowCount - inputStats.RowCount) / s.RowCount
  2081  				}
  2082  
  2083  				if item.ScalarProps().OuterCols.Intersects(inputProps.OutputCols) {
  2084  					// The column(s) are correlated with the input, so they may have a
  2085  					// distinct value for each distinct row of the input.
  2086  					zipColsDistinctCount *= inputStats.RowCount * unknownDistinctCountRatio
  2087  				}
  2088  			}
  2089  		}
  2090  
  2091  		// Multiply the distinct counts in case colStat.DistinctCount is
  2092  		// already populated with a statistic from the subset of columns
  2093  		// provided by the input. Multiplying the counts gives a worst-case
  2094  		// estimate of the joint distinct count.
  2095  		colStat.DistinctCount *= zipColsDistinctCount
  2096  
  2097  		// Assuming null columns are completely independent, calculate
  2098  		// the expected value of having nulls in both column sets.
  2099  		f1 := zipColsNullCount / s.RowCount
  2100  		f2 := colStat.NullCount / s.RowCount
  2101  		colStat.NullCount = s.RowCount * f1 * f2
  2102  	}
  2103  
  2104  	if colSet.Intersects(projectSet.Relational().NotNullCols) {
  2105  		colStat.NullCount = 0
  2106  	}
  2107  	sb.finalizeFromRowCountAndDistinctCounts(colStat, s)
  2108  	return colStat
  2109  }
  2110  
  2111  // +----------+
  2112  // | WithScan |
  2113  // +----------+
  2114  
  2115  func (sb *statisticsBuilder) buildWithScan(withScan *WithScanExpr, relProps *props.Relational) {
  2116  	s := &relProps.Stats
  2117  	if zeroCardinality := s.Init(relProps); zeroCardinality {
  2118  		// Short cut if cardinality is 0.
  2119  		return
  2120  	}
  2121  	s.Available = sb.availabilityFromInput(withScan)
  2122  
  2123  	inputStats := withScan.BindingProps.Stats
  2124  
  2125  	s.RowCount = inputStats.RowCount
  2126  	sb.finalizeFromCardinality(relProps)
  2127  }
  2128  
  2129  func (sb *statisticsBuilder) colStatWithScan(
  2130  	colSet opt.ColSet, withScan *WithScanExpr,
  2131  ) *props.ColumnStatistic {
  2132  	s := &withScan.Relational().Stats
  2133  	withProps := withScan.BindingProps
  2134  	inColSet := opt.TranslateColSet(colSet, withScan.OutCols, withScan.InCols)
  2135  
  2136  	// We cannot call colStatLeaf on &withProps.Stats directly because it can
  2137  	// modify it.
  2138  	var statsCopy props.Statistics
  2139  	statsCopy.CopyFrom(&withProps.Stats)
  2140  
  2141  	// TODO(rytaft): This would be more accurate if we could access the WithExpr
  2142  	// itself.
  2143  	inColStat := sb.colStatLeaf(inColSet, &statsCopy, &withProps.FuncDeps, withProps.NotNullCols)
  2144  
  2145  	colStat, _ := s.ColStats.Add(colSet)
  2146  	colStat.DistinctCount = inColStat.DistinctCount
  2147  	colStat.NullCount = inColStat.NullCount
  2148  	sb.finalizeFromRowCountAndDistinctCounts(colStat, s)
  2149  	return colStat
  2150  }
  2151  
  2152  // +--------------------------------+
  2153  // | Insert, Update, Upsert, Delete |
  2154  // +--------------------------------+
  2155  
  2156  func (sb *statisticsBuilder) buildMutation(mutation RelExpr, relProps *props.Relational) {
  2157  	s := &relProps.Stats
  2158  	if zeroCardinality := s.Init(relProps); zeroCardinality {
  2159  		// Short cut if cardinality is 0.
  2160  		return
  2161  	}
  2162  	s.Available = sb.availabilityFromInput(mutation)
  2163  
  2164  	inputStats := sb.statsFromChild(mutation, 0 /* childIdx */)
  2165  
  2166  	s.RowCount = inputStats.RowCount
  2167  	sb.finalizeFromCardinality(relProps)
  2168  }
  2169  
  2170  func (sb *statisticsBuilder) colStatMutation(
  2171  	colSet opt.ColSet, mutation RelExpr,
  2172  ) *props.ColumnStatistic {
  2173  	s := &mutation.Relational().Stats
  2174  	private := mutation.Private().(*MutationPrivate)
  2175  
  2176  	// Get colstat from child by mapping requested columns to corresponding
  2177  	// input columns.
  2178  	inColSet := private.MapToInputCols(colSet)
  2179  	inColStat := sb.colStatFromChild(inColSet, mutation, 0 /* childIdx */)
  2180  
  2181  	// Construct mutation colstat using the corresponding input stats.
  2182  	colStat, _ := s.ColStats.Add(colSet)
  2183  	colStat.DistinctCount = inColStat.DistinctCount
  2184  	colStat.NullCount = inColStat.NullCount
  2185  	sb.finalizeFromRowCountAndDistinctCounts(colStat, s)
  2186  	return colStat
  2187  }
  2188  
  2189  // +-----------------+
  2190  // | Sequence Select |
  2191  // +-----------------+
  2192  
  2193  func (sb *statisticsBuilder) buildSequenceSelect(relProps *props.Relational) {
  2194  	s := &relProps.Stats
  2195  	s.Available = true
  2196  	s.RowCount = 1
  2197  	sb.finalizeFromCardinality(relProps)
  2198  }
  2199  
  2200  func (sb *statisticsBuilder) colStatSequenceSelect(
  2201  	colSet opt.ColSet, seq *SequenceSelectExpr,
  2202  ) *props.ColumnStatistic {
  2203  	s := &seq.Relational().Stats
  2204  
  2205  	colStat, _ := s.ColStats.Add(colSet)
  2206  	colStat.DistinctCount = 1
  2207  	colStat.NullCount = 0
  2208  	sb.finalizeFromRowCountAndDistinctCounts(colStat, s)
  2209  	return colStat
  2210  }
  2211  
  2212  // +---------+
  2213  // | Unknown |
  2214  // +---------+
  2215  
  2216  func (sb *statisticsBuilder) buildUnknown(relProps *props.Relational) {
  2217  	s := &relProps.Stats
  2218  	s.Available = false
  2219  	s.RowCount = unknownGeneratorRowCount
  2220  	sb.finalizeFromCardinality(relProps)
  2221  }
  2222  
  2223  func (sb *statisticsBuilder) colStatUnknown(
  2224  	colSet opt.ColSet, relProps *props.Relational,
  2225  ) *props.ColumnStatistic {
  2226  	s := &relProps.Stats
  2227  
  2228  	colStat, _ := s.ColStats.Add(colSet)
  2229  	colStat.DistinctCount = s.RowCount
  2230  	colStat.NullCount = 0
  2231  	sb.finalizeFromRowCountAndDistinctCounts(colStat, s)
  2232  	return colStat
  2233  }
  2234  
  2235  /////////////////////////////////////////////////
  2236  // General helper functions for building stats //
  2237  /////////////////////////////////////////////////
  2238  
  2239  // copyColStatFromChild copies the column statistic for the given colSet from
  2240  // the first child of ev into ev. colStatFromChild may trigger recursive
  2241  // calls if the requested statistic is not already cached in the child.
  2242  func (sb *statisticsBuilder) copyColStatFromChild(
  2243  	colSet opt.ColSet, e RelExpr, s *props.Statistics,
  2244  ) *props.ColumnStatistic {
  2245  	childColStat := sb.colStatFromChild(colSet, e, 0 /* childIdx */)
  2246  	return sb.copyColStat(colSet, s, childColStat)
  2247  }
  2248  
  2249  // ensureColStat creates a column statistic for column "col" if it doesn't
  2250  // already exist in s.ColStats, copying the statistic from a child.
  2251  // Then, ensureColStat sets the distinct count to the minimum of the existing
  2252  // value and the new value.
  2253  func (sb *statisticsBuilder) ensureColStat(
  2254  	colSet opt.ColSet, maxDistinctCount float64, e RelExpr, relProps *props.Relational,
  2255  ) *props.ColumnStatistic {
  2256  	s := &relProps.Stats
  2257  
  2258  	colStat, ok := s.ColStats.Lookup(colSet)
  2259  	if !ok {
  2260  		colStat, _ = sb.colStatFromInput(colSet, e)
  2261  		colStat = sb.copyColStat(colSet, s, colStat)
  2262  	}
  2263  
  2264  	colStat.DistinctCount = min(colStat.DistinctCount, maxDistinctCount)
  2265  	return colStat
  2266  }
  2267  
  2268  // copyColStat creates a column statistic and copies the data from an existing
  2269  // column statistic. Does not copy the histogram.
  2270  func (sb *statisticsBuilder) copyColStat(
  2271  	colSet opt.ColSet, s *props.Statistics, inputColStat *props.ColumnStatistic,
  2272  ) *props.ColumnStatistic {
  2273  	if !inputColStat.Cols.SubsetOf(colSet) {
  2274  		panic(errors.AssertionFailedf(
  2275  			"copyColStat colSet: %v inputColSet: %v\n", log.Safe(colSet), log.Safe(inputColStat.Cols),
  2276  		))
  2277  	}
  2278  	colStat, _ := s.ColStats.Add(colSet)
  2279  	colStat.DistinctCount = inputColStat.DistinctCount
  2280  	colStat.NullCount = inputColStat.NullCount
  2281  	return colStat
  2282  }
  2283  
  2284  func (sb *statisticsBuilder) finalizeFromCardinality(relProps *props.Relational) {
  2285  	s := &relProps.Stats
  2286  
  2287  	// We don't ever want row count to be zero unless the cardinality is zero.
  2288  	// This is because the stats may be stale, and we can end up with weird and
  2289  	// inefficient plans if we estimate 0 rows.
  2290  	if s.RowCount <= 0 && relProps.Cardinality.Max > 0 {
  2291  		panic(errors.AssertionFailedf("estimated row count must be non-zero"))
  2292  	}
  2293  
  2294  	// The row count should be between the min and max cardinality.
  2295  	if s.RowCount > float64(relProps.Cardinality.Max) && relProps.Cardinality.Max != math.MaxUint32 {
  2296  		s.RowCount = float64(relProps.Cardinality.Max)
  2297  	} else if s.RowCount < float64(relProps.Cardinality.Min) {
  2298  		s.RowCount = float64(relProps.Cardinality.Min)
  2299  	}
  2300  
  2301  	for i, n := 0, s.ColStats.Count(); i < n; i++ {
  2302  		colStat := s.ColStats.Get(i)
  2303  		sb.finalizeFromRowCountAndDistinctCounts(colStat, s)
  2304  	}
  2305  }
  2306  
  2307  func (sb *statisticsBuilder) finalizeFromRowCountAndDistinctCounts(
  2308  	colStat *props.ColumnStatistic, s *props.Statistics,
  2309  ) {
  2310  	rowCount := s.RowCount
  2311  
  2312  	// We should always have at least one distinct value if row count > 0.
  2313  	if rowCount > 0 && colStat.DistinctCount == 0 {
  2314  		panic(errors.AssertionFailedf("estimated distinct count must be non-zero"))
  2315  	}
  2316  
  2317  	// If this is a multi-column statistic, the distinct count should be no
  2318  	// larger than the product of all the distinct counts of its individual
  2319  	// columns, and no smaller than the distinct count of any single column.
  2320  	if colStat.Cols.Len() > 1 && rowCount > 1 {
  2321  		product := 1.0
  2322  		maxDistinct := 0.0
  2323  		colStat.Cols.ForEach(func(col opt.ColumnID) {
  2324  			if singleColStat, ok := s.ColStats.Lookup(opt.MakeColSet(col)); ok {
  2325  				if singleColStat.DistinctCount > 1 {
  2326  					product *= singleColStat.DistinctCount
  2327  				}
  2328  				if singleColStat.DistinctCount > maxDistinct {
  2329  					maxDistinct = singleColStat.DistinctCount
  2330  				}
  2331  			} else {
  2332  				// This is just a best-effort check, so if we don't have one of the
  2333  				// column stats, assume its distinct count equals the row count.
  2334  				product *= rowCount
  2335  			}
  2336  		})
  2337  		colStat.DistinctCount = min(colStat.DistinctCount, product)
  2338  		colStat.DistinctCount = max(colStat.DistinctCount, maxDistinct)
  2339  	}
  2340  
  2341  	// The distinct and null counts should be no larger than the row count.
  2342  	colStat.DistinctCount = min(colStat.DistinctCount, rowCount)
  2343  	colStat.NullCount = min(colStat.NullCount, rowCount)
  2344  
  2345  	// Uniformly reduce the size of each histogram bucket so the number of values
  2346  	// is no larger than the row count.
  2347  	if colStat.Histogram != nil {
  2348  		valuesCount := colStat.Histogram.ValuesCount()
  2349  		if valuesCount > rowCount {
  2350  			colStat.Histogram = colStat.Histogram.ApplySelectivity(rowCount / valuesCount)
  2351  		}
  2352  	}
  2353  }
  2354  
  2355  func (sb *statisticsBuilder) shouldUseHistogram(relProps *props.Relational) bool {
  2356  	// If we know that the cardinality is below a certain threshold (e.g., due to
  2357  	// a constraint on a key column), don't bother adding the overhead of
  2358  	// creating a histogram.
  2359  	return relProps.Cardinality.Max >= minCardinalityForHistogram
  2360  }
  2361  
  2362  // rowsProcessed calculates and returns the number of rows processed by the
  2363  // relational expression. It is currently only supported for joins.
  2364  func (sb *statisticsBuilder) rowsProcessed(e RelExpr) float64 {
  2365  	semiAntiJoinToInnerJoin := func(joinType opt.Operator) opt.Operator {
  2366  		switch joinType {
  2367  		case opt.SemiJoinOp, opt.AntiJoinOp:
  2368  			return opt.InnerJoinOp
  2369  		case opt.SemiJoinApplyOp, opt.AntiJoinApplyOp:
  2370  			return opt.InnerJoinApplyOp
  2371  		default:
  2372  			return joinType
  2373  		}
  2374  	}
  2375  
  2376  	switch t := e.(type) {
  2377  	case *LookupJoinExpr:
  2378  		var lookupJoinPrivate *LookupJoinPrivate
  2379  		switch t.JoinType {
  2380  		case opt.SemiJoinOp, opt.SemiJoinApplyOp, opt.AntiJoinOp, opt.AntiJoinApplyOp:
  2381  			// The number of rows processed for semi and anti joins is closer to the
  2382  			// number of output rows for an equivalent inner join.
  2383  			copy := t.LookupJoinPrivate
  2384  			copy.JoinType = semiAntiJoinToInnerJoin(t.JoinType)
  2385  			lookupJoinPrivate = &copy
  2386  
  2387  		default:
  2388  			if t.On.IsTrue() {
  2389  				// If there are no additional ON filters, the number of rows processed
  2390  				// equals the number of output rows.
  2391  				return e.Relational().Stats.RowCount
  2392  			}
  2393  			lookupJoinPrivate = &t.LookupJoinPrivate
  2394  		}
  2395  
  2396  		// We need to determine the row count of the join before the
  2397  		// ON conditions are applied.
  2398  		withoutOn := e.Memo().MemoizeLookupJoin(t.Input, nil /* on */, lookupJoinPrivate)
  2399  		return withoutOn.Relational().Stats.RowCount
  2400  
  2401  	case *GeoLookupJoinExpr:
  2402  		var lookupJoinPrivate *GeoLookupJoinPrivate
  2403  		switch t.JoinType {
  2404  		case opt.SemiJoinOp, opt.SemiJoinApplyOp, opt.AntiJoinOp, opt.AntiJoinApplyOp:
  2405  			// The number of rows processed for semi and anti joins is closer to the
  2406  			// number of output rows for an equivalent inner join.
  2407  			copy := t.GeoLookupJoinPrivate
  2408  			copy.JoinType = semiAntiJoinToInnerJoin(t.JoinType)
  2409  			lookupJoinPrivate = &copy
  2410  
  2411  		default:
  2412  			if t.On.IsTrue() {
  2413  				// If there are no additional ON filters, the number of rows processed
  2414  				// equals the number of output rows.
  2415  				return e.Relational().Stats.RowCount
  2416  			}
  2417  			lookupJoinPrivate = &t.GeoLookupJoinPrivate
  2418  		}
  2419  
  2420  		// We need to determine the row count of the join before the
  2421  		// ON conditions are applied.
  2422  		withoutOn := e.Memo().MemoizeGeoLookupJoin(t.Input, nil /* on */, lookupJoinPrivate)
  2423  		return withoutOn.Relational().Stats.RowCount
  2424  
  2425  	case *MergeJoinExpr:
  2426  		var mergeJoinPrivate *MergeJoinPrivate
  2427  		switch t.JoinType {
  2428  		case opt.SemiJoinOp, opt.SemiJoinApplyOp, opt.AntiJoinOp, opt.AntiJoinApplyOp:
  2429  			// The number of rows processed for semi and anti joins is closer to the
  2430  			// number of output rows for an equivalent inner join.
  2431  			copy := t.MergeJoinPrivate
  2432  			copy.JoinType = semiAntiJoinToInnerJoin(t.JoinType)
  2433  			mergeJoinPrivate = &copy
  2434  
  2435  		default:
  2436  			if t.On.IsTrue() {
  2437  				// If there are no additional ON filters, the number of rows processed
  2438  				// equals the number of output rows.
  2439  				return e.Relational().Stats.RowCount
  2440  			}
  2441  			mergeJoinPrivate = &t.MergeJoinPrivate
  2442  		}
  2443  
  2444  		// We need to determine the row count of the join before the
  2445  		// ON conditions are applied.
  2446  		withoutOn := e.Memo().MemoizeMergeJoin(t.Left, t.Right, nil /* on */, mergeJoinPrivate)
  2447  		return withoutOn.Relational().Stats.RowCount
  2448  
  2449  	default:
  2450  		if !opt.IsJoinOp(e) {
  2451  			panic(errors.AssertionFailedf("rowsProcessed not supported for operator type %v", log.Safe(e.Op())))
  2452  		}
  2453  
  2454  		leftCols := e.Child(0).(RelExpr).Relational().OutputCols
  2455  		rightCols := e.Child(1).(RelExpr).Relational().OutputCols
  2456  		filters := e.Child(2).(*FiltersExpr)
  2457  
  2458  		// Remove ON conditions that are not equality conditions,
  2459  		on := ExtractJoinEqualityFilters(leftCols, rightCols, *filters)
  2460  
  2461  		switch t := e.(type) {
  2462  		// The number of rows processed for semi and anti joins is closer to the
  2463  		// number of output rows for an equivalent inner join.
  2464  		case *SemiJoinExpr:
  2465  			e = e.Memo().MemoizeInnerJoin(t.Left, t.Right, on, &t.JoinPrivate)
  2466  		case *SemiJoinApplyExpr:
  2467  			e = e.Memo().MemoizeInnerJoinApply(t.Left, t.Right, on, &t.JoinPrivate)
  2468  		case *AntiJoinExpr:
  2469  			e = e.Memo().MemoizeInnerJoin(t.Left, t.Right, on, &t.JoinPrivate)
  2470  		case *AntiJoinApplyExpr:
  2471  			e = e.Memo().MemoizeInnerJoinApply(t.Left, t.Right, on, &t.JoinPrivate)
  2472  
  2473  		default:
  2474  			if len(on) == len(*filters) {
  2475  				// No filters were removed.
  2476  				return e.Relational().Stats.RowCount
  2477  			}
  2478  
  2479  			switch t := e.(type) {
  2480  			case *InnerJoinExpr:
  2481  				e = e.Memo().MemoizeInnerJoin(t.Left, t.Right, on, &t.JoinPrivate)
  2482  			case *InnerJoinApplyExpr:
  2483  				e = e.Memo().MemoizeInnerJoinApply(t.Left, t.Right, on, &t.JoinPrivate)
  2484  			case *LeftJoinExpr:
  2485  				e = e.Memo().MemoizeLeftJoin(t.Left, t.Right, on, &t.JoinPrivate)
  2486  			case *LeftJoinApplyExpr:
  2487  				e = e.Memo().MemoizeLeftJoinApply(t.Left, t.Right, on, &t.JoinPrivate)
  2488  			case *RightJoinExpr:
  2489  				e = e.Memo().MemoizeRightJoin(t.Left, t.Right, on, &t.JoinPrivate)
  2490  			case *FullJoinExpr:
  2491  				e = e.Memo().MemoizeFullJoin(t.Left, t.Right, on, &t.JoinPrivate)
  2492  			default:
  2493  				panic(errors.AssertionFailedf("join type %v not handled", log.Safe(e.Op())))
  2494  			}
  2495  		}
  2496  		return e.Relational().Stats.RowCount
  2497  	}
  2498  }
  2499  
  2500  func min(a, b float64) float64 {
  2501  	if a < b {
  2502  		return a
  2503  	}
  2504  	return b
  2505  }
  2506  
  2507  func max(a, b float64) float64 {
  2508  	if a > b {
  2509  		return a
  2510  	}
  2511  	return b
  2512  }
  2513  
  2514  // fraction returns a/b if a is less than b. Otherwise returns 1.
  2515  func fraction(a, b float64) float64 {
  2516  	if a < b {
  2517  		return a / b
  2518  	}
  2519  	return 1
  2520  }
  2521  
  2522  //////////////////////////////////////////////////
  2523  // Helper functions for selectivity calculation //
  2524  //////////////////////////////////////////////////
  2525  
  2526  const (
  2527  	// This is the value used for inequality filters such as x < 1 in
  2528  	// "Access Path Selection in a Relational Database Management System"
  2529  	// by Pat Selinger et al.
  2530  	unknownFilterSelectivity = 1.0 / 3.0
  2531  
  2532  	// TODO(rytaft): Add other selectivities for other types of predicates.
  2533  
  2534  	// This is an arbitrary row count used in the absence of any real statistics.
  2535  	unknownRowCount = 1000
  2536  
  2537  	// This is the ratio of distinct column values to number of rows, which is
  2538  	// used in the absence of any real statistics for non-key columns.
  2539  	// TODO(rytaft): See if there is an industry standard value for this.
  2540  	unknownDistinctCountRatio = 0.1
  2541  
  2542  	// This is the ratio of null column values to number of rows for nullable
  2543  	// columns, which is used in the absence of any real statistics.
  2544  	unknownNullCountRatio = 0.01
  2545  
  2546  	// Use a small row count for generator functions; this allows use of lookup
  2547  	// join in cases like using json_array_elements with a small constant array.
  2548  	unknownGeneratorRowCount = 10
  2549  
  2550  	// Since the generator row count is so small, we need a larger distinct count
  2551  	// ratio for generator functions.
  2552  	unknownGeneratorDistinctCountRatio = 0.7
  2553  
  2554  	// When subtracting floating point numbers, avoid precision errors by making
  2555  	// sure the result is greater than or equal to epsilon.
  2556  	epsilon = 1e-10
  2557  
  2558  	// This is the minimum cardinality an expression should have in order to make
  2559  	// it worth adding the overhead of using a histogram.
  2560  	minCardinalityForHistogram = 100
  2561  
  2562  	// This is the default selectivity estimated for geospatial lookup joins
  2563  	// until we can get better statistics on inverted indexes and geospatial
  2564  	// columns.
  2565  	unknownGeoRelationshipSelectivity = 1.0 / 100.0
  2566  
  2567  	// multiColWeight is the weight to assign the selectivity calculation using
  2568  	// multi-column statistics versus the calculation using single-column
  2569  	// statistics. See the comment above selectivityFromMultiColDistinctCounts for
  2570  	// details.
  2571  	multiColWeight = 9.0 / 10.0
  2572  )
  2573  
  2574  // countJSONPaths returns the number of JSON paths in the specified
  2575  // FiltersItem. Used in the calculation of unapplied conjuncts in a JSON
  2576  // Contains operator. Returns 0 if paths could not be counted for any
  2577  // reason, such as malformed JSON.
  2578  func countJSONPaths(conjunct *FiltersItem) int {
  2579  	rhs := conjunct.Condition.Child(1)
  2580  	if !CanExtractConstDatum(rhs) {
  2581  		return 0
  2582  	}
  2583  	rightDatum := ExtractConstDatum(rhs)
  2584  	if rightDatum == tree.DNull {
  2585  		return 0
  2586  	}
  2587  	rd, ok := rightDatum.(*tree.DJSON)
  2588  	if !ok {
  2589  		return 0
  2590  	}
  2591  
  2592  	// TODO(itsbilal): Replace this with a method that only counts paths
  2593  	// instead of generating a slice for all of them.
  2594  	paths, err := json.AllPaths(rd.JSON)
  2595  	if err != nil {
  2596  		return 0
  2597  	}
  2598  	return len(paths)
  2599  }
  2600  
  2601  // applyFilter uses constraints to update the distinct counts and histograms
  2602  // for the constrained columns in the filter. The changes in the distinct
  2603  // counts and histograms will be used later to determine the selectivity of
  2604  // the filter.
  2605  //
  2606  // Some filters can be translated directly to distinct counts using the
  2607  // constraint set. For example, the tight constraint `/a: [/1 - /1]` indicates
  2608  // that column `a` has exactly one distinct value.  Other filters, such as
  2609  // `a % 2 = 0` may not have a tight constraint. In this case, it is not
  2610  // possible to determine the distinct count for column `a`, so instead we
  2611  // increment numUnappliedConjuncts, which will be used later for selectivity
  2612  // calculation. See comments in applyConstraintSet and
  2613  // updateDistinctCountsFromConstraint for more details about how distinct
  2614  // counts are calculated from constraints.
  2615  //
  2616  // Equalities between two variables (e.g., var1=var2) are handled separately.
  2617  // See applyEquivalencies and selectivityFromEquivalencies for details.
  2618  //
  2619  func (sb *statisticsBuilder) applyFilter(
  2620  	filters FiltersExpr, e RelExpr, relProps *props.Relational,
  2621  ) (numUnappliedConjuncts float64, constrainedCols, histCols opt.ColSet) {
  2622  	if lookupJoin, ok := e.(*LookupJoinExpr); ok {
  2623  		// Special hack for lookup joins. Add constant filters from the equality
  2624  		// conditions.
  2625  		// TODO(rytaft): the correct way to do this is probably to fully implement
  2626  		// histograms in Project and Join expressions, and use them in
  2627  		// selectivityFromEquivalencies. See Issue #38082.
  2628  		filters = append(filters, lookupJoin.ConstFilters...)
  2629  	}
  2630  
  2631  	applyConjunct := func(conjunct *FiltersItem) {
  2632  		if isEqualityWithTwoVars(conjunct.Condition) {
  2633  			// We'll handle equalities later.
  2634  			return
  2635  		}
  2636  
  2637  		// Special case: The current conjunct is a JSON Contains operator.
  2638  		// If so, count every path to a leaf node in the RHS as a separate
  2639  		// conjunct. If for whatever reason we can't get to the JSON datum
  2640  		// or enumerate its paths, count the whole operator as one conjunct.
  2641  		if conjunct.Condition.Op() == opt.ContainsOp {
  2642  			numPaths := countJSONPaths(conjunct)
  2643  			if numPaths == 0 {
  2644  				numUnappliedConjuncts++
  2645  			} else {
  2646  				// Multiply the number of paths by 2 to mimic the logic in
  2647  				// numConjunctsInConstraint, for constraints like
  2648  				// /1: [/'{"a":"b"}' - /'{"a":"b"}'] . That function counts
  2649  				// this as 2 conjuncts, and to keep row counts as consistent
  2650  				// as possible between competing filtered selects and
  2651  				// constrained scans, we apply the same logic here.
  2652  				numUnappliedConjuncts += 2 * float64(numPaths)
  2653  			}
  2654  			return
  2655  		}
  2656  
  2657  		// Update constrainedCols after the above check for isEqualityWithTwoVars.
  2658  		// We will use constrainedCols later to determine which columns to use for
  2659  		// selectivity calculation in selectivityFromMultiColDistinctCounts, and we
  2660  		// want to make sure that we don't include columns that were only present in
  2661  		// equality conjuncts such as var1=var2. The selectivity of these conjuncts
  2662  		// will be accounted for in selectivityFromEquivalencies.
  2663  		scalarProps := conjunct.ScalarProps()
  2664  		constrainedCols.UnionWith(scalarProps.OuterCols)
  2665  		if scalarProps.Constraints != nil {
  2666  			histColsLocal := sb.applyConstraintSet(
  2667  				scalarProps.Constraints, scalarProps.TightConstraints, e, relProps,
  2668  			)
  2669  			histCols.UnionWith(histColsLocal)
  2670  			if !scalarProps.TightConstraints {
  2671  				numUnappliedConjuncts++
  2672  			}
  2673  		} else {
  2674  			numUnappliedConjuncts++
  2675  		}
  2676  	}
  2677  
  2678  	for i := range filters {
  2679  		applyConjunct(&filters[i])
  2680  	}
  2681  
  2682  	return numUnappliedConjuncts, constrainedCols, histCols
  2683  }
  2684  
  2685  // applyIndexConstraint is used to update the distinct counts and histograms
  2686  // for the constrained columns in an index constraint. Returns the set of
  2687  // constrained columns and the set of columns with a filtered histogram.
  2688  func (sb *statisticsBuilder) applyIndexConstraint(
  2689  	c *constraint.Constraint, e RelExpr, relProps *props.Relational,
  2690  ) (constrainedCols, histCols opt.ColSet) {
  2691  	// If unconstrained, then no constraint could be derived from the expression,
  2692  	// so fall back to estimate.
  2693  	// If a contradiction, then optimizations must not be enabled (say for
  2694  	// testing), or else this would have been reduced.
  2695  	if c.IsUnconstrained() || c.IsContradiction() {
  2696  		return
  2697  	}
  2698  
  2699  	// Calculate distinct counts.
  2700  	applied, lastColMinDistinct := sb.updateDistinctCountsFromConstraint(c, e, relProps)
  2701  
  2702  	// Collect the set of constrained columns for which we were able to estimate
  2703  	// a distinct count, including the first column after the constraint prefix
  2704  	// (if applicable, add a distinct count estimate for that column using the
  2705  	// function updateDistinctCountFromUnappliedConjuncts).
  2706  	//
  2707  	// Note that the resulting set might not include *all* constrained columns.
  2708  	// For example, we cannot make any assumptions about the distinct count of
  2709  	// column b based on the constraints /a/b: [/1 - /5/6] or /a/b: [ - /5/6].
  2710  	// TODO(rytaft): Consider treating remaining constrained columns as
  2711  	//  "unapplied conjuncts" and account for their selectivity in
  2712  	//  selectivityFromUnappliedConjuncts.
  2713  	prefix := c.Prefix(sb.evalCtx)
  2714  	for i, n := 0, c.ConstrainedColumns(sb.evalCtx); i < n && i <= prefix; i++ {
  2715  		col := c.Columns.Get(i).ID()
  2716  		constrainedCols.Add(col)
  2717  		if i < applied {
  2718  			continue
  2719  		}
  2720  
  2721  		// Unlike the constraints found in Select and Join filters, an index
  2722  		// constraint may represent multiple conjuncts. Therefore, we need to
  2723  		// calculate the number of unapplied conjuncts for each constrained column.
  2724  		numConjuncts := sb.numConjunctsInConstraint(c, i)
  2725  
  2726  		// Set the distinct count for the current column of the constraint
  2727  		// according to unknownDistinctCountRatio.
  2728  		var lowerBound float64
  2729  		if i == applied {
  2730  			lowerBound = lastColMinDistinct
  2731  		}
  2732  		sb.updateDistinctCountFromUnappliedConjuncts(col, e, relProps, numConjuncts, lowerBound)
  2733  	}
  2734  
  2735  	if !sb.shouldUseHistogram(relProps) {
  2736  		return constrainedCols, histCols
  2737  	}
  2738  
  2739  	// Calculate histograms.
  2740  	constrainedCols.ForEach(func(col opt.ColumnID) {
  2741  		colSet := opt.MakeColSet(col)
  2742  		inputStat, _ := sb.colStatFromInput(colSet, e)
  2743  		inputHist := inputStat.Histogram
  2744  		if inputHist != nil {
  2745  			if _, _, ok := inputHist.CanFilter(c); ok {
  2746  				s := &relProps.Stats
  2747  				if colStat, ok := s.ColStats.Lookup(colSet); ok {
  2748  					colStat.Histogram = inputHist.Filter(c)
  2749  					histCols.Add(col)
  2750  					sb.updateDistinctCountFromHistogram(colStat, inputStat.DistinctCount)
  2751  				}
  2752  			}
  2753  		}
  2754  	})
  2755  
  2756  	return constrainedCols, histCols
  2757  }
  2758  
  2759  // applyConstraintSet is used to update the distinct counts and histograms
  2760  // for the constrained columns in a constraint set. Returns the set of
  2761  // columns with a filtered histogram.
  2762  func (sb *statisticsBuilder) applyConstraintSet(
  2763  	cs *constraint.Set, tight bool, e RelExpr, relProps *props.Relational,
  2764  ) (histCols opt.ColSet) {
  2765  	// If unconstrained, then no constraint could be derived from the expression,
  2766  	// so fall back to estimate.
  2767  	// If a contradiction, then optimizations must not be enabled (say for
  2768  	// testing), or else this would have been reduced.
  2769  	if cs.IsUnconstrained() || cs == constraint.Contradiction {
  2770  		return opt.ColSet{}
  2771  	}
  2772  
  2773  	s := &relProps.Stats
  2774  	for i := 0; i < cs.Length(); i++ {
  2775  		c := cs.Constraint(i)
  2776  		col := c.Columns.Get(0).ID()
  2777  
  2778  		// Calculate distinct counts.
  2779  		applied, lastColMinDistinct := sb.updateDistinctCountsFromConstraint(c, e, relProps)
  2780  		if applied == 0 {
  2781  			// If a constraint cannot be applied, it may represent an
  2782  			// inequality like x < 1. As a result, distinctCounts does not fully
  2783  			// represent the selectivity of the constraint set.
  2784  			// We return an estimate of the number of unapplied conjuncts to the
  2785  			// caller function to be used for selectivity calculation.
  2786  			numConjuncts := sb.numConjunctsInConstraint(c, 0 /* nth */)
  2787  
  2788  			// Set the distinct count for the first column of the constraint
  2789  			// according to unknownDistinctCountRatio.
  2790  			sb.updateDistinctCountFromUnappliedConjuncts(col, e, relProps, numConjuncts, lastColMinDistinct)
  2791  		}
  2792  
  2793  		if !tight {
  2794  			// TODO(rytaft): it may still be beneficial to calculate the histogram
  2795  			// even if the constraint is not tight, but don't bother for now.
  2796  			continue
  2797  		}
  2798  
  2799  		if !sb.shouldUseHistogram(relProps) {
  2800  			continue
  2801  		}
  2802  
  2803  		// Calculate histogram.
  2804  		cols := opt.MakeColSet(col)
  2805  		inputStat, _ := sb.colStatFromInput(cols, e)
  2806  		inputHist := inputStat.Histogram
  2807  		if inputHist != nil {
  2808  			if _, _, ok := inputHist.CanFilter(c); ok {
  2809  				if colStat, ok := s.ColStats.Lookup(cols); ok {
  2810  					colStat.Histogram = inputHist.Filter(c)
  2811  					histCols.UnionWith(cols)
  2812  					sb.updateDistinctCountFromHistogram(colStat, inputStat.DistinctCount)
  2813  				}
  2814  			}
  2815  		}
  2816  	}
  2817  
  2818  	return histCols
  2819  }
  2820  
  2821  // updateNullCountsFromProps zeroes null counts for columns that cannot
  2822  // have nulls in them, usually due to a column property or an application.
  2823  // of a null-excluding filter. The actual determination of non-nullable
  2824  // columns is done in the logical props builder.
  2825  //
  2826  // For example, consider the following constraint sets:
  2827  //   /a/b/c: [/1 - /1/2/3] [/1/2/5 - /1/2/8]
  2828  //   /c: [/6 - /6]
  2829  //
  2830  // The first constraint set filters nulls out of column a, and the
  2831  // second constraint set filters nulls out of column c.
  2832  //
  2833  func (sb *statisticsBuilder) updateNullCountsFromProps(e RelExpr, relProps *props.Relational) {
  2834  	s := &relProps.Stats
  2835  	relProps.NotNullCols.ForEach(func(col opt.ColumnID) {
  2836  		colSet := opt.MakeColSet(col)
  2837  		colStat, ok := s.ColStats.Lookup(colSet)
  2838  		if ok {
  2839  			colStat.NullCount = 0
  2840  		}
  2841  	})
  2842  }
  2843  
  2844  // updateDistinctCountsFromConstraint updates the distinct count for each
  2845  // column in a constraint that can be determined to have a finite number of
  2846  // possible values. It returns the number of columns for which the distinct
  2847  // count could be inferred from the constraint. If the same column appears
  2848  // in multiple constraints, the distinct count is the minimum for that column
  2849  // across all constraints.
  2850  //
  2851  // For example, consider the following constraint set:
  2852  //
  2853  //   /a/b/c: [/1/2/3 - /1/2/3] [/1/2/5 - /1/2/8]
  2854  //   /c: [/6 - /6]
  2855  //
  2856  // After the first constraint is processed, s.ColStats contains the
  2857  // following:
  2858  //   [a] -> { ... DistinctCount: 1 ... }
  2859  //   [b] -> { ... DistinctCount: 1 ... }
  2860  //   [c] -> { ... DistinctCount: 5 ... }
  2861  //
  2862  // After the second constraint is processed, column c is further constrained,
  2863  // so s.ColStats contains the following:
  2864  //   [a] -> { ... DistinctCount: 1 ... }
  2865  //   [b] -> { ... DistinctCount: 1 ... }
  2866  //   [c] -> { ... DistinctCount: 1 ... }
  2867  //
  2868  // Note that updateDistinctCountsFromConstraint is pessimistic, and assumes
  2869  // that there is at least one row for every possible value provided by the
  2870  // constraint. For example, /a: [/1 - /1000000] would find a distinct count of
  2871  // 1000000 for column "a" even if there are only 10 rows in the table. This
  2872  // discrepancy must be resolved by the calling function.
  2873  //
  2874  // Even if the distinct count can not be inferred for a particular column,
  2875  // It's possible that we can determine a lower bound. In particular, if the
  2876  // query specifically mentions some exact values we should use that as a hint.
  2877  // For example, consider the following constraint:
  2878  //
  2879  //   /a: [ - 5][10 - 10][15 - 15]
  2880  //
  2881  // In this case, updateDistinctCountsFromConstraint will infer that there
  2882  // are at least two distinct values (10 and 15). This lower bound will be
  2883  // returned in the second return value, lastColMinDistinct.
  2884  func (sb *statisticsBuilder) updateDistinctCountsFromConstraint(
  2885  	c *constraint.Constraint, e RelExpr, relProps *props.Relational,
  2886  ) (applied int, lastColMinDistinct float64) {
  2887  	// All of the columns that are part of the prefix have a finite number of
  2888  	// distinct values.
  2889  	prefix := c.Prefix(sb.evalCtx)
  2890  
  2891  	// If there are any other columns beyond the prefix, we may be able to
  2892  	// determine the number of distinct values for the first one. For example:
  2893  	//   /a/b/c: [/1/2/3 - /1/2/3] [/1/4/5 - /1/4/8]
  2894  	//       -> Column a has DistinctCount = 1.
  2895  	//       -> Column b has DistinctCount = 2.
  2896  	//       -> Column c has DistinctCount = 5.
  2897  	for col := 0; col <= prefix; col++ {
  2898  		// All columns should have at least one distinct value.
  2899  		distinctCount := 1.0
  2900  
  2901  		var val tree.Datum
  2902  		countable := true
  2903  		for i := 0; i < c.Spans.Count(); i++ {
  2904  			sp := c.Spans.Get(i)
  2905  			if sp.StartKey().Length() <= col || sp.EndKey().Length() <= col {
  2906  				// We can't determine the distinct count for this column. For example,
  2907  				// the number of distinct values for column b in the constraint
  2908  				// /a/b: [/1/1 - /1] cannot be determined.
  2909  				countable = false
  2910  				continue
  2911  			}
  2912  			startVal := sp.StartKey().Value(col)
  2913  			endVal := sp.EndKey().Value(col)
  2914  			if startVal.Compare(sb.evalCtx, endVal) != 0 {
  2915  				var start, end float64
  2916  				if startVal.ResolvedType().Family() == types.IntFamily &&
  2917  					endVal.ResolvedType().Family() == types.IntFamily {
  2918  					start = float64(*startVal.(*tree.DInt))
  2919  					end = float64(*endVal.(*tree.DInt))
  2920  				} else if startVal.ResolvedType().Family() == types.DateFamily &&
  2921  					endVal.ResolvedType().Family() == types.DateFamily {
  2922  					startDate := startVal.(*tree.DDate)
  2923  					endDate := endVal.(*tree.DDate)
  2924  					if !startDate.IsFinite() || !endDate.IsFinite() {
  2925  						// One of the boundaries is not finite, so we can't determine the
  2926  						// distinct count for this column.
  2927  						countable = false
  2928  						continue
  2929  					}
  2930  					start = float64(startDate.PGEpochDays())
  2931  					end = float64(endDate.PGEpochDays())
  2932  				} else {
  2933  					// We can't determine the distinct count for this column. For example,
  2934  					// the number of distinct values in the constraint
  2935  					// /a: [/'cherry' - /'mango'] cannot be determined.
  2936  					countable = false
  2937  					continue
  2938  				}
  2939  				// We assume that both start and end boundaries are inclusive. This
  2940  				// should be the case for integer and date columns (due to
  2941  				// normalization by constraint.PreferInclusive).
  2942  				if c.Columns.Get(col).Ascending() {
  2943  					distinctCount += end - start
  2944  				} else {
  2945  					distinctCount += start - end
  2946  				}
  2947  			}
  2948  			if i != 0 && val != nil {
  2949  				compare := startVal.Compare(sb.evalCtx, val)
  2950  				ascending := c.Columns.Get(col).Ascending()
  2951  				if (compare > 0 && ascending) || (compare < 0 && !ascending) {
  2952  					// This check is needed to ensure that we calculate the correct distinct
  2953  					// value count for constraints such as:
  2954  					//   /a/b: [/1/2 - /1/2] [/1/4 - /1/4] [/2 - /2]
  2955  					// We should only increment the distinct count for column "a" once we
  2956  					// reach the third span.
  2957  					distinctCount++
  2958  				} else if compare != 0 {
  2959  					// This can happen if we have a prefix, but not an exact prefix. For
  2960  					// example:
  2961  					//   /a/b: [/1/2 - /1/4] [/3/2 - /3/5] [/6/0 - /6/0]
  2962  					// In this case, /a is a prefix, but not an exact prefix. Trying to
  2963  					// figure out the distinct count for column b may be more trouble
  2964  					// than it's worth. For now, don't bother trying.
  2965  					countable = false
  2966  					continue
  2967  				}
  2968  			}
  2969  			val = endVal
  2970  		}
  2971  
  2972  		if !countable {
  2973  			// The last column was not fully applied since there was at least one
  2974  			// uncountable span. The calculated distinct count will be used as a
  2975  			// lower bound for updateDistinctCountFromUnappliedConjuncts.
  2976  			return applied, distinctCount
  2977  		}
  2978  
  2979  		colID := c.Columns.Get(col).ID()
  2980  		sb.ensureColStat(opt.MakeColSet(colID), distinctCount, e, relProps)
  2981  		applied = col + 1
  2982  	}
  2983  
  2984  	return applied, 0
  2985  }
  2986  
  2987  // updateDistinctCountFromUnappliedConjuncts is used to update the distinct
  2988  // count for a constrained column when the exact count cannot be determined.
  2989  // The provided lowerBound serves as a lower bound on the calculated distinct
  2990  // count.
  2991  func (sb *statisticsBuilder) updateDistinctCountFromUnappliedConjuncts(
  2992  	colID opt.ColumnID, e RelExpr, relProps *props.Relational, numConjuncts, lowerBound float64,
  2993  ) {
  2994  	colSet := opt.MakeColSet(colID)
  2995  	inputStat, _ := sb.colStatFromInput(colSet, e)
  2996  	distinctCount := inputStat.DistinctCount * math.Pow(unknownFilterSelectivity, numConjuncts)
  2997  	distinctCount = max(distinctCount, lowerBound)
  2998  	sb.ensureColStat(colSet, distinctCount, e, relProps)
  2999  }
  3000  
  3001  // updateDistinctCountFromHistogram updates the distinct count for the given
  3002  // column statistic based on the estimated number of distinct values in the
  3003  // histogram. The updated count will be no larger than the provided value for
  3004  // maxDistinctCount.
  3005  //
  3006  // This function should be called after updateDistinctCountsFromConstraint
  3007  // and/or updateDistinctCountFromUnappliedConjuncts.
  3008  func (sb *statisticsBuilder) updateDistinctCountFromHistogram(
  3009  	colStat *props.ColumnStatistic, maxDistinctCount float64,
  3010  ) {
  3011  	distinct := colStat.Histogram.DistinctValuesCount()
  3012  	if distinct == 0 {
  3013  		// Make sure the count is non-zero. The stats may be stale, and we
  3014  		// can end up with weird and inefficient plans if we estimate 0 rows.
  3015  		distinct = min(1, colStat.DistinctCount)
  3016  	}
  3017  
  3018  	// The distinct count estimate from the histogram should always be more
  3019  	// accurate than the distinct count analysis performed in
  3020  	// updateDistinctCountsFromConstraint, so use the histogram estimate to
  3021  	// replace the distinct count value.
  3022  	colStat.DistinctCount = min(distinct, maxDistinctCount)
  3023  
  3024  	// The histogram does not include null values, so add 1 if there are any
  3025  	// null values.
  3026  	if colStat.NullCount > 0 {
  3027  		colStat.DistinctCount++
  3028  	}
  3029  }
  3030  
  3031  func (sb *statisticsBuilder) applyEquivalencies(
  3032  	equivReps opt.ColSet, filterFD *props.FuncDepSet, e RelExpr, relProps *props.Relational,
  3033  ) {
  3034  	equivReps.ForEach(func(i opt.ColumnID) {
  3035  		equivGroup := filterFD.ComputeEquivGroup(i)
  3036  		sb.updateDistinctNullCountsFromEquivalency(equivGroup, e, relProps)
  3037  	})
  3038  }
  3039  
  3040  func (sb *statisticsBuilder) updateDistinctNullCountsFromEquivalency(
  3041  	equivGroup opt.ColSet, e RelExpr, relProps *props.Relational,
  3042  ) {
  3043  	s := &relProps.Stats
  3044  
  3045  	// Find the minimum distinct and null counts for all columns in this equivalency
  3046  	// group.
  3047  	minDistinctCount := s.RowCount
  3048  	minNullCount := s.RowCount
  3049  	equivGroup.ForEach(func(i opt.ColumnID) {
  3050  		colSet := opt.MakeColSet(i)
  3051  		colStat, ok := s.ColStats.Lookup(colSet)
  3052  		if !ok {
  3053  			colStat, _ = sb.colStatFromInput(colSet, e)
  3054  			colStat = sb.copyColStat(colSet, s, colStat)
  3055  			if colStat.NullCount > 0 && colSet.Intersects(relProps.NotNullCols) {
  3056  				colStat.NullCount = 0
  3057  				colStat.DistinctCount = max(colStat.DistinctCount-1, epsilon)
  3058  			}
  3059  		}
  3060  		if colStat.DistinctCount < minDistinctCount {
  3061  			minDistinctCount = colStat.DistinctCount
  3062  		}
  3063  		if colStat.NullCount < minNullCount {
  3064  			minNullCount = colStat.NullCount
  3065  		}
  3066  	})
  3067  
  3068  	// Set the distinct and null counts to the minimum for all columns in this
  3069  	// equivalency group.
  3070  	equivGroup.ForEach(func(i opt.ColumnID) {
  3071  		colStat, _ := s.ColStats.Lookup(opt.MakeColSet(i))
  3072  		colStat.DistinctCount = minDistinctCount
  3073  		colStat.NullCount = minNullCount
  3074  	})
  3075  }
  3076  
  3077  // selectivityFromMultiColDistinctCounts calculates the selectivity of a filter
  3078  // by using estimated distinct counts of each constrained column before
  3079  // and after the filter was applied. We can perform this calculation in
  3080  // two different ways: (1) by treating the columns as completely independent,
  3081  // or (2) by assuming they are correlated.
  3082  //
  3083  // (1) Assuming independence between columns, we can calculate the selectivity
  3084  //     by taking the product of selectivities of each constrained column. In
  3085  //     the general case, this can be represented by the formula:
  3086  //
  3087  //                      ┬-┬ ⎛ new_distinct(i) ⎞
  3088  //       selectivity =  │ │ ⎜ --------------- ⎟
  3089  //                      ┴ ┴ ⎝ old_distinct(i) ⎠
  3090  //                     i in
  3091  //                  {constrained
  3092  //                    columns}
  3093  //
  3094  // (2) If we instead assume there is some correlation between columns, we
  3095  //     calculate the selectivity using multi-column statistics.
  3096  //
  3097  //                     ⎛ new_distinct({constrained columns}) ⎞
  3098  //       selectivity = ⎜ ----------------------------------- ⎟
  3099  //                     ⎝ old_distinct({constrained columns}) ⎠
  3100  //
  3101  //     This formula looks simple, but the challenge is that it is difficult
  3102  //     to determine the correct value for new_distinct({constrained columns})
  3103  //     if each column is not constrained to a single value. For example, if
  3104  //     new_distinct(x)=2 and new_distinct(y)=2, new_distinct({x,y}) could be 2,
  3105  //     3 or 4. We estimate the new distinct count as follows, using the concept
  3106  //     of "soft functional dependency (FD) strength" as defined in [1]:
  3107  //
  3108  //       new_distinct({x,y}) = min_value + range * (1 - FD_strength_scaled)
  3109  //
  3110  //     where
  3111  //
  3112  //       min_value = max(new_distinct(x), new_distinct(y))
  3113  //       max_value = new_distinct(x) * new_distinct(y)
  3114  //       range     = max_value - min_value
  3115  //
  3116  //                     ⎛ max(old_distinct(x),old_distinct(y)) ⎞
  3117  //       FD_strength = ⎜ ------------------------------------ ⎟
  3118  //                     ⎝         old_distinct({x,y})          ⎠
  3119  //
  3120  //                         ⎛ max(old_distinct(x), old_distinct(y)) ⎞
  3121  //       min_FD_strength = ⎜ ------------------------------------- ⎟
  3122  //                         ⎝   old_distinct(x) * old_distinct(y)   ⎠
  3123  //
  3124  //                            ⎛ FD_strength - min_FD_strength ⎞  // scales FD_strength
  3125  //       FD_strength_scaled = ⎜ ----------------------------- ⎟  // to be between
  3126  //                            ⎝      1 - min_FD_strength      ⎠  // 0 and 1
  3127  //
  3128  //     Suppose that old_distinct(x)=100 and old_distinct(y)=10. If x and y are
  3129  //     perfectly correlated, old_distinct({x,y})=100. Using the example from
  3130  //     above, new_distinct(x)=2 and new_distinct(y)=2. Plugging in the values
  3131  //     into the equation, we get:
  3132  //
  3133  //       FD_strength_scaled  = 1
  3134  //       new_distinct({x,y}) = 2 + (4 - 2) * (1 - 1) = 2
  3135  //
  3136  //     If x and y are completely independent, however, old_distinct({x,y})=1000.
  3137  //     In this case, we get:
  3138  //
  3139  //       FD_strength_scaled  = 0
  3140  //       new_distinct({x,y}) = 2 + (4 - 2) * (1 - 0) = 4
  3141  //
  3142  // Note that even if we calculate the selectivity based on equation (2) above,
  3143  // we still want to take equation (1) into account. This is because it is
  3144  // possible that there are two predicates that each have selectivity s, but the
  3145  // multi-column selectivity is also s. In order to ensure that the cost model
  3146  // considers the two predicates combined to be more selective than either one
  3147  // individually, we must give some weight to equation (1). Therefore, instead
  3148  // of equation (2) we actually return the following selectivity:
  3149  //
  3150  //   selectivity = (1 - w) * (equation 1) + w * (equation 2)
  3151  //
  3152  // where w is the constant multiColWeight.
  3153  //
  3154  // This selectivity will be used later to update the row count and the
  3155  // distinct count for the unconstrained columns.
  3156  //
  3157  // [1] Ilyas, Ihab F., et al. "CORDS: automatic discovery of correlations and
  3158  //     soft functional dependencies." SIGMOD 2004.
  3159  //
  3160  func (sb *statisticsBuilder) selectivityFromMultiColDistinctCounts(
  3161  	cols opt.ColSet, e RelExpr, s *props.Statistics,
  3162  ) (selectivity float64) {
  3163  	// Respect the session setting OptimizerUseMultiColStats.
  3164  	if !sb.evalCtx.SessionData.OptimizerUseMultiColStats {
  3165  		return sb.selectivityFromSingleColDistinctCounts(cols, e, s)
  3166  	}
  3167  
  3168  	// Make a copy of cols so we can remove columns that are not constrained.
  3169  	multiColSet := cols.Copy()
  3170  
  3171  	// First calculate the selectivity from equation (1) (see function comment),
  3172  	// and collect the inputs to equation (2).
  3173  	singleColSelectivity := 1.0
  3174  	newDistinctProduct, oldDistinctProduct := 1.0, 1.0
  3175  	maxNewDistinct, maxOldDistinct := float64(0), float64(0)
  3176  	multiColNullCount := -1.0
  3177  	minLocalSel := math.MaxFloat64
  3178  	for col, ok := cols.Next(0); ok; col, ok = cols.Next(col + 1) {
  3179  		colStat, ok := s.ColStats.Lookup(opt.MakeColSet(col))
  3180  		if !ok {
  3181  			multiColSet.Remove(col)
  3182  			continue
  3183  		}
  3184  
  3185  		inputColStat, inputStats := sb.colStatFromInput(colStat.Cols, e)
  3186  		localSel := sb.selectivityFromDistinctCount(colStat, inputColStat, inputStats.RowCount)
  3187  		singleColSelectivity *= localSel
  3188  
  3189  		// Don't bother including columns in the multi-column calculation that
  3190  		// don't contribute to the selectivity.
  3191  		if localSel == 1 {
  3192  			multiColSet.Remove(col)
  3193  			continue
  3194  		}
  3195  
  3196  		// Calculate values needed for the multi-column stats calculation below.
  3197  		newDistinctProduct *= colStat.DistinctCount
  3198  		oldDistinctProduct *= inputColStat.DistinctCount
  3199  		if colStat.DistinctCount > maxNewDistinct {
  3200  			maxNewDistinct = colStat.DistinctCount
  3201  		}
  3202  		if inputColStat.DistinctCount > maxOldDistinct {
  3203  			maxOldDistinct = inputColStat.DistinctCount
  3204  		}
  3205  		if localSel < minLocalSel {
  3206  			minLocalSel = localSel
  3207  		}
  3208  		if multiColNullCount < 0 {
  3209  			multiColNullCount = inputStats.RowCount
  3210  		}
  3211  		// Multiply by the expected chance of collisions with nulls already
  3212  		// collected.
  3213  		multiColNullCount *= colStat.NullCount / inputStats.RowCount
  3214  	}
  3215  
  3216  	// If we don't need to use a multi-column statistic, we're done.
  3217  	if multiColSet.Len() <= 1 {
  3218  		return singleColSelectivity
  3219  	}
  3220  
  3221  	// Otherwise, calculate the selectivity using multi-column stats from
  3222  	// equation (2). See the comment above the function definition for details
  3223  	// about the formula.
  3224  	inputColStat, inputStats := sb.colStatFromInput(multiColSet, e)
  3225  	fdStrength := min(maxOldDistinct/inputColStat.DistinctCount, 1.0)
  3226  	maxMutiColOldDistinct := min(oldDistinctProduct, inputStats.RowCount)
  3227  	minFdStrength := min(maxOldDistinct/maxMutiColOldDistinct, fdStrength)
  3228  	if minFdStrength < 1 {
  3229  		// Scale the fdStrength so it ranges between 0 and 1.
  3230  		fdStrength = (fdStrength - minFdStrength) / (1 - minFdStrength)
  3231  	}
  3232  	distinctCountRange := max(newDistinctProduct-maxNewDistinct, 0)
  3233  
  3234  	colStat, _ := s.ColStats.Add(multiColSet)
  3235  	colStat.DistinctCount = maxNewDistinct + distinctCountRange*(1-fdStrength)
  3236  	colStat.NullCount = multiColNullCount
  3237  	multiColSelectivity := sb.selectivityFromDistinctCount(colStat, inputColStat, inputStats.RowCount)
  3238  
  3239  	// Now, we must adjust multiColSelectivity so that it is not greater than
  3240  	// the selectivity of any subset of the columns in multiColSet. This would
  3241  	// be internally inconsistent and could lead to bad plans. For example,
  3242  	// x=1 AND y=1 should always be considered more selective (i.e., with lower
  3243  	// selectivity) than x=1 alone.
  3244  	//
  3245  	// We have already found the minimum selectivity of all the individual
  3246  	// columns (subsets of size 1) above and stored it in minLocalSel. It's not
  3247  	// practical, however, to calculate the minimum selectivity for all subsets
  3248  	// larger than size 1.
  3249  	//
  3250  	// Instead, we focus on a specific case known to occasionally have this
  3251  	// problem: when multiColSet contains 3 or more columns and at least one has
  3252  	// distinct count greater than 1, the subset of columns that have distinct
  3253  	// count less than or equal to 1 may have a smaller selectivity according to
  3254  	// equation (2).
  3255  	//
  3256  	// In this case, update minLocalSel and adjust multiColSelectivity as needed.
  3257  	//
  3258  	if maxNewDistinct > 1 && multiColSet.Len() > 2 {
  3259  		var lowDistinctCountCols opt.ColSet
  3260  		multiColSet.ForEach(func(col opt.ColumnID) {
  3261  			// We already know the column stat exists if it's in multiColSet.
  3262  			colStat, _ := s.ColStats.Lookup(opt.MakeColSet(col))
  3263  			if colStat.DistinctCount <= 1 {
  3264  				lowDistinctCountCols.Add(col)
  3265  			}
  3266  		})
  3267  
  3268  		if lowDistinctCountCols.Len() > 1 {
  3269  			selLowDistinctCountCols := sb.selectivityFromMultiColDistinctCounts(
  3270  				lowDistinctCountCols, e, s,
  3271  			)
  3272  			if selLowDistinctCountCols < minLocalSel {
  3273  				minLocalSel = selLowDistinctCountCols
  3274  			}
  3275  		}
  3276  	}
  3277  	multiColSelectivity = min(multiColSelectivity, minLocalSel)
  3278  
  3279  	// As described in the function comment, we actually return a weighted sum
  3280  	// of multi-column and single-column selectivity estimates.
  3281  	return (1-multiColWeight)*singleColSelectivity + multiColWeight*multiColSelectivity
  3282  }
  3283  
  3284  // selectivityFromSingleColDistinctCounts calculates the selectivity of a
  3285  // filter by using estimated distinct counts of each constrained column before
  3286  // and after the filter was applied. It assumes independence between columns,
  3287  // so it uses equation (1) from selectivityFromMultiColDistinctCounts. See the
  3288  // comment above that function for details.
  3289  func (sb *statisticsBuilder) selectivityFromSingleColDistinctCounts(
  3290  	cols opt.ColSet, e RelExpr, s *props.Statistics,
  3291  ) (selectivity float64) {
  3292  	selectivity = 1.0
  3293  	for col, ok := cols.Next(0); ok; col, ok = cols.Next(col + 1) {
  3294  		colStat, ok := s.ColStats.Lookup(opt.MakeColSet(col))
  3295  		if !ok {
  3296  			continue
  3297  		}
  3298  
  3299  		inputColStat, inputStats := sb.colStatFromInput(colStat.Cols, e)
  3300  		selectivity *= sb.selectivityFromDistinctCount(colStat, inputColStat, inputStats.RowCount)
  3301  	}
  3302  
  3303  	return selectivity
  3304  }
  3305  
  3306  // selectivityFromDistinctCount calculates the selectivity of a filter by using
  3307  // the estimated distinct count of a single constrained column or set of
  3308  // columns before and after the filter was applied.
  3309  func (sb *statisticsBuilder) selectivityFromDistinctCount(
  3310  	colStat, inputColStat *props.ColumnStatistic, inputRowCount float64,
  3311  ) float64 {
  3312  	newDistinct := colStat.DistinctCount
  3313  	oldDistinct := inputColStat.DistinctCount
  3314  
  3315  	// Nulls are included in the distinct count, so remove 1 from the
  3316  	// distinct counts if needed.
  3317  	if inputColStat.NullCount > 0 {
  3318  		oldDistinct = max(oldDistinct-1, 0)
  3319  	}
  3320  	if colStat.NullCount > 0 {
  3321  		newDistinct = max(newDistinct-1, 0)
  3322  	}
  3323  
  3324  	// Calculate the selectivity of the predicate.
  3325  	nonNullSelectivity := fraction(newDistinct, oldDistinct)
  3326  	nullSelectivity := fraction(colStat.NullCount, inputColStat.NullCount)
  3327  	return sb.predicateSelectivity(
  3328  		nonNullSelectivity, nullSelectivity, inputColStat.NullCount, inputRowCount,
  3329  	)
  3330  }
  3331  
  3332  // selectivityFromHistograms is similar to selectivityFromSingleColDistinctCounts,
  3333  // in that it calculates the selectivity of a filter by taking the product of
  3334  // selectivities of each constrained column.
  3335  //
  3336  // For histograms, the selectivity of a constrained column is calculated as
  3337  // (# values in histogram after filter) / (# values in histogram before filter).
  3338  func (sb *statisticsBuilder) selectivityFromHistograms(
  3339  	cols opt.ColSet, e RelExpr, s *props.Statistics,
  3340  ) (selectivity float64) {
  3341  	selectivity = 1.0
  3342  	for col, ok := cols.Next(0); ok; col, ok = cols.Next(col + 1) {
  3343  		colStat, ok := s.ColStats.Lookup(opt.MakeColSet(col))
  3344  		if !ok {
  3345  			continue
  3346  		}
  3347  
  3348  		inputColStat, inputStats := sb.colStatFromInput(colStat.Cols, e)
  3349  		newHist := colStat.Histogram
  3350  		oldHist := inputColStat.Histogram
  3351  		if newHist == nil || oldHist == nil {
  3352  			continue
  3353  		}
  3354  
  3355  		newCount := newHist.ValuesCount()
  3356  		oldCount := oldHist.ValuesCount()
  3357  
  3358  		// Calculate the selectivity of the predicate.
  3359  		nonNullSelectivity := fraction(newCount, oldCount)
  3360  		nullSelectivity := fraction(colStat.NullCount, inputColStat.NullCount)
  3361  		selectivity *= sb.predicateSelectivity(
  3362  			nonNullSelectivity, nullSelectivity, inputColStat.NullCount, inputStats.RowCount,
  3363  		)
  3364  	}
  3365  	return selectivity
  3366  }
  3367  
  3368  // selectivityFromNullsRemoved calculates the selectivity from null-rejecting
  3369  // filters that were not already accounted for in selectivityFromMultiColDistinctCounts
  3370  // or selectivityFromHistograms. The columns for filters already accounted for
  3371  // should be designated by ignoreCols.
  3372  func (sb *statisticsBuilder) selectivityFromNullsRemoved(
  3373  	e RelExpr, relProps *props.Relational, ignoreCols opt.ColSet,
  3374  ) (selectivity float64) {
  3375  	selectivity = 1.0
  3376  	relProps.NotNullCols.ForEach(func(col opt.ColumnID) {
  3377  		if !ignoreCols.Contains(col) {
  3378  			inputColStat, inputStats := sb.colStatFromInput(opt.MakeColSet(col), e)
  3379  			selectivity *= sb.predicateSelectivity(
  3380  				1, /* nonNullSelectivity */
  3381  				0, /* nullSelectivity */
  3382  				inputColStat.NullCount,
  3383  				inputStats.RowCount,
  3384  			)
  3385  		}
  3386  	})
  3387  
  3388  	return selectivity
  3389  }
  3390  
  3391  // predicateSelectivity calculates the selectivity of a predicate, using the
  3392  // following formula:
  3393  //
  3394  //   sel = (output row count) / (input row count)
  3395  //
  3396  // where
  3397  //
  3398  //   output row count =
  3399  //     (fraction of non-null values preserved) * (number of non-null input rows) +
  3400  //     (fraction of null values preserved) * (number of null input rows)
  3401  //
  3402  func (sb *statisticsBuilder) predicateSelectivity(
  3403  	nonNullSelectivity, nullSelectivity, inputNullCount, inputRowCount float64,
  3404  ) float64 {
  3405  	outRowCount := nonNullSelectivity*(inputRowCount-inputNullCount) + nullSelectivity*inputNullCount
  3406  	sel := outRowCount / inputRowCount
  3407  
  3408  	// Avoid setting selectivity to 0. The stats may be stale, and we
  3409  	// can end up with weird and inefficient plans if we estimate 0 rows.
  3410  	return max(sel, epsilon)
  3411  }
  3412  
  3413  // selectivityFromEquivalencies determines the selectivity of equality
  3414  // constraints. It must be called before applyEquivalencies.
  3415  func (sb *statisticsBuilder) selectivityFromEquivalencies(
  3416  	equivReps opt.ColSet, filterFD *props.FuncDepSet, e RelExpr, s *props.Statistics,
  3417  ) (selectivity float64) {
  3418  	selectivity = 1.0
  3419  	equivReps.ForEach(func(i opt.ColumnID) {
  3420  		equivGroup := filterFD.ComputeEquivGroup(i)
  3421  		selectivity *= sb.selectivityFromEquivalency(equivGroup, e, s)
  3422  	})
  3423  	return selectivity
  3424  }
  3425  
  3426  func (sb *statisticsBuilder) selectivityFromEquivalency(
  3427  	equivGroup opt.ColSet, e RelExpr, s *props.Statistics,
  3428  ) (selectivity float64) {
  3429  	// Find the maximum input distinct count for all columns in this equivalency
  3430  	// group.
  3431  	maxDistinctCount := float64(0)
  3432  	equivGroup.ForEach(func(i opt.ColumnID) {
  3433  		// If any of the distinct counts were updated by the filter, we want to use
  3434  		// the updated value.
  3435  		colSet := opt.MakeColSet(i)
  3436  		colStat, ok := s.ColStats.Lookup(colSet)
  3437  		if !ok {
  3438  			colStat, _ = sb.colStatFromInput(colSet, e)
  3439  		}
  3440  		if maxDistinctCount < colStat.DistinctCount {
  3441  			maxDistinctCount = colStat.DistinctCount
  3442  		}
  3443  	})
  3444  	if maxDistinctCount > s.RowCount {
  3445  		maxDistinctCount = s.RowCount
  3446  	}
  3447  
  3448  	// The selectivity of an equality condition var1=var2 is
  3449  	// 1/max(distinct(var1), distinct(var2)).
  3450  	return fraction(1, maxDistinctCount)
  3451  }
  3452  
  3453  // selectivityFromEquivalenciesSemiJoin determines the selectivity of equality
  3454  // constraints on a semi join. It must be called before applyEquivalencies.
  3455  func (sb *statisticsBuilder) selectivityFromEquivalenciesSemiJoin(
  3456  	equivReps, leftOutputCols, rightOutputCols opt.ColSet,
  3457  	filterFD *props.FuncDepSet,
  3458  	e RelExpr,
  3459  	s *props.Statistics,
  3460  ) (selectivity float64) {
  3461  	selectivity = 1.0
  3462  	equivReps.ForEach(func(i opt.ColumnID) {
  3463  		equivGroup := filterFD.ComputeEquivGroup(i)
  3464  		selectivity *= sb.selectivityFromEquivalencySemiJoin(
  3465  			equivGroup, leftOutputCols, rightOutputCols, e, s,
  3466  		)
  3467  	})
  3468  	return selectivity
  3469  }
  3470  
  3471  func (sb *statisticsBuilder) selectivityFromEquivalencySemiJoin(
  3472  	equivGroup, leftOutputCols, rightOutputCols opt.ColSet, e RelExpr, s *props.Statistics,
  3473  ) (selectivity float64) {
  3474  	// Find the minimum (maximum) input distinct count for all columns in this
  3475  	// equivalency group from the right (left).
  3476  	minDistinctCountRight := math.MaxFloat64
  3477  	maxDistinctCountLeft := float64(0)
  3478  	equivGroup.ForEach(func(i opt.ColumnID) {
  3479  		// If any of the distinct counts were updated by the filter, we want to use
  3480  		// the updated value.
  3481  		colSet := opt.MakeColSet(i)
  3482  		colStat, ok := s.ColStats.Lookup(colSet)
  3483  		if !ok {
  3484  			colStat, _ = sb.colStatFromInput(colSet, e)
  3485  		}
  3486  		if leftOutputCols.Contains(i) {
  3487  			if maxDistinctCountLeft < colStat.DistinctCount {
  3488  				maxDistinctCountLeft = colStat.DistinctCount
  3489  			}
  3490  		} else if rightOutputCols.Contains(i) {
  3491  			if minDistinctCountRight > colStat.DistinctCount {
  3492  				minDistinctCountRight = colStat.DistinctCount
  3493  			}
  3494  		}
  3495  	})
  3496  	if maxDistinctCountLeft > s.RowCount {
  3497  		maxDistinctCountLeft = s.RowCount
  3498  	}
  3499  
  3500  	return fraction(minDistinctCountRight, maxDistinctCountLeft)
  3501  }
  3502  
  3503  func (sb *statisticsBuilder) selectivityFromGeoRelationship(
  3504  	e RelExpr, s *props.Statistics,
  3505  ) (selectivity float64) {
  3506  	return unknownGeoRelationshipSelectivity
  3507  }
  3508  
  3509  func (sb *statisticsBuilder) selectivityFromUnappliedConjuncts(
  3510  	numUnappliedConjuncts float64,
  3511  ) (selectivity float64) {
  3512  	return math.Pow(unknownFilterSelectivity, numUnappliedConjuncts)
  3513  }
  3514  
  3515  // tryReduceCols is used to determine which columns to use for selectivity
  3516  // calculation.
  3517  //
  3518  // When columns in the colStats are functionally determined by other columns,
  3519  // and the determinant columns each have distinctCount = 1, we should consider
  3520  // the implied correlations for selectivity calculation. Consider the query:
  3521  //
  3522  //   SELECT * FROM customer WHERE id = 123 and name = 'John Smith'
  3523  //
  3524  // If id is the primary key of customer, then name is functionally determined
  3525  // by id. We only need to consider the selectivity of id, not name, since id
  3526  // and name are fully correlated. To determine if we have a case such as this
  3527  // one, we functionally reduce the set of columns which have column statistics,
  3528  // eliminating columns that can be functionally determined by other columns.
  3529  // If the distinct count on all of these reduced columns is one, then we return
  3530  // this reduced column set to be used for selectivity calculation.
  3531  //
  3532  func (sb *statisticsBuilder) tryReduceCols(
  3533  	cols opt.ColSet, s *props.Statistics, fd *props.FuncDepSet,
  3534  ) opt.ColSet {
  3535  	reducedCols := fd.ReduceCols(cols)
  3536  	if reducedCols.Empty() {
  3537  		// There are no reduced columns so we return the original column set.
  3538  		return cols
  3539  	}
  3540  
  3541  	for i, ok := reducedCols.Next(0); ok; i, ok = reducedCols.Next(i + 1) {
  3542  		colStat, ok := s.ColStats.Lookup(opt.MakeColSet(i))
  3543  		if !ok || colStat.DistinctCount != 1 {
  3544  			// The reduced columns are not all constant, so return the original
  3545  			// column set.
  3546  			return cols
  3547  		}
  3548  	}
  3549  
  3550  	return reducedCols
  3551  }
  3552  
  3553  // tryReduceJoinCols is used to determine which columns to use for join ON
  3554  // condition selectivity calculation. See tryReduceCols.
  3555  func (sb *statisticsBuilder) tryReduceJoinCols(
  3556  	cols opt.ColSet,
  3557  	s *props.Statistics,
  3558  	leftCols, rightCols opt.ColSet,
  3559  	leftFD, rightFD *props.FuncDepSet,
  3560  ) opt.ColSet {
  3561  	leftCols = sb.tryReduceCols(leftCols.Intersection(cols), s, leftFD)
  3562  	rightCols = sb.tryReduceCols(rightCols.Intersection(cols), s, rightFD)
  3563  	return leftCols.Union(rightCols)
  3564  }
  3565  
  3566  func isEqualityWithTwoVars(cond opt.ScalarExpr) bool {
  3567  	if eq, ok := cond.(*EqExpr); ok {
  3568  		return eq.Left.Op() == opt.VariableOp && eq.Right.Op() == opt.VariableOp
  3569  	}
  3570  	return false
  3571  }
  3572  
  3573  // numConjunctsInConstraint returns a rough estimate of the number of conjuncts
  3574  // used to build the given constraint for the column at position nth.
  3575  func (sb *statisticsBuilder) numConjunctsInConstraint(
  3576  	c *constraint.Constraint, nth int,
  3577  ) (numConjuncts float64) {
  3578  	if c.Spans.Count() == 0 {
  3579  		return 0 /* numConjuncts */
  3580  	}
  3581  
  3582  	numConjuncts = math.MaxFloat64
  3583  	for i := 0; i < c.Spans.Count(); i++ {
  3584  		span := c.Spans.Get(i)
  3585  		numSpanConjuncts := float64(0)
  3586  		if span.StartKey().Length() > nth {
  3587  			// Cases of NULL in a constraint should be ignored. For example,
  3588  			// without knowledge of the data distribution, /a: (/NULL - /10] should
  3589  			// have the same estimated selectivity as /a: [/10 - ]. Selectivity
  3590  			// of NULL constraints is handled in selectivityFromMultiColDistinctCounts,
  3591  			// selectivityFromHistograms, and selectivityFromNullsRemoved.
  3592  			if c.Columns.Get(nth).Descending() ||
  3593  				span.StartKey().Value(nth) != tree.DNull {
  3594  				numSpanConjuncts++
  3595  			}
  3596  		}
  3597  		if span.EndKey().Length() > nth {
  3598  			// Ignore cases of NULL in constraints. (see above comment).
  3599  			if !c.Columns.Get(nth).Descending() ||
  3600  				span.EndKey().Value(nth) != tree.DNull {
  3601  				numSpanConjuncts++
  3602  			}
  3603  		}
  3604  		if numSpanConjuncts < numConjuncts {
  3605  			numConjuncts = numSpanConjuncts
  3606  		}
  3607  	}
  3608  
  3609  	return numConjuncts
  3610  }
  3611  
  3612  // RequestColStat causes a column statistic to be calculated on the relational
  3613  // expression. This is used for testing.
  3614  func RequestColStat(evalCtx *tree.EvalContext, e RelExpr, cols opt.ColSet) {
  3615  	var sb statisticsBuilder
  3616  	sb.init(evalCtx, e.Memo().Metadata())
  3617  	sb.colStat(cols, e)
  3618  }