github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/sql/opt/xform/coster.go

github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/sql/opt/xform/coster.go (about)

     1  // Copyright 2018 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package xform
    12  
    13  import (
    14  	"math"
    15  	"math/rand"
    16  
    17  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    18  	"github.com/cockroachdb/cockroach/pkg/sql/opt"
    19  	"github.com/cockroachdb/cockroach/pkg/sql/opt/cat"
    20  	"github.com/cockroachdb/cockroach/pkg/sql/opt/memo"
    21  	"github.com/cockroachdb/cockroach/pkg/sql/opt/ordering"
    22  	"github.com/cockroachdb/cockroach/pkg/sql/opt/props/physical"
    23  	"github.com/cockroachdb/cockroach/pkg/sql/sem/tree"
    24  	"github.com/cockroachdb/cockroach/pkg/util"
    25  	"github.com/cockroachdb/cockroach/pkg/util/log"
    26  	"github.com/cockroachdb/errors"
    27  	"golang.org/x/tools/container/intsets"
    28  )
    29  
    30  // Coster is used by the optimizer to assign a cost to a candidate expression
    31  // that can provide a set of required physical properties. If a candidate
    32  // expression has a lower cost than any other expression in the memo group, then
    33  // it becomes the new best expression for the group.
    34  //
    35  // The set of costing formulas maintained by the coster for the set of all
    36  // operators constitute the "cost model". A given cost model can be designed to
    37  // maximize any optimization goal, such as:
    38  //
    39  //   1. Max aggregate cluster throughput (txns/sec across cluster)
    40  //   2. Min transaction latency (time to commit txns)
    41  //   3. Min latency to first row (time to get first row of txns)
    42  //   4. Min memory usage
    43  //   5. Some weighted combination of #1 - #4
    44  //
    45  // The cost model in this file targets #1 as the optimization goal. However,
    46  // note that #2 is implicitly important to that goal, since overall cluster
    47  // throughput will suffer if there are lots of pending transactions waiting on
    48  // I/O.
    49  //
    50  // Coster is an interface so that different costing algorithms can be used by
    51  // the optimizer. For example, the OptSteps command uses a custom coster that
    52  // assigns infinite costs to some expressions in order to prevent them from
    53  // being part of the lowest cost tree (for debugging purposes).
    54  type Coster interface {
    55  	// ComputeCost returns the estimated cost of executing the candidate
    56  	// expression. The optimizer does not expect the cost to correspond to any
    57  	// real-world metric, but does expect costs to be comparable to one another,
    58  	// as well as summable.
    59  	ComputeCost(candidate memo.RelExpr, required *physical.Required) memo.Cost
    60  }
    61  
    62  // coster encapsulates the default cost model for the optimizer. The coster
    63  // assigns an estimated cost to each expression in the memo so that the
    64  // optimizer can choose the lowest cost expression tree. The estimated cost is
    65  // a best-effort approximation of the actual cost of execution, based on table
    66  // and index statistics that are propagated throughout the logical expression
    67  // tree.
    68  type coster struct {
    69  	mem *memo.Memo
    70  
    71  	// locality gives the location of the current node as a set of user-defined
    72  	// key/value pairs, ordered from most inclusive to least inclusive. If there
    73  	// are no tiers, then the node's location is not known. Example:
    74  	//
    75  	//   [region=us,dc=east]
    76  	//
    77  	locality roachpb.Locality
    78  
    79  	// perturbation indicates how much to randomly perturb the cost. It is used
    80  	// to generate alternative plans for testing. For example, if perturbation is
    81  	// 0.5, and the estimated cost of an expression is c, the cost returned by
    82  	// ComputeCost will be in the range [c - 0.5 * c, c + 0.5 * c).
    83  	perturbation float64
    84  }
    85  
    86  var _ Coster = &coster{}
    87  
    88  // MakeDefaultCoster creates an instance of the default coster.
    89  func MakeDefaultCoster(mem *memo.Memo) Coster {
    90  	return &coster{mem: mem}
    91  }
    92  
    93  const (
    94  	// These costs have been copied from the Postgres optimizer:
    95  	// https://github.com/postgres/postgres/blob/master/src/include/optimizer/cost.h
    96  	// TODO(rytaft): "How Good are Query Optimizers, Really?" says that the
    97  	// PostgreSQL ratio between CPU and I/O is probably unrealistic in modern
    98  	// systems since much of the data can be cached in memory. Consider
    99  	// increasing the cpuCostFactor to account for this.
   100  	cpuCostFactor    = 0.01
   101  	seqIOCostFactor  = 1
   102  	randIOCostFactor = 4
   103  
   104  	// TODO(justin): make this more sophisticated.
   105  	// lookupJoinRetrieveRowCost is the cost to retrieve a single row during a
   106  	// lookup join.
   107  	// See https://github.com/cockroachdb/cockroach/pull/35561 for the initial
   108  	// justification for this constant.
   109  	lookupJoinRetrieveRowCost = 2 * seqIOCostFactor
   110  
   111  	// Input rows to a join are processed in batches of this size.
   112  	// See joinreader.go.
   113  	joinReaderBatchSize = 100.0
   114  
   115  	// In the case of a limit hint, a scan will read this multiple of the expected
   116  	// number of rows. See scanNode.limitHint.
   117  	scanSoftLimitMultiplier = 2.0
   118  
   119  	// latencyCostFactor represents the throughput impact of doing scans on an
   120  	// index that may be remotely located in a different locality. If latencies
   121  	// are higher, then overall cluster throughput will suffer somewhat, as there
   122  	// will be more queries in memory blocking on I/O. The impact on throughput
   123  	// is expected to be relatively low, so latencyCostFactor is set to a small
   124  	// value. However, even a low value will cause the optimizer to prefer
   125  	// indexes that are likely to be geographically closer, if they are otherwise
   126  	// the same cost to access.
   127  	// TODO(andyk): Need to do analysis to figure out right value and/or to come
   128  	// up with better way to incorporate latency into the coster.
   129  	latencyCostFactor = cpuCostFactor
   130  
   131  	// hugeCost is used with expressions we want to avoid; these are expressions
   132  	// that "violate" a hint like forcing a specific index or join algorithm.
   133  	// If the final expression has this cost or larger, it means that there was no
   134  	// plan that could satisfy the hints.
   135  	hugeCost memo.Cost = 1e100
   136  
   137  	// Some benchmarks showed that some geo functions were atleast 10 times
   138  	// slower than some float functions, so this is a somewhat data-backed
   139  	// guess.
   140  	geoFnCost = cpuCostFactor * 10
   141  )
   142  
   143  // Init initializes a new coster structure with the given memo.
   144  func (c *coster) Init(evalCtx *tree.EvalContext, mem *memo.Memo, perturbation float64) {
   145  	c.mem = mem
   146  	c.locality = evalCtx.Locality
   147  	c.perturbation = perturbation
   148  }
   149  
   150  // ComputeCost calculates the estimated cost of the top-level operator in a
   151  // candidate best expression, based on its logical properties and those of its
   152  // children.
   153  //
   154  // Note: each custom function to compute the cost of an operator calculates
   155  // the cost based on Big-O estimated complexity. Most constant factors are
   156  // ignored for now.
   157  func (c *coster) ComputeCost(candidate memo.RelExpr, required *physical.Required) memo.Cost {
   158  	var cost memo.Cost
   159  	switch candidate.Op() {
   160  	case opt.SortOp:
   161  		cost = c.computeSortCost(candidate.(*memo.SortExpr), required)
   162  
   163  	case opt.ScanOp:
   164  		cost = c.computeScanCost(candidate.(*memo.ScanExpr), required)
   165  
   166  	case opt.SelectOp:
   167  		cost = c.computeSelectCost(candidate.(*memo.SelectExpr))
   168  
   169  	case opt.ProjectOp:
   170  		cost = c.computeProjectCost(candidate.(*memo.ProjectExpr))
   171  
   172  	case opt.ValuesOp:
   173  		cost = c.computeValuesCost(candidate.(*memo.ValuesExpr))
   174  
   175  	case opt.InnerJoinOp, opt.LeftJoinOp, opt.RightJoinOp, opt.FullJoinOp,
   176  		opt.SemiJoinOp, opt.AntiJoinOp, opt.InnerJoinApplyOp, opt.LeftJoinApplyOp,
   177  		opt.SemiJoinApplyOp, opt.AntiJoinApplyOp:
   178  		// All join ops use hash join by default.
   179  		cost = c.computeHashJoinCost(candidate)
   180  
   181  	case opt.MergeJoinOp:
   182  		cost = c.computeMergeJoinCost(candidate.(*memo.MergeJoinExpr))
   183  
   184  	case opt.IndexJoinOp:
   185  		cost = c.computeIndexJoinCost(candidate.(*memo.IndexJoinExpr))
   186  
   187  	case opt.LookupJoinOp:
   188  		cost = c.computeLookupJoinCost(candidate.(*memo.LookupJoinExpr), required)
   189  
   190  	case opt.GeoLookupJoinOp:
   191  		cost = c.computeGeoLookupJoinCost(candidate.(*memo.GeoLookupJoinExpr), required)
   192  
   193  	case opt.ZigzagJoinOp:
   194  		cost = c.computeZigzagJoinCost(candidate.(*memo.ZigzagJoinExpr))
   195  
   196  	case opt.UnionOp, opt.IntersectOp, opt.ExceptOp,
   197  		opt.UnionAllOp, opt.IntersectAllOp, opt.ExceptAllOp:
   198  		cost = c.computeSetCost(candidate)
   199  
   200  	case opt.GroupByOp, opt.ScalarGroupByOp, opt.DistinctOnOp, opt.EnsureDistinctOnOp,
   201  		opt.UpsertDistinctOnOp, opt.EnsureUpsertDistinctOnOp:
   202  		cost = c.computeGroupingCost(candidate, required)
   203  
   204  	case opt.LimitOp:
   205  		cost = c.computeLimitCost(candidate.(*memo.LimitExpr))
   206  
   207  	case opt.OffsetOp:
   208  		cost = c.computeOffsetCost(candidate.(*memo.OffsetExpr))
   209  
   210  	case opt.OrdinalityOp:
   211  		cost = c.computeOrdinalityCost(candidate.(*memo.OrdinalityExpr))
   212  
   213  	case opt.ProjectSetOp:
   214  		cost = c.computeProjectSetCost(candidate.(*memo.ProjectSetExpr))
   215  
   216  	case opt.ExplainOp:
   217  		// Technically, the cost of an Explain operation is independent of the cost
   218  		// of the underlying plan. However, we want to explain the plan we would get
   219  		// without EXPLAIN, i.e. the lowest cost plan. So do nothing special to get
   220  		// default behavior.
   221  	}
   222  
   223  	// Add a one-time cost for any operator, meant to reflect the cost of setting
   224  	// up execution for the operator. This makes plans with fewer operators
   225  	// preferable, all else being equal.
   226  	cost += cpuCostFactor
   227  
   228  	if !cost.Less(memo.MaxCost) {
   229  		// Optsteps uses MaxCost to suppress nodes in the memo. When a node with
   230  		// MaxCost is added to the memo, it can lead to an obscure crash with an
   231  		// unknown node. We'd rather detect this early.
   232  		panic(errors.AssertionFailedf("node %s with MaxCost added to the memo", log.Safe(candidate.Op())))
   233  	}
   234  
   235  	if c.perturbation != 0 {
   236  		// Don't perturb the cost if we are forcing an index.
   237  		if cost < hugeCost {
   238  			// Get a random value in the range [-1.0, 1.0)
   239  			multiplier := 2*rand.Float64() - 1
   240  
   241  			// If perturbation is p, and the estimated cost of an expression is c,
   242  			// the new cost is in the range [max(0, c - pc), c + pc). For example,
   243  			// if p=1.5, the new cost is in the range [0, c + 1.5 * c).
   244  			cost += cost * memo.Cost(c.perturbation*multiplier)
   245  			// The cost must always be >= 0.
   246  			if cost < 0 {
   247  				cost = 0
   248  			}
   249  		}
   250  	}
   251  
   252  	return cost
   253  }
   254  
   255  func (c *coster) computeSortCost(sort *memo.SortExpr, required *physical.Required) memo.Cost {
   256  	// We calculate the cost of a segmented sort. We assume each segment
   257  	// is of the same size of (rowCount / numSegments). We also calculate the
   258  	// per-row cost. The cost of the sort is:
   259  	//
   260  	// perRowCost * (rowCount + (segmentSize * log2(segmentSize) * numOrderedSegments))
   261  	//
   262  	// The constant term is necessary for cases where the estimated row count is
   263  	// very small.
   264  	// TODO(rytaft): This is the cost of a local, in-memory sort. When a
   265  	// certain amount of memory is used, distsql switches to a disk-based sort
   266  	// with a temp RocksDB store.
   267  
   268  	numSegments := c.countSegments(sort)
   269  	cost := memo.Cost(0)
   270  	stats := sort.Relational().Stats
   271  	rowCount := stats.RowCount
   272  	perRowCost := c.rowSortCost(len(required.Ordering.Columns) - len(sort.InputOrdering.Columns))
   273  
   274  	if !sort.InputOrdering.Any() {
   275  		// Add the cost for finding the segments.
   276  		cost += memo.Cost(float64(len(sort.InputOrdering.Columns))*rowCount) * cpuCostFactor
   277  	}
   278  
   279  	segmentSize := rowCount / numSegments
   280  	if segmentSize > 1 {
   281  		cost += memo.Cost(segmentSize) * (memo.Cost(math.Log2(segmentSize)) * memo.Cost(numSegments))
   282  	}
   283  	cost = perRowCost * (memo.Cost(rowCount) + cost)
   284  	return cost
   285  }
   286  
   287  func (c *coster) computeScanCost(scan *memo.ScanExpr, required *physical.Required) memo.Cost {
   288  	// Scanning an index with a few columns is faster than scanning an index with
   289  	// many columns. Ideally, we would want to use statistics about the size of
   290  	// each column. In lieu of that, use the number of columns.
   291  	if scan.Flags.ForceIndex && scan.Flags.Index != scan.Index {
   292  		// If we are forcing an index, any other index has a very high cost. In
   293  		// practice, this will only happen when this is a primary index scan.
   294  		return hugeCost
   295  	}
   296  	rowCount := scan.Relational().Stats.RowCount
   297  	perRowCost := c.rowScanCost(scan.Table, scan.Index, scan.Cols.Len())
   298  
   299  	if required.LimitHint != 0 {
   300  		rowCount = math.Min(rowCount, required.LimitHint*scanSoftLimitMultiplier)
   301  	}
   302  
   303  	if ordering.ScanIsReverse(scan, &required.Ordering) {
   304  		if rowCount > 1 {
   305  			// Need to do binary search to seek to the previous row.
   306  			perRowCost += memo.Cost(math.Log2(rowCount)) * cpuCostFactor
   307  		}
   308  	}
   309  
   310  	// Add a small cost if the scan is unconstrained, so all else being equal, we
   311  	// will prefer a constrained scan. This is important if our row count
   312  	// estimate turns out to be smaller than the actual row count.
   313  	var preferConstrainedScanCost memo.Cost
   314  	if scan.Constraint == nil || scan.Constraint.IsUnconstrained() {
   315  		preferConstrainedScanCost = cpuCostFactor
   316  	}
   317  	return memo.Cost(rowCount)*(seqIOCostFactor+perRowCost) + preferConstrainedScanCost
   318  }
   319  
   320  func (c *coster) computeSelectCost(sel *memo.SelectExpr) memo.Cost {
   321  	// The filter has to be evaluated on each input row.
   322  	inputRowCount := sel.Input.Relational().Stats.RowCount
   323  	cost := memo.Cost(inputRowCount) * cpuCostFactor
   324  	return cost
   325  }
   326  
   327  func (c *coster) computeProjectCost(prj *memo.ProjectExpr) memo.Cost {
   328  	// Each synthesized column causes an expression to be evaluated on each row.
   329  	rowCount := prj.Relational().Stats.RowCount
   330  	synthesizedColCount := len(prj.Projections)
   331  	cost := memo.Cost(rowCount) * memo.Cost(synthesizedColCount) * cpuCostFactor
   332  
   333  	// Add the CPU cost of emitting the rows.
   334  	cost += memo.Cost(rowCount) * cpuCostFactor
   335  	return cost
   336  }
   337  
   338  func (c *coster) computeValuesCost(values *memo.ValuesExpr) memo.Cost {
   339  	return memo.Cost(values.Relational().Stats.RowCount) * cpuCostFactor
   340  }
   341  
   342  func (c *coster) computeHashJoinCost(join memo.RelExpr) memo.Cost {
   343  	if !join.Private().(*memo.JoinPrivate).Flags.Has(memo.AllowHashJoinStoreRight) {
   344  		return hugeCost
   345  	}
   346  	leftRowCount := join.Child(0).(memo.RelExpr).Relational().Stats.RowCount
   347  	rightRowCount := join.Child(1).(memo.RelExpr).Relational().Stats.RowCount
   348  
   349  	// A hash join must process every row from both tables once.
   350  	//
   351  	// We add some factors to account for the hashtable build and lookups. The
   352  	// right side is the one stored in the hashtable, so we use a larger factor
   353  	// for that side. This ensures that a join with the smaller right side is
   354  	// preferred to the symmetric join.
   355  	//
   356  	// TODO(rytaft): This is the cost of an in-memory hash join. When a certain
   357  	// amount of memory is used, distsql switches to a disk-based hash join with
   358  	// a temp RocksDB store.
   359  	cost := memo.Cost(1.25*leftRowCount+1.75*rightRowCount) * cpuCostFactor
   360  
   361  	// Add the CPU cost of emitting the rows.
   362  	rowsProcessed, ok := c.mem.RowsProcessed(join)
   363  	if !ok {
   364  		// This can happen as part of testing. In this case just return the number
   365  		// of rows.
   366  		rowsProcessed = join.Relational().Stats.RowCount
   367  	}
   368  	cost += memo.Cost(rowsProcessed) * cpuCostFactor
   369  
   370  	// Compute filter cost. Fetch the equality columns so they can be
   371  	// ignored later.
   372  	on := join.Child(2).(*memo.FiltersExpr)
   373  	leftEq, rightEq := memo.ExtractJoinEqualityColumns(
   374  		join.Child(0).(memo.RelExpr).Relational().OutputCols,
   375  		join.Child(1).(memo.RelExpr).Relational().OutputCols,
   376  		*on,
   377  	)
   378  	// Generate a quick way to lookup if two columns are join equality
   379  	// columns. We add in both directions because we don't know which way
   380  	// the equality filters will be defined.
   381  	eqMap := util.FastIntMap{}
   382  	for i := range leftEq {
   383  		left := int(leftEq[i])
   384  		right := int(rightEq[i])
   385  		eqMap.Set(left, right)
   386  		eqMap.Set(right, left)
   387  	}
   388  	cost += c.computeFiltersCost(*on, eqMap)
   389  
   390  	return cost
   391  }
   392  
   393  func (c *coster) computeMergeJoinCost(join *memo.MergeJoinExpr) memo.Cost {
   394  	leftRowCount := join.Left.Relational().Stats.RowCount
   395  	rightRowCount := join.Right.Relational().Stats.RowCount
   396  
   397  	cost := memo.Cost(leftRowCount+rightRowCount) * cpuCostFactor
   398  
   399  	// Add the CPU cost of emitting the rows.
   400  	rowsProcessed, ok := c.mem.RowsProcessed(join)
   401  	if !ok {
   402  		// We shouldn't ever get here. Since we don't allow the memo
   403  		// to be optimized twice, the coster should never be used after
   404  		// logPropsBuilder.clear() is called.
   405  		panic(errors.AssertionFailedf("could not get rows processed for merge join"))
   406  	}
   407  	cost += memo.Cost(rowsProcessed) * cpuCostFactor
   408  
   409  	cost += c.computeFiltersCost(join.On, util.FastIntMap{})
   410  	return cost
   411  }
   412  
   413  func (c *coster) computeIndexJoinCost(join *memo.IndexJoinExpr) memo.Cost {
   414  	leftRowCount := join.Input.Relational().Stats.RowCount
   415  
   416  	// The rows in the (left) input are used to probe into the (right) table.
   417  	// Since the matching rows in the table may not all be in the same range, this
   418  	// counts as random I/O.
   419  	perRowCost := cpuCostFactor + randIOCostFactor +
   420  		c.rowScanCost(join.Table, cat.PrimaryIndex, join.Cols.Len())
   421  	return memo.Cost(leftRowCount) * perRowCost
   422  }
   423  
   424  func (c *coster) computeLookupJoinCost(
   425  	join *memo.LookupJoinExpr, required *physical.Required,
   426  ) memo.Cost {
   427  	lookupCount := join.Input.Relational().Stats.RowCount
   428  
   429  	// Take into account that the "internal" row count is higher, according to
   430  	// the selectivities of the conditions. In particular, we need to ignore
   431  	// left-over conditions that are not selective.
   432  	// For example:
   433  	//   ab JOIN xy ON a=x AND x=10
   434  	// becomes (during normalization):
   435  	//   ab JOIN xy ON a=x AND a=10 AND x=10
   436  	// which can become a lookup join with left-over condition x=10 which doesn't
   437  	// actually filter anything.
   438  	rowsProcessed, ok := c.mem.RowsProcessed(join)
   439  	if !ok {
   440  		// We shouldn't ever get here. Since we don't allow the memo
   441  		// to be optimized twice, the coster should never be used after
   442  		// logPropsBuilder.clear() is called.
   443  		panic(errors.AssertionFailedf("could not get rows processed for lookup join"))
   444  	}
   445  
   446  	// Lookup joins can return early if enough rows have been found. An otherwise
   447  	// expensive lookup join might have a lower cost if its limit hint estimates
   448  	// that most rows will not be needed.
   449  	if required.LimitHint != 0 && lookupCount > 0 {
   450  		outputRows := join.Relational().Stats.RowCount
   451  		unlimitedLookupCount := lookupCount
   452  		lookupCount = lookupJoinInputLimitHint(unlimitedLookupCount, outputRows, required.LimitHint)
   453  		// We scale the number of rows processed by the same factor (we are
   454  		// calculating the average number of rows processed per lookup and
   455  		// multiplying by the new lookup count).
   456  		rowsProcessed = (rowsProcessed / unlimitedLookupCount) * lookupCount
   457  	}
   458  
   459  	// The rows in the (left) input are used to probe into the (right) table.
   460  	// Since the matching rows in the table may not all be in the same range, this
   461  	// counts as random I/O.
   462  	perLookupCost := memo.Cost(randIOCostFactor)
   463  	if !join.LookupColsAreTableKey {
   464  		// If the lookup columns don't form a key, execution will have to limit
   465  		// KV batches which prevents running requests to multiple nodes in parallel.
   466  		// An experiment on a 4 node cluster with a table with 100k rows split into
   467  		// 100 ranges showed that a "non-parallel" lookup join is about 5 times
   468  		// slower.
   469  		perLookupCost *= 5
   470  	}
   471  	cost := memo.Cost(lookupCount) * perLookupCost
   472  
   473  	// Each lookup might retrieve many rows; add the IO cost of retrieving the
   474  	// rows (relevant when we expect many resulting rows per lookup) and the CPU
   475  	// cost of emitting the rows.
   476  	numLookupCols := join.Cols.Difference(join.Input.Relational().OutputCols).Len()
   477  	perRowCost := lookupJoinRetrieveRowCost +
   478  		c.rowScanCost(join.Table, join.Index, numLookupCols)
   479  
   480  	cost += memo.Cost(rowsProcessed) * perRowCost
   481  
   482  	cost += c.computeFiltersCost(join.On, util.FastIntMap{})
   483  	return cost
   484  }
   485  
   486  func (c *coster) computeGeoLookupJoinCost(
   487  	join *memo.GeoLookupJoinExpr, required *physical.Required,
   488  ) memo.Cost {
   489  	lookupCount := join.Input.Relational().Stats.RowCount
   490  
   491  	// Take into account that the "internal" row count is higher, according to
   492  	// the selectivities of the conditions. In particular, we need to ignore
   493  	// the conditions that don't affect the number of rows processed.
   494  	// A contrived example, where gid is a SERIAL PK:
   495  	//   nyc_census_blocks c JOIN nyc_neighborhoods n ON
   496  	//   ST_Intersects(c.geom, n.geom) AND c.gid < n.gid
   497  	// which can become a lookup join with left-over condition c.gid <
   498  	// n.gid.
   499  	rowsProcessed, ok := c.mem.RowsProcessed(join)
   500  	if !ok {
   501  		// We shouldn't ever get here. Since we don't allow the memo
   502  		// to be optimized twice, the coster should never be used after
   503  		// logPropsBuilder.clear() is called.
   504  		panic(errors.AssertionFailedf("could not get rows processed for geolookup join"))
   505  	}
   506  
   507  	// Lookup joins can return early if enough rows have been found. An otherwise
   508  	// expensive lookup join might have a lower cost if its limit hint estimates
   509  	// that most rows will not be needed.
   510  	if required.LimitHint != 0 && lookupCount > 0 {
   511  		outputRows := join.Relational().Stats.RowCount
   512  		unlimitedLookupCount := lookupCount
   513  		lookupCount = lookupJoinInputLimitHint(unlimitedLookupCount, outputRows, required.LimitHint)
   514  		// We scale the number of rows processed by the same factor (we are
   515  		// calculating the average number of rows processed per lookup and
   516  		// multiplying by the new lookup count).
   517  		rowsProcessed = (rowsProcessed / unlimitedLookupCount) * lookupCount
   518  	}
   519  
   520  	// The rows in the (left) input are used to probe into the (right) table.
   521  	// Since the matching rows in the table may not all be in the same range, this
   522  	// counts as random I/O.
   523  	perLookupCost := memo.Cost(randIOCostFactor)
   524  	// Since inverted indexes can't form a key, execution will have to
   525  	// limit KV batches which prevents running requests to multiple nodes
   526  	// in parallel.  An experiment on a 4 node cluster with a table with
   527  	// 100k rows split into 100 ranges showed that a "non-parallel" lookup
   528  	// join is about 5 times slower.
   529  	perLookupCost *= 5
   530  	cost := memo.Cost(lookupCount) * perLookupCost
   531  
   532  	// Each lookup might retrieve many rows; add the IO cost of retrieving the
   533  	// rows (relevant when we expect many resulting rows per lookup) and the CPU
   534  	// cost of emitting the rows.
   535  	numLookupCols := join.Cols.Difference(join.Input.Relational().OutputCols).Len()
   536  	perRowCost := lookupJoinRetrieveRowCost +
   537  		c.rowScanCost(join.Table, join.Index, numLookupCols)
   538  	cost += memo.Cost(rowsProcessed) * perRowCost
   539  
   540  	// We don't add the result of computeFiltersCost to perRowCost because
   541  	// otherwise the 1 that is added to rowsProcessed would either have
   542  	// to be removed or be multiplied by all of the other various costs in
   543  	// perRowCost. To be consistent with other joins, keep it separate.
   544  	cost += c.computeFiltersCost(join.On, util.FastIntMap{}) * memo.Cost(1+rowsProcessed)
   545  	return cost
   546  }
   547  
   548  // computeFiltersCost returns the per-row cost of executing a filter. Callers
   549  // of this function should multiply its output by the number of rows expected
   550  // to be filtered + 1. The + 1 accounts for a setup cost and is useful for
   551  // comparing costs of filters with very low row counts.
   552  // TODO: account for per-row costs in all callers.
   553  func (c *coster) computeFiltersCost(filters memo.FiltersExpr, eqMap util.FastIntMap) memo.Cost {
   554  	var cost memo.Cost
   555  	for i := range filters {
   556  		f := &filters[i]
   557  		switch f.Condition.Op() {
   558  		case opt.EqOp:
   559  			eq := f.Condition.(*memo.EqExpr)
   560  			leftVar, ok := eq.Left.(*memo.VariableExpr)
   561  			if !ok {
   562  				break
   563  			}
   564  			rightVar, ok := eq.Right.(*memo.VariableExpr)
   565  			if !ok {
   566  				break
   567  			}
   568  			if val, ok := eqMap.Get(int(leftVar.Col)); ok && val == int(rightVar.Col) {
   569  				// Equality filters on some joins are still in
   570  				// filters, while others have already removed
   571  				// them. They do not cost anything.
   572  				continue
   573  			}
   574  		case opt.FunctionOp:
   575  			if IsGeoIndexFunction(f.Condition) {
   576  				cost += geoFnCost
   577  			}
   578  			// TODO(mjibson): do we need to cost other functions?
   579  		}
   580  
   581  		// Add a constant "setup" cost per ON condition to account for the fact that
   582  		// the rowsProcessed estimate alone cannot effectively discriminate between
   583  		// plans when RowCount is too small.
   584  		// TODO: perhaps separate the one-time and per-row costs and
   585  		// return them separately.
   586  		cost += cpuCostFactor
   587  	}
   588  	return cost
   589  }
   590  
   591  func (c *coster) computeZigzagJoinCost(join *memo.ZigzagJoinExpr) memo.Cost {
   592  	rowCount := join.Relational().Stats.RowCount
   593  
   594  	// Assume the upper bound on scan cost to be the sum of the cost of
   595  	// scanning the two constituent indexes. To determine how many columns
   596  	// are returned from each scan, intersect the output column set join.Cols
   597  	// with each side's IndexColumns. Columns present in both indexes are
   598  	// projected from the left side only.
   599  	md := c.mem.Metadata()
   600  	leftCols := md.TableMeta(join.LeftTable).IndexColumns(join.LeftIndex)
   601  	leftCols.IntersectionWith(join.Cols)
   602  	rightCols := md.TableMeta(join.RightTable).IndexColumns(join.RightIndex)
   603  	rightCols.IntersectionWith(join.Cols)
   604  	rightCols.DifferenceWith(leftCols)
   605  	scanCost := c.rowScanCost(join.LeftTable, join.LeftIndex, leftCols.Len())
   606  	scanCost += c.rowScanCost(join.RightTable, join.RightIndex, rightCols.Len())
   607  
   608  	// Double the cost of emitting rows as well as the cost of seeking rows,
   609  	// given two indexes will be accessed.
   610  	cost := memo.Cost(rowCount) * (2*(cpuCostFactor+seqIOCostFactor) + scanCost)
   611  
   612  	cost += c.computeFiltersCost(join.On, util.FastIntMap{})
   613  	return cost
   614  }
   615  
   616  func (c *coster) computeSetCost(set memo.RelExpr) memo.Cost {
   617  	// Add the CPU cost of emitting the rows.
   618  	cost := memo.Cost(set.Relational().Stats.RowCount) * cpuCostFactor
   619  
   620  	// A set operation must process every row from both tables once.
   621  	// UnionAll can avoid any extra computation, but all other set operations
   622  	// must perform a hash table lookup or update for each input row.
   623  	if set.Op() != opt.UnionAllOp {
   624  		leftRowCount := set.Child(0).(memo.RelExpr).Relational().Stats.RowCount
   625  		rightRowCount := set.Child(1).(memo.RelExpr).Relational().Stats.RowCount
   626  		cost += memo.Cost(leftRowCount+rightRowCount) * cpuCostFactor
   627  	}
   628  
   629  	return cost
   630  }
   631  
   632  func (c *coster) computeGroupingCost(grouping memo.RelExpr, required *physical.Required) memo.Cost {
   633  	// Start with some extra fixed overhead, since the grouping operators have
   634  	// setup overhead that is greater than other operators like Project. This
   635  	// can matter for rules like ReplaceMaxWithLimit.
   636  	cost := memo.Cost(cpuCostFactor)
   637  
   638  	// Add the CPU cost of emitting the rows.
   639  	cost += memo.Cost(grouping.Relational().Stats.RowCount) * cpuCostFactor
   640  
   641  	// GroupBy must process each input row once. Cost per row depends on the
   642  	// number of grouping columns and the number of aggregates.
   643  	inputRowCount := grouping.Child(0).(memo.RelExpr).Relational().Stats.RowCount
   644  	aggsCount := grouping.Child(1).ChildCount()
   645  	private := grouping.Private().(*memo.GroupingPrivate)
   646  	groupingColCount := private.GroupingCols.Len()
   647  	cost += memo.Cost(inputRowCount) * memo.Cost(aggsCount+groupingColCount) * cpuCostFactor
   648  
   649  	if groupingColCount > 0 {
   650  		// Add a cost that reflects the use of a hash table - unless we are doing a
   651  		// streaming aggregation where all the grouping columns are ordered; we
   652  		// interpolate linearly if only part of the grouping columns are ordered.
   653  		//
   654  		// The cost is chosen so that it's always less than the cost to sort the
   655  		// input.
   656  		hashCost := memo.Cost(inputRowCount) * cpuCostFactor
   657  		n := len(ordering.StreamingGroupingColOrdering(private, &required.Ordering))
   658  		// n = 0:                factor = 1
   659  		// n = groupingColCount: factor = 0
   660  		hashCost *= 1 - memo.Cost(n)/memo.Cost(groupingColCount)
   661  		cost += hashCost
   662  	}
   663  
   664  	return cost
   665  }
   666  
   667  func (c *coster) computeLimitCost(limit *memo.LimitExpr) memo.Cost {
   668  	// Add the CPU cost of emitting the rows.
   669  	cost := memo.Cost(limit.Relational().Stats.RowCount) * cpuCostFactor
   670  	return cost
   671  }
   672  
   673  func (c *coster) computeOffsetCost(offset *memo.OffsetExpr) memo.Cost {
   674  	// Add the CPU cost of emitting the rows.
   675  	cost := memo.Cost(offset.Relational().Stats.RowCount) * cpuCostFactor
   676  	return cost
   677  }
   678  
   679  func (c *coster) computeOrdinalityCost(ord *memo.OrdinalityExpr) memo.Cost {
   680  	// Add the CPU cost of emitting the rows.
   681  	cost := memo.Cost(ord.Relational().Stats.RowCount) * cpuCostFactor
   682  	return cost
   683  }
   684  
   685  func (c *coster) computeProjectSetCost(projectSet *memo.ProjectSetExpr) memo.Cost {
   686  	// Add the CPU cost of emitting the rows.
   687  	cost := memo.Cost(projectSet.Relational().Stats.RowCount) * cpuCostFactor
   688  	return cost
   689  }
   690  
   691  // countSegments calculates the number of segments that will be used to execute
   692  // the sort. If no input ordering is provided, there's only one segment.
   693  func (c *coster) countSegments(sort *memo.SortExpr) float64 {
   694  	if sort.InputOrdering.Any() {
   695  		return 1
   696  	}
   697  	stats := sort.Relational().Stats
   698  	orderedCols := sort.InputOrdering.ColSet()
   699  	orderedStats, ok := stats.ColStats.Lookup(orderedCols)
   700  	if !ok {
   701  		orderedStats, ok = c.mem.RequestColStat(sort, orderedCols)
   702  		if !ok {
   703  			// I don't think we can ever get here. Since we don't allow the memo
   704  			// to be optimized twice, the coster should never be used after
   705  			// logPropsBuilder.clear() is called.
   706  			panic(errors.AssertionFailedf("could not request the stats for ColSet %v", orderedCols))
   707  		}
   708  	}
   709  
   710  	return orderedStats.DistinctCount
   711  }
   712  
   713  // rowSortCost is the CPU cost to sort one row, which depends on the number of
   714  // columns in the sort key.
   715  func (c *coster) rowSortCost(numKeyCols int) memo.Cost {
   716  	// Sorting involves comparisons on the key columns, but the cost isn't
   717  	// directly proportional: we only compare the second column if the rows are
   718  	// equal on the first column; and so on. We also account for a fixed
   719  	// "non-comparison" cost related to processing the
   720  	// row. The formula is:
   721  	//
   722  	//   cpuCostFactor * [ 1 + Sum eqProb^(i-1) with i=1 to numKeyCols ]
   723  	//
   724  	const eqProb = 0.1
   725  	cost := cpuCostFactor
   726  	for i, c := 0, cpuCostFactor; i < numKeyCols; i, c = i+1, c*eqProb {
   727  		// c is cpuCostFactor * eqProb^i.
   728  		cost += c
   729  	}
   730  
   731  	// There is a fixed "non-comparison" cost and a comparison cost proportional
   732  	// to the key columns. Note that the cost has to be high enough so that a
   733  	// sort is almost always more expensive than a reverse scan or an index scan.
   734  	return memo.Cost(cost)
   735  }
   736  
   737  // rowScanCost is the CPU cost to scan one row, which depends on the number of
   738  // columns in the index and (to a lesser extent) on the number of columns we are
   739  // scanning.
   740  func (c *coster) rowScanCost(tabID opt.TableID, idxOrd int, numScannedCols int) memo.Cost {
   741  	md := c.mem.Metadata()
   742  	tab := md.Table(tabID)
   743  	idx := tab.Index(idxOrd)
   744  	numCols := idx.ColumnCount()
   745  
   746  	// Adjust cost based on how well the current locality matches the index's
   747  	// zone constraints.
   748  	var costFactor memo.Cost = cpuCostFactor
   749  	if !tab.IsVirtualTable() && len(c.locality.Tiers) != 0 {
   750  		// If 0% of locality tiers have matching constraints, then add additional
   751  		// cost. If 100% of locality tiers have matching constraints, then add no
   752  		// additional cost. Anything in between is proportional to the number of
   753  		// matches.
   754  		adjustment := 1.0 - localityMatchScore(idx.Zone(), c.locality)
   755  		costFactor += latencyCostFactor * memo.Cost(adjustment)
   756  	}
   757  
   758  	// The number of the columns in the index matter because more columns means
   759  	// more data to scan. The number of columns we actually return also matters
   760  	// because that is the amount of data that we could potentially transfer over
   761  	// the network.
   762  	return memo.Cost(numCols+numScannedCols) * costFactor
   763  }
   764  
   765  // localityMatchScore returns a number from 0.0 to 1.0 that describes how well
   766  // the current node's locality matches the given zone constraints and
   767  // leaseholder preferences, with 0.0 indicating 0% and 1.0 indicating 100%. This
   768  // is the basic algorithm:
   769  //
   770  //   t = total # of locality tiers
   771  //
   772  //   Match each locality tier against the constraint set, and compute a value
   773  //   for each tier:
   774  //
   775  //      0 = key not present in constraint set or key matches prohibited
   776  //          constraint, but value doesn't match
   777  //     +1 = key matches required constraint, and value does match
   778  //     -1 = otherwise
   779  //
   780  //   m = length of longest locality prefix that ends in a +1 value and doesn't
   781  //       contain a -1 value.
   782  //
   783  //   Compute "m" for both the ReplicaConstraints constraints set, as well as for
   784  //   the LeasePreferences constraints set:
   785  //
   786  //     constraint-score = m / t
   787  //     lease-pref-score = m / t
   788  //
   789  //   if there are no lease preferences, then final-score = lease-pref-score
   790  //   else final-score = (constraint-score * 2 + lease-pref-score) / 3
   791  //
   792  // Here are some scoring examples:
   793  //
   794  //   Locality = region=us,dc=east
   795  //   0.0 = []                     // No constraints to match
   796  //   0.0 = [+region=eu,+dc=uk]    // None of the tiers match
   797  //   0.0 = [+region=eu,+dc=east]  // 2nd tier matches, but 1st tier doesn't
   798  //   0.0 = [-region=us,+dc=east]  // 1st tier matches PROHIBITED constraint
   799  //   0.0 = [-region=eu]           // 1st tier PROHIBITED and non-matching
   800  //   0.5 = [+region=us]           // 1st tier matches
   801  //   0.5 = [+region=us,-dc=east]  // 1st tier matches, 2nd tier PROHIBITED
   802  //   0.5 = [+region=us,+dc=west]  // 1st tier matches, but 2nd tier doesn't
   803  //   1.0 = [+region=us,+dc=east]  // Both tiers match
   804  //   1.0 = [+dc=east]             // 2nd tier matches, no constraints for 1st
   805  //   1.0 = [+region=us,+dc=east,+rack=1,-ssd]  // Extra constraints ignored
   806  //
   807  // Note that constraints need not be specified in any particular order, so all
   808  // constraints are scanned when matching each locality tier. In cases where
   809  // there are multiple replica constraint groups (i.e. where a subset of replicas
   810  // can have different constraints than another subset), the minimum constraint
   811  // score among the groups is used.
   812  //
   813  // While matching leaseholder preferences are considered in the final score,
   814  // leaseholder preferences are not guaranteed, so its score is weighted at half
   815  // of the replica constraint score, in order to reflect the possibility that the
   816  // leaseholder has moved from the preferred location.
   817  func localityMatchScore(zone cat.Zone, locality roachpb.Locality) float64 {
   818  	// Fast path: if there are no constraints or leaseholder preferences, then
   819  	// locality can't match.
   820  	if zone.ReplicaConstraintsCount() == 0 && zone.LeasePreferenceCount() == 0 {
   821  		return 0.0
   822  	}
   823  
   824  	// matchTier matches a tier to a set of constraints and returns:
   825  	//
   826  	//    0 = key not present in constraint set or key only matches prohibited
   827  	//        constraints where value doesn't match
   828  	//   +1 = key matches any required constraint key + value
   829  	//   -1 = otherwise
   830  	//
   831  	matchTier := func(tier roachpb.Tier, set cat.ConstraintSet) int {
   832  		foundNoMatch := false
   833  		for j, n := 0, set.ConstraintCount(); j < n; j++ {
   834  			con := set.Constraint(j)
   835  			if con.GetKey() != tier.Key {
   836  				// Ignore constraints that don't have matching key.
   837  				continue
   838  			}
   839  
   840  			if con.GetValue() == tier.Value {
   841  				if !con.IsRequired() {
   842  					// Matching prohibited constraint, so result is -1.
   843  					return -1
   844  				}
   845  
   846  				// Matching required constraint, so result is +1.
   847  				return +1
   848  			}
   849  
   850  			if con.IsRequired() {
   851  				// Remember that non-matching required constraint was found.
   852  				foundNoMatch = true
   853  			}
   854  		}
   855  
   856  		if foundNoMatch {
   857  			// At least one non-matching required constraint was found, and no
   858  			// matching constraints.
   859  			return -1
   860  		}
   861  
   862  		// Key not present in constraint set, or key only matches prohibited
   863  		// constraints where value doesn't match.
   864  		return 0
   865  	}
   866  
   867  	// matchConstraints returns the number of tiers that match the given
   868  	// constraint set ("m" in algorithm described above).
   869  	matchConstraints := func(set cat.ConstraintSet) int {
   870  		matchCount := 0
   871  		for i, tier := range locality.Tiers {
   872  			switch matchTier(tier, set) {
   873  			case +1:
   874  				matchCount = i + 1
   875  			case -1:
   876  				return matchCount
   877  			}
   878  		}
   879  		return matchCount
   880  	}
   881  
   882  	// Score any replica constraints.
   883  	var constraintScore float64
   884  	if zone.ReplicaConstraintsCount() != 0 {
   885  		// Iterate over the replica constraints and determine the minimum value
   886  		// returned by matchConstraints for any replica. For example:
   887  		//
   888  		//   3: [+region=us,+dc=east]
   889  		//   2: [+region=us]
   890  		//
   891  		// For the [region=us,dc=east] locality, the result is min(2, 1).
   892  		minCount := intsets.MaxInt
   893  		for i := 0; i < zone.ReplicaConstraintsCount(); i++ {
   894  			matchCount := matchConstraints(zone.ReplicaConstraints(i))
   895  			if matchCount < minCount {
   896  				minCount = matchCount
   897  			}
   898  		}
   899  
   900  		constraintScore = float64(minCount) / float64(len(locality.Tiers))
   901  	}
   902  
   903  	// If there are no lease preferences, then use replica constraint score.
   904  	if zone.LeasePreferenceCount() == 0 {
   905  		return constraintScore
   906  	}
   907  
   908  	// Score the first lease preference, if one is available. Ignore subsequent
   909  	// lease preferences, since they only apply in edge cases.
   910  	matchCount := matchConstraints(zone.LeasePreference(0))
   911  	leaseScore := float64(matchCount) / float64(len(locality.Tiers))
   912  
   913  	// Weight the constraintScore twice as much as the lease score.
   914  	return (constraintScore*2 + leaseScore) / 3
   915  }
   916  
   917  // lookupJoinInputLimitHint calculates an appropriate limit hint for the input
   918  // to a lookup join.
   919  func lookupJoinInputLimitHint(inputRowCount, outputRowCount, outputLimitHint float64) float64 {
   920  	if outputRowCount == 0 {
   921  		return 0
   922  	}
   923  
   924  	// Estimate the number of lookups needed to output LimitHint rows.
   925  	expectedLookupCount := outputLimitHint * inputRowCount / outputRowCount
   926  
   927  	// Round up to the nearest multiple of a batch.
   928  	expectedLookupCount = math.Ceil(expectedLookupCount/joinReaderBatchSize) * joinReaderBatchSize
   929  	return math.Min(inputRowCount, expectedLookupCount)
   930  }