github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/sql/opt/xform/coster.go (about) 1 // Copyright 2018 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package xform 12 13 import ( 14 "math" 15 "math/rand" 16 17 "github.com/cockroachdb/cockroach/pkg/roachpb" 18 "github.com/cockroachdb/cockroach/pkg/sql/opt" 19 "github.com/cockroachdb/cockroach/pkg/sql/opt/cat" 20 "github.com/cockroachdb/cockroach/pkg/sql/opt/memo" 21 "github.com/cockroachdb/cockroach/pkg/sql/opt/ordering" 22 "github.com/cockroachdb/cockroach/pkg/sql/opt/props/physical" 23 "github.com/cockroachdb/cockroach/pkg/sql/sem/tree" 24 "github.com/cockroachdb/cockroach/pkg/util" 25 "github.com/cockroachdb/cockroach/pkg/util/log" 26 "github.com/cockroachdb/errors" 27 "golang.org/x/tools/container/intsets" 28 ) 29 30 // Coster is used by the optimizer to assign a cost to a candidate expression 31 // that can provide a set of required physical properties. If a candidate 32 // expression has a lower cost than any other expression in the memo group, then 33 // it becomes the new best expression for the group. 34 // 35 // The set of costing formulas maintained by the coster for the set of all 36 // operators constitute the "cost model". A given cost model can be designed to 37 // maximize any optimization goal, such as: 38 // 39 // 1. Max aggregate cluster throughput (txns/sec across cluster) 40 // 2. Min transaction latency (time to commit txns) 41 // 3. Min latency to first row (time to get first row of txns) 42 // 4. Min memory usage 43 // 5. Some weighted combination of #1 - #4 44 // 45 // The cost model in this file targets #1 as the optimization goal. However, 46 // note that #2 is implicitly important to that goal, since overall cluster 47 // throughput will suffer if there are lots of pending transactions waiting on 48 // I/O. 49 // 50 // Coster is an interface so that different costing algorithms can be used by 51 // the optimizer. For example, the OptSteps command uses a custom coster that 52 // assigns infinite costs to some expressions in order to prevent them from 53 // being part of the lowest cost tree (for debugging purposes). 54 type Coster interface { 55 // ComputeCost returns the estimated cost of executing the candidate 56 // expression. The optimizer does not expect the cost to correspond to any 57 // real-world metric, but does expect costs to be comparable to one another, 58 // as well as summable. 59 ComputeCost(candidate memo.RelExpr, required *physical.Required) memo.Cost 60 } 61 62 // coster encapsulates the default cost model for the optimizer. The coster 63 // assigns an estimated cost to each expression in the memo so that the 64 // optimizer can choose the lowest cost expression tree. The estimated cost is 65 // a best-effort approximation of the actual cost of execution, based on table 66 // and index statistics that are propagated throughout the logical expression 67 // tree. 68 type coster struct { 69 mem *memo.Memo 70 71 // locality gives the location of the current node as a set of user-defined 72 // key/value pairs, ordered from most inclusive to least inclusive. If there 73 // are no tiers, then the node's location is not known. Example: 74 // 75 // [region=us,dc=east] 76 // 77 locality roachpb.Locality 78 79 // perturbation indicates how much to randomly perturb the cost. It is used 80 // to generate alternative plans for testing. For example, if perturbation is 81 // 0.5, and the estimated cost of an expression is c, the cost returned by 82 // ComputeCost will be in the range [c - 0.5 * c, c + 0.5 * c). 83 perturbation float64 84 } 85 86 var _ Coster = &coster{} 87 88 // MakeDefaultCoster creates an instance of the default coster. 89 func MakeDefaultCoster(mem *memo.Memo) Coster { 90 return &coster{mem: mem} 91 } 92 93 const ( 94 // These costs have been copied from the Postgres optimizer: 95 // https://github.com/postgres/postgres/blob/master/src/include/optimizer/cost.h 96 // TODO(rytaft): "How Good are Query Optimizers, Really?" says that the 97 // PostgreSQL ratio between CPU and I/O is probably unrealistic in modern 98 // systems since much of the data can be cached in memory. Consider 99 // increasing the cpuCostFactor to account for this. 100 cpuCostFactor = 0.01 101 seqIOCostFactor = 1 102 randIOCostFactor = 4 103 104 // TODO(justin): make this more sophisticated. 105 // lookupJoinRetrieveRowCost is the cost to retrieve a single row during a 106 // lookup join. 107 // See https://github.com/cockroachdb/cockroach/pull/35561 for the initial 108 // justification for this constant. 109 lookupJoinRetrieveRowCost = 2 * seqIOCostFactor 110 111 // Input rows to a join are processed in batches of this size. 112 // See joinreader.go. 113 joinReaderBatchSize = 100.0 114 115 // In the case of a limit hint, a scan will read this multiple of the expected 116 // number of rows. See scanNode.limitHint. 117 scanSoftLimitMultiplier = 2.0 118 119 // latencyCostFactor represents the throughput impact of doing scans on an 120 // index that may be remotely located in a different locality. If latencies 121 // are higher, then overall cluster throughput will suffer somewhat, as there 122 // will be more queries in memory blocking on I/O. The impact on throughput 123 // is expected to be relatively low, so latencyCostFactor is set to a small 124 // value. However, even a low value will cause the optimizer to prefer 125 // indexes that are likely to be geographically closer, if they are otherwise 126 // the same cost to access. 127 // TODO(andyk): Need to do analysis to figure out right value and/or to come 128 // up with better way to incorporate latency into the coster. 129 latencyCostFactor = cpuCostFactor 130 131 // hugeCost is used with expressions we want to avoid; these are expressions 132 // that "violate" a hint like forcing a specific index or join algorithm. 133 // If the final expression has this cost or larger, it means that there was no 134 // plan that could satisfy the hints. 135 hugeCost memo.Cost = 1e100 136 137 // Some benchmarks showed that some geo functions were atleast 10 times 138 // slower than some float functions, so this is a somewhat data-backed 139 // guess. 140 geoFnCost = cpuCostFactor * 10 141 ) 142 143 // Init initializes a new coster structure with the given memo. 144 func (c *coster) Init(evalCtx *tree.EvalContext, mem *memo.Memo, perturbation float64) { 145 c.mem = mem 146 c.locality = evalCtx.Locality 147 c.perturbation = perturbation 148 } 149 150 // ComputeCost calculates the estimated cost of the top-level operator in a 151 // candidate best expression, based on its logical properties and those of its 152 // children. 153 // 154 // Note: each custom function to compute the cost of an operator calculates 155 // the cost based on Big-O estimated complexity. Most constant factors are 156 // ignored for now. 157 func (c *coster) ComputeCost(candidate memo.RelExpr, required *physical.Required) memo.Cost { 158 var cost memo.Cost 159 switch candidate.Op() { 160 case opt.SortOp: 161 cost = c.computeSortCost(candidate.(*memo.SortExpr), required) 162 163 case opt.ScanOp: 164 cost = c.computeScanCost(candidate.(*memo.ScanExpr), required) 165 166 case opt.SelectOp: 167 cost = c.computeSelectCost(candidate.(*memo.SelectExpr)) 168 169 case opt.ProjectOp: 170 cost = c.computeProjectCost(candidate.(*memo.ProjectExpr)) 171 172 case opt.ValuesOp: 173 cost = c.computeValuesCost(candidate.(*memo.ValuesExpr)) 174 175 case opt.InnerJoinOp, opt.LeftJoinOp, opt.RightJoinOp, opt.FullJoinOp, 176 opt.SemiJoinOp, opt.AntiJoinOp, opt.InnerJoinApplyOp, opt.LeftJoinApplyOp, 177 opt.SemiJoinApplyOp, opt.AntiJoinApplyOp: 178 // All join ops use hash join by default. 179 cost = c.computeHashJoinCost(candidate) 180 181 case opt.MergeJoinOp: 182 cost = c.computeMergeJoinCost(candidate.(*memo.MergeJoinExpr)) 183 184 case opt.IndexJoinOp: 185 cost = c.computeIndexJoinCost(candidate.(*memo.IndexJoinExpr)) 186 187 case opt.LookupJoinOp: 188 cost = c.computeLookupJoinCost(candidate.(*memo.LookupJoinExpr), required) 189 190 case opt.GeoLookupJoinOp: 191 cost = c.computeGeoLookupJoinCost(candidate.(*memo.GeoLookupJoinExpr), required) 192 193 case opt.ZigzagJoinOp: 194 cost = c.computeZigzagJoinCost(candidate.(*memo.ZigzagJoinExpr)) 195 196 case opt.UnionOp, opt.IntersectOp, opt.ExceptOp, 197 opt.UnionAllOp, opt.IntersectAllOp, opt.ExceptAllOp: 198 cost = c.computeSetCost(candidate) 199 200 case opt.GroupByOp, opt.ScalarGroupByOp, opt.DistinctOnOp, opt.EnsureDistinctOnOp, 201 opt.UpsertDistinctOnOp, opt.EnsureUpsertDistinctOnOp: 202 cost = c.computeGroupingCost(candidate, required) 203 204 case opt.LimitOp: 205 cost = c.computeLimitCost(candidate.(*memo.LimitExpr)) 206 207 case opt.OffsetOp: 208 cost = c.computeOffsetCost(candidate.(*memo.OffsetExpr)) 209 210 case opt.OrdinalityOp: 211 cost = c.computeOrdinalityCost(candidate.(*memo.OrdinalityExpr)) 212 213 case opt.ProjectSetOp: 214 cost = c.computeProjectSetCost(candidate.(*memo.ProjectSetExpr)) 215 216 case opt.ExplainOp: 217 // Technically, the cost of an Explain operation is independent of the cost 218 // of the underlying plan. However, we want to explain the plan we would get 219 // without EXPLAIN, i.e. the lowest cost plan. So do nothing special to get 220 // default behavior. 221 } 222 223 // Add a one-time cost for any operator, meant to reflect the cost of setting 224 // up execution for the operator. This makes plans with fewer operators 225 // preferable, all else being equal. 226 cost += cpuCostFactor 227 228 if !cost.Less(memo.MaxCost) { 229 // Optsteps uses MaxCost to suppress nodes in the memo. When a node with 230 // MaxCost is added to the memo, it can lead to an obscure crash with an 231 // unknown node. We'd rather detect this early. 232 panic(errors.AssertionFailedf("node %s with MaxCost added to the memo", log.Safe(candidate.Op()))) 233 } 234 235 if c.perturbation != 0 { 236 // Don't perturb the cost if we are forcing an index. 237 if cost < hugeCost { 238 // Get a random value in the range [-1.0, 1.0) 239 multiplier := 2*rand.Float64() - 1 240 241 // If perturbation is p, and the estimated cost of an expression is c, 242 // the new cost is in the range [max(0, c - pc), c + pc). For example, 243 // if p=1.5, the new cost is in the range [0, c + 1.5 * c). 244 cost += cost * memo.Cost(c.perturbation*multiplier) 245 // The cost must always be >= 0. 246 if cost < 0 { 247 cost = 0 248 } 249 } 250 } 251 252 return cost 253 } 254 255 func (c *coster) computeSortCost(sort *memo.SortExpr, required *physical.Required) memo.Cost { 256 // We calculate the cost of a segmented sort. We assume each segment 257 // is of the same size of (rowCount / numSegments). We also calculate the 258 // per-row cost. The cost of the sort is: 259 // 260 // perRowCost * (rowCount + (segmentSize * log2(segmentSize) * numOrderedSegments)) 261 // 262 // The constant term is necessary for cases where the estimated row count is 263 // very small. 264 // TODO(rytaft): This is the cost of a local, in-memory sort. When a 265 // certain amount of memory is used, distsql switches to a disk-based sort 266 // with a temp RocksDB store. 267 268 numSegments := c.countSegments(sort) 269 cost := memo.Cost(0) 270 stats := sort.Relational().Stats 271 rowCount := stats.RowCount 272 perRowCost := c.rowSortCost(len(required.Ordering.Columns) - len(sort.InputOrdering.Columns)) 273 274 if !sort.InputOrdering.Any() { 275 // Add the cost for finding the segments. 276 cost += memo.Cost(float64(len(sort.InputOrdering.Columns))*rowCount) * cpuCostFactor 277 } 278 279 segmentSize := rowCount / numSegments 280 if segmentSize > 1 { 281 cost += memo.Cost(segmentSize) * (memo.Cost(math.Log2(segmentSize)) * memo.Cost(numSegments)) 282 } 283 cost = perRowCost * (memo.Cost(rowCount) + cost) 284 return cost 285 } 286 287 func (c *coster) computeScanCost(scan *memo.ScanExpr, required *physical.Required) memo.Cost { 288 // Scanning an index with a few columns is faster than scanning an index with 289 // many columns. Ideally, we would want to use statistics about the size of 290 // each column. In lieu of that, use the number of columns. 291 if scan.Flags.ForceIndex && scan.Flags.Index != scan.Index { 292 // If we are forcing an index, any other index has a very high cost. In 293 // practice, this will only happen when this is a primary index scan. 294 return hugeCost 295 } 296 rowCount := scan.Relational().Stats.RowCount 297 perRowCost := c.rowScanCost(scan.Table, scan.Index, scan.Cols.Len()) 298 299 if required.LimitHint != 0 { 300 rowCount = math.Min(rowCount, required.LimitHint*scanSoftLimitMultiplier) 301 } 302 303 if ordering.ScanIsReverse(scan, &required.Ordering) { 304 if rowCount > 1 { 305 // Need to do binary search to seek to the previous row. 306 perRowCost += memo.Cost(math.Log2(rowCount)) * cpuCostFactor 307 } 308 } 309 310 // Add a small cost if the scan is unconstrained, so all else being equal, we 311 // will prefer a constrained scan. This is important if our row count 312 // estimate turns out to be smaller than the actual row count. 313 var preferConstrainedScanCost memo.Cost 314 if scan.Constraint == nil || scan.Constraint.IsUnconstrained() { 315 preferConstrainedScanCost = cpuCostFactor 316 } 317 return memo.Cost(rowCount)*(seqIOCostFactor+perRowCost) + preferConstrainedScanCost 318 } 319 320 func (c *coster) computeSelectCost(sel *memo.SelectExpr) memo.Cost { 321 // The filter has to be evaluated on each input row. 322 inputRowCount := sel.Input.Relational().Stats.RowCount 323 cost := memo.Cost(inputRowCount) * cpuCostFactor 324 return cost 325 } 326 327 func (c *coster) computeProjectCost(prj *memo.ProjectExpr) memo.Cost { 328 // Each synthesized column causes an expression to be evaluated on each row. 329 rowCount := prj.Relational().Stats.RowCount 330 synthesizedColCount := len(prj.Projections) 331 cost := memo.Cost(rowCount) * memo.Cost(synthesizedColCount) * cpuCostFactor 332 333 // Add the CPU cost of emitting the rows. 334 cost += memo.Cost(rowCount) * cpuCostFactor 335 return cost 336 } 337 338 func (c *coster) computeValuesCost(values *memo.ValuesExpr) memo.Cost { 339 return memo.Cost(values.Relational().Stats.RowCount) * cpuCostFactor 340 } 341 342 func (c *coster) computeHashJoinCost(join memo.RelExpr) memo.Cost { 343 if !join.Private().(*memo.JoinPrivate).Flags.Has(memo.AllowHashJoinStoreRight) { 344 return hugeCost 345 } 346 leftRowCount := join.Child(0).(memo.RelExpr).Relational().Stats.RowCount 347 rightRowCount := join.Child(1).(memo.RelExpr).Relational().Stats.RowCount 348 349 // A hash join must process every row from both tables once. 350 // 351 // We add some factors to account for the hashtable build and lookups. The 352 // right side is the one stored in the hashtable, so we use a larger factor 353 // for that side. This ensures that a join with the smaller right side is 354 // preferred to the symmetric join. 355 // 356 // TODO(rytaft): This is the cost of an in-memory hash join. When a certain 357 // amount of memory is used, distsql switches to a disk-based hash join with 358 // a temp RocksDB store. 359 cost := memo.Cost(1.25*leftRowCount+1.75*rightRowCount) * cpuCostFactor 360 361 // Add the CPU cost of emitting the rows. 362 rowsProcessed, ok := c.mem.RowsProcessed(join) 363 if !ok { 364 // This can happen as part of testing. In this case just return the number 365 // of rows. 366 rowsProcessed = join.Relational().Stats.RowCount 367 } 368 cost += memo.Cost(rowsProcessed) * cpuCostFactor 369 370 // Compute filter cost. Fetch the equality columns so they can be 371 // ignored later. 372 on := join.Child(2).(*memo.FiltersExpr) 373 leftEq, rightEq := memo.ExtractJoinEqualityColumns( 374 join.Child(0).(memo.RelExpr).Relational().OutputCols, 375 join.Child(1).(memo.RelExpr).Relational().OutputCols, 376 *on, 377 ) 378 // Generate a quick way to lookup if two columns are join equality 379 // columns. We add in both directions because we don't know which way 380 // the equality filters will be defined. 381 eqMap := util.FastIntMap{} 382 for i := range leftEq { 383 left := int(leftEq[i]) 384 right := int(rightEq[i]) 385 eqMap.Set(left, right) 386 eqMap.Set(right, left) 387 } 388 cost += c.computeFiltersCost(*on, eqMap) 389 390 return cost 391 } 392 393 func (c *coster) computeMergeJoinCost(join *memo.MergeJoinExpr) memo.Cost { 394 leftRowCount := join.Left.Relational().Stats.RowCount 395 rightRowCount := join.Right.Relational().Stats.RowCount 396 397 cost := memo.Cost(leftRowCount+rightRowCount) * cpuCostFactor 398 399 // Add the CPU cost of emitting the rows. 400 rowsProcessed, ok := c.mem.RowsProcessed(join) 401 if !ok { 402 // We shouldn't ever get here. Since we don't allow the memo 403 // to be optimized twice, the coster should never be used after 404 // logPropsBuilder.clear() is called. 405 panic(errors.AssertionFailedf("could not get rows processed for merge join")) 406 } 407 cost += memo.Cost(rowsProcessed) * cpuCostFactor 408 409 cost += c.computeFiltersCost(join.On, util.FastIntMap{}) 410 return cost 411 } 412 413 func (c *coster) computeIndexJoinCost(join *memo.IndexJoinExpr) memo.Cost { 414 leftRowCount := join.Input.Relational().Stats.RowCount 415 416 // The rows in the (left) input are used to probe into the (right) table. 417 // Since the matching rows in the table may not all be in the same range, this 418 // counts as random I/O. 419 perRowCost := cpuCostFactor + randIOCostFactor + 420 c.rowScanCost(join.Table, cat.PrimaryIndex, join.Cols.Len()) 421 return memo.Cost(leftRowCount) * perRowCost 422 } 423 424 func (c *coster) computeLookupJoinCost( 425 join *memo.LookupJoinExpr, required *physical.Required, 426 ) memo.Cost { 427 lookupCount := join.Input.Relational().Stats.RowCount 428 429 // Take into account that the "internal" row count is higher, according to 430 // the selectivities of the conditions. In particular, we need to ignore 431 // left-over conditions that are not selective. 432 // For example: 433 // ab JOIN xy ON a=x AND x=10 434 // becomes (during normalization): 435 // ab JOIN xy ON a=x AND a=10 AND x=10 436 // which can become a lookup join with left-over condition x=10 which doesn't 437 // actually filter anything. 438 rowsProcessed, ok := c.mem.RowsProcessed(join) 439 if !ok { 440 // We shouldn't ever get here. Since we don't allow the memo 441 // to be optimized twice, the coster should never be used after 442 // logPropsBuilder.clear() is called. 443 panic(errors.AssertionFailedf("could not get rows processed for lookup join")) 444 } 445 446 // Lookup joins can return early if enough rows have been found. An otherwise 447 // expensive lookup join might have a lower cost if its limit hint estimates 448 // that most rows will not be needed. 449 if required.LimitHint != 0 && lookupCount > 0 { 450 outputRows := join.Relational().Stats.RowCount 451 unlimitedLookupCount := lookupCount 452 lookupCount = lookupJoinInputLimitHint(unlimitedLookupCount, outputRows, required.LimitHint) 453 // We scale the number of rows processed by the same factor (we are 454 // calculating the average number of rows processed per lookup and 455 // multiplying by the new lookup count). 456 rowsProcessed = (rowsProcessed / unlimitedLookupCount) * lookupCount 457 } 458 459 // The rows in the (left) input are used to probe into the (right) table. 460 // Since the matching rows in the table may not all be in the same range, this 461 // counts as random I/O. 462 perLookupCost := memo.Cost(randIOCostFactor) 463 if !join.LookupColsAreTableKey { 464 // If the lookup columns don't form a key, execution will have to limit 465 // KV batches which prevents running requests to multiple nodes in parallel. 466 // An experiment on a 4 node cluster with a table with 100k rows split into 467 // 100 ranges showed that a "non-parallel" lookup join is about 5 times 468 // slower. 469 perLookupCost *= 5 470 } 471 cost := memo.Cost(lookupCount) * perLookupCost 472 473 // Each lookup might retrieve many rows; add the IO cost of retrieving the 474 // rows (relevant when we expect many resulting rows per lookup) and the CPU 475 // cost of emitting the rows. 476 numLookupCols := join.Cols.Difference(join.Input.Relational().OutputCols).Len() 477 perRowCost := lookupJoinRetrieveRowCost + 478 c.rowScanCost(join.Table, join.Index, numLookupCols) 479 480 cost += memo.Cost(rowsProcessed) * perRowCost 481 482 cost += c.computeFiltersCost(join.On, util.FastIntMap{}) 483 return cost 484 } 485 486 func (c *coster) computeGeoLookupJoinCost( 487 join *memo.GeoLookupJoinExpr, required *physical.Required, 488 ) memo.Cost { 489 lookupCount := join.Input.Relational().Stats.RowCount 490 491 // Take into account that the "internal" row count is higher, according to 492 // the selectivities of the conditions. In particular, we need to ignore 493 // the conditions that don't affect the number of rows processed. 494 // A contrived example, where gid is a SERIAL PK: 495 // nyc_census_blocks c JOIN nyc_neighborhoods n ON 496 // ST_Intersects(c.geom, n.geom) AND c.gid < n.gid 497 // which can become a lookup join with left-over condition c.gid < 498 // n.gid. 499 rowsProcessed, ok := c.mem.RowsProcessed(join) 500 if !ok { 501 // We shouldn't ever get here. Since we don't allow the memo 502 // to be optimized twice, the coster should never be used after 503 // logPropsBuilder.clear() is called. 504 panic(errors.AssertionFailedf("could not get rows processed for geolookup join")) 505 } 506 507 // Lookup joins can return early if enough rows have been found. An otherwise 508 // expensive lookup join might have a lower cost if its limit hint estimates 509 // that most rows will not be needed. 510 if required.LimitHint != 0 && lookupCount > 0 { 511 outputRows := join.Relational().Stats.RowCount 512 unlimitedLookupCount := lookupCount 513 lookupCount = lookupJoinInputLimitHint(unlimitedLookupCount, outputRows, required.LimitHint) 514 // We scale the number of rows processed by the same factor (we are 515 // calculating the average number of rows processed per lookup and 516 // multiplying by the new lookup count). 517 rowsProcessed = (rowsProcessed / unlimitedLookupCount) * lookupCount 518 } 519 520 // The rows in the (left) input are used to probe into the (right) table. 521 // Since the matching rows in the table may not all be in the same range, this 522 // counts as random I/O. 523 perLookupCost := memo.Cost(randIOCostFactor) 524 // Since inverted indexes can't form a key, execution will have to 525 // limit KV batches which prevents running requests to multiple nodes 526 // in parallel. An experiment on a 4 node cluster with a table with 527 // 100k rows split into 100 ranges showed that a "non-parallel" lookup 528 // join is about 5 times slower. 529 perLookupCost *= 5 530 cost := memo.Cost(lookupCount) * perLookupCost 531 532 // Each lookup might retrieve many rows; add the IO cost of retrieving the 533 // rows (relevant when we expect many resulting rows per lookup) and the CPU 534 // cost of emitting the rows. 535 numLookupCols := join.Cols.Difference(join.Input.Relational().OutputCols).Len() 536 perRowCost := lookupJoinRetrieveRowCost + 537 c.rowScanCost(join.Table, join.Index, numLookupCols) 538 cost += memo.Cost(rowsProcessed) * perRowCost 539 540 // We don't add the result of computeFiltersCost to perRowCost because 541 // otherwise the 1 that is added to rowsProcessed would either have 542 // to be removed or be multiplied by all of the other various costs in 543 // perRowCost. To be consistent with other joins, keep it separate. 544 cost += c.computeFiltersCost(join.On, util.FastIntMap{}) * memo.Cost(1+rowsProcessed) 545 return cost 546 } 547 548 // computeFiltersCost returns the per-row cost of executing a filter. Callers 549 // of this function should multiply its output by the number of rows expected 550 // to be filtered + 1. The + 1 accounts for a setup cost and is useful for 551 // comparing costs of filters with very low row counts. 552 // TODO: account for per-row costs in all callers. 553 func (c *coster) computeFiltersCost(filters memo.FiltersExpr, eqMap util.FastIntMap) memo.Cost { 554 var cost memo.Cost 555 for i := range filters { 556 f := &filters[i] 557 switch f.Condition.Op() { 558 case opt.EqOp: 559 eq := f.Condition.(*memo.EqExpr) 560 leftVar, ok := eq.Left.(*memo.VariableExpr) 561 if !ok { 562 break 563 } 564 rightVar, ok := eq.Right.(*memo.VariableExpr) 565 if !ok { 566 break 567 } 568 if val, ok := eqMap.Get(int(leftVar.Col)); ok && val == int(rightVar.Col) { 569 // Equality filters on some joins are still in 570 // filters, while others have already removed 571 // them. They do not cost anything. 572 continue 573 } 574 case opt.FunctionOp: 575 if IsGeoIndexFunction(f.Condition) { 576 cost += geoFnCost 577 } 578 // TODO(mjibson): do we need to cost other functions? 579 } 580 581 // Add a constant "setup" cost per ON condition to account for the fact that 582 // the rowsProcessed estimate alone cannot effectively discriminate between 583 // plans when RowCount is too small. 584 // TODO: perhaps separate the one-time and per-row costs and 585 // return them separately. 586 cost += cpuCostFactor 587 } 588 return cost 589 } 590 591 func (c *coster) computeZigzagJoinCost(join *memo.ZigzagJoinExpr) memo.Cost { 592 rowCount := join.Relational().Stats.RowCount 593 594 // Assume the upper bound on scan cost to be the sum of the cost of 595 // scanning the two constituent indexes. To determine how many columns 596 // are returned from each scan, intersect the output column set join.Cols 597 // with each side's IndexColumns. Columns present in both indexes are 598 // projected from the left side only. 599 md := c.mem.Metadata() 600 leftCols := md.TableMeta(join.LeftTable).IndexColumns(join.LeftIndex) 601 leftCols.IntersectionWith(join.Cols) 602 rightCols := md.TableMeta(join.RightTable).IndexColumns(join.RightIndex) 603 rightCols.IntersectionWith(join.Cols) 604 rightCols.DifferenceWith(leftCols) 605 scanCost := c.rowScanCost(join.LeftTable, join.LeftIndex, leftCols.Len()) 606 scanCost += c.rowScanCost(join.RightTable, join.RightIndex, rightCols.Len()) 607 608 // Double the cost of emitting rows as well as the cost of seeking rows, 609 // given two indexes will be accessed. 610 cost := memo.Cost(rowCount) * (2*(cpuCostFactor+seqIOCostFactor) + scanCost) 611 612 cost += c.computeFiltersCost(join.On, util.FastIntMap{}) 613 return cost 614 } 615 616 func (c *coster) computeSetCost(set memo.RelExpr) memo.Cost { 617 // Add the CPU cost of emitting the rows. 618 cost := memo.Cost(set.Relational().Stats.RowCount) * cpuCostFactor 619 620 // A set operation must process every row from both tables once. 621 // UnionAll can avoid any extra computation, but all other set operations 622 // must perform a hash table lookup or update for each input row. 623 if set.Op() != opt.UnionAllOp { 624 leftRowCount := set.Child(0).(memo.RelExpr).Relational().Stats.RowCount 625 rightRowCount := set.Child(1).(memo.RelExpr).Relational().Stats.RowCount 626 cost += memo.Cost(leftRowCount+rightRowCount) * cpuCostFactor 627 } 628 629 return cost 630 } 631 632 func (c *coster) computeGroupingCost(grouping memo.RelExpr, required *physical.Required) memo.Cost { 633 // Start with some extra fixed overhead, since the grouping operators have 634 // setup overhead that is greater than other operators like Project. This 635 // can matter for rules like ReplaceMaxWithLimit. 636 cost := memo.Cost(cpuCostFactor) 637 638 // Add the CPU cost of emitting the rows. 639 cost += memo.Cost(grouping.Relational().Stats.RowCount) * cpuCostFactor 640 641 // GroupBy must process each input row once. Cost per row depends on the 642 // number of grouping columns and the number of aggregates. 643 inputRowCount := grouping.Child(0).(memo.RelExpr).Relational().Stats.RowCount 644 aggsCount := grouping.Child(1).ChildCount() 645 private := grouping.Private().(*memo.GroupingPrivate) 646 groupingColCount := private.GroupingCols.Len() 647 cost += memo.Cost(inputRowCount) * memo.Cost(aggsCount+groupingColCount) * cpuCostFactor 648 649 if groupingColCount > 0 { 650 // Add a cost that reflects the use of a hash table - unless we are doing a 651 // streaming aggregation where all the grouping columns are ordered; we 652 // interpolate linearly if only part of the grouping columns are ordered. 653 // 654 // The cost is chosen so that it's always less than the cost to sort the 655 // input. 656 hashCost := memo.Cost(inputRowCount) * cpuCostFactor 657 n := len(ordering.StreamingGroupingColOrdering(private, &required.Ordering)) 658 // n = 0: factor = 1 659 // n = groupingColCount: factor = 0 660 hashCost *= 1 - memo.Cost(n)/memo.Cost(groupingColCount) 661 cost += hashCost 662 } 663 664 return cost 665 } 666 667 func (c *coster) computeLimitCost(limit *memo.LimitExpr) memo.Cost { 668 // Add the CPU cost of emitting the rows. 669 cost := memo.Cost(limit.Relational().Stats.RowCount) * cpuCostFactor 670 return cost 671 } 672 673 func (c *coster) computeOffsetCost(offset *memo.OffsetExpr) memo.Cost { 674 // Add the CPU cost of emitting the rows. 675 cost := memo.Cost(offset.Relational().Stats.RowCount) * cpuCostFactor 676 return cost 677 } 678 679 func (c *coster) computeOrdinalityCost(ord *memo.OrdinalityExpr) memo.Cost { 680 // Add the CPU cost of emitting the rows. 681 cost := memo.Cost(ord.Relational().Stats.RowCount) * cpuCostFactor 682 return cost 683 } 684 685 func (c *coster) computeProjectSetCost(projectSet *memo.ProjectSetExpr) memo.Cost { 686 // Add the CPU cost of emitting the rows. 687 cost := memo.Cost(projectSet.Relational().Stats.RowCount) * cpuCostFactor 688 return cost 689 } 690 691 // countSegments calculates the number of segments that will be used to execute 692 // the sort. If no input ordering is provided, there's only one segment. 693 func (c *coster) countSegments(sort *memo.SortExpr) float64 { 694 if sort.InputOrdering.Any() { 695 return 1 696 } 697 stats := sort.Relational().Stats 698 orderedCols := sort.InputOrdering.ColSet() 699 orderedStats, ok := stats.ColStats.Lookup(orderedCols) 700 if !ok { 701 orderedStats, ok = c.mem.RequestColStat(sort, orderedCols) 702 if !ok { 703 // I don't think we can ever get here. Since we don't allow the memo 704 // to be optimized twice, the coster should never be used after 705 // logPropsBuilder.clear() is called. 706 panic(errors.AssertionFailedf("could not request the stats for ColSet %v", orderedCols)) 707 } 708 } 709 710 return orderedStats.DistinctCount 711 } 712 713 // rowSortCost is the CPU cost to sort one row, which depends on the number of 714 // columns in the sort key. 715 func (c *coster) rowSortCost(numKeyCols int) memo.Cost { 716 // Sorting involves comparisons on the key columns, but the cost isn't 717 // directly proportional: we only compare the second column if the rows are 718 // equal on the first column; and so on. We also account for a fixed 719 // "non-comparison" cost related to processing the 720 // row. The formula is: 721 // 722 // cpuCostFactor * [ 1 + Sum eqProb^(i-1) with i=1 to numKeyCols ] 723 // 724 const eqProb = 0.1 725 cost := cpuCostFactor 726 for i, c := 0, cpuCostFactor; i < numKeyCols; i, c = i+1, c*eqProb { 727 // c is cpuCostFactor * eqProb^i. 728 cost += c 729 } 730 731 // There is a fixed "non-comparison" cost and a comparison cost proportional 732 // to the key columns. Note that the cost has to be high enough so that a 733 // sort is almost always more expensive than a reverse scan or an index scan. 734 return memo.Cost(cost) 735 } 736 737 // rowScanCost is the CPU cost to scan one row, which depends on the number of 738 // columns in the index and (to a lesser extent) on the number of columns we are 739 // scanning. 740 func (c *coster) rowScanCost(tabID opt.TableID, idxOrd int, numScannedCols int) memo.Cost { 741 md := c.mem.Metadata() 742 tab := md.Table(tabID) 743 idx := tab.Index(idxOrd) 744 numCols := idx.ColumnCount() 745 746 // Adjust cost based on how well the current locality matches the index's 747 // zone constraints. 748 var costFactor memo.Cost = cpuCostFactor 749 if !tab.IsVirtualTable() && len(c.locality.Tiers) != 0 { 750 // If 0% of locality tiers have matching constraints, then add additional 751 // cost. If 100% of locality tiers have matching constraints, then add no 752 // additional cost. Anything in between is proportional to the number of 753 // matches. 754 adjustment := 1.0 - localityMatchScore(idx.Zone(), c.locality) 755 costFactor += latencyCostFactor * memo.Cost(adjustment) 756 } 757 758 // The number of the columns in the index matter because more columns means 759 // more data to scan. The number of columns we actually return also matters 760 // because that is the amount of data that we could potentially transfer over 761 // the network. 762 return memo.Cost(numCols+numScannedCols) * costFactor 763 } 764 765 // localityMatchScore returns a number from 0.0 to 1.0 that describes how well 766 // the current node's locality matches the given zone constraints and 767 // leaseholder preferences, with 0.0 indicating 0% and 1.0 indicating 100%. This 768 // is the basic algorithm: 769 // 770 // t = total # of locality tiers 771 // 772 // Match each locality tier against the constraint set, and compute a value 773 // for each tier: 774 // 775 // 0 = key not present in constraint set or key matches prohibited 776 // constraint, but value doesn't match 777 // +1 = key matches required constraint, and value does match 778 // -1 = otherwise 779 // 780 // m = length of longest locality prefix that ends in a +1 value and doesn't 781 // contain a -1 value. 782 // 783 // Compute "m" for both the ReplicaConstraints constraints set, as well as for 784 // the LeasePreferences constraints set: 785 // 786 // constraint-score = m / t 787 // lease-pref-score = m / t 788 // 789 // if there are no lease preferences, then final-score = lease-pref-score 790 // else final-score = (constraint-score * 2 + lease-pref-score) / 3 791 // 792 // Here are some scoring examples: 793 // 794 // Locality = region=us,dc=east 795 // 0.0 = [] // No constraints to match 796 // 0.0 = [+region=eu,+dc=uk] // None of the tiers match 797 // 0.0 = [+region=eu,+dc=east] // 2nd tier matches, but 1st tier doesn't 798 // 0.0 = [-region=us,+dc=east] // 1st tier matches PROHIBITED constraint 799 // 0.0 = [-region=eu] // 1st tier PROHIBITED and non-matching 800 // 0.5 = [+region=us] // 1st tier matches 801 // 0.5 = [+region=us,-dc=east] // 1st tier matches, 2nd tier PROHIBITED 802 // 0.5 = [+region=us,+dc=west] // 1st tier matches, but 2nd tier doesn't 803 // 1.0 = [+region=us,+dc=east] // Both tiers match 804 // 1.0 = [+dc=east] // 2nd tier matches, no constraints for 1st 805 // 1.0 = [+region=us,+dc=east,+rack=1,-ssd] // Extra constraints ignored 806 // 807 // Note that constraints need not be specified in any particular order, so all 808 // constraints are scanned when matching each locality tier. In cases where 809 // there are multiple replica constraint groups (i.e. where a subset of replicas 810 // can have different constraints than another subset), the minimum constraint 811 // score among the groups is used. 812 // 813 // While matching leaseholder preferences are considered in the final score, 814 // leaseholder preferences are not guaranteed, so its score is weighted at half 815 // of the replica constraint score, in order to reflect the possibility that the 816 // leaseholder has moved from the preferred location. 817 func localityMatchScore(zone cat.Zone, locality roachpb.Locality) float64 { 818 // Fast path: if there are no constraints or leaseholder preferences, then 819 // locality can't match. 820 if zone.ReplicaConstraintsCount() == 0 && zone.LeasePreferenceCount() == 0 { 821 return 0.0 822 } 823 824 // matchTier matches a tier to a set of constraints and returns: 825 // 826 // 0 = key not present in constraint set or key only matches prohibited 827 // constraints where value doesn't match 828 // +1 = key matches any required constraint key + value 829 // -1 = otherwise 830 // 831 matchTier := func(tier roachpb.Tier, set cat.ConstraintSet) int { 832 foundNoMatch := false 833 for j, n := 0, set.ConstraintCount(); j < n; j++ { 834 con := set.Constraint(j) 835 if con.GetKey() != tier.Key { 836 // Ignore constraints that don't have matching key. 837 continue 838 } 839 840 if con.GetValue() == tier.Value { 841 if !con.IsRequired() { 842 // Matching prohibited constraint, so result is -1. 843 return -1 844 } 845 846 // Matching required constraint, so result is +1. 847 return +1 848 } 849 850 if con.IsRequired() { 851 // Remember that non-matching required constraint was found. 852 foundNoMatch = true 853 } 854 } 855 856 if foundNoMatch { 857 // At least one non-matching required constraint was found, and no 858 // matching constraints. 859 return -1 860 } 861 862 // Key not present in constraint set, or key only matches prohibited 863 // constraints where value doesn't match. 864 return 0 865 } 866 867 // matchConstraints returns the number of tiers that match the given 868 // constraint set ("m" in algorithm described above). 869 matchConstraints := func(set cat.ConstraintSet) int { 870 matchCount := 0 871 for i, tier := range locality.Tiers { 872 switch matchTier(tier, set) { 873 case +1: 874 matchCount = i + 1 875 case -1: 876 return matchCount 877 } 878 } 879 return matchCount 880 } 881 882 // Score any replica constraints. 883 var constraintScore float64 884 if zone.ReplicaConstraintsCount() != 0 { 885 // Iterate over the replica constraints and determine the minimum value 886 // returned by matchConstraints for any replica. For example: 887 // 888 // 3: [+region=us,+dc=east] 889 // 2: [+region=us] 890 // 891 // For the [region=us,dc=east] locality, the result is min(2, 1). 892 minCount := intsets.MaxInt 893 for i := 0; i < zone.ReplicaConstraintsCount(); i++ { 894 matchCount := matchConstraints(zone.ReplicaConstraints(i)) 895 if matchCount < minCount { 896 minCount = matchCount 897 } 898 } 899 900 constraintScore = float64(minCount) / float64(len(locality.Tiers)) 901 } 902 903 // If there are no lease preferences, then use replica constraint score. 904 if zone.LeasePreferenceCount() == 0 { 905 return constraintScore 906 } 907 908 // Score the first lease preference, if one is available. Ignore subsequent 909 // lease preferences, since they only apply in edge cases. 910 matchCount := matchConstraints(zone.LeasePreference(0)) 911 leaseScore := float64(matchCount) / float64(len(locality.Tiers)) 912 913 // Weight the constraintScore twice as much as the lease score. 914 return (constraintScore*2 + leaseScore) / 3 915 } 916 917 // lookupJoinInputLimitHint calculates an appropriate limit hint for the input 918 // to a lookup join. 919 func lookupJoinInputLimitHint(inputRowCount, outputRowCount, outputLimitHint float64) float64 { 920 if outputRowCount == 0 { 921 return 0 922 } 923 924 // Estimate the number of lookups needed to output LimitHint rows. 925 expectedLookupCount := outputLimitHint * inputRowCount / outputRowCount 926 927 // Round up to the nearest multiple of a batch. 928 expectedLookupCount = math.Ceil(expectedLookupCount/joinReaderBatchSize) * joinReaderBatchSize 929 return math.Min(inputRowCount, expectedLookupCount) 930 }