github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/sql/distsql_physical_planner.go (about) 1 // Copyright 2016 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package sql 12 13 import ( 14 "context" 15 "fmt" 16 "reflect" 17 "sort" 18 "strings" 19 20 "github.com/cockroachdb/cockroach/pkg/gossip" 21 "github.com/cockroachdb/cockroach/pkg/keys" 22 "github.com/cockroachdb/cockroach/pkg/kv" 23 "github.com/cockroachdb/cockroach/pkg/kv/kvclient/kvcoord" 24 "github.com/cockroachdb/cockroach/pkg/roachpb" 25 "github.com/cockroachdb/cockroach/pkg/rpc" 26 "github.com/cockroachdb/cockroach/pkg/rpc/nodedialer" 27 "github.com/cockroachdb/cockroach/pkg/settings/cluster" 28 "github.com/cockroachdb/cockroach/pkg/sql/distsql" 29 "github.com/cockroachdb/cockroach/pkg/sql/execinfra" 30 "github.com/cockroachdb/cockroach/pkg/sql/execinfrapb" 31 "github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgcode" 32 "github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgerror" 33 "github.com/cockroachdb/cockroach/pkg/sql/physicalplan" 34 "github.com/cockroachdb/cockroach/pkg/sql/physicalplan/replicaoracle" 35 "github.com/cockroachdb/cockroach/pkg/sql/sem/tree" 36 "github.com/cockroachdb/cockroach/pkg/sql/sqlbase" 37 "github.com/cockroachdb/cockroach/pkg/sql/types" 38 "github.com/cockroachdb/cockroach/pkg/util" 39 "github.com/cockroachdb/cockroach/pkg/util/encoding" 40 "github.com/cockroachdb/cockroach/pkg/util/envutil" 41 "github.com/cockroachdb/cockroach/pkg/util/log" 42 "github.com/cockroachdb/cockroach/pkg/util/stop" 43 "github.com/cockroachdb/cockroach/pkg/util/uuid" 44 "github.com/cockroachdb/errors" 45 ) 46 47 // DistSQLPlanner is used to generate distributed plans from logical 48 // plans. A rough overview of the process: 49 // 50 // - the plan is based on a planNode tree (in the future it will be based on an 51 // intermediate representation tree). Only a subset of the possible trees is 52 // supported (this can be checked via CheckSupport). 53 // 54 // - we generate a PhysicalPlan for the planNode tree recursively. The 55 // PhysicalPlan consists of a network of processors and streams, with a set 56 // of unconnected "result routers". The PhysicalPlan also has information on 57 // ordering and on the mapping planNode columns to columns in the result 58 // streams (all result routers output streams with the same schema). 59 // 60 // The PhysicalPlan for a scanNode leaf consists of TableReaders, one for each node 61 // that has one or more ranges. 62 // 63 // - for each an internal planNode we start with the plan of the child node(s) 64 // and add processing stages (connected to the result routers of the children 65 // node). 66 type DistSQLPlanner struct { 67 // planVersion is the version of DistSQL targeted by the plan we're building. 68 // This is currently only assigned to the node's current DistSQL version and 69 // is used to skip incompatible nodes when mapping spans. 70 planVersion execinfrapb.DistSQLVersion 71 72 st *cluster.Settings 73 // The node descriptor for the gateway node that initiated this query. 74 nodeDesc roachpb.NodeDescriptor 75 stopper *stop.Stopper 76 distSQLSrv *distsql.ServerImpl 77 spanResolver physicalplan.SpanResolver 78 79 // metadataTestTolerance is the minimum level required to plan metadata test 80 // processors. 81 metadataTestTolerance execinfra.MetadataTestLevel 82 83 // runnerChan is used to send out requests (for running SetupFlow RPCs) to a 84 // pool of workers. 85 runnerChan chan runnerRequest 86 87 // gossip handle used to check node version compatibility and to construct 88 // the spanResolver. 89 gossip gossip.DeprecatedGossip 90 91 nodeDialer *nodedialer.Dialer 92 93 // nodeHealth encapsulates the various node health checks to avoid planning 94 // on unhealthy nodes. 95 nodeHealth distSQLNodeHealth 96 97 // distSender is used to construct the spanResolver upon SetNodeDesc. 98 distSender *kvcoord.DistSender 99 // rpcCtx is used to construct the spanResolver upon SetNodeDesc. 100 rpcCtx *rpc.Context 101 } 102 103 // ReplicaOraclePolicy controls which policy the physical planner uses to choose 104 // a replica for a given range. It is exported so that it may be overwritten 105 // during initialization by CCL code to enable follower reads. 106 var ReplicaOraclePolicy = replicaoracle.BinPackingChoice 107 108 // If true, the plan diagram (in JSON) is logged for each plan (used for 109 // debugging). 110 var logPlanDiagram = envutil.EnvOrDefaultBool("COCKROACH_DISTSQL_LOG_PLAN", false) 111 112 // NewDistSQLPlanner initializes a DistSQLPlanner. 113 // 114 // nodeDesc is the descriptor of the node on which this planner runs. It is used 115 // to favor itself and other close-by nodes when planning. An empty descriptor 116 // can be passed to aid bootstrapping, but then SetNodeDesc() needs to be called 117 // before this planner is used. 118 func NewDistSQLPlanner( 119 ctx context.Context, 120 planVersion execinfrapb.DistSQLVersion, 121 st *cluster.Settings, 122 nodeDesc roachpb.NodeDescriptor, 123 rpcCtx *rpc.Context, 124 distSQLSrv *distsql.ServerImpl, 125 distSender *kvcoord.DistSender, 126 gw gossip.DeprecatedGossip, 127 stopper *stop.Stopper, 128 isLive func(roachpb.NodeID) (bool, error), 129 nodeDialer *nodedialer.Dialer, 130 ) *DistSQLPlanner { 131 dsp := &DistSQLPlanner{ 132 planVersion: planVersion, 133 st: st, 134 nodeDesc: nodeDesc, 135 stopper: stopper, 136 distSQLSrv: distSQLSrv, 137 gossip: gw, 138 nodeDialer: nodeDialer, 139 nodeHealth: distSQLNodeHealth{ 140 gossip: gw, 141 connHealth: nodeDialer.ConnHealth, 142 isLive: isLive, 143 }, 144 distSender: distSender, 145 rpcCtx: rpcCtx, 146 metadataTestTolerance: execinfra.NoExplain, 147 } 148 149 dsp.initRunners() 150 return dsp 151 } 152 153 func (dsp *DistSQLPlanner) shouldPlanTestMetadata() bool { 154 return dsp.distSQLSrv.TestingKnobs.MetadataTestLevel >= dsp.metadataTestTolerance 155 } 156 157 // SetNodeDesc sets the planner's node descriptor. 158 // The first call to SetNodeDesc leads to the construction of the SpanResolver. 159 func (dsp *DistSQLPlanner) SetNodeDesc(desc roachpb.NodeDescriptor) { 160 dsp.nodeDesc = desc 161 if dsp.spanResolver == nil { 162 sr := physicalplan.NewSpanResolver(dsp.st, dsp.distSender, dsp.gossip, desc, 163 dsp.rpcCtx, ReplicaOraclePolicy) 164 dsp.SetSpanResolver(sr) 165 } 166 } 167 168 // SetSpanResolver switches to a different SpanResolver. It is the caller's 169 // responsibility to make sure the DistSQLPlanner is not in use. 170 func (dsp *DistSQLPlanner) SetSpanResolver(spanResolver physicalplan.SpanResolver) { 171 dsp.spanResolver = spanResolver 172 } 173 174 // distSQLExprCheckVisitor is a tree.Visitor that checks if expressions 175 // contain things not supported by distSQL, like distSQL-blacklisted functions. 176 type distSQLExprCheckVisitor struct { 177 err error 178 } 179 180 var _ tree.Visitor = &distSQLExprCheckVisitor{} 181 182 func (v *distSQLExprCheckVisitor) VisitPre(expr tree.Expr) (recurse bool, newExpr tree.Expr) { 183 if v.err != nil { 184 return false, expr 185 } 186 switch t := expr.(type) { 187 case *tree.FuncExpr: 188 if t.IsDistSQLBlacklist() { 189 v.err = newQueryNotSupportedErrorf("function %s cannot be executed with distsql", t) 190 return false, expr 191 } 192 case *tree.DOid: 193 v.err = newQueryNotSupportedError("OID expressions are not supported by distsql") 194 return false, expr 195 case *tree.CastExpr: 196 // TODO (rohany): I'm not sure why this CastExpr doesn't have a type 197 // annotation at this stage of processing... 198 if typ, ok := tree.GetStaticallyKnownType(t.Type); ok && typ.Family() == types.OidFamily { 199 v.err = newQueryNotSupportedErrorf("cast to %s is not supported by distsql", t.Type) 200 return false, expr 201 } 202 } 203 return true, expr 204 } 205 206 func (v *distSQLExprCheckVisitor) VisitPost(expr tree.Expr) tree.Expr { return expr } 207 208 // checkExpr verifies that an expression doesn't contain things that are not yet 209 // supported by distSQL, like distSQL-blacklisted functions. 210 func checkExpr(expr tree.Expr) error { 211 if expr == nil { 212 return nil 213 } 214 v := distSQLExprCheckVisitor{} 215 tree.WalkExprConst(&v, expr) 216 return v.err 217 } 218 219 type distRecommendation int 220 221 const ( 222 // cannotDistribute indicates that a plan cannot be distributed. 223 cannotDistribute distRecommendation = iota 224 225 // shouldNotDistribute indicates that a plan could suffer if distributed. 226 shouldNotDistribute 227 228 // canDistribute indicates that a plan will probably not benefit but will 229 // probably not suffer if distributed. 230 canDistribute 231 232 // shouldDistribute indicates that a plan will likely benefit if distributed. 233 shouldDistribute 234 ) 235 236 // compose returns the recommendation for a plan given recommendations for two 237 // parts of it: if we shouldNotDistribute either part, then we 238 // shouldNotDistribute the overall plan either. 239 func (a distRecommendation) compose(b distRecommendation) distRecommendation { 240 if a == cannotDistribute || b == cannotDistribute { 241 return cannotDistribute 242 } 243 if a == shouldNotDistribute || b == shouldNotDistribute { 244 return shouldNotDistribute 245 } 246 if a == shouldDistribute || b == shouldDistribute { 247 return shouldDistribute 248 } 249 return canDistribute 250 } 251 252 type queryNotSupportedError struct { 253 msg string 254 } 255 256 func (e *queryNotSupportedError) Error() string { 257 return e.msg 258 } 259 260 func newQueryNotSupportedError(msg string) error { 261 return &queryNotSupportedError{msg: msg} 262 } 263 264 func newQueryNotSupportedErrorf(format string, args ...interface{}) error { 265 return &queryNotSupportedError{msg: fmt.Sprintf(format, args...)} 266 } 267 268 // planNodeNotSupportedErr is the catch-all error value returned from 269 // checkSupportForPlanNode when a planNode type does not support distributed 270 // execution. 271 var planNodeNotSupportedErr = newQueryNotSupportedError("unsupported node") 272 273 var cannotDistributeRowLevelLockingErr = newQueryNotSupportedError( 274 "scans with row-level locking are not supported by distsql", 275 ) 276 277 // mustWrapNode returns true if a node has no DistSQL-processor equivalent. 278 // This must be kept in sync with createPhysPlanForPlanNode. 279 // TODO(jordan): refactor these to use the observer pattern to avoid duplication. 280 func (dsp *DistSQLPlanner) mustWrapNode(planCtx *PlanningCtx, node planNode) bool { 281 switch n := node.(type) { 282 // Keep these cases alphabetized, please! 283 case *distinctNode: 284 case *exportNode: 285 case *filterNode: 286 case *groupNode: 287 case *indexJoinNode: 288 case *joinNode: 289 case *limitNode: 290 case *lookupJoinNode: 291 case *ordinalityNode: 292 case *projectSetNode: 293 case *renderNode: 294 case *scanNode: 295 case *sortNode: 296 case *unaryNode: 297 case *unionNode: 298 case *valuesNode: 299 // This is unfortunately duplicated by createPhysPlanForPlanNode, and must be kept 300 // in sync with its implementation. 301 if !n.specifiedInQuery || planCtx.isLocal || planCtx.noEvalSubqueries { 302 return true 303 } 304 return false 305 case *windowNode: 306 case *zeroNode: 307 case *zigzagJoinNode: 308 default: 309 return true 310 } 311 return false 312 } 313 314 // checkSupportForPlanNode returns a distRecommendation (as described above) or 315 // cannotDistribute and an error if the plan subtree is not distributable. 316 // The error doesn't indicate complete failure - it's instead the reason that 317 // this plan couldn't be distributed. 318 // TODO(radu): add tests for this. 319 func checkSupportForPlanNode(node planNode) (distRecommendation, error) { 320 switch n := node.(type) { 321 // Keep these cases alphabetized, please! 322 case *distinctNode: 323 return checkSupportForPlanNode(n.plan) 324 325 case *exportNode: 326 return checkSupportForPlanNode(n.source) 327 328 case *filterNode: 329 if err := checkExpr(n.filter); err != nil { 330 return cannotDistribute, err 331 } 332 return checkSupportForPlanNode(n.source.plan) 333 334 case *groupNode: 335 rec, err := checkSupportForPlanNode(n.plan) 336 if err != nil { 337 return cannotDistribute, err 338 } 339 // Distribute aggregations if possible. 340 return rec.compose(shouldDistribute), nil 341 342 case *indexJoinNode: 343 // n.table doesn't have meaningful spans, but we need to check support (e.g. 344 // for any filtering expression). 345 if _, err := checkSupportForPlanNode(n.table); err != nil { 346 return cannotDistribute, err 347 } 348 return checkSupportForPlanNode(n.input) 349 350 case *joinNode: 351 if err := checkExpr(n.pred.onCond); err != nil { 352 return cannotDistribute, err 353 } 354 recLeft, err := checkSupportForPlanNode(n.left.plan) 355 if err != nil { 356 return cannotDistribute, err 357 } 358 recRight, err := checkSupportForPlanNode(n.right.plan) 359 if err != nil { 360 return cannotDistribute, err 361 } 362 // If either the left or the right side can benefit from distribution, we 363 // should distribute. 364 rec := recLeft.compose(recRight) 365 // If we can do a hash join, we distribute if possible. 366 if len(n.pred.leftEqualityIndices) > 0 { 367 rec = rec.compose(shouldDistribute) 368 } 369 return rec, nil 370 371 case *limitNode: 372 if err := checkExpr(n.countExpr); err != nil { 373 return cannotDistribute, err 374 } 375 if err := checkExpr(n.offsetExpr); err != nil { 376 return cannotDistribute, err 377 } 378 return checkSupportForPlanNode(n.plan) 379 380 case *lookupJoinNode: 381 if err := checkExpr(n.onCond); err != nil { 382 return cannotDistribute, err 383 } 384 if _, err := checkSupportForPlanNode(n.input); err != nil { 385 return cannotDistribute, err 386 } 387 return shouldDistribute, nil 388 389 case *projectSetNode: 390 return checkSupportForPlanNode(n.source) 391 392 case *renderNode: 393 for _, e := range n.render { 394 if err := checkExpr(e); err != nil { 395 return cannotDistribute, err 396 } 397 } 398 return checkSupportForPlanNode(n.source.plan) 399 400 case *scanNode: 401 if n.lockingStrength != sqlbase.ScanLockingStrength_FOR_NONE { 402 // Scans that are performing row-level locking cannot currently be 403 // distributed because their locks would not be propagated back to 404 // the root transaction coordinator. 405 // TODO(nvanbenschoten): lift this restriction. 406 return cannotDistribute, cannotDistributeRowLevelLockingErr 407 } 408 409 // Although we don't yet recommend distributing plans where soft limits 410 // propagate to scan nodes because we don't have infrastructure to only 411 // plan for a few ranges at a time, the propagation of the soft limits 412 // to scan nodes has been added in 20.1 release, so to keep the 413 // previous behavior we continue to ignore the soft limits for now. 414 // TODO(yuzefovich): pay attention to the soft limits. 415 rec := canDistribute 416 // We recommend running scans distributed if we have a filtering 417 // expression or if we have a full table scan. 418 if n.filter != nil { 419 if err := checkExpr(n.filter); err != nil { 420 return cannotDistribute, err 421 } 422 rec = rec.compose(shouldDistribute) 423 } 424 // Check if we are doing a full scan. 425 if n.isFull { 426 rec = rec.compose(shouldDistribute) 427 } 428 return rec, nil 429 430 case *sortNode: 431 rec, err := checkSupportForPlanNode(n.plan) 432 if err != nil { 433 return cannotDistribute, err 434 } 435 // If we have to sort, distribute the query. 436 rec = rec.compose(shouldDistribute) 437 return rec, nil 438 439 case *unaryNode: 440 return canDistribute, nil 441 442 case *unionNode: 443 recLeft, err := checkSupportForPlanNode(n.left) 444 if err != nil { 445 return cannotDistribute, err 446 } 447 recRight, err := checkSupportForPlanNode(n.right) 448 if err != nil { 449 return cannotDistribute, err 450 } 451 return recLeft.compose(recRight), nil 452 453 case *valuesNode: 454 if !n.specifiedInQuery { 455 // This condition indicates that the valuesNode was created by planning, 456 // not by the user, like the way vtables are expanded into valuesNodes. We 457 // don't want to distribute queries like this across the network. 458 return cannotDistribute, newQueryNotSupportedErrorf("unsupported valuesNode, not specified in query") 459 } 460 461 for _, tuple := range n.tuples { 462 for _, expr := range tuple { 463 if err := checkExpr(expr); err != nil { 464 return cannotDistribute, err 465 } 466 } 467 } 468 return canDistribute, nil 469 470 case *windowNode: 471 return checkSupportForPlanNode(n.plan) 472 473 case *zeroNode: 474 return canDistribute, nil 475 476 case *zigzagJoinNode: 477 if err := checkExpr(n.onCond); err != nil { 478 return cannotDistribute, err 479 } 480 return shouldDistribute, nil 481 482 default: 483 return cannotDistribute, planNodeNotSupportedErr 484 } 485 } 486 487 //go:generate stringer -type=NodeStatus 488 489 // NodeStatus represents a node's health and compatibility in the context of 490 // physical planning for a query. 491 type NodeStatus int 492 493 const ( 494 // NodeOK means that the node can be used for planning. 495 NodeOK NodeStatus = iota 496 // NodeUnhealthy means that the node should be avoided because 497 // it's not healthy. 498 NodeUnhealthy 499 // NodeDistSQLVersionIncompatible means that the node should be avoided 500 // because it's DistSQL version is not compatible. 501 NodeDistSQLVersionIncompatible 502 ) 503 504 // PlanningCtx contains data used and updated throughout the planning process of 505 // a single query. 506 type PlanningCtx struct { 507 ctx context.Context 508 ExtendedEvalCtx *extendedEvalContext 509 spanIter physicalplan.SpanResolverIterator 510 // NodesStatuses contains info for all NodeIDs that are referenced by any 511 // PhysicalPlan we generate with this context. 512 NodeStatuses map[roachpb.NodeID]NodeStatus 513 514 // isLocal is set to true if we're planning this query on a single node. 515 isLocal bool 516 planner *planner 517 // ignoreClose, when set to true, will prevent the closing of the planner's 518 // current plan. Only the top-level query needs to close it, but everything 519 // else (like sub- and postqueries, or EXPLAIN ANALYZE) should set this to 520 // true to avoid double closes of the planNode tree. 521 ignoreClose bool 522 stmtType tree.StatementType 523 // planDepth is set to the current depth of the planNode tree. It's used to 524 // keep track of whether it's valid to run a root node in a special fast path 525 // mode. 526 planDepth int 527 528 // noEvalSubqueries indicates that the plan expects any subqueries to not 529 // be replaced by evaluation. Should only be set by EXPLAIN. 530 noEvalSubqueries bool 531 532 // If set, a diagram for the plan will be generated and passed to this 533 // function. 534 saveDiagram func(execinfrapb.FlowDiagram) 535 // If set, the diagram passed to saveDiagram will show the types of each 536 // stream. 537 saveDiagramShowInputTypes bool 538 } 539 540 var _ physicalplan.ExprContext = &PlanningCtx{} 541 542 // EvalContext returns the associated EvalContext, or nil if there isn't one. 543 func (p *PlanningCtx) EvalContext() *tree.EvalContext { 544 if p.ExtendedEvalCtx == nil { 545 return nil 546 } 547 return &p.ExtendedEvalCtx.EvalContext 548 } 549 550 // IsLocal returns true if this PlanningCtx is being used to plan a query that 551 // has no remote flows. 552 func (p *PlanningCtx) IsLocal() bool { 553 return p.isLocal 554 } 555 556 // EvaluateSubqueries returns true if this plan requires subqueries be fully 557 // executed before trying to marshal. This is normally true except for in the 558 // case of EXPLAIN queries, which ultimately want to describe the subquery that 559 // will run, without actually running it. 560 func (p *PlanningCtx) EvaluateSubqueries() bool { 561 return !p.noEvalSubqueries 562 } 563 564 // PhysicalPlan is a partial physical plan which corresponds to a planNode 565 // (partial in that it can correspond to a planNode subtree and not necessarily 566 // to the entire planNode for a given query). 567 // 568 // It augments physicalplan.PhysicalPlan with information relating the physical 569 // plan to a planNode subtree. 570 // 571 // These plans are built recursively on a planNode tree. 572 type PhysicalPlan struct { 573 physicalplan.PhysicalPlan 574 575 // PlanToStreamColMap maps planNode columns (see planColumns()) to columns in 576 // the result streams. These stream indices correspond to the streams 577 // referenced in ResultTypes. 578 // 579 // Note that in some cases, not all columns in the result streams are 580 // referenced in the map; for example, columns that are only required for 581 // stream merges in downstream input synchronizers are not included here. 582 // (This is due to some processors not being configurable to output only 583 // certain columns and will be fixed.) 584 // 585 // Conversely, in some cases not all planNode columns have a corresponding 586 // result stream column (these map to index -1); this is the case for scanNode 587 // and indexJoinNode where not all columns in the table are actually used in 588 // the plan, but are kept for possible use downstream (e.g., sorting). 589 // 590 // When the query is run, the output processor's PlanToStreamColMap is used 591 // by DistSQLReceiver to create an implicit projection on the processor's 592 // output for client consumption (see DistSQLReceiver.Push()). Therefore, 593 // "invisible" columns (e.g., columns required for merge ordering) will not 594 // be output. 595 PlanToStreamColMap []int 596 } 597 598 // makePlanToStreamColMap initializes a new PhysicalPlan.PlanToStreamColMap. The 599 // columns that are present in the result stream(s) should be set in the map. 600 func makePlanToStreamColMap(numCols int) []int { 601 m := make([]int, numCols) 602 for i := 0; i < numCols; i++ { 603 m[i] = -1 604 } 605 return m 606 } 607 608 // identityMap returns the slice {0, 1, 2, ..., numCols-1}. 609 // buf can be optionally provided as a buffer. 610 func identityMap(buf []int, numCols int) []int { 611 buf = buf[:0] 612 for i := 0; i < numCols; i++ { 613 buf = append(buf, i) 614 } 615 return buf 616 } 617 618 // identityMapInPlace returns the modified slice such that it contains 619 // {0, 1, ..., len(slice)-1}. 620 func identityMapInPlace(slice []int) []int { 621 for i := range slice { 622 slice[i] = i 623 } 624 return slice 625 } 626 627 // SpanPartition is the intersection between a set of spans for a certain 628 // operation (e.g table scan) and the set of ranges owned by a given node. 629 type SpanPartition struct { 630 Node roachpb.NodeID 631 Spans roachpb.Spans 632 } 633 634 type distSQLNodeHealth struct { 635 gossip gossip.DeprecatedGossip 636 isLive func(roachpb.NodeID) (bool, error) 637 connHealth func(roachpb.NodeID, rpc.ConnectionClass) error 638 } 639 640 func (h *distSQLNodeHealth) check(ctx context.Context, nodeID roachpb.NodeID) error { 641 { 642 // NB: as of #22658, ConnHealth does not work as expected; see the 643 // comment within. We still keep this code for now because in 644 // practice, once the node is down it will prevent using this node 645 // 90% of the time (it gets used around once per second as an 646 // artifact of rpcContext's reconnection mechanism at the time of 647 // writing). This is better than having it used in 100% of cases 648 // (until the liveness check below kicks in). 649 err := h.connHealth(nodeID, rpc.DefaultClass) 650 if err != nil && !errors.Is(err, rpc.ErrNotHeartbeated) { 651 // This host is known to be unhealthy. Don't use it (use the gateway 652 // instead). Note: this can never happen for our nodeID (which 653 // always has its address in the nodeMap). 654 log.VEventf(ctx, 1, "marking n%d as unhealthy for this plan: %v", nodeID, err) 655 return err 656 } 657 } 658 { 659 live, err := h.isLive(nodeID) 660 if err == nil && !live { 661 err = pgerror.Newf(pgcode.CannotConnectNow, 662 "node n%d is not live", errors.Safe(nodeID)) 663 } 664 if err != nil { 665 return pgerror.Wrapf(err, pgcode.CannotConnectNow, 666 "not using n%d due to liveness", errors.Safe(nodeID)) 667 } 668 } 669 670 // Check that the node is not draining. 671 if g, ok := h.gossip.Optional(distsql.MultiTenancyIssueNo); ok { 672 drainingInfo := &execinfrapb.DistSQLDrainingInfo{} 673 if err := g.GetInfoProto(gossip.MakeDistSQLDrainingKey(nodeID), drainingInfo); err != nil { 674 // Because draining info has no expiration, an error 675 // implies that we have not yet received a node's 676 // draining information. Since this information is 677 // written on startup, the most likely scenario is 678 // that the node is ready. We therefore return no 679 // error. 680 // TODO(ajwerner): Determine the expected error types and only filter those. 681 return nil //nolint:returnerrcheck 682 } 683 684 if drainingInfo.Draining { 685 err := errors.Newf("not using n%d because it is draining", log.Safe(nodeID)) 686 log.VEventf(ctx, 1, "%v", err) 687 return err 688 } 689 } 690 691 return nil 692 } 693 694 // PartitionSpans finds out which nodes are owners for ranges touching the 695 // given spans, and splits the spans according to owning nodes. The result is a 696 // set of SpanPartitions (guaranteed one for each relevant node), which form a 697 // partitioning of the spans (i.e. they are non-overlapping and their union is 698 // exactly the original set of spans). 699 // 700 // PartitionSpans does its best to not assign ranges on nodes that are known to 701 // either be unhealthy or running an incompatible version. The ranges owned by 702 // such nodes are assigned to the gateway. 703 func (dsp *DistSQLPlanner) PartitionSpans( 704 planCtx *PlanningCtx, spans roachpb.Spans, 705 ) ([]SpanPartition, error) { 706 if len(spans) == 0 { 707 panic("no spans") 708 } 709 ctx := planCtx.ctx 710 partitions := make([]SpanPartition, 0, 1) 711 if planCtx.isLocal { 712 // If we're planning locally, map all spans to the local node. 713 partitions = append(partitions, 714 SpanPartition{dsp.nodeDesc.NodeID, spans}) 715 return partitions, nil 716 } 717 // nodeMap maps a nodeID to an index inside the partitions array. 718 nodeMap := make(map[roachpb.NodeID]int) 719 it := planCtx.spanIter 720 for _, span := range spans { 721 // rspan is the span we are currently partitioning. 722 var rspan roachpb.RSpan 723 var err error 724 if rspan.Key, err = keys.Addr(span.Key); err != nil { 725 return nil, err 726 } 727 if rspan.EndKey, err = keys.Addr(span.EndKey); err != nil { 728 return nil, err 729 } 730 731 var lastNodeID roachpb.NodeID 732 // lastKey maintains the EndKey of the last piece of `span`. 733 lastKey := rspan.Key 734 if log.V(1) { 735 log.Infof(ctx, "partitioning span %s", span) 736 } 737 // We break up rspan into its individual ranges (which may or 738 // may not be on separate nodes). We then create "partitioned 739 // spans" using the end keys of these individual ranges. 740 for it.Seek(ctx, span, kvcoord.Ascending); ; it.Next(ctx) { 741 if !it.Valid() { 742 return nil, it.Error() 743 } 744 replDesc, err := it.ReplicaInfo(ctx) 745 if err != nil { 746 return nil, err 747 } 748 desc := it.Desc() 749 if log.V(1) { 750 descCpy := desc // don't let desc escape 751 log.Infof(ctx, "lastKey: %s desc: %s", lastKey, &descCpy) 752 } 753 754 if !desc.ContainsKey(lastKey) { 755 // This range must contain the last range's EndKey. 756 log.Fatalf( 757 ctx, "next range %v doesn't cover last end key %v. Partitions: %#v", 758 desc.RSpan(), lastKey, partitions, 759 ) 760 } 761 762 // Limit the end key to the end of the span we are resolving. 763 endKey := desc.EndKey 764 if rspan.EndKey.Less(endKey) { 765 endKey = rspan.EndKey 766 } 767 768 nodeID := replDesc.NodeID 769 partitionIdx, inNodeMap := nodeMap[nodeID] 770 if !inNodeMap { 771 // This is the first time we are seeing nodeID for these spans. Check 772 // its health. 773 status := dsp.CheckNodeHealthAndVersion(planCtx, nodeID) 774 // If the node is unhealthy or its DistSQL version is incompatible, use 775 // the gateway to process this span instead of the unhealthy host. 776 // An empty address indicates an unhealthy host. 777 if status != NodeOK { 778 log.Eventf(ctx, "not planning on node %d: %s", nodeID, status) 779 nodeID = dsp.nodeDesc.NodeID 780 partitionIdx, inNodeMap = nodeMap[nodeID] 781 } 782 783 if !inNodeMap { 784 partitionIdx = len(partitions) 785 partitions = append(partitions, SpanPartition{Node: nodeID}) 786 nodeMap[nodeID] = partitionIdx 787 } 788 } 789 partition := &partitions[partitionIdx] 790 791 if lastNodeID == nodeID { 792 // Two consecutive ranges on the same node, merge the spans. 793 partition.Spans[len(partition.Spans)-1].EndKey = endKey.AsRawKey() 794 } else { 795 partition.Spans = append(partition.Spans, roachpb.Span{ 796 Key: lastKey.AsRawKey(), 797 EndKey: endKey.AsRawKey(), 798 }) 799 } 800 801 if !endKey.Less(rspan.EndKey) { 802 // Done. 803 break 804 } 805 806 lastKey = endKey 807 lastNodeID = nodeID 808 } 809 } 810 return partitions, nil 811 } 812 813 // nodeVersionIsCompatible decides whether a particular node's DistSQL version 814 // is compatible with dsp.planVersion. It uses gossip to find out the node's 815 // version range. 816 func (dsp *DistSQLPlanner) nodeVersionIsCompatible(nodeID roachpb.NodeID) bool { 817 g, ok := dsp.gossip.Optional(distsql.MultiTenancyIssueNo) 818 if !ok { 819 return true // no gossip - always compatible; only a single gateway running in Phase 2 820 } 821 var v execinfrapb.DistSQLVersionGossipInfo 822 if err := g.GetInfoProto(gossip.MakeDistSQLNodeVersionKey(nodeID), &v); err != nil { 823 return false 824 } 825 return distsql.FlowVerIsCompatible(dsp.planVersion, v.MinAcceptedVersion, v.Version) 826 } 827 828 func getIndexIdx( 829 index *sqlbase.IndexDescriptor, desc *sqlbase.ImmutableTableDescriptor, 830 ) (uint32, error) { 831 if index.ID == desc.PrimaryIndex.ID { 832 return 0, nil 833 } 834 for i := range desc.Indexes { 835 if index.ID == desc.Indexes[i].ID { 836 // IndexIdx is 1 based (0 means primary index). 837 return uint32(i + 1), nil 838 } 839 } 840 return 0, errors.Errorf("invalid index %v (table %s)", index, desc.Name) 841 } 842 843 // initTableReaderSpec initializes a TableReaderSpec/PostProcessSpec that 844 // corresponds to a scanNode, except for the Spans and OutputColumns. 845 func initTableReaderSpec( 846 n *scanNode, planCtx *PlanningCtx, indexVarMap []int, 847 ) (*execinfrapb.TableReaderSpec, execinfrapb.PostProcessSpec, error) { 848 s := physicalplan.NewTableReaderSpec() 849 *s = execinfrapb.TableReaderSpec{ 850 Table: *n.desc.TableDesc(), 851 Reverse: n.reverse, 852 IsCheck: n.isCheck, 853 Visibility: n.colCfg.visibility, 854 LockingStrength: n.lockingStrength, 855 LockingWaitPolicy: n.lockingWaitPolicy, 856 857 // Retain the capacity of the spans slice. 858 Spans: s.Spans[:0], 859 } 860 indexIdx, err := getIndexIdx(n.index, n.desc) 861 if err != nil { 862 return nil, execinfrapb.PostProcessSpec{}, err 863 } 864 s.IndexIdx = indexIdx 865 866 // When a TableReader is running scrub checks, do not allow a 867 // post-processor. This is because the outgoing stream is a fixed 868 // format (rowexec.ScrubTypes). 869 if n.isCheck { 870 return s, execinfrapb.PostProcessSpec{}, nil 871 } 872 873 filter, err := physicalplan.MakeExpression(n.filter, planCtx, indexVarMap) 874 if err != nil { 875 return nil, execinfrapb.PostProcessSpec{}, err 876 } 877 post := execinfrapb.PostProcessSpec{ 878 Filter: filter, 879 } 880 881 if n.hardLimit != 0 { 882 post.Limit = uint64(n.hardLimit) 883 } else if n.softLimit != 0 { 884 s.LimitHint = n.softLimit 885 } 886 return s, post, nil 887 } 888 889 // scanNodeOrdinal returns the index of a column with the given ID. 890 func tableOrdinal( 891 desc *sqlbase.ImmutableTableDescriptor, 892 colID sqlbase.ColumnID, 893 visibility execinfrapb.ScanVisibility, 894 ) int { 895 for i := range desc.Columns { 896 if desc.Columns[i].ID == colID { 897 return i 898 } 899 } 900 if visibility == execinfra.ScanVisibilityPublicAndNotPublic { 901 offset := len(desc.Columns) 902 for i, col := range desc.MutationColumns() { 903 if col.ID == colID { 904 return offset + i 905 } 906 } 907 } 908 panic(fmt.Sprintf("column %d not in desc.Columns", colID)) 909 } 910 911 // getScanNodeToTableOrdinalMap returns a map from scan node column ordinal to 912 // table reader column ordinal. Returns nil if the map is identity. 913 // 914 // scanNodes can have columns set up in a few different ways, depending on the 915 // colCfg. The heuristic planner always creates scanNodes with all public 916 // columns (even if some of them aren't even in the index we are scanning). 917 // The optimizer creates scanNodes with a specific set of wanted columns; in 918 // this case we have to create a map from scanNode column ordinal to table 919 // column ordinal (which is what the TableReader uses). 920 func getScanNodeToTableOrdinalMap(n *scanNode) []int { 921 if n.colCfg.wantedColumns == nil { 922 return nil 923 } 924 if n.colCfg.addUnwantedAsHidden { 925 panic("addUnwantedAsHidden not supported") 926 } 927 res := make([]int, len(n.cols)) 928 for i := range res { 929 res[i] = tableOrdinal(n.desc, n.cols[i].ID, n.colCfg.visibility) 930 } 931 return res 932 } 933 934 // getOutputColumnsFromScanNode returns the indices of the columns that are 935 // returned by a scanNode. 936 // If remap is not nil, the column ordinals are remapped accordingly. 937 func getOutputColumnsFromScanNode(n *scanNode, remap []int) []uint32 { 938 outputColumns := make([]uint32, 0, len(n.cols)) 939 // TODO(radu): if we have a scan with a filter, cols will include the 940 // columns needed for the filter, even if they aren't needed for the next 941 // stage. 942 for i := 0; i < len(n.cols); i++ { 943 colIdx := i 944 if remap != nil { 945 colIdx = remap[i] 946 } 947 outputColumns = append(outputColumns, uint32(colIdx)) 948 } 949 return outputColumns 950 } 951 952 // convertOrdering maps the columns in props.ordering to the output columns of a 953 // processor. 954 func (dsp *DistSQLPlanner) convertOrdering( 955 reqOrdering ReqOrdering, planToStreamColMap []int, 956 ) execinfrapb.Ordering { 957 if len(reqOrdering) == 0 { 958 return execinfrapb.Ordering{} 959 } 960 result := execinfrapb.Ordering{ 961 Columns: make([]execinfrapb.Ordering_Column, len(reqOrdering)), 962 } 963 for i, o := range reqOrdering { 964 streamColIdx := o.ColIdx 965 if planToStreamColMap != nil { 966 streamColIdx = planToStreamColMap[o.ColIdx] 967 } 968 if streamColIdx == -1 { 969 panic("column in ordering not part of processor output") 970 } 971 result.Columns[i].ColIdx = uint32(streamColIdx) 972 dir := execinfrapb.Ordering_Column_ASC 973 if o.Direction == encoding.Descending { 974 dir = execinfrapb.Ordering_Column_DESC 975 } 976 result.Columns[i].Direction = dir 977 } 978 return result 979 } 980 981 // getNodeIDForScan retrieves the node ID where the single table reader should 982 // reside for a limited scan. Ideally this is the lease holder for the first 983 // range in the specified spans. But if that node is unhealthy or incompatible, 984 // we use the gateway node instead. 985 func (dsp *DistSQLPlanner) getNodeIDForScan( 986 planCtx *PlanningCtx, spans []roachpb.Span, reverse bool, 987 ) (roachpb.NodeID, error) { 988 if len(spans) == 0 { 989 panic("no spans") 990 } 991 992 // Determine the node ID for the first range to be scanned. 993 it := planCtx.spanIter 994 if reverse { 995 it.Seek(planCtx.ctx, spans[len(spans)-1], kvcoord.Descending) 996 } else { 997 it.Seek(planCtx.ctx, spans[0], kvcoord.Ascending) 998 } 999 if !it.Valid() { 1000 return 0, it.Error() 1001 } 1002 replDesc, err := it.ReplicaInfo(planCtx.ctx) 1003 if err != nil { 1004 return 0, err 1005 } 1006 1007 nodeID := replDesc.NodeID 1008 status := dsp.CheckNodeHealthAndVersion(planCtx, nodeID) 1009 if status != NodeOK { 1010 log.Eventf(planCtx.ctx, "not planning on node %d: %s", nodeID, status) 1011 return dsp.nodeDesc.NodeID, nil 1012 } 1013 return nodeID, nil 1014 } 1015 1016 // CheckNodeHealthAndVersion returns a information about a node's health and 1017 // compatibility. The info is also recorded in planCtx.Nodes. 1018 func (dsp *DistSQLPlanner) CheckNodeHealthAndVersion( 1019 planCtx *PlanningCtx, nodeID roachpb.NodeID, 1020 ) NodeStatus { 1021 if status, ok := planCtx.NodeStatuses[nodeID]; ok { 1022 return status 1023 } 1024 1025 var status NodeStatus 1026 if err := dsp.nodeHealth.check(planCtx.ctx, nodeID); err != nil { 1027 status = NodeUnhealthy 1028 } else if !dsp.nodeVersionIsCompatible(nodeID) { 1029 status = NodeDistSQLVersionIncompatible 1030 } else { 1031 status = NodeOK 1032 } 1033 planCtx.NodeStatuses[nodeID] = status 1034 return status 1035 } 1036 1037 // createTableReaders generates a plan consisting of table reader processors, 1038 // one for each node that has spans that we are reading. 1039 // overridesResultColumns is optional. 1040 func (dsp *DistSQLPlanner) createTableReaders( 1041 planCtx *PlanningCtx, n *scanNode, 1042 ) (*PhysicalPlan, error) { 1043 scanNodeToTableOrdinalMap := getScanNodeToTableOrdinalMap(n) 1044 spec, post, err := initTableReaderSpec(n, planCtx, scanNodeToTableOrdinalMap) 1045 if err != nil { 1046 return nil, err 1047 } 1048 1049 var spanPartitions []SpanPartition 1050 if planCtx.isLocal { 1051 spanPartitions = []SpanPartition{{dsp.nodeDesc.NodeID, n.spans}} 1052 } else if n.hardLimit == 0 { 1053 // No hard limit - plan all table readers where their data live. Note 1054 // that we're ignoring soft limits for now since the TableReader will 1055 // still read too eagerly in the soft limit case. To prevent this we'll 1056 // need a new mechanism on the execution side to modulate table reads. 1057 // TODO(yuzefovich): add that mechanism. 1058 spanPartitions, err = dsp.PartitionSpans(planCtx, n.spans) 1059 if err != nil { 1060 return nil, err 1061 } 1062 } else { 1063 // If the scan has a hard limit, use a single TableReader to avoid 1064 // reading more rows than necessary. 1065 nodeID, err := dsp.getNodeIDForScan(planCtx, n.spans, n.reverse) 1066 if err != nil { 1067 return nil, err 1068 } 1069 spanPartitions = []SpanPartition{{nodeID, n.spans}} 1070 } 1071 1072 var p PhysicalPlan 1073 stageID := p.NewStageID() 1074 1075 p.ResultRouters = make([]physicalplan.ProcessorIdx, len(spanPartitions)) 1076 p.Processors = make([]physicalplan.Processor, 0, len(spanPartitions)) 1077 1078 returnMutations := n.colCfg.visibility == execinfra.ScanVisibilityPublicAndNotPublic 1079 1080 for i, sp := range spanPartitions { 1081 var tr *execinfrapb.TableReaderSpec 1082 if i == 0 { 1083 // For the first span partition, we can just directly use the spec we made 1084 // above. 1085 tr = spec 1086 } else { 1087 // For the rest, we have to copy the spec into a fresh spec. 1088 tr = physicalplan.NewTableReaderSpec() 1089 // Grab the Spans field of the new spec, and reuse it in case the pooled 1090 // TableReaderSpec we got has pre-allocated Spans memory. 1091 newSpansSlice := tr.Spans 1092 *tr = *spec 1093 tr.Spans = newSpansSlice 1094 } 1095 for j := range sp.Spans { 1096 tr.Spans = append(tr.Spans, execinfrapb.TableReaderSpan{Span: sp.Spans[j]}) 1097 } 1098 1099 tr.MaxResults = n.maxResults 1100 p.TotalEstimatedScannedRows += n.estimatedRowCount 1101 if n.estimatedRowCount > p.MaxEstimatedRowCount { 1102 p.MaxEstimatedRowCount = n.estimatedRowCount 1103 } 1104 1105 proc := physicalplan.Processor{ 1106 Node: sp.Node, 1107 Spec: execinfrapb.ProcessorSpec{ 1108 Core: execinfrapb.ProcessorCoreUnion{TableReader: tr}, 1109 Output: []execinfrapb.OutputRouterSpec{{Type: execinfrapb.OutputRouterSpec_PASS_THROUGH}}, 1110 StageID: stageID, 1111 }, 1112 } 1113 1114 pIdx := p.AddProcessor(proc) 1115 p.ResultRouters[i] = pIdx 1116 } 1117 1118 if len(p.ResultRouters) > 1 && len(n.reqOrdering) > 0 { 1119 // Make a note of the fact that we have to maintain a certain ordering 1120 // between the parallel streams. 1121 // 1122 // This information is taken into account by the AddProjection call below: 1123 // specifically, it will make sure these columns are kept even if they are 1124 // not in the projection (e.g. "SELECT v FROM kv ORDER BY k"). 1125 p.SetMergeOrdering(dsp.convertOrdering(n.reqOrdering, scanNodeToTableOrdinalMap)) 1126 } 1127 1128 var typs []*types.T 1129 if returnMutations { 1130 typs = make([]*types.T, 0, len(n.desc.Columns)+len(n.desc.MutationColumns())) 1131 } else { 1132 typs = make([]*types.T, 0, len(n.desc.Columns)) 1133 } 1134 for i := range n.desc.Columns { 1135 typs = append(typs, n.desc.Columns[i].Type) 1136 } 1137 if returnMutations { 1138 for _, col := range n.desc.MutationColumns() { 1139 typs = append(typs, col.Type) 1140 } 1141 } 1142 p.SetLastStagePost(post, typs) 1143 1144 outCols := getOutputColumnsFromScanNode(n, scanNodeToTableOrdinalMap) 1145 planToStreamColMap := make([]int, len(n.cols)) 1146 descColumnIDs := make([]sqlbase.ColumnID, 0, len(n.desc.Columns)) 1147 for i := range n.desc.Columns { 1148 descColumnIDs = append(descColumnIDs, n.desc.Columns[i].ID) 1149 } 1150 if returnMutations { 1151 for _, c := range n.desc.MutationColumns() { 1152 descColumnIDs = append(descColumnIDs, c.ID) 1153 } 1154 } 1155 for i := range planToStreamColMap { 1156 planToStreamColMap[i] = -1 1157 for j, c := range outCols { 1158 if descColumnIDs[c] == n.cols[i].ID { 1159 planToStreamColMap[i] = j 1160 break 1161 } 1162 } 1163 } 1164 p.AddProjection(outCols) 1165 1166 p.PlanToStreamColMap = planToStreamColMap 1167 return &p, nil 1168 } 1169 1170 // selectRenders takes a PhysicalPlan that produces the results corresponding to 1171 // the select data source (a n.source) and updates it to produce results 1172 // corresponding to the render node itself. An evaluator stage is added if the 1173 // render node has any expressions which are not just simple column references. 1174 func (dsp *DistSQLPlanner) selectRenders( 1175 p *PhysicalPlan, n *renderNode, planCtx *PlanningCtx, 1176 ) error { 1177 typs, err := getTypesForPlanResult(n, nil /* planToStreamColMap */) 1178 if err != nil { 1179 return err 1180 } 1181 err = p.AddRendering(n.render, planCtx, p.PlanToStreamColMap, typs) 1182 if err != nil { 1183 return err 1184 } 1185 p.PlanToStreamColMap = identityMap(p.PlanToStreamColMap, len(n.render)) 1186 return nil 1187 } 1188 1189 // addSorters adds sorters corresponding to a sortNode and updates the plan to 1190 // reflect the sort node. 1191 func (dsp *DistSQLPlanner) addSorters(p *PhysicalPlan, n *sortNode) { 1192 // Sorting is needed; we add a stage of sorting processors. 1193 ordering := execinfrapb.ConvertToMappedSpecOrdering(n.ordering, p.PlanToStreamColMap) 1194 1195 p.AddNoGroupingStage( 1196 execinfrapb.ProcessorCoreUnion{ 1197 Sorter: &execinfrapb.SorterSpec{ 1198 OutputOrdering: ordering, 1199 OrderingMatchLen: uint32(n.alreadyOrderedPrefix), 1200 }, 1201 }, 1202 execinfrapb.PostProcessSpec{}, 1203 p.ResultTypes, 1204 ordering, 1205 ) 1206 } 1207 1208 // addAggregators adds aggregators corresponding to a groupNode and updates the plan to 1209 // reflect the groupNode. An evaluator stage is added if necessary. 1210 // Invariants assumed: 1211 // - There is strictly no "pre-evaluation" necessary. If the given query is 1212 // 'SELECT COUNT(k), v + w FROM kv GROUP BY v + w', the evaluation of the first 1213 // 'v + w' is done at the source of the groupNode. 1214 // - We only operate on the following expressions: 1215 // - ONLY aggregation functions, with arguments pre-evaluated. So for 1216 // COUNT(k + v), we assume a stream of evaluated 'k + v' values. 1217 // - Expressions that CONTAIN an aggregation function, e.g. 'COUNT(k) + 1'. 1218 // This is evaluated in the post aggregation evaluator attached after. 1219 // - Expressions that also appear verbatim in the GROUP BY expressions. 1220 // For 'SELECT k GROUP BY k', the aggregation function added is IDENT, 1221 // therefore k just passes through unchanged. 1222 // All other expressions simply pass through unchanged, for e.g. '1' in 1223 // 'SELECT 1 GROUP BY k'. 1224 func (dsp *DistSQLPlanner) addAggregators( 1225 planCtx *PlanningCtx, p *PhysicalPlan, n *groupNode, 1226 ) error { 1227 aggregations := make([]execinfrapb.AggregatorSpec_Aggregation, len(n.funcs)) 1228 aggregationsColumnTypes := make([][]*types.T, len(n.funcs)) 1229 for i, fholder := range n.funcs { 1230 // Convert the aggregate function to the enum value with the same string 1231 // representation. 1232 funcStr := strings.ToUpper(fholder.funcName) 1233 funcIdx, ok := execinfrapb.AggregatorSpec_Func_value[funcStr] 1234 if !ok { 1235 return errors.Errorf("unknown aggregate %s", funcStr) 1236 } 1237 aggregations[i].Func = execinfrapb.AggregatorSpec_Func(funcIdx) 1238 aggregations[i].Distinct = fholder.isDistinct() 1239 for _, renderIdx := range fholder.argRenderIdxs { 1240 aggregations[i].ColIdx = append(aggregations[i].ColIdx, uint32(p.PlanToStreamColMap[renderIdx])) 1241 } 1242 if fholder.hasFilter() { 1243 col := uint32(p.PlanToStreamColMap[fholder.filterRenderIdx]) 1244 aggregations[i].FilterColIdx = &col 1245 } 1246 aggregations[i].Arguments = make([]execinfrapb.Expression, len(fholder.arguments)) 1247 aggregationsColumnTypes[i] = make([]*types.T, len(fholder.arguments)) 1248 for j, argument := range fholder.arguments { 1249 var err error 1250 aggregations[i].Arguments[j], err = physicalplan.MakeExpression(argument, planCtx, nil) 1251 if err != nil { 1252 return err 1253 } 1254 aggregationsColumnTypes[i][j] = argument.ResolvedType() 1255 if err != nil { 1256 return err 1257 } 1258 } 1259 } 1260 1261 aggType := execinfrapb.AggregatorSpec_NON_SCALAR 1262 if n.isScalar { 1263 aggType = execinfrapb.AggregatorSpec_SCALAR 1264 } 1265 1266 inputTypes := p.ResultTypes 1267 1268 groupCols := make([]uint32, len(n.groupCols)) 1269 for i, idx := range n.groupCols { 1270 groupCols[i] = uint32(p.PlanToStreamColMap[idx]) 1271 } 1272 orderedGroupCols := make([]uint32, len(n.groupColOrdering)) 1273 var orderedGroupColSet util.FastIntSet 1274 for i, c := range n.groupColOrdering { 1275 orderedGroupCols[i] = uint32(p.PlanToStreamColMap[c.ColIdx]) 1276 orderedGroupColSet.Add(c.ColIdx) 1277 } 1278 1279 // We either have a local stage on each stream followed by a final stage, or 1280 // just a final stage. We only use a local stage if: 1281 // - the previous stage is distributed on multiple nodes, and 1282 // - all aggregation functions support it. TODO(radu): we could relax this by 1283 // splitting the aggregation into two different paths and joining on the 1284 // results. 1285 // - we have a mix of aggregations that use distinct and aggregations that 1286 // don't use distinct. TODO(arjun): This would require doing the same as 1287 // the todo as above. 1288 multiStage := false 1289 allDistinct := true 1290 anyDistinct := false 1291 1292 // Check if the previous stage is all on one node. 1293 prevStageNode := p.Processors[p.ResultRouters[0]].Node 1294 for i := 1; i < len(p.ResultRouters); i++ { 1295 if n := p.Processors[p.ResultRouters[i]].Node; n != prevStageNode { 1296 prevStageNode = 0 1297 break 1298 } 1299 } 1300 1301 if prevStageNode == 0 { 1302 // Check that all aggregation functions support a local stage. 1303 multiStage = true 1304 for _, e := range aggregations { 1305 if e.Distinct { 1306 // We can't do local aggregation for functions with distinct. 1307 multiStage = false 1308 anyDistinct = true 1309 } else { 1310 // We can't do local distinct if we have a mix of distinct and 1311 // non-distinct aggregations. 1312 allDistinct = false 1313 } 1314 if _, ok := physicalplan.DistAggregationTable[e.Func]; !ok { 1315 multiStage = false 1316 break 1317 } 1318 } 1319 } 1320 if !anyDistinct { 1321 allDistinct = false 1322 } 1323 1324 var finalAggsSpec execinfrapb.AggregatorSpec 1325 var finalAggsPost execinfrapb.PostProcessSpec 1326 1327 if !multiStage && allDistinct { 1328 // We can't do local aggregation, but we can do local distinct processing 1329 // to reduce streaming duplicates, and aggregate on the final node. 1330 1331 ordering := dsp.convertOrdering(planReqOrdering(n.plan), p.PlanToStreamColMap).Columns 1332 orderedColsMap := make(map[uint32]struct{}) 1333 for _, ord := range ordering { 1334 orderedColsMap[ord.ColIdx] = struct{}{} 1335 } 1336 distinctColsMap := make(map[uint32]struct{}) 1337 for _, agg := range aggregations { 1338 for _, c := range agg.ColIdx { 1339 distinctColsMap[c] = struct{}{} 1340 } 1341 } 1342 orderedColumns := make([]uint32, len(orderedColsMap)) 1343 idx := 0 1344 for o := range orderedColsMap { 1345 orderedColumns[idx] = o 1346 idx++ 1347 } 1348 distinctColumns := make([]uint32, len(distinctColsMap)) 1349 idx = 0 1350 for o := range distinctColsMap { 1351 distinctColumns[idx] = o 1352 idx++ 1353 } 1354 1355 sort.Slice(orderedColumns, func(i, j int) bool { return orderedColumns[i] < orderedColumns[j] }) 1356 sort.Slice(distinctColumns, func(i, j int) bool { return distinctColumns[i] < distinctColumns[j] }) 1357 1358 distinctSpec := execinfrapb.ProcessorCoreUnion{ 1359 Distinct: &execinfrapb.DistinctSpec{ 1360 OrderedColumns: orderedColumns, 1361 DistinctColumns: distinctColumns, 1362 }, 1363 } 1364 1365 // Add distinct processors local to each existing current result processor. 1366 p.AddNoGroupingStage(distinctSpec, execinfrapb.PostProcessSpec{}, p.ResultTypes, p.MergeOrdering) 1367 } 1368 1369 // planToStreamMapSet keeps track of whether or not 1370 // p.PlanToStreamColMap has been set to its desired mapping or not. 1371 planToStreamMapSet := false 1372 if !multiStage { 1373 finalAggsSpec = execinfrapb.AggregatorSpec{ 1374 Type: aggType, 1375 Aggregations: aggregations, 1376 GroupCols: groupCols, 1377 OrderedGroupCols: orderedGroupCols, 1378 } 1379 } else { 1380 // Some aggregations might need multiple aggregation as part of 1381 // their local and final stages (along with a final render 1382 // expression to combine the multiple aggregations into a 1383 // single result). 1384 // 1385 // Count the total number of aggregation in the local/final 1386 // stages and keep track of whether any of them needs a final 1387 // rendering. 1388 nLocalAgg := 0 1389 nFinalAgg := 0 1390 needRender := false 1391 for _, e := range aggregations { 1392 info := physicalplan.DistAggregationTable[e.Func] 1393 nLocalAgg += len(info.LocalStage) 1394 nFinalAgg += len(info.FinalStage) 1395 if info.FinalRendering != nil { 1396 needRender = true 1397 } 1398 } 1399 1400 // We alloc the maximum possible number of unique local and final 1401 // aggregations but do not initialize any aggregations 1402 // since we can de-duplicate equivalent local and final aggregations. 1403 localAggs := make([]execinfrapb.AggregatorSpec_Aggregation, 0, nLocalAgg+len(groupCols)) 1404 intermediateTypes := make([]*types.T, 0, nLocalAgg+len(groupCols)) 1405 finalAggs := make([]execinfrapb.AggregatorSpec_Aggregation, 0, nFinalAgg) 1406 // finalIdxMap maps the index i of the final aggregation (with 1407 // respect to the i-th final aggregation out of all final 1408 // aggregations) to its index in the finalAggs slice. 1409 finalIdxMap := make([]uint32, nFinalAgg) 1410 1411 // finalPreRenderTypes is passed to an IndexVarHelper which 1412 // helps type-check the indexed variables passed into 1413 // FinalRendering for some aggregations. 1414 // This has a 1-1 mapping to finalAggs 1415 var finalPreRenderTypes []*types.T 1416 if needRender { 1417 finalPreRenderTypes = make([]*types.T, 0, nFinalAgg) 1418 } 1419 1420 // Each aggregation can have multiple aggregations in the 1421 // local/final stages. We concatenate all these into 1422 // localAggs/finalAggs. 1423 // finalIdx is the index of the final aggregation with respect 1424 // to all final aggregations. 1425 finalIdx := 0 1426 for _, e := range aggregations { 1427 info := physicalplan.DistAggregationTable[e.Func] 1428 1429 // relToAbsLocalIdx maps each local stage for the given 1430 // aggregation e to its final index in localAggs. This 1431 // is necessary since we de-duplicate equivalent local 1432 // aggregations and need to correspond the one copy of 1433 // local aggregation required by the final stage to its 1434 // input, which is specified as a relative local stage 1435 // index (see `Aggregations` in aggregators_func.go). 1436 // We use a slice here instead of a map because we have 1437 // a small, bounded domain to map and runtime hash 1438 // operations are relatively expensive. 1439 relToAbsLocalIdx := make([]uint32, len(info.LocalStage)) 1440 // First prepare and spec local aggregations. 1441 // Note the planNode first feeds the input (inputTypes) 1442 // into the local aggregators. 1443 for i, localFunc := range info.LocalStage { 1444 localAgg := execinfrapb.AggregatorSpec_Aggregation{ 1445 Func: localFunc, 1446 ColIdx: e.ColIdx, 1447 FilterColIdx: e.FilterColIdx, 1448 } 1449 1450 isNewAgg := true 1451 for j, prevLocalAgg := range localAggs { 1452 if localAgg.Equals(prevLocalAgg) { 1453 // Found existing, equivalent local agg. 1454 // Map the relative index (i) 1455 // for the current local agg 1456 // to the absolute index (j) of 1457 // the existing local agg. 1458 relToAbsLocalIdx[i] = uint32(j) 1459 isNewAgg = false 1460 break 1461 } 1462 } 1463 1464 if isNewAgg { 1465 // Append the new local aggregation 1466 // and map to its index in localAggs. 1467 relToAbsLocalIdx[i] = uint32(len(localAggs)) 1468 localAggs = append(localAggs, localAgg) 1469 1470 // Keep track of the new local 1471 // aggregation's output type. 1472 argTypes := make([]*types.T, len(e.ColIdx)) 1473 for j, c := range e.ColIdx { 1474 argTypes[j] = inputTypes[c] 1475 } 1476 _, outputType, err := execinfrapb.GetAggregateInfo(localFunc, argTypes...) 1477 if err != nil { 1478 return err 1479 } 1480 intermediateTypes = append(intermediateTypes, outputType) 1481 } 1482 } 1483 1484 for _, finalInfo := range info.FinalStage { 1485 // The input of the final aggregators is 1486 // specified as the relative indices of the 1487 // local aggregation values. We need to map 1488 // these to the corresponding absolute indices 1489 // in localAggs. 1490 // argIdxs consists of the absolute indices 1491 // in localAggs. 1492 argIdxs := make([]uint32, len(finalInfo.LocalIdxs)) 1493 for i, relIdx := range finalInfo.LocalIdxs { 1494 argIdxs[i] = relToAbsLocalIdx[relIdx] 1495 } 1496 finalAgg := execinfrapb.AggregatorSpec_Aggregation{ 1497 Func: finalInfo.Fn, 1498 ColIdx: argIdxs, 1499 } 1500 1501 isNewAgg := true 1502 for i, prevFinalAgg := range finalAggs { 1503 if finalAgg.Equals(prevFinalAgg) { 1504 // Found existing, equivalent 1505 // final agg. Map the finalIdx 1506 // for the current final agg to 1507 // its index (i) in finalAggs. 1508 finalIdxMap[finalIdx] = uint32(i) 1509 isNewAgg = false 1510 break 1511 } 1512 } 1513 1514 // Append the final agg if there is no existing 1515 // equivalent. 1516 if isNewAgg { 1517 finalIdxMap[finalIdx] = uint32(len(finalAggs)) 1518 finalAggs = append(finalAggs, finalAgg) 1519 1520 if needRender { 1521 argTypes := make([]*types.T, len(finalInfo.LocalIdxs)) 1522 for i := range finalInfo.LocalIdxs { 1523 // Map the corresponding local 1524 // aggregation output types for 1525 // the current aggregation e. 1526 argTypes[i] = intermediateTypes[argIdxs[i]] 1527 } 1528 _, outputType, err := execinfrapb.GetAggregateInfo(finalInfo.Fn, argTypes...) 1529 if err != nil { 1530 return err 1531 } 1532 finalPreRenderTypes = append(finalPreRenderTypes, outputType) 1533 } 1534 } 1535 finalIdx++ 1536 } 1537 } 1538 1539 // In queries like SELECT min(v) FROM kv GROUP BY k, not all group columns 1540 // appear in the rendering. Add IDENT expressions for them, as they need to 1541 // be part of the output of the local stage for the final stage to know 1542 // about them. 1543 finalGroupCols := make([]uint32, len(groupCols)) 1544 finalOrderedGroupCols := make([]uint32, 0, len(orderedGroupCols)) 1545 for i, groupColIdx := range groupCols { 1546 agg := execinfrapb.AggregatorSpec_Aggregation{ 1547 Func: execinfrapb.AggregatorSpec_ANY_NOT_NULL, 1548 ColIdx: []uint32{groupColIdx}, 1549 } 1550 // See if there already is an aggregation like the one 1551 // we want to add. 1552 idx := -1 1553 for j := range localAggs { 1554 if localAggs[j].Equals(agg) { 1555 idx = j 1556 break 1557 } 1558 } 1559 if idx == -1 { 1560 // Not already there, add it. 1561 idx = len(localAggs) 1562 localAggs = append(localAggs, agg) 1563 intermediateTypes = append(intermediateTypes, inputTypes[groupColIdx]) 1564 } 1565 finalGroupCols[i] = uint32(idx) 1566 if orderedGroupColSet.Contains(n.groupCols[i]) { 1567 finalOrderedGroupCols = append(finalOrderedGroupCols, uint32(idx)) 1568 } 1569 } 1570 1571 // Create the merge ordering for the local stage (this will be maintained 1572 // for results going into the final stage). 1573 ordCols := make([]execinfrapb.Ordering_Column, len(n.groupColOrdering)) 1574 for i, o := range n.groupColOrdering { 1575 // Find the group column. 1576 found := false 1577 for j, col := range n.groupCols { 1578 if col == o.ColIdx { 1579 ordCols[i].ColIdx = finalGroupCols[j] 1580 found = true 1581 break 1582 } 1583 } 1584 if !found { 1585 return errors.AssertionFailedf("group column ordering contains non-grouping column %d", o.ColIdx) 1586 } 1587 if o.Direction == encoding.Descending { 1588 ordCols[i].Direction = execinfrapb.Ordering_Column_DESC 1589 } else { 1590 ordCols[i].Direction = execinfrapb.Ordering_Column_ASC 1591 } 1592 } 1593 1594 localAggsSpec := execinfrapb.AggregatorSpec{ 1595 Type: aggType, 1596 Aggregations: localAggs, 1597 GroupCols: groupCols, 1598 OrderedGroupCols: orderedGroupCols, 1599 } 1600 1601 p.AddNoGroupingStage( 1602 execinfrapb.ProcessorCoreUnion{Aggregator: &localAggsSpec}, 1603 execinfrapb.PostProcessSpec{}, 1604 intermediateTypes, 1605 execinfrapb.Ordering{Columns: ordCols}, 1606 ) 1607 1608 finalAggsSpec = execinfrapb.AggregatorSpec{ 1609 Type: aggType, 1610 Aggregations: finalAggs, 1611 GroupCols: finalGroupCols, 1612 OrderedGroupCols: finalOrderedGroupCols, 1613 } 1614 1615 if needRender { 1616 // Build rendering expressions. 1617 renderExprs := make([]execinfrapb.Expression, len(aggregations)) 1618 h := tree.MakeTypesOnlyIndexedVarHelper(finalPreRenderTypes) 1619 // finalIdx is an index inside finalAggs. It is used to 1620 // keep track of the finalAggs results that correspond 1621 // to each aggregation. 1622 finalIdx := 0 1623 for i, e := range aggregations { 1624 info := physicalplan.DistAggregationTable[e.Func] 1625 if info.FinalRendering == nil { 1626 // mappedIdx corresponds to the index 1627 // location of the result for this 1628 // final aggregation in finalAggs. This 1629 // is necessary since we re-use final 1630 // aggregations if they are equivalent 1631 // across and within stages. 1632 mappedIdx := int(finalIdxMap[finalIdx]) 1633 var err error 1634 renderExprs[i], err = physicalplan.MakeExpression( 1635 h.IndexedVar(mappedIdx), planCtx, nil /* indexVarMap */) 1636 if err != nil { 1637 return err 1638 } 1639 } else { 1640 // We have multiple final aggregation 1641 // values that we need to be mapped to 1642 // their corresponding index in 1643 // finalAggs for FinalRendering. 1644 mappedIdxs := make([]int, len(info.FinalStage)) 1645 for j := range info.FinalStage { 1646 mappedIdxs[j] = int(finalIdxMap[finalIdx+j]) 1647 } 1648 // Map the final aggregation values 1649 // to their corresponding indices. 1650 expr, err := info.FinalRendering(&h, mappedIdxs) 1651 if err != nil { 1652 return err 1653 } 1654 renderExprs[i], err = physicalplan.MakeExpression( 1655 expr, planCtx, 1656 nil /* indexVarMap */) 1657 if err != nil { 1658 return err 1659 } 1660 } 1661 finalIdx += len(info.FinalStage) 1662 } 1663 finalAggsPost.RenderExprs = renderExprs 1664 } else if len(finalAggs) < len(aggregations) { 1665 // We want to ensure we map the streams properly now 1666 // that we've potential reduced the number of final 1667 // aggregation output streams. We use finalIdxMap to 1668 // create a 1-1 mapping from the final aggregators to 1669 // their corresponding column index in the map. 1670 p.PlanToStreamColMap = p.PlanToStreamColMap[:0] 1671 for _, idx := range finalIdxMap { 1672 p.PlanToStreamColMap = append(p.PlanToStreamColMap, int(idx)) 1673 } 1674 planToStreamMapSet = true 1675 } 1676 } 1677 1678 // Set up the final stage. 1679 1680 finalOutTypes := make([]*types.T, len(aggregations)) 1681 for i, agg := range aggregations { 1682 argTypes := make([]*types.T, len(agg.ColIdx)+len(agg.Arguments)) 1683 for j, c := range agg.ColIdx { 1684 argTypes[j] = inputTypes[c] 1685 } 1686 for j, argumentColumnType := range aggregationsColumnTypes[i] { 1687 argTypes[len(agg.ColIdx)+j] = argumentColumnType 1688 } 1689 var err error 1690 _, returnTyp, err := execinfrapb.GetAggregateInfo(agg.Func, argTypes...) 1691 if err != nil { 1692 return err 1693 } 1694 finalOutTypes[i] = returnTyp 1695 } 1696 1697 // Update p.PlanToStreamColMap; we will have a simple 1-to-1 mapping of 1698 // planNode columns to stream columns because the aggregator 1699 // has been programmed to produce the same columns as the groupNode. 1700 if !planToStreamMapSet { 1701 p.PlanToStreamColMap = identityMap(p.PlanToStreamColMap, len(aggregations)) 1702 } 1703 1704 if len(finalAggsSpec.GroupCols) == 0 || len(p.ResultRouters) == 1 { 1705 // No GROUP BY, or we have a single stream. Use a single final aggregator. 1706 // If the previous stage was all on a single node, put the final 1707 // aggregator there. Otherwise, bring the results back on this node. 1708 node := dsp.nodeDesc.NodeID 1709 if prevStageNode != 0 { 1710 node = prevStageNode 1711 } 1712 p.AddSingleGroupStage( 1713 node, 1714 execinfrapb.ProcessorCoreUnion{Aggregator: &finalAggsSpec}, 1715 finalAggsPost, 1716 finalOutTypes, 1717 ) 1718 } else { 1719 // We distribute (by group columns) to multiple processors. 1720 1721 // Set up the output routers from the previous stage. 1722 for _, resultProc := range p.ResultRouters { 1723 p.Processors[resultProc].Spec.Output[0] = execinfrapb.OutputRouterSpec{ 1724 Type: execinfrapb.OutputRouterSpec_BY_HASH, 1725 HashColumns: finalAggsSpec.GroupCols, 1726 } 1727 } 1728 1729 stageID := p.NewStageID() 1730 1731 // We have one final stage processor for each result router. This is a 1732 // somewhat arbitrary decision; we could have a different number of nodes 1733 // working on the final stage. 1734 pIdxStart := physicalplan.ProcessorIdx(len(p.Processors)) 1735 for _, resultProc := range p.ResultRouters { 1736 proc := physicalplan.Processor{ 1737 Node: p.Processors[resultProc].Node, 1738 Spec: execinfrapb.ProcessorSpec{ 1739 Input: []execinfrapb.InputSyncSpec{{ 1740 // The other fields will be filled in by mergeResultStreams. 1741 ColumnTypes: p.ResultTypes, 1742 }}, 1743 Core: execinfrapb.ProcessorCoreUnion{Aggregator: &finalAggsSpec}, 1744 Post: finalAggsPost, 1745 Output: []execinfrapb.OutputRouterSpec{{ 1746 Type: execinfrapb.OutputRouterSpec_PASS_THROUGH, 1747 }}, 1748 StageID: stageID, 1749 }, 1750 } 1751 p.AddProcessor(proc) 1752 } 1753 1754 // Connect the streams. 1755 for bucket := 0; bucket < len(p.ResultRouters); bucket++ { 1756 pIdx := pIdxStart + physicalplan.ProcessorIdx(bucket) 1757 p.MergeResultStreams(p.ResultRouters, bucket, p.MergeOrdering, pIdx, 0) 1758 } 1759 1760 // Set the new result routers. 1761 for i := 0; i < len(p.ResultRouters); i++ { 1762 p.ResultRouters[i] = pIdxStart + physicalplan.ProcessorIdx(i) 1763 } 1764 1765 p.ResultTypes = finalOutTypes 1766 p.SetMergeOrdering(dsp.convertOrdering(n.reqOrdering, p.PlanToStreamColMap)) 1767 } 1768 1769 return nil 1770 } 1771 1772 func (dsp *DistSQLPlanner) createPlanForIndexJoin( 1773 planCtx *PlanningCtx, n *indexJoinNode, 1774 ) (*PhysicalPlan, error) { 1775 plan, err := dsp.createPhysPlanForPlanNode(planCtx, n.input) 1776 if err != nil { 1777 return nil, err 1778 } 1779 1780 // In "index-join mode", the join reader assumes that the PK cols are a prefix 1781 // of the input stream columns (see #40749). We need a projection to make that 1782 // happen. The other columns are not used by the join reader. 1783 pkCols := make([]uint32, len(n.keyCols)) 1784 for i := range n.keyCols { 1785 streamColOrd := plan.PlanToStreamColMap[n.keyCols[i]] 1786 if streamColOrd == -1 { 1787 panic("key column not in planToStreamColMap") 1788 } 1789 pkCols[i] = uint32(streamColOrd) 1790 } 1791 plan.AddProjection(pkCols) 1792 1793 joinReaderSpec := execinfrapb.JoinReaderSpec{ 1794 Table: *n.table.desc.TableDesc(), 1795 IndexIdx: 0, 1796 Visibility: n.table.colCfg.visibility, 1797 LockingStrength: n.table.lockingStrength, 1798 LockingWaitPolicy: n.table.lockingWaitPolicy, 1799 } 1800 1801 filter, err := physicalplan.MakeExpression( 1802 n.table.filter, planCtx, nil /* indexVarMap */) 1803 if err != nil { 1804 return nil, err 1805 } 1806 post := execinfrapb.PostProcessSpec{ 1807 Filter: filter, 1808 Projection: true, 1809 } 1810 1811 // Calculate the output columns from n.cols. 1812 post.OutputColumns = make([]uint32, len(n.cols)) 1813 plan.PlanToStreamColMap = identityMap(plan.PlanToStreamColMap, len(n.cols)) 1814 1815 for i := range n.cols { 1816 ord := tableOrdinal(n.table.desc, n.cols[i].ID, n.table.colCfg.visibility) 1817 post.OutputColumns[i] = uint32(ord) 1818 } 1819 1820 types, err := getTypesForPlanResult(n, plan.PlanToStreamColMap) 1821 if err != nil { 1822 return nil, err 1823 } 1824 if len(plan.ResultRouters) > 1 { 1825 // Instantiate one join reader for every stream. 1826 plan.AddNoGroupingStage( 1827 execinfrapb.ProcessorCoreUnion{JoinReader: &joinReaderSpec}, 1828 post, 1829 types, 1830 dsp.convertOrdering(n.reqOrdering, plan.PlanToStreamColMap), 1831 ) 1832 } else { 1833 // We have a single stream, so use a single join reader on that node. 1834 plan.AddSingleGroupStage( 1835 plan.Processors[plan.ResultRouters[0]].Node, 1836 execinfrapb.ProcessorCoreUnion{JoinReader: &joinReaderSpec}, 1837 post, 1838 types, 1839 ) 1840 } 1841 return plan, nil 1842 } 1843 1844 // createPlanForLookupJoin creates a distributed plan for a lookupJoinNode. 1845 // Note that this is a separate code path from the experimental path which 1846 // converts joins to lookup joins. 1847 func (dsp *DistSQLPlanner) createPlanForLookupJoin( 1848 planCtx *PlanningCtx, n *lookupJoinNode, 1849 ) (*PhysicalPlan, error) { 1850 plan, err := dsp.createPhysPlanForPlanNode(planCtx, n.input) 1851 if err != nil { 1852 return nil, err 1853 } 1854 1855 joinReaderSpec := execinfrapb.JoinReaderSpec{ 1856 Table: *n.table.desc.TableDesc(), 1857 Type: n.joinType, 1858 Visibility: n.table.colCfg.visibility, 1859 LockingStrength: n.table.lockingStrength, 1860 LockingWaitPolicy: n.table.lockingWaitPolicy, 1861 MaintainOrdering: len(n.reqOrdering) > 0, 1862 } 1863 joinReaderSpec.IndexIdx, err = getIndexIdx(n.table.index, n.table.desc) 1864 if err != nil { 1865 return nil, err 1866 } 1867 joinReaderSpec.LookupColumns = make([]uint32, len(n.eqCols)) 1868 for i, col := range n.eqCols { 1869 if plan.PlanToStreamColMap[col] == -1 { 1870 panic("lookup column not in planToStreamColMap") 1871 } 1872 joinReaderSpec.LookupColumns[i] = uint32(plan.PlanToStreamColMap[col]) 1873 } 1874 joinReaderSpec.LookupColumnsAreKey = n.eqColsAreKey 1875 1876 // The n.table node can be configured with an arbitrary set of columns. Apply 1877 // the corresponding projection. 1878 // The internal schema of the join reader is: 1879 // <input columns>... <table columns>... 1880 numLeftCols := len(plan.ResultTypes) 1881 numOutCols := numLeftCols + len(n.table.cols) 1882 post := execinfrapb.PostProcessSpec{Projection: true} 1883 1884 post.OutputColumns = make([]uint32, numOutCols) 1885 types := make([]*types.T, numOutCols) 1886 1887 for i := 0; i < numLeftCols; i++ { 1888 types[i] = plan.ResultTypes[i] 1889 post.OutputColumns[i] = uint32(i) 1890 } 1891 for i := range n.table.cols { 1892 types[numLeftCols+i] = n.table.cols[i].Type 1893 ord := tableOrdinal(n.table.desc, n.table.cols[i].ID, n.table.colCfg.visibility) 1894 post.OutputColumns[numLeftCols+i] = uint32(numLeftCols + ord) 1895 } 1896 1897 // Map the columns of the lookupJoinNode to the result streams of the 1898 // JoinReader. 1899 numInputNodeCols := len(planColumns(n.input)) 1900 planToStreamColMap := makePlanToStreamColMap(numInputNodeCols + len(n.table.cols)) 1901 copy(planToStreamColMap, plan.PlanToStreamColMap) 1902 for i := range n.table.cols { 1903 planToStreamColMap[numInputNodeCols+i] = numLeftCols + i 1904 } 1905 1906 // Set the ON condition. 1907 if n.onCond != nil { 1908 // Note that (regardless of the join type or the OutputColumns projection) 1909 // the ON condition refers to the input columns with var indexes 0 to 1910 // numInputNodeCols-1 and to table columns with var indexes starting from 1911 // numInputNodeCols. 1912 indexVarMap := makePlanToStreamColMap(numInputNodeCols + len(n.table.cols)) 1913 copy(indexVarMap, plan.PlanToStreamColMap) 1914 for i := range n.table.cols { 1915 indexVarMap[numInputNodeCols+i] = int(post.OutputColumns[numLeftCols+i]) 1916 } 1917 var err error 1918 joinReaderSpec.OnExpr, err = physicalplan.MakeExpression( 1919 n.onCond, planCtx, indexVarMap, 1920 ) 1921 if err != nil { 1922 return nil, err 1923 } 1924 } 1925 1926 if n.joinType == sqlbase.LeftSemiJoin || n.joinType == sqlbase.LeftAntiJoin { 1927 // For anti/semi join, we only produce the input columns. 1928 planToStreamColMap = planToStreamColMap[:numInputNodeCols] 1929 post.OutputColumns = post.OutputColumns[:numInputNodeCols] 1930 types = types[:numInputNodeCols] 1931 } 1932 1933 // Instantiate one join reader for every stream. 1934 plan.AddNoGroupingStage( 1935 execinfrapb.ProcessorCoreUnion{JoinReader: &joinReaderSpec}, 1936 post, 1937 types, 1938 dsp.convertOrdering(planReqOrdering(n), planToStreamColMap), 1939 ) 1940 plan.PlanToStreamColMap = planToStreamColMap 1941 return plan, nil 1942 } 1943 1944 // createPlanForZigzagJoin creates a distributed plan for a zigzagJoinNode. 1945 func (dsp *DistSQLPlanner) createPlanForZigzagJoin( 1946 planCtx *PlanningCtx, n *zigzagJoinNode, 1947 ) (plan *PhysicalPlan, err error) { 1948 plan = &PhysicalPlan{} 1949 1950 tables := make([]sqlbase.TableDescriptor, len(n.sides)) 1951 indexOrdinals := make([]uint32, len(n.sides)) 1952 cols := make([]execinfrapb.Columns, len(n.sides)) 1953 numStreamCols := 0 1954 for i, side := range n.sides { 1955 tables[i] = *side.scan.desc.TableDesc() 1956 indexOrdinals[i], err = getIndexIdx(side.scan.index, side.scan.desc) 1957 if err != nil { 1958 return nil, err 1959 } 1960 1961 cols[i].Columns = make([]uint32, len(side.eqCols)) 1962 for j, col := range side.eqCols { 1963 cols[i].Columns[j] = uint32(col) 1964 } 1965 1966 numStreamCols += len(side.scan.desc.Columns) 1967 } 1968 1969 // The zigzag join node only represents inner joins, so hardcode Type to 1970 // InnerJoin. 1971 zigzagJoinerSpec := execinfrapb.ZigzagJoinerSpec{ 1972 Tables: tables, 1973 IndexOrdinals: indexOrdinals, 1974 EqColumns: cols, 1975 Type: sqlbase.InnerJoin, 1976 } 1977 zigzagJoinerSpec.FixedValues = make([]*execinfrapb.ValuesCoreSpec, len(n.sides)) 1978 1979 // The fixed values are represented as a Values node with one tuple. 1980 for i := range n.sides { 1981 valuesPlan, err := dsp.createPlanForValues(planCtx, n.sides[i].fixedVals) 1982 if err != nil { 1983 return nil, err 1984 } 1985 zigzagJoinerSpec.FixedValues[i] = valuesPlan.PhysicalPlan.Processors[0].Spec.Core.Values 1986 } 1987 1988 // The internal schema of the zigzag joiner is: 1989 // <side 1 table columns> ... <side 2 table columns> ... 1990 // with only the columns in the specified index populated. 1991 // 1992 // The schema of the zigzagJoinNode is: 1993 // <side 1 index columns> ... <side 2 index columns> ... 1994 // so the planToStreamColMap has to basically map index ordinals 1995 // to table ordinals. 1996 post := execinfrapb.PostProcessSpec{Projection: true} 1997 numOutCols := len(n.columns) 1998 1999 post.OutputColumns = make([]uint32, numOutCols) 2000 types := make([]*types.T, numOutCols) 2001 planToStreamColMap := makePlanToStreamColMap(numOutCols) 2002 colOffset := 0 2003 i := 0 2004 2005 // Populate post.OutputColumns (the implicit projection), result types, 2006 // and the planToStreamColMap for index columns from all sides. 2007 for _, side := range n.sides { 2008 // Note that the side's scanNode only contains the columns from that 2009 // index that are also in n.columns. This is because we generated 2010 // colCfg.wantedColumns for only the necessary columns in 2011 // opt/exec/execbuilder/relational_builder.go, similar to lookup joins. 2012 for colIdx := range side.scan.cols { 2013 ord := tableOrdinal(side.scan.desc, side.scan.cols[colIdx].ID, side.scan.colCfg.visibility) 2014 post.OutputColumns[i] = uint32(colOffset + ord) 2015 types[i] = side.scan.cols[colIdx].Type 2016 planToStreamColMap[i] = i 2017 2018 i++ 2019 } 2020 2021 colOffset += len(side.scan.desc.Columns) 2022 } 2023 2024 // Figure out the node where this zigzag joiner goes. 2025 // 2026 // TODO(itsbilal): Add support for restricting the Zigzag joiner 2027 // to a certain set of spans (similar to the InterleavedReaderJoiner) 2028 // on one side. Once that's done, we can split this processor across 2029 // multiple nodes here. Until then, schedule on the current node. 2030 nodeID := dsp.nodeDesc.NodeID 2031 2032 stageID := plan.NewStageID() 2033 // Set the ON condition. 2034 if n.onCond != nil { 2035 // Note that the ON condition refers to the *internal* columns of the 2036 // processor (before the OutputColumns projection). 2037 indexVarMap := makePlanToStreamColMap(len(n.columns)) 2038 for i := range n.columns { 2039 indexVarMap[i] = int(post.OutputColumns[i]) 2040 } 2041 zigzagJoinerSpec.OnExpr, err = physicalplan.MakeExpression( 2042 n.onCond, planCtx, indexVarMap, 2043 ) 2044 if err != nil { 2045 return nil, err 2046 } 2047 } 2048 2049 // Build the PhysicalPlan. 2050 proc := physicalplan.Processor{ 2051 Node: nodeID, 2052 Spec: execinfrapb.ProcessorSpec{ 2053 Core: execinfrapb.ProcessorCoreUnion{ZigzagJoiner: &zigzagJoinerSpec}, 2054 Post: post, 2055 Output: []execinfrapb.OutputRouterSpec{{Type: execinfrapb.OutputRouterSpec_PASS_THROUGH}}, 2056 StageID: stageID, 2057 }, 2058 } 2059 2060 plan.Processors = append(plan.Processors, proc) 2061 2062 // Each result router correspond to each of the processors we appended. 2063 plan.ResultRouters = []physicalplan.ProcessorIdx{physicalplan.ProcessorIdx(0)} 2064 2065 plan.PlanToStreamColMap = planToStreamColMap 2066 plan.ResultTypes = types 2067 2068 return plan, nil 2069 } 2070 2071 // getTypesForPlanResult returns the types of the elements in the result streams 2072 // of a plan that corresponds to a given planNode. If planToStreamColMap is nil, 2073 // a 1-1 mapping is assumed. 2074 func getTypesForPlanResult(node planNode, planToStreamColMap []int) ([]*types.T, error) { 2075 nodeColumns := planColumns(node) 2076 if planToStreamColMap == nil { 2077 // No remapping. 2078 types := make([]*types.T, len(nodeColumns)) 2079 for i := range nodeColumns { 2080 types[i] = nodeColumns[i].Typ 2081 } 2082 return types, nil 2083 } 2084 numCols := 0 2085 for _, streamCol := range planToStreamColMap { 2086 if numCols <= streamCol { 2087 numCols = streamCol + 1 2088 } 2089 } 2090 types := make([]*types.T, numCols) 2091 for nodeCol, streamCol := range planToStreamColMap { 2092 if streamCol != -1 { 2093 types[streamCol] = nodeColumns[nodeCol].Typ 2094 } 2095 } 2096 return types, nil 2097 } 2098 2099 func (dsp *DistSQLPlanner) createPlanForJoin( 2100 planCtx *PlanningCtx, n *joinNode, 2101 ) (*PhysicalPlan, error) { 2102 // See if we can create an interleave join plan. 2103 if planInterleavedJoins.Get(&dsp.st.SV) { 2104 plan, ok, err := dsp.tryCreatePlanForInterleavedJoin(planCtx, n) 2105 if err != nil { 2106 return nil, err 2107 } 2108 // An interleave join plan could be used. Return it. 2109 if ok { 2110 return plan, nil 2111 } 2112 } 2113 2114 // Outline of the planning process for joins: 2115 // 2116 // - We create PhysicalPlans for the left and right side. Each plan has a set 2117 // of output routers with result that will serve as input for the join. 2118 // 2119 // - We merge the list of processors and streams into a single plan. We keep 2120 // track of the output routers for the left and right results. 2121 // 2122 // - We add a set of joiner processors (say K of them). 2123 // 2124 // - We configure the left and right output routers to send results to 2125 // these joiners, distributing rows by hash (on the join equality columns). 2126 // We are thus breaking up all input rows into K buckets such that rows 2127 // that match on the equality columns end up in the same bucket. If there 2128 // are no equality columns, we cannot distribute rows so we use a single 2129 // joiner. 2130 // 2131 // - The routers of the joiner processors are the result routers of the plan. 2132 2133 leftPlan, err := dsp.createPhysPlanForPlanNode(planCtx, n.left.plan) 2134 if err != nil { 2135 return nil, err 2136 } 2137 rightPlan, err := dsp.createPhysPlanForPlanNode(planCtx, n.right.plan) 2138 if err != nil { 2139 return nil, err 2140 } 2141 2142 // Nodes where we will run the join processors. 2143 var nodes []roachpb.NodeID 2144 2145 // We initialize these properties of the joiner. They will then be used to 2146 // fill in the processor spec. See descriptions for HashJoinerSpec. 2147 var leftEqCols, rightEqCols []uint32 2148 var leftMergeOrd, rightMergeOrd execinfrapb.Ordering 2149 joinType := n.joinType 2150 2151 // Figure out the left and right types. 2152 leftTypes := leftPlan.ResultTypes 2153 rightTypes := rightPlan.ResultTypes 2154 2155 // Set up the equality columns. 2156 if numEq := len(n.pred.leftEqualityIndices); numEq != 0 { 2157 leftEqCols = eqCols(n.pred.leftEqualityIndices, leftPlan.PlanToStreamColMap) 2158 rightEqCols = eqCols(n.pred.rightEqualityIndices, rightPlan.PlanToStreamColMap) 2159 } 2160 2161 var p PhysicalPlan 2162 var leftRouters, rightRouters []physicalplan.ProcessorIdx 2163 p.PhysicalPlan, leftRouters, rightRouters = physicalplan.MergePlans( 2164 &leftPlan.PhysicalPlan, &rightPlan.PhysicalPlan, 2165 ) 2166 2167 // Set up the output columns. 2168 if numEq := len(n.pred.leftEqualityIndices); numEq != 0 { 2169 nodes = findJoinProcessorNodes(leftRouters, rightRouters, p.Processors) 2170 2171 if len(n.mergeJoinOrdering) > 0 { 2172 // TODO(radu): we currently only use merge joins when we have an ordering on 2173 // all equality columns. We should relax this by either: 2174 // - implementing a hybrid hash/merge processor which implements merge 2175 // logic on the columns we have an ordering on, and within each merge 2176 // group uses a hashmap on the remaining columns 2177 // - or: adding a sort processor to complete the order 2178 if len(n.mergeJoinOrdering) == len(n.pred.leftEqualityIndices) { 2179 // Excellent! We can use the merge joiner. 2180 leftMergeOrd = distsqlOrdering(n.mergeJoinOrdering, leftEqCols) 2181 rightMergeOrd = distsqlOrdering(n.mergeJoinOrdering, rightEqCols) 2182 } 2183 } 2184 } else { 2185 // Without column equality, we cannot distribute the join. Run a 2186 // single processor. 2187 nodes = []roachpb.NodeID{dsp.nodeDesc.NodeID} 2188 2189 // If either side has a single stream, put the processor on that node. We 2190 // prefer the left side because that is processed first by the hash joiner. 2191 if len(leftRouters) == 1 { 2192 nodes[0] = p.Processors[leftRouters[0]].Node 2193 } else if len(rightRouters) == 1 { 2194 nodes[0] = p.Processors[rightRouters[0]].Node 2195 } 2196 } 2197 2198 rightMap := rightPlan.PlanToStreamColMap 2199 post, joinToStreamColMap := joinOutColumns(n, leftPlan.PlanToStreamColMap, rightMap) 2200 onExpr, err := remapOnExpr(planCtx, n, leftPlan.PlanToStreamColMap, rightMap) 2201 if err != nil { 2202 return nil, err 2203 } 2204 2205 // Create the Core spec. 2206 var core execinfrapb.ProcessorCoreUnion 2207 if leftMergeOrd.Columns == nil { 2208 core.HashJoiner = &execinfrapb.HashJoinerSpec{ 2209 LeftEqColumns: leftEqCols, 2210 RightEqColumns: rightEqCols, 2211 OnExpr: onExpr, 2212 Type: joinType, 2213 LeftEqColumnsAreKey: n.pred.leftEqKey, 2214 RightEqColumnsAreKey: n.pred.rightEqKey, 2215 } 2216 } else { 2217 core.MergeJoiner = &execinfrapb.MergeJoinerSpec{ 2218 LeftOrdering: leftMergeOrd, 2219 RightOrdering: rightMergeOrd, 2220 OnExpr: onExpr, 2221 Type: joinType, 2222 LeftEqColumnsAreKey: n.pred.leftEqKey, 2223 RightEqColumnsAreKey: n.pred.rightEqKey, 2224 } 2225 } 2226 2227 p.AddJoinStage( 2228 nodes, core, post, leftEqCols, rightEqCols, leftTypes, rightTypes, 2229 leftMergeOrd, rightMergeOrd, leftRouters, rightRouters, 2230 ) 2231 2232 p.PlanToStreamColMap = joinToStreamColMap 2233 p.ResultTypes, err = getTypesForPlanResult(n, joinToStreamColMap) 2234 if err != nil { 2235 return nil, err 2236 } 2237 2238 // Joiners may guarantee an ordering to outputs, so we ensure that 2239 // ordering is propagated through the input synchronizer of the next stage. 2240 // We can propagate the ordering from either side, we use the left side here. 2241 // Note that n.props only has a non-empty ordering for inner joins, where it 2242 // uses the mergeJoinOrdering. 2243 p.SetMergeOrdering(dsp.convertOrdering(n.reqOrdering, p.PlanToStreamColMap)) 2244 return &p, nil 2245 } 2246 2247 func (dsp *DistSQLPlanner) createPhysPlan( 2248 planCtx *PlanningCtx, plan planMaybePhysical, 2249 ) (physPlan *PhysicalPlan, err error) { 2250 if plan.isPhysicalPlan() { 2251 return plan.physPlan, nil 2252 } 2253 return dsp.createPhysPlanForPlanNode(planCtx, plan.planNode) 2254 } 2255 2256 func (dsp *DistSQLPlanner) createPhysPlanForPlanNode( 2257 planCtx *PlanningCtx, node planNode, 2258 ) (plan *PhysicalPlan, err error) { 2259 planCtx.planDepth++ 2260 2261 switch n := node.(type) { 2262 // Keep these cases alphabetized, please! 2263 case *distinctNode: 2264 plan, err = dsp.createPlanForDistinct(planCtx, n) 2265 2266 case *exportNode: 2267 plan, err = dsp.createPlanForExport(planCtx, n) 2268 2269 case *filterNode: 2270 plan, err = dsp.createPhysPlanForPlanNode(planCtx, n.source.plan) 2271 if err != nil { 2272 return nil, err 2273 } 2274 2275 if err := plan.AddFilter(n.filter, planCtx, plan.PlanToStreamColMap); err != nil { 2276 return nil, err 2277 } 2278 2279 case *groupNode: 2280 plan, err = dsp.createPhysPlanForPlanNode(planCtx, n.plan) 2281 if err != nil { 2282 return nil, err 2283 } 2284 2285 if err := dsp.addAggregators(planCtx, plan, n); err != nil { 2286 return nil, err 2287 } 2288 2289 case *indexJoinNode: 2290 plan, err = dsp.createPlanForIndexJoin(planCtx, n) 2291 2292 case *joinNode: 2293 plan, err = dsp.createPlanForJoin(planCtx, n) 2294 2295 case *limitNode: 2296 plan, err = dsp.createPhysPlanForPlanNode(planCtx, n.plan) 2297 if err != nil { 2298 return nil, err 2299 } 2300 if err := n.evalLimit(planCtx.EvalContext()); err != nil { 2301 return nil, err 2302 } 2303 if err := plan.AddLimit(n.count, n.offset, planCtx, dsp.nodeDesc.NodeID); err != nil { 2304 return nil, err 2305 } 2306 2307 case *lookupJoinNode: 2308 plan, err = dsp.createPlanForLookupJoin(planCtx, n) 2309 2310 case *ordinalityNode: 2311 plan, err = dsp.createPlanForOrdinality(planCtx, n) 2312 2313 case *projectSetNode: 2314 plan, err = dsp.createPlanForProjectSet(planCtx, n) 2315 2316 case *renderNode: 2317 plan, err = dsp.createPhysPlanForPlanNode(planCtx, n.source.plan) 2318 if err != nil { 2319 return nil, err 2320 } 2321 err = dsp.selectRenders(plan, n, planCtx) 2322 if err != nil { 2323 return nil, err 2324 } 2325 2326 case *scanNode: 2327 plan, err = dsp.createTableReaders(planCtx, n) 2328 2329 case *sortNode: 2330 plan, err = dsp.createPhysPlanForPlanNode(planCtx, n.plan) 2331 if err != nil { 2332 return nil, err 2333 } 2334 2335 dsp.addSorters(plan, n) 2336 2337 case *unaryNode: 2338 plan, err = dsp.createPlanForUnary(planCtx, n) 2339 2340 case *unionNode: 2341 plan, err = dsp.createPlanForSetOp(planCtx, n) 2342 2343 case *valuesNode: 2344 // Just like in checkSupportForPlanNode, if a valuesNode wasn't specified in 2345 // the query, it means that it was autogenerated for things that we don't 2346 // want to be distributing, like populating values from a virtual table. So, 2347 // we wrap the plan instead. 2348 // 2349 // If the plan is local, we also wrap the plan to avoid pointless 2350 // serialization of the values, and also to avoid situations in which 2351 // expressions within the valuesNode were not distributable in the first 2352 // place. 2353 // 2354 // Finally, if noEvalSubqueries is set, it means that nothing has replaced 2355 // the subqueries with their results yet, which again means that we can't 2356 // plan a DistSQL values node, which requires that all expressions be 2357 // evaluatable. 2358 // 2359 // NB: If you change this conditional, you must also change it in 2360 // checkSupportForPlanNode! 2361 if !n.specifiedInQuery || planCtx.isLocal || planCtx.noEvalSubqueries { 2362 plan, err = dsp.wrapPlan(planCtx, n) 2363 } else { 2364 plan, err = dsp.createPlanForValues(planCtx, n) 2365 } 2366 2367 case *windowNode: 2368 plan, err = dsp.createPlanForWindow(planCtx, n) 2369 2370 case *zeroNode: 2371 plan, err = dsp.createPlanForZero(planCtx, n) 2372 2373 case *zigzagJoinNode: 2374 plan, err = dsp.createPlanForZigzagJoin(planCtx, n) 2375 2376 default: 2377 // Can't handle a node? We wrap it and continue on our way. 2378 plan, err = dsp.wrapPlan(planCtx, n) 2379 } 2380 2381 if err != nil { 2382 return plan, err 2383 } 2384 2385 if dsp.shouldPlanTestMetadata() { 2386 if err := plan.CheckLastStagePost(); err != nil { 2387 log.Fatalf(planCtx.ctx, "%v", err) 2388 } 2389 plan.AddNoGroupingStageWithCoreFunc( 2390 func(_ int, _ *physicalplan.Processor) execinfrapb.ProcessorCoreUnion { 2391 return execinfrapb.ProcessorCoreUnion{ 2392 MetadataTestSender: &execinfrapb.MetadataTestSenderSpec{ 2393 ID: uuid.MakeV4().String(), 2394 }, 2395 } 2396 }, 2397 execinfrapb.PostProcessSpec{}, 2398 plan.ResultTypes, 2399 plan.MergeOrdering, 2400 ) 2401 } 2402 2403 return plan, err 2404 } 2405 2406 // wrapPlan produces a DistSQL processor for an arbitrary planNode. This is 2407 // invoked when a particular planNode can't be distributed for some reason. It 2408 // will create a planNodeToRowSource wrapper for the sub-tree that's not 2409 // plannable by DistSQL. If that sub-tree has DistSQL-plannable sources, they 2410 // will be planned by DistSQL and connected to the wrapper. 2411 func (dsp *DistSQLPlanner) wrapPlan(planCtx *PlanningCtx, n planNode) (*PhysicalPlan, error) { 2412 useFastPath := planCtx.planDepth == 1 && planCtx.stmtType == tree.RowsAffected 2413 2414 // First, we search the planNode tree we're trying to wrap for the first 2415 // DistSQL-enabled planNode in the tree. If we find one, we ask the planner to 2416 // continue the DistSQL planning recursion on that planNode. 2417 seenTop := false 2418 nParents := uint32(0) 2419 p := &PhysicalPlan{} 2420 // This will be set to first DistSQL-enabled planNode we find, if any. We'll 2421 // modify its parent later to connect its source to the DistSQL-planned 2422 // subtree. 2423 var firstNotWrapped planNode 2424 if err := walkPlan(planCtx.ctx, n, planObserver{ 2425 enterNode: func(ctx context.Context, nodeName string, plan planNode) (bool, error) { 2426 switch plan.(type) { 2427 case *explainDistSQLNode, *explainPlanNode, *explainVecNode: 2428 // Don't continue recursing into explain nodes - they need to be left 2429 // alone since they handle their own planning later. 2430 return false, nil 2431 } 2432 if !seenTop { 2433 // We know we're wrapping the first node, so ignore it. 2434 seenTop = true 2435 return true, nil 2436 } 2437 var err error 2438 // Continue walking until we find a node that has a DistSQL 2439 // representation - that's when we'll quit the wrapping process and hand 2440 // control of planning back to the DistSQL physical planner. 2441 if !dsp.mustWrapNode(planCtx, plan) { 2442 firstNotWrapped = plan 2443 p, err = dsp.createPhysPlanForPlanNode(planCtx, plan) 2444 if err != nil { 2445 return false, err 2446 } 2447 nParents++ 2448 return false, nil 2449 } 2450 return true, nil 2451 }, 2452 }); err != nil { 2453 return nil, err 2454 } 2455 if nParents > 1 { 2456 return nil, errors.Errorf("can't wrap plan %v %T with more than one input", n, n) 2457 } 2458 2459 // Copy the evalCtx. 2460 evalCtx := *planCtx.ExtendedEvalCtx 2461 // We permit the planNodeToRowSource to trigger the wrapped planNode's fast 2462 // path if its the very first node in the flow, and if the statement type we're 2463 // expecting is in fact RowsAffected. RowsAffected statements return a single 2464 // row with the number of rows affected by the statement, and are the only 2465 // types of statement where it's valid to invoke a plan's fast path. 2466 wrapper, err := makePlanNodeToRowSource(n, 2467 runParams{ 2468 extendedEvalCtx: &evalCtx, 2469 p: planCtx.planner, 2470 }, 2471 useFastPath, 2472 ) 2473 if err != nil { 2474 return nil, err 2475 } 2476 wrapper.firstNotWrapped = firstNotWrapped 2477 2478 idx := uint32(len(p.LocalProcessors)) 2479 p.LocalProcessors = append(p.LocalProcessors, wrapper) 2480 p.LocalProcessorIndexes = append(p.LocalProcessorIndexes, &idx) 2481 var input []execinfrapb.InputSyncSpec 2482 if firstNotWrapped != nil { 2483 // We found a DistSQL-plannable subtree - create an input spec for it. 2484 input = []execinfrapb.InputSyncSpec{{ 2485 Type: execinfrapb.InputSyncSpec_UNORDERED, 2486 ColumnTypes: p.ResultTypes, 2487 }} 2488 } 2489 name := nodeName(n) 2490 proc := physicalplan.Processor{ 2491 Node: dsp.nodeDesc.NodeID, 2492 Spec: execinfrapb.ProcessorSpec{ 2493 Input: input, 2494 Core: execinfrapb.ProcessorCoreUnion{LocalPlanNode: &execinfrapb.LocalPlanNodeSpec{ 2495 RowSourceIdx: &idx, 2496 NumInputs: &nParents, 2497 Name: &name, 2498 }}, 2499 Post: execinfrapb.PostProcessSpec{}, 2500 Output: []execinfrapb.OutputRouterSpec{{ 2501 Type: execinfrapb.OutputRouterSpec_PASS_THROUGH, 2502 }}, 2503 StageID: p.NewStageID(), 2504 }, 2505 } 2506 pIdx := p.AddProcessor(proc) 2507 p.ResultTypes = wrapper.outputTypes 2508 p.PlanToStreamColMap = identityMapInPlace(make([]int, len(p.ResultTypes))) 2509 if firstNotWrapped != nil { 2510 // If we found a DistSQL-plannable subtree, we need to add a result stream 2511 // between it and the physicalPlan we're creating here. 2512 p.MergeResultStreams(p.ResultRouters, 0, p.MergeOrdering, pIdx, 0) 2513 } 2514 // ResultRouters gets overwritten each time we add a new PhysicalPlan. We will 2515 // just have a single result router, since local processors aren't 2516 // distributed, so make sure that p.ResultRouters has at least 1 slot and 2517 // write the new processor index there. 2518 if cap(p.ResultRouters) < 1 { 2519 p.ResultRouters = make([]physicalplan.ProcessorIdx, 1) 2520 } else { 2521 p.ResultRouters = p.ResultRouters[:1] 2522 } 2523 p.ResultRouters[0] = pIdx 2524 return p, nil 2525 } 2526 2527 // createValuesPlan creates a plan with a single Values processor 2528 // located on the gateway node and initialized with given numRows 2529 // and rawBytes that need to be precomputed beforehand. 2530 func (dsp *DistSQLPlanner) createValuesPlan( 2531 resultTypes []*types.T, numRows int, rawBytes [][]byte, 2532 ) (*PhysicalPlan, error) { 2533 numColumns := len(resultTypes) 2534 s := execinfrapb.ValuesCoreSpec{ 2535 Columns: make([]execinfrapb.DatumInfo, numColumns), 2536 } 2537 2538 for i, t := range resultTypes { 2539 s.Columns[i].Encoding = sqlbase.DatumEncoding_VALUE 2540 s.Columns[i].Type = t 2541 } 2542 2543 s.NumRows = uint64(numRows) 2544 s.RawBytes = rawBytes 2545 2546 plan := physicalplan.PhysicalPlan{ 2547 Processors: []physicalplan.Processor{{ 2548 // TODO: find a better node to place processor at 2549 Node: dsp.nodeDesc.NodeID, 2550 Spec: execinfrapb.ProcessorSpec{ 2551 Core: execinfrapb.ProcessorCoreUnion{Values: &s}, 2552 Output: []execinfrapb.OutputRouterSpec{{Type: execinfrapb.OutputRouterSpec_PASS_THROUGH}}, 2553 }, 2554 }}, 2555 ResultRouters: []physicalplan.ProcessorIdx{0}, 2556 ResultTypes: resultTypes, 2557 } 2558 2559 return &PhysicalPlan{ 2560 PhysicalPlan: plan, 2561 PlanToStreamColMap: identityMapInPlace(make([]int, numColumns)), 2562 }, nil 2563 } 2564 2565 func (dsp *DistSQLPlanner) createPlanForValues( 2566 planCtx *PlanningCtx, n *valuesNode, 2567 ) (*PhysicalPlan, error) { 2568 params := runParams{ 2569 ctx: planCtx.ctx, 2570 extendedEvalCtx: planCtx.ExtendedEvalCtx, 2571 } 2572 2573 types, err := getTypesForPlanResult(n, nil /* planToStreamColMap */) 2574 if err != nil { 2575 return nil, err 2576 } 2577 2578 if err := n.startExec(params); err != nil { 2579 return nil, err 2580 } 2581 defer n.Close(planCtx.ctx) 2582 2583 var a sqlbase.DatumAlloc 2584 2585 numRows := n.rows.Len() 2586 rawBytes := make([][]byte, numRows) 2587 for i := 0; i < numRows; i++ { 2588 if next, err := n.Next(runParams{ctx: planCtx.ctx}); !next { 2589 return nil, err 2590 } 2591 2592 var buf []byte 2593 datums := n.Values() 2594 for j := range n.columns { 2595 var err error 2596 datum := sqlbase.DatumToEncDatum(types[j], datums[j]) 2597 buf, err = datum.Encode(types[j], &a, sqlbase.DatumEncoding_VALUE, buf) 2598 if err != nil { 2599 return nil, err 2600 } 2601 } 2602 rawBytes[i] = buf 2603 } 2604 2605 return dsp.createValuesPlan(types, numRows, rawBytes) 2606 } 2607 2608 func (dsp *DistSQLPlanner) createPlanForUnary( 2609 planCtx *PlanningCtx, n *unaryNode, 2610 ) (*PhysicalPlan, error) { 2611 types, err := getTypesForPlanResult(n, nil /* planToStreamColMap */) 2612 if err != nil { 2613 return nil, err 2614 } 2615 2616 return dsp.createValuesPlan(types, 1 /* numRows */, nil /* rawBytes */) 2617 } 2618 2619 func (dsp *DistSQLPlanner) createPlanForZero( 2620 planCtx *PlanningCtx, n *zeroNode, 2621 ) (*PhysicalPlan, error) { 2622 types, err := getTypesForPlanResult(n, nil /* planToStreamColMap */) 2623 if err != nil { 2624 return nil, err 2625 } 2626 2627 return dsp.createValuesPlan(types, 0 /* numRows */, nil /* rawBytes */) 2628 } 2629 2630 func createDistinctSpec(n *distinctNode, cols []int) *execinfrapb.DistinctSpec { 2631 var orderedColumns []uint32 2632 if !n.columnsInOrder.Empty() { 2633 orderedColumns = make([]uint32, 0, n.columnsInOrder.Len()) 2634 for i, ok := n.columnsInOrder.Next(0); ok; i, ok = n.columnsInOrder.Next(i + 1) { 2635 orderedColumns = append(orderedColumns, uint32(cols[i])) 2636 } 2637 } 2638 2639 var distinctColumns []uint32 2640 if !n.distinctOnColIdxs.Empty() { 2641 for planCol, streamCol := range cols { 2642 if streamCol != -1 && n.distinctOnColIdxs.Contains(planCol) { 2643 distinctColumns = append(distinctColumns, uint32(streamCol)) 2644 } 2645 } 2646 } else { 2647 // If no distinct columns were specified, run distinct on the entire row. 2648 for _, streamCol := range cols { 2649 if streamCol != -1 { 2650 distinctColumns = append(distinctColumns, uint32(streamCol)) 2651 } 2652 } 2653 } 2654 2655 return &execinfrapb.DistinctSpec{ 2656 OrderedColumns: orderedColumns, 2657 DistinctColumns: distinctColumns, 2658 NullsAreDistinct: n.nullsAreDistinct, 2659 ErrorOnDup: n.errorOnDup, 2660 } 2661 } 2662 2663 func (dsp *DistSQLPlanner) createPlanForDistinct( 2664 planCtx *PlanningCtx, n *distinctNode, 2665 ) (*PhysicalPlan, error) { 2666 plan, err := dsp.createPhysPlanForPlanNode(planCtx, n.plan) 2667 if err != nil { 2668 return nil, err 2669 } 2670 currentResultRouters := plan.ResultRouters 2671 2672 distinctSpec := execinfrapb.ProcessorCoreUnion{ 2673 Distinct: createDistinctSpec(n, plan.PlanToStreamColMap), 2674 } 2675 2676 if len(currentResultRouters) == 1 { 2677 plan.AddNoGroupingStage(distinctSpec, execinfrapb.PostProcessSpec{}, plan.ResultTypes, plan.MergeOrdering) 2678 return plan, nil 2679 } 2680 2681 // TODO(arjun): This is potentially memory inefficient if we don't have any sorted columns. 2682 2683 // Add distinct processors local to each existing current result processor. 2684 plan.AddNoGroupingStage(distinctSpec, execinfrapb.PostProcessSpec{}, plan.ResultTypes, plan.MergeOrdering) 2685 2686 // TODO(arjun): We could distribute this final stage by hash. 2687 plan.AddSingleGroupStage(dsp.nodeDesc.NodeID, distinctSpec, execinfrapb.PostProcessSpec{}, plan.ResultTypes) 2688 2689 return plan, nil 2690 } 2691 2692 func (dsp *DistSQLPlanner) createPlanForOrdinality( 2693 planCtx *PlanningCtx, n *ordinalityNode, 2694 ) (*PhysicalPlan, error) { 2695 plan, err := dsp.createPhysPlanForPlanNode(planCtx, n.source) 2696 if err != nil { 2697 return nil, err 2698 } 2699 2700 ordinalitySpec := execinfrapb.ProcessorCoreUnion{ 2701 Ordinality: &execinfrapb.OrdinalitySpec{}, 2702 } 2703 2704 plan.PlanToStreamColMap = append(plan.PlanToStreamColMap, len(plan.ResultTypes)) 2705 outputTypes := append(plan.ResultTypes, types.Int) 2706 2707 // WITH ORDINALITY never gets distributed so that the gateway node can 2708 // always number each row in order. 2709 plan.AddSingleGroupStage(dsp.nodeDesc.NodeID, ordinalitySpec, execinfrapb.PostProcessSpec{}, outputTypes) 2710 2711 return plan, nil 2712 } 2713 2714 func createProjectSetSpec( 2715 planCtx *PlanningCtx, n *projectSetNode, indexVarMap []int, 2716 ) (*execinfrapb.ProjectSetSpec, error) { 2717 spec := execinfrapb.ProjectSetSpec{ 2718 Exprs: make([]execinfrapb.Expression, len(n.exprs)), 2719 GeneratedColumns: make([]*types.T, len(n.columns)-n.numColsInSource), 2720 NumColsPerGen: make([]uint32, len(n.exprs)), 2721 } 2722 for i, expr := range n.exprs { 2723 var err error 2724 spec.Exprs[i], err = physicalplan.MakeExpression(expr, planCtx, indexVarMap) 2725 if err != nil { 2726 return nil, err 2727 } 2728 } 2729 for i, col := range n.columns[n.numColsInSource:] { 2730 spec.GeneratedColumns[i] = col.Typ 2731 } 2732 for i, n := range n.numColsPerGen { 2733 spec.NumColsPerGen[i] = uint32(n) 2734 } 2735 return &spec, nil 2736 } 2737 2738 func (dsp *DistSQLPlanner) createPlanForProjectSet( 2739 planCtx *PlanningCtx, n *projectSetNode, 2740 ) (*PhysicalPlan, error) { 2741 plan, err := dsp.createPhysPlanForPlanNode(planCtx, n.source) 2742 if err != nil { 2743 return nil, err 2744 } 2745 numResults := len(plan.ResultTypes) 2746 2747 indexVarMap := makePlanToStreamColMap(len(n.columns)) 2748 copy(indexVarMap, plan.PlanToStreamColMap) 2749 2750 // Create the project set processor spec. 2751 projectSetSpec, err := createProjectSetSpec(planCtx, n, indexVarMap) 2752 if err != nil { 2753 return nil, err 2754 } 2755 spec := execinfrapb.ProcessorCoreUnion{ 2756 ProjectSet: projectSetSpec, 2757 } 2758 2759 // Since ProjectSet tends to be a late stage which produces more rows than its 2760 // source, we opt to perform it only on the gateway node. If we encounter 2761 // cases in the future where this is non-optimal (perhaps if its output is 2762 // filtered), we could try to detect these cases and use AddNoGroupingStage 2763 // instead. 2764 outputTypes := append(plan.ResultTypes, projectSetSpec.GeneratedColumns...) 2765 plan.AddSingleGroupStage(dsp.nodeDesc.NodeID, spec, execinfrapb.PostProcessSpec{}, outputTypes) 2766 2767 // Add generated columns to PlanToStreamColMap. 2768 for i := range projectSetSpec.GeneratedColumns { 2769 plan.PlanToStreamColMap = append(plan.PlanToStreamColMap, numResults+i) 2770 } 2771 2772 return plan, nil 2773 } 2774 2775 // isOnlyOnGateway returns true if a physical plan is executed entirely on the 2776 // gateway node. 2777 func (dsp *DistSQLPlanner) isOnlyOnGateway(plan *PhysicalPlan) bool { 2778 if len(plan.ResultRouters) == 1 { 2779 processorIdx := plan.ResultRouters[0] 2780 if plan.Processors[processorIdx].Node == dsp.nodeDesc.NodeID { 2781 return true 2782 } 2783 } 2784 return false 2785 } 2786 2787 // TODO(abhimadan): Refactor this function to reduce the UNION vs 2788 // EXCEPT/INTERSECT and DISTINCT vs ALL branching. 2789 // 2790 // createPlanForSetOp creates a physical plan for "set operations". UNION plans 2791 // are created by merging the left and right plans together, and INTERSECT and 2792 // EXCEPT plans are created by performing a special type of join on the left and 2793 // right sides. In the UNION DISTINCT case, a distinct stage is placed after the 2794 // plans are merged, and in the INTERSECT/EXCEPT DISTINCT cases, distinct stages 2795 // are added as the inputs of the join stage. In all DISTINCT cases, an 2796 // additional distinct stage is placed at the end of the left and right plans if 2797 // there are multiple nodes involved in the query, to reduce the amount of 2798 // unnecessary network I/O. 2799 // 2800 // Examples (single node): 2801 // - Query: ( VALUES (1), (2), (2) ) UNION ( VALUES (2), (3) ) 2802 // Plan: 2803 // VALUES VALUES 2804 // | | 2805 // ------------- 2806 // | 2807 // DISTINCT 2808 // 2809 // - Query: ( VALUES (1), (2), (2) ) INTERSECT ALL ( VALUES (2), (3) ) 2810 // Plan: 2811 // VALUES VALUES 2812 // | | 2813 // ------------- 2814 // | 2815 // JOIN 2816 // 2817 // - Query: ( VALUES (1), (2), (2) ) EXCEPT ( VALUES (2), (3) ) 2818 // Plan: 2819 // VALUES VALUES 2820 // | | 2821 // DISTINCT DISTINCT 2822 // | | 2823 // ------------- 2824 // | 2825 // JOIN 2826 func (dsp *DistSQLPlanner) createPlanForSetOp( 2827 planCtx *PlanningCtx, n *unionNode, 2828 ) (*PhysicalPlan, error) { 2829 leftLogicalPlan := n.left 2830 leftPlan, err := dsp.createPhysPlanForPlanNode(planCtx, n.left) 2831 if err != nil { 2832 return nil, err 2833 } 2834 rightLogicalPlan := n.right 2835 rightPlan, err := dsp.createPhysPlanForPlanNode(planCtx, n.right) 2836 if err != nil { 2837 return nil, err 2838 } 2839 if n.inverted { 2840 leftPlan, rightPlan = rightPlan, leftPlan 2841 leftLogicalPlan, rightLogicalPlan = rightLogicalPlan, leftLogicalPlan 2842 } 2843 childPhysicalPlans := []*PhysicalPlan{leftPlan, rightPlan} 2844 2845 // Check that the left and right side PlanToStreamColMaps are equivalent. 2846 // TODO(solon): Are there any valid UNION/INTERSECT/EXCEPT cases where these 2847 // differ? If we encounter any, we could handle them by adding a projection on 2848 // the unioned columns on each side, similar to how we handle mismatched 2849 // ResultTypes. 2850 if !reflect.DeepEqual(leftPlan.PlanToStreamColMap, rightPlan.PlanToStreamColMap) { 2851 return nil, errors.Errorf( 2852 "planToStreamColMap mismatch: %v, %v", leftPlan.PlanToStreamColMap, 2853 rightPlan.PlanToStreamColMap) 2854 } 2855 planToStreamColMap := leftPlan.PlanToStreamColMap 2856 streamCols := make([]uint32, 0, len(planToStreamColMap)) 2857 for _, streamCol := range planToStreamColMap { 2858 if streamCol < 0 { 2859 continue 2860 } 2861 streamCols = append(streamCols, uint32(streamCol)) 2862 } 2863 2864 var distinctSpecs [2]execinfrapb.ProcessorCoreUnion 2865 2866 if !n.all { 2867 var distinctOrds [2]execinfrapb.Ordering 2868 distinctOrds[0] = execinfrapb.ConvertToMappedSpecOrdering( 2869 planReqOrdering(leftLogicalPlan), leftPlan.PlanToStreamColMap, 2870 ) 2871 distinctOrds[1] = execinfrapb.ConvertToMappedSpecOrdering( 2872 planReqOrdering(rightLogicalPlan), rightPlan.PlanToStreamColMap, 2873 ) 2874 2875 // Build distinct processor specs for the left and right child plans. 2876 // 2877 // Note there is the potential for further network I/O optimization here 2878 // in the UNION case, since rows are not deduplicated between left and right 2879 // until the single group stage. In the worst case (total duplication), this 2880 // causes double the amount of data to be streamed as necessary. 2881 for side, plan := range childPhysicalPlans { 2882 sortCols := make([]uint32, len(distinctOrds[side].Columns)) 2883 for i, ord := range distinctOrds[side].Columns { 2884 sortCols[i] = ord.ColIdx 2885 } 2886 distinctSpec := &distinctSpecs[side] 2887 distinctSpec.Distinct = &execinfrapb.DistinctSpec{ 2888 DistinctColumns: streamCols, 2889 OrderedColumns: sortCols, 2890 } 2891 if !dsp.isOnlyOnGateway(plan) { 2892 // TODO(solon): We could skip this stage if there is a strong key on 2893 // the result columns. 2894 plan.AddNoGroupingStage( 2895 *distinctSpec, execinfrapb.PostProcessSpec{}, plan.ResultTypes, distinctOrds[side]) 2896 plan.AddProjection(streamCols) 2897 } 2898 } 2899 } 2900 2901 var p PhysicalPlan 2902 2903 // Merge the plans' PlanToStreamColMap, which we know are equivalent. 2904 p.PlanToStreamColMap = planToStreamColMap 2905 2906 // Merge the plans' result types and merge ordering. 2907 resultTypes, err := physicalplan.MergeResultTypes(leftPlan.ResultTypes, rightPlan.ResultTypes) 2908 if err != nil { 2909 return nil, err 2910 } 2911 2912 if len(leftPlan.MergeOrdering.Columns) != 0 || len(rightPlan.MergeOrdering.Columns) != 0 { 2913 return nil, errors.AssertionFailedf("set op inputs should have no orderings") 2914 } 2915 2916 // TODO(radu): for INTERSECT and EXCEPT, the mergeOrdering should be set when 2917 // we can use merge joiners below. The optimizer needs to be modified to take 2918 // advantage of this optimization and pass down merge orderings. Tracked by 2919 // #40797. 2920 var mergeOrdering execinfrapb.Ordering 2921 2922 // Merge processors, streams, result routers, and stage counter. 2923 var leftRouters, rightRouters []physicalplan.ProcessorIdx 2924 p.PhysicalPlan, leftRouters, rightRouters = physicalplan.MergePlans( 2925 &leftPlan.PhysicalPlan, &rightPlan.PhysicalPlan) 2926 2927 if n.unionType == tree.UnionOp { 2928 // We just need to append the left and right streams together, so append 2929 // the left and right output routers. 2930 p.ResultRouters = append(leftRouters, rightRouters...) 2931 2932 p.ResultTypes = resultTypes 2933 p.SetMergeOrdering(mergeOrdering) 2934 2935 if !n.all { 2936 // TODO(abhimadan): use columns from mergeOrdering to fill in the 2937 // OrderingColumns field in DistinctSpec once the unused columns 2938 // are projected out. 2939 distinctSpec := execinfrapb.ProcessorCoreUnion{ 2940 Distinct: &execinfrapb.DistinctSpec{DistinctColumns: streamCols}, 2941 } 2942 p.AddSingleGroupStage( 2943 dsp.nodeDesc.NodeID, distinctSpec, execinfrapb.PostProcessSpec{}, p.ResultTypes) 2944 } else { 2945 // With UNION ALL, we can end up with multiple streams on the same node. 2946 // We don't want to have unnecessary routers and cross-node streams, so 2947 // merge these streams now. 2948 // 2949 // More importantly, we need to guarantee that if everything is planned 2950 // on a single node (which is always the case when there are mutations), 2951 // we can fuse everything so there are no concurrent KV operations (see 2952 // #40487, #41307). 2953 p.EnsureSingleStreamPerNode() 2954 2955 // UNION ALL is special: it doesn't have any required downstream 2956 // processor, so its two inputs might have different post-processing 2957 // which would violate an assumption later down the line. Check for this 2958 // condition and add a no-op stage if it exists. 2959 if err := p.CheckLastStagePost(); err != nil { 2960 p.AddSingleGroupStage( 2961 dsp.nodeDesc.NodeID, 2962 execinfrapb.ProcessorCoreUnion{Noop: &execinfrapb.NoopCoreSpec{}}, 2963 execinfrapb.PostProcessSpec{}, 2964 p.ResultTypes, 2965 ) 2966 } 2967 } 2968 } else { 2969 // We plan INTERSECT and EXCEPT queries with joiners. Get the appropriate 2970 // join type. 2971 joinType := distsqlSetOpJoinType(n.unionType) 2972 2973 // Nodes where we will run the join processors. 2974 nodes := findJoinProcessorNodes(leftRouters, rightRouters, p.Processors) 2975 2976 // Set up the equality columns. 2977 eqCols := streamCols 2978 2979 // Project the left-side columns only. 2980 post := execinfrapb.PostProcessSpec{Projection: true} 2981 post.OutputColumns = make([]uint32, len(streamCols)) 2982 copy(post.OutputColumns, streamCols) 2983 2984 // Create the Core spec. 2985 // 2986 // TODO(radu): we currently only use merge joins when we have an ordering on 2987 // all equality columns. We should relax this by either: 2988 // - implementing a hybrid hash/merge processor which implements merge 2989 // logic on the columns we have an ordering on, and within each merge 2990 // group uses a hashmap on the remaining columns 2991 // - or: adding a sort processor to complete the order 2992 var core execinfrapb.ProcessorCoreUnion 2993 if len(mergeOrdering.Columns) < len(streamCols) { 2994 core.HashJoiner = &execinfrapb.HashJoinerSpec{ 2995 LeftEqColumns: eqCols, 2996 RightEqColumns: eqCols, 2997 Type: joinType, 2998 } 2999 } else { 3000 core.MergeJoiner = &execinfrapb.MergeJoinerSpec{ 3001 LeftOrdering: mergeOrdering, 3002 RightOrdering: mergeOrdering, 3003 Type: joinType, 3004 NullEquality: true, 3005 } 3006 } 3007 3008 if n.all { 3009 p.AddJoinStage( 3010 nodes, core, post, eqCols, eqCols, 3011 leftPlan.ResultTypes, rightPlan.ResultTypes, 3012 leftPlan.MergeOrdering, rightPlan.MergeOrdering, 3013 leftRouters, rightRouters, 3014 ) 3015 } else { 3016 p.AddDistinctSetOpStage( 3017 nodes, core, distinctSpecs[:], post, eqCols, 3018 leftPlan.ResultTypes, rightPlan.ResultTypes, 3019 leftPlan.MergeOrdering, rightPlan.MergeOrdering, 3020 leftRouters, rightRouters, 3021 ) 3022 } 3023 3024 // An EXCEPT ALL is like a left outer join, so there is no guaranteed ordering. 3025 if n.unionType == tree.ExceptOp { 3026 mergeOrdering = execinfrapb.Ordering{} 3027 } 3028 3029 p.ResultTypes = resultTypes 3030 p.SetMergeOrdering(mergeOrdering) 3031 } 3032 3033 return &p, nil 3034 } 3035 3036 // createPlanForWindow creates a physical plan for computing window functions. 3037 // We add a new stage of windower processors for each different partitioning 3038 // scheme found in the query's window functions. 3039 func (dsp *DistSQLPlanner) createPlanForWindow( 3040 planCtx *PlanningCtx, n *windowNode, 3041 ) (*PhysicalPlan, error) { 3042 plan, err := dsp.createPhysPlanForPlanNode(planCtx, n.plan) 3043 if err != nil { 3044 return nil, err 3045 } 3046 3047 numWindowFuncProcessed := 0 3048 windowPlanState := createWindowPlanState(n, planCtx, plan) 3049 // Each iteration of this loop adds a new stage of windowers. The steps taken: 3050 // 1. find a set of unprocessed window functions that have the same PARTITION BY 3051 // clause. All of these will be computed using the single stage of windowers. 3052 // 2. a) populate output types of the current stage of windowers. All input 3053 // columns are being passed through, and windower will append output 3054 // columns for each window function processed at the stage. 3055 // b) create specs for all window functions in the set. 3056 // 3. decide whether to put windowers on a single or on multiple nodes. 3057 // a) if we're putting windowers on multiple nodes, we'll put them onto 3058 // every node that participated in the previous stage. We leverage hash 3059 // routers to partition the data based on PARTITION BY clause of window 3060 // functions in the set. 3061 for numWindowFuncProcessed < len(n.funcs) { 3062 samePartitionFuncs, partitionIdxs := windowPlanState.findUnprocessedWindowFnsWithSamePartition() 3063 numWindowFuncProcessed += len(samePartitionFuncs) 3064 windowerSpec := execinfrapb.WindowerSpec{ 3065 PartitionBy: partitionIdxs, 3066 WindowFns: make([]execinfrapb.WindowerSpec_WindowFn, len(samePartitionFuncs)), 3067 } 3068 3069 newResultTypes := make([]*types.T, len(plan.ResultTypes)+len(samePartitionFuncs)) 3070 copy(newResultTypes, plan.ResultTypes) 3071 for windowFnSpecIdx, windowFn := range samePartitionFuncs { 3072 windowFnSpec, outputType, err := windowPlanState.createWindowFnSpec(windowFn) 3073 if err != nil { 3074 return nil, err 3075 } 3076 newResultTypes[windowFn.outputColIdx] = outputType 3077 windowerSpec.WindowFns[windowFnSpecIdx] = windowFnSpec 3078 } 3079 3080 // Check if the previous stage is all on one node. 3081 prevStageNode := plan.Processors[plan.ResultRouters[0]].Node 3082 for i := 1; i < len(plan.ResultRouters); i++ { 3083 if n := plan.Processors[plan.ResultRouters[i]].Node; n != prevStageNode { 3084 prevStageNode = 0 3085 break 3086 } 3087 } 3088 3089 // Get all nodes from the previous stage. 3090 nodes := getNodesOfRouters(plan.ResultRouters, plan.Processors) 3091 if len(partitionIdxs) == 0 || len(nodes) == 1 { 3092 // No PARTITION BY or we have a single node. Use a single windower. 3093 // If the previous stage was all on a single node, put the windower 3094 // there. Otherwise, bring the results back on this node. 3095 node := dsp.nodeDesc.NodeID 3096 if len(nodes) == 1 { 3097 node = nodes[0] 3098 } 3099 plan.AddSingleGroupStage( 3100 node, 3101 execinfrapb.ProcessorCoreUnion{Windower: &windowerSpec}, 3102 execinfrapb.PostProcessSpec{}, 3103 newResultTypes, 3104 ) 3105 } else { 3106 // Set up the output routers from the previous stage. 3107 // We use hash routers with hashing on the columns 3108 // from PARTITION BY clause of window functions 3109 // we're processing in the current stage. 3110 for _, resultProc := range plan.ResultRouters { 3111 plan.Processors[resultProc].Spec.Output[0] = execinfrapb.OutputRouterSpec{ 3112 Type: execinfrapb.OutputRouterSpec_BY_HASH, 3113 HashColumns: partitionIdxs, 3114 } 3115 } 3116 stageID := plan.NewStageID() 3117 3118 // We put a windower on each node and we connect it 3119 // with all hash routers from the previous stage in 3120 // a such way that each node has its designated 3121 // SourceRouterSlot - namely, position in which 3122 // a node appears in nodes. 3123 prevStageRouters := plan.ResultRouters 3124 plan.ResultRouters = make([]physicalplan.ProcessorIdx, 0, len(nodes)) 3125 for bucket, nodeID := range nodes { 3126 proc := physicalplan.Processor{ 3127 Node: nodeID, 3128 Spec: execinfrapb.ProcessorSpec{ 3129 Input: []execinfrapb.InputSyncSpec{{ 3130 Type: execinfrapb.InputSyncSpec_UNORDERED, 3131 ColumnTypes: plan.ResultTypes, 3132 }}, 3133 Core: execinfrapb.ProcessorCoreUnion{Windower: &windowerSpec}, 3134 Post: execinfrapb.PostProcessSpec{}, 3135 Output: []execinfrapb.OutputRouterSpec{{ 3136 Type: execinfrapb.OutputRouterSpec_PASS_THROUGH, 3137 }}, 3138 StageID: stageID, 3139 }, 3140 } 3141 pIdx := plan.AddProcessor(proc) 3142 3143 for _, router := range prevStageRouters { 3144 plan.Streams = append(plan.Streams, physicalplan.Stream{ 3145 SourceProcessor: router, 3146 SourceRouterSlot: bucket, 3147 DestProcessor: pIdx, 3148 DestInput: 0, 3149 }) 3150 } 3151 plan.ResultRouters = append(plan.ResultRouters, pIdx) 3152 } 3153 3154 plan.ResultTypes = newResultTypes 3155 } 3156 } 3157 3158 // We definitely added columns throughout all the stages of windowers, so we 3159 // need to update PlanToStreamColMap. We need to update the map before adding 3160 // rendering or projection because it is used there. 3161 plan.PlanToStreamColMap = identityMap(plan.PlanToStreamColMap, len(plan.ResultTypes)) 3162 3163 // windowers do not guarantee maintaining the order at the moment, so we 3164 // reset MergeOrdering. There shouldn't be an ordering here, but we reset it 3165 // defensively (see #35179). 3166 plan.SetMergeOrdering(execinfrapb.Ordering{}) 3167 3168 // After all window functions are computed, we need to add rendering or 3169 // projection. 3170 if err := windowPlanState.addRenderingOrProjection(); err != nil { 3171 return nil, err 3172 } 3173 3174 if len(plan.ResultTypes) != len(plan.PlanToStreamColMap) { 3175 // We added/removed columns while rendering or projecting, so we need to 3176 // update PlanToStreamColMap. 3177 plan.PlanToStreamColMap = identityMap(plan.PlanToStreamColMap, len(plan.ResultTypes)) 3178 } 3179 3180 return plan, nil 3181 } 3182 3183 // createPlanForExport creates a physical plan for EXPORT. 3184 // We add a new stage of CSVWriter processors to the input plan. 3185 func (dsp *DistSQLPlanner) createPlanForExport( 3186 planCtx *PlanningCtx, n *exportNode, 3187 ) (*PhysicalPlan, error) { 3188 plan, err := dsp.createPhysPlanForPlanNode(planCtx, n.source) 3189 if err != nil { 3190 return nil, err 3191 } 3192 3193 core := execinfrapb.ProcessorCoreUnion{CSVWriter: &execinfrapb.CSVWriterSpec{ 3194 Destination: n.fileName, 3195 NamePattern: exportFilePatternDefault, 3196 Options: n.csvOpts, 3197 ChunkRows: int64(n.chunkSize), 3198 CompressionCodec: n.fileCompression, 3199 }} 3200 3201 resTypes := make([]*types.T, len(sqlbase.ExportColumns)) 3202 for i := range sqlbase.ExportColumns { 3203 resTypes[i] = sqlbase.ExportColumns[i].Typ 3204 } 3205 plan.AddNoGroupingStage( 3206 core, execinfrapb.PostProcessSpec{}, resTypes, execinfrapb.Ordering{}, 3207 ) 3208 3209 // The CSVWriter produces the same columns as the EXPORT statement. 3210 plan.PlanToStreamColMap = identityMap(plan.PlanToStreamColMap, len(sqlbase.ExportColumns)) 3211 return plan, nil 3212 } 3213 3214 // NewPlanningCtx returns a new PlanningCtx. When distribute is false, a 3215 // lightweight version PlanningCtx is returned that can be used when the caller 3216 // knows plans will only be run on one node. 3217 func (dsp *DistSQLPlanner) NewPlanningCtx( 3218 ctx context.Context, evalCtx *extendedEvalContext, txn *kv.Txn, distribute bool, 3219 ) *PlanningCtx { 3220 planCtx := &PlanningCtx{ 3221 ctx: ctx, 3222 ExtendedEvalCtx: evalCtx, 3223 isLocal: !distribute, 3224 } 3225 if !distribute { 3226 return planCtx 3227 } 3228 planCtx.spanIter = dsp.spanResolver.NewSpanResolverIterator(txn) 3229 planCtx.NodeStatuses = make(map[roachpb.NodeID]NodeStatus) 3230 planCtx.NodeStatuses[dsp.nodeDesc.NodeID] = NodeOK 3231 return planCtx 3232 } 3233 3234 // FinalizePlan adds a final "result" stage if necessary and populates the 3235 // endpoints of the plan. 3236 func (dsp *DistSQLPlanner) FinalizePlan(planCtx *PlanningCtx, plan *PhysicalPlan) { 3237 // Find all MetadataTestSenders in the plan, so that the MetadataTestReceiver 3238 // knows how many sender IDs it should expect. 3239 var metadataSenders []string 3240 for _, proc := range plan.Processors { 3241 if proc.Spec.Core.MetadataTestSender != nil { 3242 metadataSenders = append(metadataSenders, proc.Spec.Core.MetadataTestSender.ID) 3243 } 3244 } 3245 thisNodeID := dsp.nodeDesc.NodeID 3246 // If we don't already have a single result router on this node, add a final 3247 // stage. 3248 if len(plan.ResultRouters) != 1 || 3249 plan.Processors[plan.ResultRouters[0]].Node != thisNodeID { 3250 plan.AddSingleGroupStage( 3251 thisNodeID, 3252 execinfrapb.ProcessorCoreUnion{Noop: &execinfrapb.NoopCoreSpec{}}, 3253 execinfrapb.PostProcessSpec{}, 3254 plan.ResultTypes, 3255 ) 3256 if len(plan.ResultRouters) != 1 { 3257 panic(fmt.Sprintf("%d results after single group stage", len(plan.ResultRouters))) 3258 } 3259 } 3260 3261 if len(metadataSenders) > 0 { 3262 plan.AddSingleGroupStage( 3263 thisNodeID, 3264 execinfrapb.ProcessorCoreUnion{ 3265 MetadataTestReceiver: &execinfrapb.MetadataTestReceiverSpec{ 3266 SenderIDs: metadataSenders, 3267 }, 3268 }, 3269 execinfrapb.PostProcessSpec{}, 3270 plan.ResultTypes, 3271 ) 3272 } 3273 3274 // Set up the endpoints for p.streams. 3275 plan.PopulateEndpoints() 3276 3277 // Set up the endpoint for the final result. 3278 finalOut := &plan.Processors[plan.ResultRouters[0]].Spec.Output[0] 3279 finalOut.Streams = append(finalOut.Streams, execinfrapb.StreamEndpointSpec{ 3280 Type: execinfrapb.StreamEndpointSpec_SYNC_RESPONSE, 3281 }) 3282 3283 // Assign processor IDs. 3284 for i := range plan.Processors { 3285 plan.Processors[i].Spec.ProcessorID = int32(i) 3286 } 3287 } 3288 3289 func makeTableReaderSpans(spans roachpb.Spans) []execinfrapb.TableReaderSpan { 3290 trSpans := make([]execinfrapb.TableReaderSpan, len(spans)) 3291 for i, span := range spans { 3292 trSpans[i].Span = span 3293 } 3294 3295 return trSpans 3296 }