github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/sql/distsql_physical_planner.go (about)

     1  // Copyright 2016 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package sql
    12  
    13  import (
    14  	"context"
    15  	"fmt"
    16  	"reflect"
    17  	"sort"
    18  	"strings"
    19  
    20  	"github.com/cockroachdb/cockroach/pkg/gossip"
    21  	"github.com/cockroachdb/cockroach/pkg/keys"
    22  	"github.com/cockroachdb/cockroach/pkg/kv"
    23  	"github.com/cockroachdb/cockroach/pkg/kv/kvclient/kvcoord"
    24  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    25  	"github.com/cockroachdb/cockroach/pkg/rpc"
    26  	"github.com/cockroachdb/cockroach/pkg/rpc/nodedialer"
    27  	"github.com/cockroachdb/cockroach/pkg/settings/cluster"
    28  	"github.com/cockroachdb/cockroach/pkg/sql/distsql"
    29  	"github.com/cockroachdb/cockroach/pkg/sql/execinfra"
    30  	"github.com/cockroachdb/cockroach/pkg/sql/execinfrapb"
    31  	"github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgcode"
    32  	"github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgerror"
    33  	"github.com/cockroachdb/cockroach/pkg/sql/physicalplan"
    34  	"github.com/cockroachdb/cockroach/pkg/sql/physicalplan/replicaoracle"
    35  	"github.com/cockroachdb/cockroach/pkg/sql/sem/tree"
    36  	"github.com/cockroachdb/cockroach/pkg/sql/sqlbase"
    37  	"github.com/cockroachdb/cockroach/pkg/sql/types"
    38  	"github.com/cockroachdb/cockroach/pkg/util"
    39  	"github.com/cockroachdb/cockroach/pkg/util/encoding"
    40  	"github.com/cockroachdb/cockroach/pkg/util/envutil"
    41  	"github.com/cockroachdb/cockroach/pkg/util/log"
    42  	"github.com/cockroachdb/cockroach/pkg/util/stop"
    43  	"github.com/cockroachdb/cockroach/pkg/util/uuid"
    44  	"github.com/cockroachdb/errors"
    45  )
    46  
    47  // DistSQLPlanner is used to generate distributed plans from logical
    48  // plans. A rough overview of the process:
    49  //
    50  //  - the plan is based on a planNode tree (in the future it will be based on an
    51  //    intermediate representation tree). Only a subset of the possible trees is
    52  //    supported (this can be checked via CheckSupport).
    53  //
    54  //  - we generate a PhysicalPlan for the planNode tree recursively. The
    55  //    PhysicalPlan consists of a network of processors and streams, with a set
    56  //    of unconnected "result routers". The PhysicalPlan also has information on
    57  //    ordering and on the mapping planNode columns to columns in the result
    58  //    streams (all result routers output streams with the same schema).
    59  //
    60  //    The PhysicalPlan for a scanNode leaf consists of TableReaders, one for each node
    61  //    that has one or more ranges.
    62  //
    63  //  - for each an internal planNode we start with the plan of the child node(s)
    64  //    and add processing stages (connected to the result routers of the children
    65  //    node).
    66  type DistSQLPlanner struct {
    67  	// planVersion is the version of DistSQL targeted by the plan we're building.
    68  	// This is currently only assigned to the node's current DistSQL version and
    69  	// is used to skip incompatible nodes when mapping spans.
    70  	planVersion execinfrapb.DistSQLVersion
    71  
    72  	st *cluster.Settings
    73  	// The node descriptor for the gateway node that initiated this query.
    74  	nodeDesc     roachpb.NodeDescriptor
    75  	stopper      *stop.Stopper
    76  	distSQLSrv   *distsql.ServerImpl
    77  	spanResolver physicalplan.SpanResolver
    78  
    79  	// metadataTestTolerance is the minimum level required to plan metadata test
    80  	// processors.
    81  	metadataTestTolerance execinfra.MetadataTestLevel
    82  
    83  	// runnerChan is used to send out requests (for running SetupFlow RPCs) to a
    84  	// pool of workers.
    85  	runnerChan chan runnerRequest
    86  
    87  	// gossip handle used to check node version compatibility and to construct
    88  	// the spanResolver.
    89  	gossip gossip.DeprecatedGossip
    90  
    91  	nodeDialer *nodedialer.Dialer
    92  
    93  	// nodeHealth encapsulates the various node health checks to avoid planning
    94  	// on unhealthy nodes.
    95  	nodeHealth distSQLNodeHealth
    96  
    97  	// distSender is used to construct the spanResolver upon SetNodeDesc.
    98  	distSender *kvcoord.DistSender
    99  	// rpcCtx is used to construct the spanResolver upon SetNodeDesc.
   100  	rpcCtx *rpc.Context
   101  }
   102  
   103  // ReplicaOraclePolicy controls which policy the physical planner uses to choose
   104  // a replica for a given range. It is exported so that it may be overwritten
   105  // during initialization by CCL code to enable follower reads.
   106  var ReplicaOraclePolicy = replicaoracle.BinPackingChoice
   107  
   108  // If true, the plan diagram (in JSON) is logged for each plan (used for
   109  // debugging).
   110  var logPlanDiagram = envutil.EnvOrDefaultBool("COCKROACH_DISTSQL_LOG_PLAN", false)
   111  
   112  // NewDistSQLPlanner initializes a DistSQLPlanner.
   113  //
   114  // nodeDesc is the descriptor of the node on which this planner runs. It is used
   115  // to favor itself and other close-by nodes when planning. An empty descriptor
   116  // can be passed to aid bootstrapping, but then SetNodeDesc() needs to be called
   117  // before this planner is used.
   118  func NewDistSQLPlanner(
   119  	ctx context.Context,
   120  	planVersion execinfrapb.DistSQLVersion,
   121  	st *cluster.Settings,
   122  	nodeDesc roachpb.NodeDescriptor,
   123  	rpcCtx *rpc.Context,
   124  	distSQLSrv *distsql.ServerImpl,
   125  	distSender *kvcoord.DistSender,
   126  	gw gossip.DeprecatedGossip,
   127  	stopper *stop.Stopper,
   128  	isLive func(roachpb.NodeID) (bool, error),
   129  	nodeDialer *nodedialer.Dialer,
   130  ) *DistSQLPlanner {
   131  	dsp := &DistSQLPlanner{
   132  		planVersion: planVersion,
   133  		st:          st,
   134  		nodeDesc:    nodeDesc,
   135  		stopper:     stopper,
   136  		distSQLSrv:  distSQLSrv,
   137  		gossip:      gw,
   138  		nodeDialer:  nodeDialer,
   139  		nodeHealth: distSQLNodeHealth{
   140  			gossip:     gw,
   141  			connHealth: nodeDialer.ConnHealth,
   142  			isLive:     isLive,
   143  		},
   144  		distSender:            distSender,
   145  		rpcCtx:                rpcCtx,
   146  		metadataTestTolerance: execinfra.NoExplain,
   147  	}
   148  
   149  	dsp.initRunners()
   150  	return dsp
   151  }
   152  
   153  func (dsp *DistSQLPlanner) shouldPlanTestMetadata() bool {
   154  	return dsp.distSQLSrv.TestingKnobs.MetadataTestLevel >= dsp.metadataTestTolerance
   155  }
   156  
   157  // SetNodeDesc sets the planner's node descriptor.
   158  // The first call to SetNodeDesc leads to the construction of the SpanResolver.
   159  func (dsp *DistSQLPlanner) SetNodeDesc(desc roachpb.NodeDescriptor) {
   160  	dsp.nodeDesc = desc
   161  	if dsp.spanResolver == nil {
   162  		sr := physicalplan.NewSpanResolver(dsp.st, dsp.distSender, dsp.gossip, desc,
   163  			dsp.rpcCtx, ReplicaOraclePolicy)
   164  		dsp.SetSpanResolver(sr)
   165  	}
   166  }
   167  
   168  // SetSpanResolver switches to a different SpanResolver. It is the caller's
   169  // responsibility to make sure the DistSQLPlanner is not in use.
   170  func (dsp *DistSQLPlanner) SetSpanResolver(spanResolver physicalplan.SpanResolver) {
   171  	dsp.spanResolver = spanResolver
   172  }
   173  
   174  // distSQLExprCheckVisitor is a tree.Visitor that checks if expressions
   175  // contain things not supported by distSQL, like distSQL-blacklisted functions.
   176  type distSQLExprCheckVisitor struct {
   177  	err error
   178  }
   179  
   180  var _ tree.Visitor = &distSQLExprCheckVisitor{}
   181  
   182  func (v *distSQLExprCheckVisitor) VisitPre(expr tree.Expr) (recurse bool, newExpr tree.Expr) {
   183  	if v.err != nil {
   184  		return false, expr
   185  	}
   186  	switch t := expr.(type) {
   187  	case *tree.FuncExpr:
   188  		if t.IsDistSQLBlacklist() {
   189  			v.err = newQueryNotSupportedErrorf("function %s cannot be executed with distsql", t)
   190  			return false, expr
   191  		}
   192  	case *tree.DOid:
   193  		v.err = newQueryNotSupportedError("OID expressions are not supported by distsql")
   194  		return false, expr
   195  	case *tree.CastExpr:
   196  		// TODO (rohany): I'm not sure why this CastExpr doesn't have a type
   197  		//  annotation at this stage of processing...
   198  		if typ, ok := tree.GetStaticallyKnownType(t.Type); ok && typ.Family() == types.OidFamily {
   199  			v.err = newQueryNotSupportedErrorf("cast to %s is not supported by distsql", t.Type)
   200  			return false, expr
   201  		}
   202  	}
   203  	return true, expr
   204  }
   205  
   206  func (v *distSQLExprCheckVisitor) VisitPost(expr tree.Expr) tree.Expr { return expr }
   207  
   208  // checkExpr verifies that an expression doesn't contain things that are not yet
   209  // supported by distSQL, like distSQL-blacklisted functions.
   210  func checkExpr(expr tree.Expr) error {
   211  	if expr == nil {
   212  		return nil
   213  	}
   214  	v := distSQLExprCheckVisitor{}
   215  	tree.WalkExprConst(&v, expr)
   216  	return v.err
   217  }
   218  
   219  type distRecommendation int
   220  
   221  const (
   222  	// cannotDistribute indicates that a plan cannot be distributed.
   223  	cannotDistribute distRecommendation = iota
   224  
   225  	// shouldNotDistribute indicates that a plan could suffer if distributed.
   226  	shouldNotDistribute
   227  
   228  	// canDistribute indicates that a plan will probably not benefit but will
   229  	// probably not suffer if distributed.
   230  	canDistribute
   231  
   232  	// shouldDistribute indicates that a plan will likely benefit if distributed.
   233  	shouldDistribute
   234  )
   235  
   236  // compose returns the recommendation for a plan given recommendations for two
   237  // parts of it: if we shouldNotDistribute either part, then we
   238  // shouldNotDistribute the overall plan either.
   239  func (a distRecommendation) compose(b distRecommendation) distRecommendation {
   240  	if a == cannotDistribute || b == cannotDistribute {
   241  		return cannotDistribute
   242  	}
   243  	if a == shouldNotDistribute || b == shouldNotDistribute {
   244  		return shouldNotDistribute
   245  	}
   246  	if a == shouldDistribute || b == shouldDistribute {
   247  		return shouldDistribute
   248  	}
   249  	return canDistribute
   250  }
   251  
   252  type queryNotSupportedError struct {
   253  	msg string
   254  }
   255  
   256  func (e *queryNotSupportedError) Error() string {
   257  	return e.msg
   258  }
   259  
   260  func newQueryNotSupportedError(msg string) error {
   261  	return &queryNotSupportedError{msg: msg}
   262  }
   263  
   264  func newQueryNotSupportedErrorf(format string, args ...interface{}) error {
   265  	return &queryNotSupportedError{msg: fmt.Sprintf(format, args...)}
   266  }
   267  
   268  // planNodeNotSupportedErr is the catch-all error value returned from
   269  // checkSupportForPlanNode when a planNode type does not support distributed
   270  // execution.
   271  var planNodeNotSupportedErr = newQueryNotSupportedError("unsupported node")
   272  
   273  var cannotDistributeRowLevelLockingErr = newQueryNotSupportedError(
   274  	"scans with row-level locking are not supported by distsql",
   275  )
   276  
   277  // mustWrapNode returns true if a node has no DistSQL-processor equivalent.
   278  // This must be kept in sync with createPhysPlanForPlanNode.
   279  // TODO(jordan): refactor these to use the observer pattern to avoid duplication.
   280  func (dsp *DistSQLPlanner) mustWrapNode(planCtx *PlanningCtx, node planNode) bool {
   281  	switch n := node.(type) {
   282  	// Keep these cases alphabetized, please!
   283  	case *distinctNode:
   284  	case *exportNode:
   285  	case *filterNode:
   286  	case *groupNode:
   287  	case *indexJoinNode:
   288  	case *joinNode:
   289  	case *limitNode:
   290  	case *lookupJoinNode:
   291  	case *ordinalityNode:
   292  	case *projectSetNode:
   293  	case *renderNode:
   294  	case *scanNode:
   295  	case *sortNode:
   296  	case *unaryNode:
   297  	case *unionNode:
   298  	case *valuesNode:
   299  		// This is unfortunately duplicated by createPhysPlanForPlanNode, and must be kept
   300  		// in sync with its implementation.
   301  		if !n.specifiedInQuery || planCtx.isLocal || planCtx.noEvalSubqueries {
   302  			return true
   303  		}
   304  		return false
   305  	case *windowNode:
   306  	case *zeroNode:
   307  	case *zigzagJoinNode:
   308  	default:
   309  		return true
   310  	}
   311  	return false
   312  }
   313  
   314  // checkSupportForPlanNode returns a distRecommendation (as described above) or
   315  // cannotDistribute and an error if the plan subtree is not distributable.
   316  // The error doesn't indicate complete failure - it's instead the reason that
   317  // this plan couldn't be distributed.
   318  // TODO(radu): add tests for this.
   319  func checkSupportForPlanNode(node planNode) (distRecommendation, error) {
   320  	switch n := node.(type) {
   321  	// Keep these cases alphabetized, please!
   322  	case *distinctNode:
   323  		return checkSupportForPlanNode(n.plan)
   324  
   325  	case *exportNode:
   326  		return checkSupportForPlanNode(n.source)
   327  
   328  	case *filterNode:
   329  		if err := checkExpr(n.filter); err != nil {
   330  			return cannotDistribute, err
   331  		}
   332  		return checkSupportForPlanNode(n.source.plan)
   333  
   334  	case *groupNode:
   335  		rec, err := checkSupportForPlanNode(n.plan)
   336  		if err != nil {
   337  			return cannotDistribute, err
   338  		}
   339  		// Distribute aggregations if possible.
   340  		return rec.compose(shouldDistribute), nil
   341  
   342  	case *indexJoinNode:
   343  		// n.table doesn't have meaningful spans, but we need to check support (e.g.
   344  		// for any filtering expression).
   345  		if _, err := checkSupportForPlanNode(n.table); err != nil {
   346  			return cannotDistribute, err
   347  		}
   348  		return checkSupportForPlanNode(n.input)
   349  
   350  	case *joinNode:
   351  		if err := checkExpr(n.pred.onCond); err != nil {
   352  			return cannotDistribute, err
   353  		}
   354  		recLeft, err := checkSupportForPlanNode(n.left.plan)
   355  		if err != nil {
   356  			return cannotDistribute, err
   357  		}
   358  		recRight, err := checkSupportForPlanNode(n.right.plan)
   359  		if err != nil {
   360  			return cannotDistribute, err
   361  		}
   362  		// If either the left or the right side can benefit from distribution, we
   363  		// should distribute.
   364  		rec := recLeft.compose(recRight)
   365  		// If we can do a hash join, we distribute if possible.
   366  		if len(n.pred.leftEqualityIndices) > 0 {
   367  			rec = rec.compose(shouldDistribute)
   368  		}
   369  		return rec, nil
   370  
   371  	case *limitNode:
   372  		if err := checkExpr(n.countExpr); err != nil {
   373  			return cannotDistribute, err
   374  		}
   375  		if err := checkExpr(n.offsetExpr); err != nil {
   376  			return cannotDistribute, err
   377  		}
   378  		return checkSupportForPlanNode(n.plan)
   379  
   380  	case *lookupJoinNode:
   381  		if err := checkExpr(n.onCond); err != nil {
   382  			return cannotDistribute, err
   383  		}
   384  		if _, err := checkSupportForPlanNode(n.input); err != nil {
   385  			return cannotDistribute, err
   386  		}
   387  		return shouldDistribute, nil
   388  
   389  	case *projectSetNode:
   390  		return checkSupportForPlanNode(n.source)
   391  
   392  	case *renderNode:
   393  		for _, e := range n.render {
   394  			if err := checkExpr(e); err != nil {
   395  				return cannotDistribute, err
   396  			}
   397  		}
   398  		return checkSupportForPlanNode(n.source.plan)
   399  
   400  	case *scanNode:
   401  		if n.lockingStrength != sqlbase.ScanLockingStrength_FOR_NONE {
   402  			// Scans that are performing row-level locking cannot currently be
   403  			// distributed because their locks would not be propagated back to
   404  			// the root transaction coordinator.
   405  			// TODO(nvanbenschoten): lift this restriction.
   406  			return cannotDistribute, cannotDistributeRowLevelLockingErr
   407  		}
   408  
   409  		// Although we don't yet recommend distributing plans where soft limits
   410  		// propagate to scan nodes because we don't have infrastructure to only
   411  		// plan for a few ranges at a time, the propagation of the soft limits
   412  		// to scan nodes has been added in 20.1 release, so to keep the
   413  		// previous behavior we continue to ignore the soft limits for now.
   414  		// TODO(yuzefovich): pay attention to the soft limits.
   415  		rec := canDistribute
   416  		// We recommend running scans distributed if we have a filtering
   417  		// expression or if we have a full table scan.
   418  		if n.filter != nil {
   419  			if err := checkExpr(n.filter); err != nil {
   420  				return cannotDistribute, err
   421  			}
   422  			rec = rec.compose(shouldDistribute)
   423  		}
   424  		// Check if we are doing a full scan.
   425  		if n.isFull {
   426  			rec = rec.compose(shouldDistribute)
   427  		}
   428  		return rec, nil
   429  
   430  	case *sortNode:
   431  		rec, err := checkSupportForPlanNode(n.plan)
   432  		if err != nil {
   433  			return cannotDistribute, err
   434  		}
   435  		// If we have to sort, distribute the query.
   436  		rec = rec.compose(shouldDistribute)
   437  		return rec, nil
   438  
   439  	case *unaryNode:
   440  		return canDistribute, nil
   441  
   442  	case *unionNode:
   443  		recLeft, err := checkSupportForPlanNode(n.left)
   444  		if err != nil {
   445  			return cannotDistribute, err
   446  		}
   447  		recRight, err := checkSupportForPlanNode(n.right)
   448  		if err != nil {
   449  			return cannotDistribute, err
   450  		}
   451  		return recLeft.compose(recRight), nil
   452  
   453  	case *valuesNode:
   454  		if !n.specifiedInQuery {
   455  			// This condition indicates that the valuesNode was created by planning,
   456  			// not by the user, like the way vtables are expanded into valuesNodes. We
   457  			// don't want to distribute queries like this across the network.
   458  			return cannotDistribute, newQueryNotSupportedErrorf("unsupported valuesNode, not specified in query")
   459  		}
   460  
   461  		for _, tuple := range n.tuples {
   462  			for _, expr := range tuple {
   463  				if err := checkExpr(expr); err != nil {
   464  					return cannotDistribute, err
   465  				}
   466  			}
   467  		}
   468  		return canDistribute, nil
   469  
   470  	case *windowNode:
   471  		return checkSupportForPlanNode(n.plan)
   472  
   473  	case *zeroNode:
   474  		return canDistribute, nil
   475  
   476  	case *zigzagJoinNode:
   477  		if err := checkExpr(n.onCond); err != nil {
   478  			return cannotDistribute, err
   479  		}
   480  		return shouldDistribute, nil
   481  
   482  	default:
   483  		return cannotDistribute, planNodeNotSupportedErr
   484  	}
   485  }
   486  
   487  //go:generate stringer -type=NodeStatus
   488  
   489  // NodeStatus represents a node's health and compatibility in the context of
   490  // physical planning for a query.
   491  type NodeStatus int
   492  
   493  const (
   494  	// NodeOK means that the node can be used for planning.
   495  	NodeOK NodeStatus = iota
   496  	// NodeUnhealthy means that the node should be avoided because
   497  	// it's not healthy.
   498  	NodeUnhealthy
   499  	// NodeDistSQLVersionIncompatible means that the node should be avoided
   500  	// because it's DistSQL version is not compatible.
   501  	NodeDistSQLVersionIncompatible
   502  )
   503  
   504  // PlanningCtx contains data used and updated throughout the planning process of
   505  // a single query.
   506  type PlanningCtx struct {
   507  	ctx             context.Context
   508  	ExtendedEvalCtx *extendedEvalContext
   509  	spanIter        physicalplan.SpanResolverIterator
   510  	// NodesStatuses contains info for all NodeIDs that are referenced by any
   511  	// PhysicalPlan we generate with this context.
   512  	NodeStatuses map[roachpb.NodeID]NodeStatus
   513  
   514  	// isLocal is set to true if we're planning this query on a single node.
   515  	isLocal bool
   516  	planner *planner
   517  	// ignoreClose, when set to true, will prevent the closing of the planner's
   518  	// current plan. Only the top-level query needs to close it, but everything
   519  	// else (like sub- and postqueries, or EXPLAIN ANALYZE) should set this to
   520  	// true to avoid double closes of the planNode tree.
   521  	ignoreClose bool
   522  	stmtType    tree.StatementType
   523  	// planDepth is set to the current depth of the planNode tree. It's used to
   524  	// keep track of whether it's valid to run a root node in a special fast path
   525  	// mode.
   526  	planDepth int
   527  
   528  	// noEvalSubqueries indicates that the plan expects any subqueries to not
   529  	// be replaced by evaluation. Should only be set by EXPLAIN.
   530  	noEvalSubqueries bool
   531  
   532  	// If set, a diagram for the plan will be generated and passed to this
   533  	// function.
   534  	saveDiagram func(execinfrapb.FlowDiagram)
   535  	// If set, the diagram passed to saveDiagram will show the types of each
   536  	// stream.
   537  	saveDiagramShowInputTypes bool
   538  }
   539  
   540  var _ physicalplan.ExprContext = &PlanningCtx{}
   541  
   542  // EvalContext returns the associated EvalContext, or nil if there isn't one.
   543  func (p *PlanningCtx) EvalContext() *tree.EvalContext {
   544  	if p.ExtendedEvalCtx == nil {
   545  		return nil
   546  	}
   547  	return &p.ExtendedEvalCtx.EvalContext
   548  }
   549  
   550  // IsLocal returns true if this PlanningCtx is being used to plan a query that
   551  // has no remote flows.
   552  func (p *PlanningCtx) IsLocal() bool {
   553  	return p.isLocal
   554  }
   555  
   556  // EvaluateSubqueries returns true if this plan requires subqueries be fully
   557  // executed before trying to marshal. This is normally true except for in the
   558  // case of EXPLAIN queries, which ultimately want to describe the subquery that
   559  // will run, without actually running it.
   560  func (p *PlanningCtx) EvaluateSubqueries() bool {
   561  	return !p.noEvalSubqueries
   562  }
   563  
   564  // PhysicalPlan is a partial physical plan which corresponds to a planNode
   565  // (partial in that it can correspond to a planNode subtree and not necessarily
   566  // to the entire planNode for a given query).
   567  //
   568  // It augments physicalplan.PhysicalPlan with information relating the physical
   569  // plan to a planNode subtree.
   570  //
   571  // These plans are built recursively on a planNode tree.
   572  type PhysicalPlan struct {
   573  	physicalplan.PhysicalPlan
   574  
   575  	// PlanToStreamColMap maps planNode columns (see planColumns()) to columns in
   576  	// the result streams. These stream indices correspond to the streams
   577  	// referenced in ResultTypes.
   578  	//
   579  	// Note that in some cases, not all columns in the result streams are
   580  	// referenced in the map; for example, columns that are only required for
   581  	// stream merges in downstream input synchronizers are not included here.
   582  	// (This is due to some processors not being configurable to output only
   583  	// certain columns and will be fixed.)
   584  	//
   585  	// Conversely, in some cases not all planNode columns have a corresponding
   586  	// result stream column (these map to index -1); this is the case for scanNode
   587  	// and indexJoinNode where not all columns in the table are actually used in
   588  	// the plan, but are kept for possible use downstream (e.g., sorting).
   589  	//
   590  	// When the query is run, the output processor's PlanToStreamColMap is used
   591  	// by DistSQLReceiver to create an implicit projection on the processor's
   592  	// output for client consumption (see DistSQLReceiver.Push()). Therefore,
   593  	// "invisible" columns (e.g., columns required for merge ordering) will not
   594  	// be output.
   595  	PlanToStreamColMap []int
   596  }
   597  
   598  // makePlanToStreamColMap initializes a new PhysicalPlan.PlanToStreamColMap. The
   599  // columns that are present in the result stream(s) should be set in the map.
   600  func makePlanToStreamColMap(numCols int) []int {
   601  	m := make([]int, numCols)
   602  	for i := 0; i < numCols; i++ {
   603  		m[i] = -1
   604  	}
   605  	return m
   606  }
   607  
   608  // identityMap returns the slice {0, 1, 2, ..., numCols-1}.
   609  // buf can be optionally provided as a buffer.
   610  func identityMap(buf []int, numCols int) []int {
   611  	buf = buf[:0]
   612  	for i := 0; i < numCols; i++ {
   613  		buf = append(buf, i)
   614  	}
   615  	return buf
   616  }
   617  
   618  // identityMapInPlace returns the modified slice such that it contains
   619  // {0, 1, ..., len(slice)-1}.
   620  func identityMapInPlace(slice []int) []int {
   621  	for i := range slice {
   622  		slice[i] = i
   623  	}
   624  	return slice
   625  }
   626  
   627  // SpanPartition is the intersection between a set of spans for a certain
   628  // operation (e.g table scan) and the set of ranges owned by a given node.
   629  type SpanPartition struct {
   630  	Node  roachpb.NodeID
   631  	Spans roachpb.Spans
   632  }
   633  
   634  type distSQLNodeHealth struct {
   635  	gossip     gossip.DeprecatedGossip
   636  	isLive     func(roachpb.NodeID) (bool, error)
   637  	connHealth func(roachpb.NodeID, rpc.ConnectionClass) error
   638  }
   639  
   640  func (h *distSQLNodeHealth) check(ctx context.Context, nodeID roachpb.NodeID) error {
   641  	{
   642  		// NB: as of #22658, ConnHealth does not work as expected; see the
   643  		// comment within. We still keep this code for now because in
   644  		// practice, once the node is down it will prevent using this node
   645  		// 90% of the time (it gets used around once per second as an
   646  		// artifact of rpcContext's reconnection mechanism at the time of
   647  		// writing). This is better than having it used in 100% of cases
   648  		// (until the liveness check below kicks in).
   649  		err := h.connHealth(nodeID, rpc.DefaultClass)
   650  		if err != nil && !errors.Is(err, rpc.ErrNotHeartbeated) {
   651  			// This host is known to be unhealthy. Don't use it (use the gateway
   652  			// instead). Note: this can never happen for our nodeID (which
   653  			// always has its address in the nodeMap).
   654  			log.VEventf(ctx, 1, "marking n%d as unhealthy for this plan: %v", nodeID, err)
   655  			return err
   656  		}
   657  	}
   658  	{
   659  		live, err := h.isLive(nodeID)
   660  		if err == nil && !live {
   661  			err = pgerror.Newf(pgcode.CannotConnectNow,
   662  				"node n%d is not live", errors.Safe(nodeID))
   663  		}
   664  		if err != nil {
   665  			return pgerror.Wrapf(err, pgcode.CannotConnectNow,
   666  				"not using n%d due to liveness", errors.Safe(nodeID))
   667  		}
   668  	}
   669  
   670  	// Check that the node is not draining.
   671  	if g, ok := h.gossip.Optional(distsql.MultiTenancyIssueNo); ok {
   672  		drainingInfo := &execinfrapb.DistSQLDrainingInfo{}
   673  		if err := g.GetInfoProto(gossip.MakeDistSQLDrainingKey(nodeID), drainingInfo); err != nil {
   674  			// Because draining info has no expiration, an error
   675  			// implies that we have not yet received a node's
   676  			// draining information. Since this information is
   677  			// written on startup, the most likely scenario is
   678  			// that the node is ready. We therefore return no
   679  			// error.
   680  			// TODO(ajwerner): Determine the expected error types and only filter those.
   681  			return nil //nolint:returnerrcheck
   682  		}
   683  
   684  		if drainingInfo.Draining {
   685  			err := errors.Newf("not using n%d because it is draining", log.Safe(nodeID))
   686  			log.VEventf(ctx, 1, "%v", err)
   687  			return err
   688  		}
   689  	}
   690  
   691  	return nil
   692  }
   693  
   694  // PartitionSpans finds out which nodes are owners for ranges touching the
   695  // given spans, and splits the spans according to owning nodes. The result is a
   696  // set of SpanPartitions (guaranteed one for each relevant node), which form a
   697  // partitioning of the spans (i.e. they are non-overlapping and their union is
   698  // exactly the original set of spans).
   699  //
   700  // PartitionSpans does its best to not assign ranges on nodes that are known to
   701  // either be unhealthy or running an incompatible version. The ranges owned by
   702  // such nodes are assigned to the gateway.
   703  func (dsp *DistSQLPlanner) PartitionSpans(
   704  	planCtx *PlanningCtx, spans roachpb.Spans,
   705  ) ([]SpanPartition, error) {
   706  	if len(spans) == 0 {
   707  		panic("no spans")
   708  	}
   709  	ctx := planCtx.ctx
   710  	partitions := make([]SpanPartition, 0, 1)
   711  	if planCtx.isLocal {
   712  		// If we're planning locally, map all spans to the local node.
   713  		partitions = append(partitions,
   714  			SpanPartition{dsp.nodeDesc.NodeID, spans})
   715  		return partitions, nil
   716  	}
   717  	// nodeMap maps a nodeID to an index inside the partitions array.
   718  	nodeMap := make(map[roachpb.NodeID]int)
   719  	it := planCtx.spanIter
   720  	for _, span := range spans {
   721  		// rspan is the span we are currently partitioning.
   722  		var rspan roachpb.RSpan
   723  		var err error
   724  		if rspan.Key, err = keys.Addr(span.Key); err != nil {
   725  			return nil, err
   726  		}
   727  		if rspan.EndKey, err = keys.Addr(span.EndKey); err != nil {
   728  			return nil, err
   729  		}
   730  
   731  		var lastNodeID roachpb.NodeID
   732  		// lastKey maintains the EndKey of the last piece of `span`.
   733  		lastKey := rspan.Key
   734  		if log.V(1) {
   735  			log.Infof(ctx, "partitioning span %s", span)
   736  		}
   737  		// We break up rspan into its individual ranges (which may or
   738  		// may not be on separate nodes). We then create "partitioned
   739  		// spans" using the end keys of these individual ranges.
   740  		for it.Seek(ctx, span, kvcoord.Ascending); ; it.Next(ctx) {
   741  			if !it.Valid() {
   742  				return nil, it.Error()
   743  			}
   744  			replDesc, err := it.ReplicaInfo(ctx)
   745  			if err != nil {
   746  				return nil, err
   747  			}
   748  			desc := it.Desc()
   749  			if log.V(1) {
   750  				descCpy := desc // don't let desc escape
   751  				log.Infof(ctx, "lastKey: %s desc: %s", lastKey, &descCpy)
   752  			}
   753  
   754  			if !desc.ContainsKey(lastKey) {
   755  				// This range must contain the last range's EndKey.
   756  				log.Fatalf(
   757  					ctx, "next range %v doesn't cover last end key %v. Partitions: %#v",
   758  					desc.RSpan(), lastKey, partitions,
   759  				)
   760  			}
   761  
   762  			// Limit the end key to the end of the span we are resolving.
   763  			endKey := desc.EndKey
   764  			if rspan.EndKey.Less(endKey) {
   765  				endKey = rspan.EndKey
   766  			}
   767  
   768  			nodeID := replDesc.NodeID
   769  			partitionIdx, inNodeMap := nodeMap[nodeID]
   770  			if !inNodeMap {
   771  				// This is the first time we are seeing nodeID for these spans. Check
   772  				// its health.
   773  				status := dsp.CheckNodeHealthAndVersion(planCtx, nodeID)
   774  				// If the node is unhealthy or its DistSQL version is incompatible, use
   775  				// the gateway to process this span instead of the unhealthy host.
   776  				// An empty address indicates an unhealthy host.
   777  				if status != NodeOK {
   778  					log.Eventf(ctx, "not planning on node %d: %s", nodeID, status)
   779  					nodeID = dsp.nodeDesc.NodeID
   780  					partitionIdx, inNodeMap = nodeMap[nodeID]
   781  				}
   782  
   783  				if !inNodeMap {
   784  					partitionIdx = len(partitions)
   785  					partitions = append(partitions, SpanPartition{Node: nodeID})
   786  					nodeMap[nodeID] = partitionIdx
   787  				}
   788  			}
   789  			partition := &partitions[partitionIdx]
   790  
   791  			if lastNodeID == nodeID {
   792  				// Two consecutive ranges on the same node, merge the spans.
   793  				partition.Spans[len(partition.Spans)-1].EndKey = endKey.AsRawKey()
   794  			} else {
   795  				partition.Spans = append(partition.Spans, roachpb.Span{
   796  					Key:    lastKey.AsRawKey(),
   797  					EndKey: endKey.AsRawKey(),
   798  				})
   799  			}
   800  
   801  			if !endKey.Less(rspan.EndKey) {
   802  				// Done.
   803  				break
   804  			}
   805  
   806  			lastKey = endKey
   807  			lastNodeID = nodeID
   808  		}
   809  	}
   810  	return partitions, nil
   811  }
   812  
   813  // nodeVersionIsCompatible decides whether a particular node's DistSQL version
   814  // is compatible with dsp.planVersion. It uses gossip to find out the node's
   815  // version range.
   816  func (dsp *DistSQLPlanner) nodeVersionIsCompatible(nodeID roachpb.NodeID) bool {
   817  	g, ok := dsp.gossip.Optional(distsql.MultiTenancyIssueNo)
   818  	if !ok {
   819  		return true // no gossip - always compatible; only a single gateway running in Phase 2
   820  	}
   821  	var v execinfrapb.DistSQLVersionGossipInfo
   822  	if err := g.GetInfoProto(gossip.MakeDistSQLNodeVersionKey(nodeID), &v); err != nil {
   823  		return false
   824  	}
   825  	return distsql.FlowVerIsCompatible(dsp.planVersion, v.MinAcceptedVersion, v.Version)
   826  }
   827  
   828  func getIndexIdx(
   829  	index *sqlbase.IndexDescriptor, desc *sqlbase.ImmutableTableDescriptor,
   830  ) (uint32, error) {
   831  	if index.ID == desc.PrimaryIndex.ID {
   832  		return 0, nil
   833  	}
   834  	for i := range desc.Indexes {
   835  		if index.ID == desc.Indexes[i].ID {
   836  			// IndexIdx is 1 based (0 means primary index).
   837  			return uint32(i + 1), nil
   838  		}
   839  	}
   840  	return 0, errors.Errorf("invalid index %v (table %s)", index, desc.Name)
   841  }
   842  
   843  // initTableReaderSpec initializes a TableReaderSpec/PostProcessSpec that
   844  // corresponds to a scanNode, except for the Spans and OutputColumns.
   845  func initTableReaderSpec(
   846  	n *scanNode, planCtx *PlanningCtx, indexVarMap []int,
   847  ) (*execinfrapb.TableReaderSpec, execinfrapb.PostProcessSpec, error) {
   848  	s := physicalplan.NewTableReaderSpec()
   849  	*s = execinfrapb.TableReaderSpec{
   850  		Table:             *n.desc.TableDesc(),
   851  		Reverse:           n.reverse,
   852  		IsCheck:           n.isCheck,
   853  		Visibility:        n.colCfg.visibility,
   854  		LockingStrength:   n.lockingStrength,
   855  		LockingWaitPolicy: n.lockingWaitPolicy,
   856  
   857  		// Retain the capacity of the spans slice.
   858  		Spans: s.Spans[:0],
   859  	}
   860  	indexIdx, err := getIndexIdx(n.index, n.desc)
   861  	if err != nil {
   862  		return nil, execinfrapb.PostProcessSpec{}, err
   863  	}
   864  	s.IndexIdx = indexIdx
   865  
   866  	// When a TableReader is running scrub checks, do not allow a
   867  	// post-processor. This is because the outgoing stream is a fixed
   868  	// format (rowexec.ScrubTypes).
   869  	if n.isCheck {
   870  		return s, execinfrapb.PostProcessSpec{}, nil
   871  	}
   872  
   873  	filter, err := physicalplan.MakeExpression(n.filter, planCtx, indexVarMap)
   874  	if err != nil {
   875  		return nil, execinfrapb.PostProcessSpec{}, err
   876  	}
   877  	post := execinfrapb.PostProcessSpec{
   878  		Filter: filter,
   879  	}
   880  
   881  	if n.hardLimit != 0 {
   882  		post.Limit = uint64(n.hardLimit)
   883  	} else if n.softLimit != 0 {
   884  		s.LimitHint = n.softLimit
   885  	}
   886  	return s, post, nil
   887  }
   888  
   889  // scanNodeOrdinal returns the index of a column with the given ID.
   890  func tableOrdinal(
   891  	desc *sqlbase.ImmutableTableDescriptor,
   892  	colID sqlbase.ColumnID,
   893  	visibility execinfrapb.ScanVisibility,
   894  ) int {
   895  	for i := range desc.Columns {
   896  		if desc.Columns[i].ID == colID {
   897  			return i
   898  		}
   899  	}
   900  	if visibility == execinfra.ScanVisibilityPublicAndNotPublic {
   901  		offset := len(desc.Columns)
   902  		for i, col := range desc.MutationColumns() {
   903  			if col.ID == colID {
   904  				return offset + i
   905  			}
   906  		}
   907  	}
   908  	panic(fmt.Sprintf("column %d not in desc.Columns", colID))
   909  }
   910  
   911  // getScanNodeToTableOrdinalMap returns a map from scan node column ordinal to
   912  // table reader column ordinal. Returns nil if the map is identity.
   913  //
   914  // scanNodes can have columns set up in a few different ways, depending on the
   915  // colCfg. The heuristic planner always creates scanNodes with all public
   916  // columns (even if some of them aren't even in the index we are scanning).
   917  // The optimizer creates scanNodes with a specific set of wanted columns; in
   918  // this case we have to create a map from scanNode column ordinal to table
   919  // column ordinal (which is what the TableReader uses).
   920  func getScanNodeToTableOrdinalMap(n *scanNode) []int {
   921  	if n.colCfg.wantedColumns == nil {
   922  		return nil
   923  	}
   924  	if n.colCfg.addUnwantedAsHidden {
   925  		panic("addUnwantedAsHidden not supported")
   926  	}
   927  	res := make([]int, len(n.cols))
   928  	for i := range res {
   929  		res[i] = tableOrdinal(n.desc, n.cols[i].ID, n.colCfg.visibility)
   930  	}
   931  	return res
   932  }
   933  
   934  // getOutputColumnsFromScanNode returns the indices of the columns that are
   935  // returned by a scanNode.
   936  // If remap is not nil, the column ordinals are remapped accordingly.
   937  func getOutputColumnsFromScanNode(n *scanNode, remap []int) []uint32 {
   938  	outputColumns := make([]uint32, 0, len(n.cols))
   939  	// TODO(radu): if we have a scan with a filter, cols will include the
   940  	// columns needed for the filter, even if they aren't needed for the next
   941  	// stage.
   942  	for i := 0; i < len(n.cols); i++ {
   943  		colIdx := i
   944  		if remap != nil {
   945  			colIdx = remap[i]
   946  		}
   947  		outputColumns = append(outputColumns, uint32(colIdx))
   948  	}
   949  	return outputColumns
   950  }
   951  
   952  // convertOrdering maps the columns in props.ordering to the output columns of a
   953  // processor.
   954  func (dsp *DistSQLPlanner) convertOrdering(
   955  	reqOrdering ReqOrdering, planToStreamColMap []int,
   956  ) execinfrapb.Ordering {
   957  	if len(reqOrdering) == 0 {
   958  		return execinfrapb.Ordering{}
   959  	}
   960  	result := execinfrapb.Ordering{
   961  		Columns: make([]execinfrapb.Ordering_Column, len(reqOrdering)),
   962  	}
   963  	for i, o := range reqOrdering {
   964  		streamColIdx := o.ColIdx
   965  		if planToStreamColMap != nil {
   966  			streamColIdx = planToStreamColMap[o.ColIdx]
   967  		}
   968  		if streamColIdx == -1 {
   969  			panic("column in ordering not part of processor output")
   970  		}
   971  		result.Columns[i].ColIdx = uint32(streamColIdx)
   972  		dir := execinfrapb.Ordering_Column_ASC
   973  		if o.Direction == encoding.Descending {
   974  			dir = execinfrapb.Ordering_Column_DESC
   975  		}
   976  		result.Columns[i].Direction = dir
   977  	}
   978  	return result
   979  }
   980  
   981  // getNodeIDForScan retrieves the node ID where the single table reader should
   982  // reside for a limited scan. Ideally this is the lease holder for the first
   983  // range in the specified spans. But if that node is unhealthy or incompatible,
   984  // we use the gateway node instead.
   985  func (dsp *DistSQLPlanner) getNodeIDForScan(
   986  	planCtx *PlanningCtx, spans []roachpb.Span, reverse bool,
   987  ) (roachpb.NodeID, error) {
   988  	if len(spans) == 0 {
   989  		panic("no spans")
   990  	}
   991  
   992  	// Determine the node ID for the first range to be scanned.
   993  	it := planCtx.spanIter
   994  	if reverse {
   995  		it.Seek(planCtx.ctx, spans[len(spans)-1], kvcoord.Descending)
   996  	} else {
   997  		it.Seek(planCtx.ctx, spans[0], kvcoord.Ascending)
   998  	}
   999  	if !it.Valid() {
  1000  		return 0, it.Error()
  1001  	}
  1002  	replDesc, err := it.ReplicaInfo(planCtx.ctx)
  1003  	if err != nil {
  1004  		return 0, err
  1005  	}
  1006  
  1007  	nodeID := replDesc.NodeID
  1008  	status := dsp.CheckNodeHealthAndVersion(planCtx, nodeID)
  1009  	if status != NodeOK {
  1010  		log.Eventf(planCtx.ctx, "not planning on node %d: %s", nodeID, status)
  1011  		return dsp.nodeDesc.NodeID, nil
  1012  	}
  1013  	return nodeID, nil
  1014  }
  1015  
  1016  // CheckNodeHealthAndVersion returns a information about a node's health and
  1017  // compatibility. The info is also recorded in planCtx.Nodes.
  1018  func (dsp *DistSQLPlanner) CheckNodeHealthAndVersion(
  1019  	planCtx *PlanningCtx, nodeID roachpb.NodeID,
  1020  ) NodeStatus {
  1021  	if status, ok := planCtx.NodeStatuses[nodeID]; ok {
  1022  		return status
  1023  	}
  1024  
  1025  	var status NodeStatus
  1026  	if err := dsp.nodeHealth.check(planCtx.ctx, nodeID); err != nil {
  1027  		status = NodeUnhealthy
  1028  	} else if !dsp.nodeVersionIsCompatible(nodeID) {
  1029  		status = NodeDistSQLVersionIncompatible
  1030  	} else {
  1031  		status = NodeOK
  1032  	}
  1033  	planCtx.NodeStatuses[nodeID] = status
  1034  	return status
  1035  }
  1036  
  1037  // createTableReaders generates a plan consisting of table reader processors,
  1038  // one for each node that has spans that we are reading.
  1039  // overridesResultColumns is optional.
  1040  func (dsp *DistSQLPlanner) createTableReaders(
  1041  	planCtx *PlanningCtx, n *scanNode,
  1042  ) (*PhysicalPlan, error) {
  1043  	scanNodeToTableOrdinalMap := getScanNodeToTableOrdinalMap(n)
  1044  	spec, post, err := initTableReaderSpec(n, planCtx, scanNodeToTableOrdinalMap)
  1045  	if err != nil {
  1046  		return nil, err
  1047  	}
  1048  
  1049  	var spanPartitions []SpanPartition
  1050  	if planCtx.isLocal {
  1051  		spanPartitions = []SpanPartition{{dsp.nodeDesc.NodeID, n.spans}}
  1052  	} else if n.hardLimit == 0 {
  1053  		// No hard limit - plan all table readers where their data live. Note
  1054  		// that we're ignoring soft limits for now since the TableReader will
  1055  		// still read too eagerly in the soft limit case. To prevent this we'll
  1056  		// need a new mechanism on the execution side to modulate table reads.
  1057  		// TODO(yuzefovich): add that mechanism.
  1058  		spanPartitions, err = dsp.PartitionSpans(planCtx, n.spans)
  1059  		if err != nil {
  1060  			return nil, err
  1061  		}
  1062  	} else {
  1063  		// If the scan has a hard limit, use a single TableReader to avoid
  1064  		// reading more rows than necessary.
  1065  		nodeID, err := dsp.getNodeIDForScan(planCtx, n.spans, n.reverse)
  1066  		if err != nil {
  1067  			return nil, err
  1068  		}
  1069  		spanPartitions = []SpanPartition{{nodeID, n.spans}}
  1070  	}
  1071  
  1072  	var p PhysicalPlan
  1073  	stageID := p.NewStageID()
  1074  
  1075  	p.ResultRouters = make([]physicalplan.ProcessorIdx, len(spanPartitions))
  1076  	p.Processors = make([]physicalplan.Processor, 0, len(spanPartitions))
  1077  
  1078  	returnMutations := n.colCfg.visibility == execinfra.ScanVisibilityPublicAndNotPublic
  1079  
  1080  	for i, sp := range spanPartitions {
  1081  		var tr *execinfrapb.TableReaderSpec
  1082  		if i == 0 {
  1083  			// For the first span partition, we can just directly use the spec we made
  1084  			// above.
  1085  			tr = spec
  1086  		} else {
  1087  			// For the rest, we have to copy the spec into a fresh spec.
  1088  			tr = physicalplan.NewTableReaderSpec()
  1089  			// Grab the Spans field of the new spec, and reuse it in case the pooled
  1090  			// TableReaderSpec we got has pre-allocated Spans memory.
  1091  			newSpansSlice := tr.Spans
  1092  			*tr = *spec
  1093  			tr.Spans = newSpansSlice
  1094  		}
  1095  		for j := range sp.Spans {
  1096  			tr.Spans = append(tr.Spans, execinfrapb.TableReaderSpan{Span: sp.Spans[j]})
  1097  		}
  1098  
  1099  		tr.MaxResults = n.maxResults
  1100  		p.TotalEstimatedScannedRows += n.estimatedRowCount
  1101  		if n.estimatedRowCount > p.MaxEstimatedRowCount {
  1102  			p.MaxEstimatedRowCount = n.estimatedRowCount
  1103  		}
  1104  
  1105  		proc := physicalplan.Processor{
  1106  			Node: sp.Node,
  1107  			Spec: execinfrapb.ProcessorSpec{
  1108  				Core:    execinfrapb.ProcessorCoreUnion{TableReader: tr},
  1109  				Output:  []execinfrapb.OutputRouterSpec{{Type: execinfrapb.OutputRouterSpec_PASS_THROUGH}},
  1110  				StageID: stageID,
  1111  			},
  1112  		}
  1113  
  1114  		pIdx := p.AddProcessor(proc)
  1115  		p.ResultRouters[i] = pIdx
  1116  	}
  1117  
  1118  	if len(p.ResultRouters) > 1 && len(n.reqOrdering) > 0 {
  1119  		// Make a note of the fact that we have to maintain a certain ordering
  1120  		// between the parallel streams.
  1121  		//
  1122  		// This information is taken into account by the AddProjection call below:
  1123  		// specifically, it will make sure these columns are kept even if they are
  1124  		// not in the projection (e.g. "SELECT v FROM kv ORDER BY k").
  1125  		p.SetMergeOrdering(dsp.convertOrdering(n.reqOrdering, scanNodeToTableOrdinalMap))
  1126  	}
  1127  
  1128  	var typs []*types.T
  1129  	if returnMutations {
  1130  		typs = make([]*types.T, 0, len(n.desc.Columns)+len(n.desc.MutationColumns()))
  1131  	} else {
  1132  		typs = make([]*types.T, 0, len(n.desc.Columns))
  1133  	}
  1134  	for i := range n.desc.Columns {
  1135  		typs = append(typs, n.desc.Columns[i].Type)
  1136  	}
  1137  	if returnMutations {
  1138  		for _, col := range n.desc.MutationColumns() {
  1139  			typs = append(typs, col.Type)
  1140  		}
  1141  	}
  1142  	p.SetLastStagePost(post, typs)
  1143  
  1144  	outCols := getOutputColumnsFromScanNode(n, scanNodeToTableOrdinalMap)
  1145  	planToStreamColMap := make([]int, len(n.cols))
  1146  	descColumnIDs := make([]sqlbase.ColumnID, 0, len(n.desc.Columns))
  1147  	for i := range n.desc.Columns {
  1148  		descColumnIDs = append(descColumnIDs, n.desc.Columns[i].ID)
  1149  	}
  1150  	if returnMutations {
  1151  		for _, c := range n.desc.MutationColumns() {
  1152  			descColumnIDs = append(descColumnIDs, c.ID)
  1153  		}
  1154  	}
  1155  	for i := range planToStreamColMap {
  1156  		planToStreamColMap[i] = -1
  1157  		for j, c := range outCols {
  1158  			if descColumnIDs[c] == n.cols[i].ID {
  1159  				planToStreamColMap[i] = j
  1160  				break
  1161  			}
  1162  		}
  1163  	}
  1164  	p.AddProjection(outCols)
  1165  
  1166  	p.PlanToStreamColMap = planToStreamColMap
  1167  	return &p, nil
  1168  }
  1169  
  1170  // selectRenders takes a PhysicalPlan that produces the results corresponding to
  1171  // the select data source (a n.source) and updates it to produce results
  1172  // corresponding to the render node itself. An evaluator stage is added if the
  1173  // render node has any expressions which are not just simple column references.
  1174  func (dsp *DistSQLPlanner) selectRenders(
  1175  	p *PhysicalPlan, n *renderNode, planCtx *PlanningCtx,
  1176  ) error {
  1177  	typs, err := getTypesForPlanResult(n, nil /* planToStreamColMap */)
  1178  	if err != nil {
  1179  		return err
  1180  	}
  1181  	err = p.AddRendering(n.render, planCtx, p.PlanToStreamColMap, typs)
  1182  	if err != nil {
  1183  		return err
  1184  	}
  1185  	p.PlanToStreamColMap = identityMap(p.PlanToStreamColMap, len(n.render))
  1186  	return nil
  1187  }
  1188  
  1189  // addSorters adds sorters corresponding to a sortNode and updates the plan to
  1190  // reflect the sort node.
  1191  func (dsp *DistSQLPlanner) addSorters(p *PhysicalPlan, n *sortNode) {
  1192  	// Sorting is needed; we add a stage of sorting processors.
  1193  	ordering := execinfrapb.ConvertToMappedSpecOrdering(n.ordering, p.PlanToStreamColMap)
  1194  
  1195  	p.AddNoGroupingStage(
  1196  		execinfrapb.ProcessorCoreUnion{
  1197  			Sorter: &execinfrapb.SorterSpec{
  1198  				OutputOrdering:   ordering,
  1199  				OrderingMatchLen: uint32(n.alreadyOrderedPrefix),
  1200  			},
  1201  		},
  1202  		execinfrapb.PostProcessSpec{},
  1203  		p.ResultTypes,
  1204  		ordering,
  1205  	)
  1206  }
  1207  
  1208  // addAggregators adds aggregators corresponding to a groupNode and updates the plan to
  1209  // reflect the groupNode. An evaluator stage is added if necessary.
  1210  // Invariants assumed:
  1211  //  - There is strictly no "pre-evaluation" necessary. If the given query is
  1212  //  'SELECT COUNT(k), v + w FROM kv GROUP BY v + w', the evaluation of the first
  1213  //  'v + w' is done at the source of the groupNode.
  1214  //  - We only operate on the following expressions:
  1215  //      - ONLY aggregation functions, with arguments pre-evaluated. So for
  1216  //        COUNT(k + v), we assume a stream of evaluated 'k + v' values.
  1217  //      - Expressions that CONTAIN an aggregation function, e.g. 'COUNT(k) + 1'.
  1218  //        This is evaluated in the post aggregation evaluator attached after.
  1219  //      - Expressions that also appear verbatim in the GROUP BY expressions.
  1220  //        For 'SELECT k GROUP BY k', the aggregation function added is IDENT,
  1221  //        therefore k just passes through unchanged.
  1222  //    All other expressions simply pass through unchanged, for e.g. '1' in
  1223  //    'SELECT 1 GROUP BY k'.
  1224  func (dsp *DistSQLPlanner) addAggregators(
  1225  	planCtx *PlanningCtx, p *PhysicalPlan, n *groupNode,
  1226  ) error {
  1227  	aggregations := make([]execinfrapb.AggregatorSpec_Aggregation, len(n.funcs))
  1228  	aggregationsColumnTypes := make([][]*types.T, len(n.funcs))
  1229  	for i, fholder := range n.funcs {
  1230  		// Convert the aggregate function to the enum value with the same string
  1231  		// representation.
  1232  		funcStr := strings.ToUpper(fholder.funcName)
  1233  		funcIdx, ok := execinfrapb.AggregatorSpec_Func_value[funcStr]
  1234  		if !ok {
  1235  			return errors.Errorf("unknown aggregate %s", funcStr)
  1236  		}
  1237  		aggregations[i].Func = execinfrapb.AggregatorSpec_Func(funcIdx)
  1238  		aggregations[i].Distinct = fholder.isDistinct()
  1239  		for _, renderIdx := range fholder.argRenderIdxs {
  1240  			aggregations[i].ColIdx = append(aggregations[i].ColIdx, uint32(p.PlanToStreamColMap[renderIdx]))
  1241  		}
  1242  		if fholder.hasFilter() {
  1243  			col := uint32(p.PlanToStreamColMap[fholder.filterRenderIdx])
  1244  			aggregations[i].FilterColIdx = &col
  1245  		}
  1246  		aggregations[i].Arguments = make([]execinfrapb.Expression, len(fholder.arguments))
  1247  		aggregationsColumnTypes[i] = make([]*types.T, len(fholder.arguments))
  1248  		for j, argument := range fholder.arguments {
  1249  			var err error
  1250  			aggregations[i].Arguments[j], err = physicalplan.MakeExpression(argument, planCtx, nil)
  1251  			if err != nil {
  1252  				return err
  1253  			}
  1254  			aggregationsColumnTypes[i][j] = argument.ResolvedType()
  1255  			if err != nil {
  1256  				return err
  1257  			}
  1258  		}
  1259  	}
  1260  
  1261  	aggType := execinfrapb.AggregatorSpec_NON_SCALAR
  1262  	if n.isScalar {
  1263  		aggType = execinfrapb.AggregatorSpec_SCALAR
  1264  	}
  1265  
  1266  	inputTypes := p.ResultTypes
  1267  
  1268  	groupCols := make([]uint32, len(n.groupCols))
  1269  	for i, idx := range n.groupCols {
  1270  		groupCols[i] = uint32(p.PlanToStreamColMap[idx])
  1271  	}
  1272  	orderedGroupCols := make([]uint32, len(n.groupColOrdering))
  1273  	var orderedGroupColSet util.FastIntSet
  1274  	for i, c := range n.groupColOrdering {
  1275  		orderedGroupCols[i] = uint32(p.PlanToStreamColMap[c.ColIdx])
  1276  		orderedGroupColSet.Add(c.ColIdx)
  1277  	}
  1278  
  1279  	// We either have a local stage on each stream followed by a final stage, or
  1280  	// just a final stage. We only use a local stage if:
  1281  	//  - the previous stage is distributed on multiple nodes, and
  1282  	//  - all aggregation functions support it. TODO(radu): we could relax this by
  1283  	//    splitting the aggregation into two different paths and joining on the
  1284  	//    results.
  1285  	//  - we have a mix of aggregations that use distinct and aggregations that
  1286  	//    don't use distinct. TODO(arjun): This would require doing the same as
  1287  	//    the todo as above.
  1288  	multiStage := false
  1289  	allDistinct := true
  1290  	anyDistinct := false
  1291  
  1292  	// Check if the previous stage is all on one node.
  1293  	prevStageNode := p.Processors[p.ResultRouters[0]].Node
  1294  	for i := 1; i < len(p.ResultRouters); i++ {
  1295  		if n := p.Processors[p.ResultRouters[i]].Node; n != prevStageNode {
  1296  			prevStageNode = 0
  1297  			break
  1298  		}
  1299  	}
  1300  
  1301  	if prevStageNode == 0 {
  1302  		// Check that all aggregation functions support a local stage.
  1303  		multiStage = true
  1304  		for _, e := range aggregations {
  1305  			if e.Distinct {
  1306  				// We can't do local aggregation for functions with distinct.
  1307  				multiStage = false
  1308  				anyDistinct = true
  1309  			} else {
  1310  				// We can't do local distinct if we have a mix of distinct and
  1311  				// non-distinct aggregations.
  1312  				allDistinct = false
  1313  			}
  1314  			if _, ok := physicalplan.DistAggregationTable[e.Func]; !ok {
  1315  				multiStage = false
  1316  				break
  1317  			}
  1318  		}
  1319  	}
  1320  	if !anyDistinct {
  1321  		allDistinct = false
  1322  	}
  1323  
  1324  	var finalAggsSpec execinfrapb.AggregatorSpec
  1325  	var finalAggsPost execinfrapb.PostProcessSpec
  1326  
  1327  	if !multiStage && allDistinct {
  1328  		// We can't do local aggregation, but we can do local distinct processing
  1329  		// to reduce streaming duplicates, and aggregate on the final node.
  1330  
  1331  		ordering := dsp.convertOrdering(planReqOrdering(n.plan), p.PlanToStreamColMap).Columns
  1332  		orderedColsMap := make(map[uint32]struct{})
  1333  		for _, ord := range ordering {
  1334  			orderedColsMap[ord.ColIdx] = struct{}{}
  1335  		}
  1336  		distinctColsMap := make(map[uint32]struct{})
  1337  		for _, agg := range aggregations {
  1338  			for _, c := range agg.ColIdx {
  1339  				distinctColsMap[c] = struct{}{}
  1340  			}
  1341  		}
  1342  		orderedColumns := make([]uint32, len(orderedColsMap))
  1343  		idx := 0
  1344  		for o := range orderedColsMap {
  1345  			orderedColumns[idx] = o
  1346  			idx++
  1347  		}
  1348  		distinctColumns := make([]uint32, len(distinctColsMap))
  1349  		idx = 0
  1350  		for o := range distinctColsMap {
  1351  			distinctColumns[idx] = o
  1352  			idx++
  1353  		}
  1354  
  1355  		sort.Slice(orderedColumns, func(i, j int) bool { return orderedColumns[i] < orderedColumns[j] })
  1356  		sort.Slice(distinctColumns, func(i, j int) bool { return distinctColumns[i] < distinctColumns[j] })
  1357  
  1358  		distinctSpec := execinfrapb.ProcessorCoreUnion{
  1359  			Distinct: &execinfrapb.DistinctSpec{
  1360  				OrderedColumns:  orderedColumns,
  1361  				DistinctColumns: distinctColumns,
  1362  			},
  1363  		}
  1364  
  1365  		// Add distinct processors local to each existing current result processor.
  1366  		p.AddNoGroupingStage(distinctSpec, execinfrapb.PostProcessSpec{}, p.ResultTypes, p.MergeOrdering)
  1367  	}
  1368  
  1369  	// planToStreamMapSet keeps track of whether or not
  1370  	// p.PlanToStreamColMap has been set to its desired mapping or not.
  1371  	planToStreamMapSet := false
  1372  	if !multiStage {
  1373  		finalAggsSpec = execinfrapb.AggregatorSpec{
  1374  			Type:             aggType,
  1375  			Aggregations:     aggregations,
  1376  			GroupCols:        groupCols,
  1377  			OrderedGroupCols: orderedGroupCols,
  1378  		}
  1379  	} else {
  1380  		// Some aggregations might need multiple aggregation as part of
  1381  		// their local and final stages (along with a final render
  1382  		// expression to combine the multiple aggregations into a
  1383  		// single result).
  1384  		//
  1385  		// Count the total number of aggregation in the local/final
  1386  		// stages and keep track of whether any of them needs a final
  1387  		// rendering.
  1388  		nLocalAgg := 0
  1389  		nFinalAgg := 0
  1390  		needRender := false
  1391  		for _, e := range aggregations {
  1392  			info := physicalplan.DistAggregationTable[e.Func]
  1393  			nLocalAgg += len(info.LocalStage)
  1394  			nFinalAgg += len(info.FinalStage)
  1395  			if info.FinalRendering != nil {
  1396  				needRender = true
  1397  			}
  1398  		}
  1399  
  1400  		// We alloc the maximum possible number of unique local and final
  1401  		// aggregations but do not initialize any aggregations
  1402  		// since we can de-duplicate equivalent local and final aggregations.
  1403  		localAggs := make([]execinfrapb.AggregatorSpec_Aggregation, 0, nLocalAgg+len(groupCols))
  1404  		intermediateTypes := make([]*types.T, 0, nLocalAgg+len(groupCols))
  1405  		finalAggs := make([]execinfrapb.AggregatorSpec_Aggregation, 0, nFinalAgg)
  1406  		// finalIdxMap maps the index i of the final aggregation (with
  1407  		// respect to the i-th final aggregation out of all final
  1408  		// aggregations) to its index in the finalAggs slice.
  1409  		finalIdxMap := make([]uint32, nFinalAgg)
  1410  
  1411  		// finalPreRenderTypes is passed to an IndexVarHelper which
  1412  		// helps type-check the indexed variables passed into
  1413  		// FinalRendering for some aggregations.
  1414  		// This has a 1-1 mapping to finalAggs
  1415  		var finalPreRenderTypes []*types.T
  1416  		if needRender {
  1417  			finalPreRenderTypes = make([]*types.T, 0, nFinalAgg)
  1418  		}
  1419  
  1420  		// Each aggregation can have multiple aggregations in the
  1421  		// local/final stages. We concatenate all these into
  1422  		// localAggs/finalAggs.
  1423  		// finalIdx is the index of the final aggregation with respect
  1424  		// to all final aggregations.
  1425  		finalIdx := 0
  1426  		for _, e := range aggregations {
  1427  			info := physicalplan.DistAggregationTable[e.Func]
  1428  
  1429  			// relToAbsLocalIdx maps each local stage for the given
  1430  			// aggregation e to its final index in localAggs.  This
  1431  			// is necessary since we de-duplicate equivalent local
  1432  			// aggregations and need to correspond the one copy of
  1433  			// local aggregation required by the final stage to its
  1434  			// input, which is specified as a relative local stage
  1435  			// index (see `Aggregations` in aggregators_func.go).
  1436  			// We use a slice here instead of a map because we have
  1437  			// a small, bounded domain to map and runtime hash
  1438  			// operations are relatively expensive.
  1439  			relToAbsLocalIdx := make([]uint32, len(info.LocalStage))
  1440  			// First prepare and spec local aggregations.
  1441  			// Note the planNode first feeds the input (inputTypes)
  1442  			// into the local aggregators.
  1443  			for i, localFunc := range info.LocalStage {
  1444  				localAgg := execinfrapb.AggregatorSpec_Aggregation{
  1445  					Func:         localFunc,
  1446  					ColIdx:       e.ColIdx,
  1447  					FilterColIdx: e.FilterColIdx,
  1448  				}
  1449  
  1450  				isNewAgg := true
  1451  				for j, prevLocalAgg := range localAggs {
  1452  					if localAgg.Equals(prevLocalAgg) {
  1453  						// Found existing, equivalent local agg.
  1454  						// Map the relative index (i)
  1455  						// for the current local agg
  1456  						// to the absolute index (j) of
  1457  						// the existing local agg.
  1458  						relToAbsLocalIdx[i] = uint32(j)
  1459  						isNewAgg = false
  1460  						break
  1461  					}
  1462  				}
  1463  
  1464  				if isNewAgg {
  1465  					// Append the new local aggregation
  1466  					// and map to its index in localAggs.
  1467  					relToAbsLocalIdx[i] = uint32(len(localAggs))
  1468  					localAggs = append(localAggs, localAgg)
  1469  
  1470  					// Keep track of the new local
  1471  					// aggregation's output type.
  1472  					argTypes := make([]*types.T, len(e.ColIdx))
  1473  					for j, c := range e.ColIdx {
  1474  						argTypes[j] = inputTypes[c]
  1475  					}
  1476  					_, outputType, err := execinfrapb.GetAggregateInfo(localFunc, argTypes...)
  1477  					if err != nil {
  1478  						return err
  1479  					}
  1480  					intermediateTypes = append(intermediateTypes, outputType)
  1481  				}
  1482  			}
  1483  
  1484  			for _, finalInfo := range info.FinalStage {
  1485  				// The input of the final aggregators is
  1486  				// specified as the relative indices of the
  1487  				// local aggregation values. We need to map
  1488  				// these to the corresponding absolute indices
  1489  				// in localAggs.
  1490  				// argIdxs consists of the absolute indices
  1491  				// in localAggs.
  1492  				argIdxs := make([]uint32, len(finalInfo.LocalIdxs))
  1493  				for i, relIdx := range finalInfo.LocalIdxs {
  1494  					argIdxs[i] = relToAbsLocalIdx[relIdx]
  1495  				}
  1496  				finalAgg := execinfrapb.AggregatorSpec_Aggregation{
  1497  					Func:   finalInfo.Fn,
  1498  					ColIdx: argIdxs,
  1499  				}
  1500  
  1501  				isNewAgg := true
  1502  				for i, prevFinalAgg := range finalAggs {
  1503  					if finalAgg.Equals(prevFinalAgg) {
  1504  						// Found existing, equivalent
  1505  						// final agg.  Map the finalIdx
  1506  						// for the current final agg to
  1507  						// its index (i) in finalAggs.
  1508  						finalIdxMap[finalIdx] = uint32(i)
  1509  						isNewAgg = false
  1510  						break
  1511  					}
  1512  				}
  1513  
  1514  				// Append the final agg if there is no existing
  1515  				// equivalent.
  1516  				if isNewAgg {
  1517  					finalIdxMap[finalIdx] = uint32(len(finalAggs))
  1518  					finalAggs = append(finalAggs, finalAgg)
  1519  
  1520  					if needRender {
  1521  						argTypes := make([]*types.T, len(finalInfo.LocalIdxs))
  1522  						for i := range finalInfo.LocalIdxs {
  1523  							// Map the corresponding local
  1524  							// aggregation output types for
  1525  							// the current aggregation e.
  1526  							argTypes[i] = intermediateTypes[argIdxs[i]]
  1527  						}
  1528  						_, outputType, err := execinfrapb.GetAggregateInfo(finalInfo.Fn, argTypes...)
  1529  						if err != nil {
  1530  							return err
  1531  						}
  1532  						finalPreRenderTypes = append(finalPreRenderTypes, outputType)
  1533  					}
  1534  				}
  1535  				finalIdx++
  1536  			}
  1537  		}
  1538  
  1539  		// In queries like SELECT min(v) FROM kv GROUP BY k, not all group columns
  1540  		// appear in the rendering. Add IDENT expressions for them, as they need to
  1541  		// be part of the output of the local stage for the final stage to know
  1542  		// about them.
  1543  		finalGroupCols := make([]uint32, len(groupCols))
  1544  		finalOrderedGroupCols := make([]uint32, 0, len(orderedGroupCols))
  1545  		for i, groupColIdx := range groupCols {
  1546  			agg := execinfrapb.AggregatorSpec_Aggregation{
  1547  				Func:   execinfrapb.AggregatorSpec_ANY_NOT_NULL,
  1548  				ColIdx: []uint32{groupColIdx},
  1549  			}
  1550  			// See if there already is an aggregation like the one
  1551  			// we want to add.
  1552  			idx := -1
  1553  			for j := range localAggs {
  1554  				if localAggs[j].Equals(agg) {
  1555  					idx = j
  1556  					break
  1557  				}
  1558  			}
  1559  			if idx == -1 {
  1560  				// Not already there, add it.
  1561  				idx = len(localAggs)
  1562  				localAggs = append(localAggs, agg)
  1563  				intermediateTypes = append(intermediateTypes, inputTypes[groupColIdx])
  1564  			}
  1565  			finalGroupCols[i] = uint32(idx)
  1566  			if orderedGroupColSet.Contains(n.groupCols[i]) {
  1567  				finalOrderedGroupCols = append(finalOrderedGroupCols, uint32(idx))
  1568  			}
  1569  		}
  1570  
  1571  		// Create the merge ordering for the local stage (this will be maintained
  1572  		// for results going into the final stage).
  1573  		ordCols := make([]execinfrapb.Ordering_Column, len(n.groupColOrdering))
  1574  		for i, o := range n.groupColOrdering {
  1575  			// Find the group column.
  1576  			found := false
  1577  			for j, col := range n.groupCols {
  1578  				if col == o.ColIdx {
  1579  					ordCols[i].ColIdx = finalGroupCols[j]
  1580  					found = true
  1581  					break
  1582  				}
  1583  			}
  1584  			if !found {
  1585  				return errors.AssertionFailedf("group column ordering contains non-grouping column %d", o.ColIdx)
  1586  			}
  1587  			if o.Direction == encoding.Descending {
  1588  				ordCols[i].Direction = execinfrapb.Ordering_Column_DESC
  1589  			} else {
  1590  				ordCols[i].Direction = execinfrapb.Ordering_Column_ASC
  1591  			}
  1592  		}
  1593  
  1594  		localAggsSpec := execinfrapb.AggregatorSpec{
  1595  			Type:             aggType,
  1596  			Aggregations:     localAggs,
  1597  			GroupCols:        groupCols,
  1598  			OrderedGroupCols: orderedGroupCols,
  1599  		}
  1600  
  1601  		p.AddNoGroupingStage(
  1602  			execinfrapb.ProcessorCoreUnion{Aggregator: &localAggsSpec},
  1603  			execinfrapb.PostProcessSpec{},
  1604  			intermediateTypes,
  1605  			execinfrapb.Ordering{Columns: ordCols},
  1606  		)
  1607  
  1608  		finalAggsSpec = execinfrapb.AggregatorSpec{
  1609  			Type:             aggType,
  1610  			Aggregations:     finalAggs,
  1611  			GroupCols:        finalGroupCols,
  1612  			OrderedGroupCols: finalOrderedGroupCols,
  1613  		}
  1614  
  1615  		if needRender {
  1616  			// Build rendering expressions.
  1617  			renderExprs := make([]execinfrapb.Expression, len(aggregations))
  1618  			h := tree.MakeTypesOnlyIndexedVarHelper(finalPreRenderTypes)
  1619  			// finalIdx is an index inside finalAggs. It is used to
  1620  			// keep track of the finalAggs results that correspond
  1621  			// to each aggregation.
  1622  			finalIdx := 0
  1623  			for i, e := range aggregations {
  1624  				info := physicalplan.DistAggregationTable[e.Func]
  1625  				if info.FinalRendering == nil {
  1626  					// mappedIdx corresponds to the index
  1627  					// location of the result for this
  1628  					// final aggregation in finalAggs. This
  1629  					// is necessary since we re-use final
  1630  					// aggregations if they are equivalent
  1631  					// across and within stages.
  1632  					mappedIdx := int(finalIdxMap[finalIdx])
  1633  					var err error
  1634  					renderExprs[i], err = physicalplan.MakeExpression(
  1635  						h.IndexedVar(mappedIdx), planCtx, nil /* indexVarMap */)
  1636  					if err != nil {
  1637  						return err
  1638  					}
  1639  				} else {
  1640  					// We have multiple final aggregation
  1641  					// values that we need to be mapped to
  1642  					// their corresponding index in
  1643  					// finalAggs for FinalRendering.
  1644  					mappedIdxs := make([]int, len(info.FinalStage))
  1645  					for j := range info.FinalStage {
  1646  						mappedIdxs[j] = int(finalIdxMap[finalIdx+j])
  1647  					}
  1648  					// Map the final aggregation values
  1649  					// to their corresponding indices.
  1650  					expr, err := info.FinalRendering(&h, mappedIdxs)
  1651  					if err != nil {
  1652  						return err
  1653  					}
  1654  					renderExprs[i], err = physicalplan.MakeExpression(
  1655  						expr, planCtx,
  1656  						nil /* indexVarMap */)
  1657  					if err != nil {
  1658  						return err
  1659  					}
  1660  				}
  1661  				finalIdx += len(info.FinalStage)
  1662  			}
  1663  			finalAggsPost.RenderExprs = renderExprs
  1664  		} else if len(finalAggs) < len(aggregations) {
  1665  			// We want to ensure we map the streams properly now
  1666  			// that we've potential reduced the number of final
  1667  			// aggregation output streams. We use finalIdxMap to
  1668  			// create a 1-1 mapping from the final aggregators to
  1669  			// their corresponding column index in the map.
  1670  			p.PlanToStreamColMap = p.PlanToStreamColMap[:0]
  1671  			for _, idx := range finalIdxMap {
  1672  				p.PlanToStreamColMap = append(p.PlanToStreamColMap, int(idx))
  1673  			}
  1674  			planToStreamMapSet = true
  1675  		}
  1676  	}
  1677  
  1678  	// Set up the final stage.
  1679  
  1680  	finalOutTypes := make([]*types.T, len(aggregations))
  1681  	for i, agg := range aggregations {
  1682  		argTypes := make([]*types.T, len(agg.ColIdx)+len(agg.Arguments))
  1683  		for j, c := range agg.ColIdx {
  1684  			argTypes[j] = inputTypes[c]
  1685  		}
  1686  		for j, argumentColumnType := range aggregationsColumnTypes[i] {
  1687  			argTypes[len(agg.ColIdx)+j] = argumentColumnType
  1688  		}
  1689  		var err error
  1690  		_, returnTyp, err := execinfrapb.GetAggregateInfo(agg.Func, argTypes...)
  1691  		if err != nil {
  1692  			return err
  1693  		}
  1694  		finalOutTypes[i] = returnTyp
  1695  	}
  1696  
  1697  	// Update p.PlanToStreamColMap; we will have a simple 1-to-1 mapping of
  1698  	// planNode columns to stream columns because the aggregator
  1699  	// has been programmed to produce the same columns as the groupNode.
  1700  	if !planToStreamMapSet {
  1701  		p.PlanToStreamColMap = identityMap(p.PlanToStreamColMap, len(aggregations))
  1702  	}
  1703  
  1704  	if len(finalAggsSpec.GroupCols) == 0 || len(p.ResultRouters) == 1 {
  1705  		// No GROUP BY, or we have a single stream. Use a single final aggregator.
  1706  		// If the previous stage was all on a single node, put the final
  1707  		// aggregator there. Otherwise, bring the results back on this node.
  1708  		node := dsp.nodeDesc.NodeID
  1709  		if prevStageNode != 0 {
  1710  			node = prevStageNode
  1711  		}
  1712  		p.AddSingleGroupStage(
  1713  			node,
  1714  			execinfrapb.ProcessorCoreUnion{Aggregator: &finalAggsSpec},
  1715  			finalAggsPost,
  1716  			finalOutTypes,
  1717  		)
  1718  	} else {
  1719  		// We distribute (by group columns) to multiple processors.
  1720  
  1721  		// Set up the output routers from the previous stage.
  1722  		for _, resultProc := range p.ResultRouters {
  1723  			p.Processors[resultProc].Spec.Output[0] = execinfrapb.OutputRouterSpec{
  1724  				Type:        execinfrapb.OutputRouterSpec_BY_HASH,
  1725  				HashColumns: finalAggsSpec.GroupCols,
  1726  			}
  1727  		}
  1728  
  1729  		stageID := p.NewStageID()
  1730  
  1731  		// We have one final stage processor for each result router. This is a
  1732  		// somewhat arbitrary decision; we could have a different number of nodes
  1733  		// working on the final stage.
  1734  		pIdxStart := physicalplan.ProcessorIdx(len(p.Processors))
  1735  		for _, resultProc := range p.ResultRouters {
  1736  			proc := physicalplan.Processor{
  1737  				Node: p.Processors[resultProc].Node,
  1738  				Spec: execinfrapb.ProcessorSpec{
  1739  					Input: []execinfrapb.InputSyncSpec{{
  1740  						// The other fields will be filled in by mergeResultStreams.
  1741  						ColumnTypes: p.ResultTypes,
  1742  					}},
  1743  					Core: execinfrapb.ProcessorCoreUnion{Aggregator: &finalAggsSpec},
  1744  					Post: finalAggsPost,
  1745  					Output: []execinfrapb.OutputRouterSpec{{
  1746  						Type: execinfrapb.OutputRouterSpec_PASS_THROUGH,
  1747  					}},
  1748  					StageID: stageID,
  1749  				},
  1750  			}
  1751  			p.AddProcessor(proc)
  1752  		}
  1753  
  1754  		// Connect the streams.
  1755  		for bucket := 0; bucket < len(p.ResultRouters); bucket++ {
  1756  			pIdx := pIdxStart + physicalplan.ProcessorIdx(bucket)
  1757  			p.MergeResultStreams(p.ResultRouters, bucket, p.MergeOrdering, pIdx, 0)
  1758  		}
  1759  
  1760  		// Set the new result routers.
  1761  		for i := 0; i < len(p.ResultRouters); i++ {
  1762  			p.ResultRouters[i] = pIdxStart + physicalplan.ProcessorIdx(i)
  1763  		}
  1764  
  1765  		p.ResultTypes = finalOutTypes
  1766  		p.SetMergeOrdering(dsp.convertOrdering(n.reqOrdering, p.PlanToStreamColMap))
  1767  	}
  1768  
  1769  	return nil
  1770  }
  1771  
  1772  func (dsp *DistSQLPlanner) createPlanForIndexJoin(
  1773  	planCtx *PlanningCtx, n *indexJoinNode,
  1774  ) (*PhysicalPlan, error) {
  1775  	plan, err := dsp.createPhysPlanForPlanNode(planCtx, n.input)
  1776  	if err != nil {
  1777  		return nil, err
  1778  	}
  1779  
  1780  	// In "index-join mode", the join reader assumes that the PK cols are a prefix
  1781  	// of the input stream columns (see #40749). We need a projection to make that
  1782  	// happen. The other columns are not used by the join reader.
  1783  	pkCols := make([]uint32, len(n.keyCols))
  1784  	for i := range n.keyCols {
  1785  		streamColOrd := plan.PlanToStreamColMap[n.keyCols[i]]
  1786  		if streamColOrd == -1 {
  1787  			panic("key column not in planToStreamColMap")
  1788  		}
  1789  		pkCols[i] = uint32(streamColOrd)
  1790  	}
  1791  	plan.AddProjection(pkCols)
  1792  
  1793  	joinReaderSpec := execinfrapb.JoinReaderSpec{
  1794  		Table:             *n.table.desc.TableDesc(),
  1795  		IndexIdx:          0,
  1796  		Visibility:        n.table.colCfg.visibility,
  1797  		LockingStrength:   n.table.lockingStrength,
  1798  		LockingWaitPolicy: n.table.lockingWaitPolicy,
  1799  	}
  1800  
  1801  	filter, err := physicalplan.MakeExpression(
  1802  		n.table.filter, planCtx, nil /* indexVarMap */)
  1803  	if err != nil {
  1804  		return nil, err
  1805  	}
  1806  	post := execinfrapb.PostProcessSpec{
  1807  		Filter:     filter,
  1808  		Projection: true,
  1809  	}
  1810  
  1811  	// Calculate the output columns from n.cols.
  1812  	post.OutputColumns = make([]uint32, len(n.cols))
  1813  	plan.PlanToStreamColMap = identityMap(plan.PlanToStreamColMap, len(n.cols))
  1814  
  1815  	for i := range n.cols {
  1816  		ord := tableOrdinal(n.table.desc, n.cols[i].ID, n.table.colCfg.visibility)
  1817  		post.OutputColumns[i] = uint32(ord)
  1818  	}
  1819  
  1820  	types, err := getTypesForPlanResult(n, plan.PlanToStreamColMap)
  1821  	if err != nil {
  1822  		return nil, err
  1823  	}
  1824  	if len(plan.ResultRouters) > 1 {
  1825  		// Instantiate one join reader for every stream.
  1826  		plan.AddNoGroupingStage(
  1827  			execinfrapb.ProcessorCoreUnion{JoinReader: &joinReaderSpec},
  1828  			post,
  1829  			types,
  1830  			dsp.convertOrdering(n.reqOrdering, plan.PlanToStreamColMap),
  1831  		)
  1832  	} else {
  1833  		// We have a single stream, so use a single join reader on that node.
  1834  		plan.AddSingleGroupStage(
  1835  			plan.Processors[plan.ResultRouters[0]].Node,
  1836  			execinfrapb.ProcessorCoreUnion{JoinReader: &joinReaderSpec},
  1837  			post,
  1838  			types,
  1839  		)
  1840  	}
  1841  	return plan, nil
  1842  }
  1843  
  1844  // createPlanForLookupJoin creates a distributed plan for a lookupJoinNode.
  1845  // Note that this is a separate code path from the experimental path which
  1846  // converts joins to lookup joins.
  1847  func (dsp *DistSQLPlanner) createPlanForLookupJoin(
  1848  	planCtx *PlanningCtx, n *lookupJoinNode,
  1849  ) (*PhysicalPlan, error) {
  1850  	plan, err := dsp.createPhysPlanForPlanNode(planCtx, n.input)
  1851  	if err != nil {
  1852  		return nil, err
  1853  	}
  1854  
  1855  	joinReaderSpec := execinfrapb.JoinReaderSpec{
  1856  		Table:             *n.table.desc.TableDesc(),
  1857  		Type:              n.joinType,
  1858  		Visibility:        n.table.colCfg.visibility,
  1859  		LockingStrength:   n.table.lockingStrength,
  1860  		LockingWaitPolicy: n.table.lockingWaitPolicy,
  1861  		MaintainOrdering:  len(n.reqOrdering) > 0,
  1862  	}
  1863  	joinReaderSpec.IndexIdx, err = getIndexIdx(n.table.index, n.table.desc)
  1864  	if err != nil {
  1865  		return nil, err
  1866  	}
  1867  	joinReaderSpec.LookupColumns = make([]uint32, len(n.eqCols))
  1868  	for i, col := range n.eqCols {
  1869  		if plan.PlanToStreamColMap[col] == -1 {
  1870  			panic("lookup column not in planToStreamColMap")
  1871  		}
  1872  		joinReaderSpec.LookupColumns[i] = uint32(plan.PlanToStreamColMap[col])
  1873  	}
  1874  	joinReaderSpec.LookupColumnsAreKey = n.eqColsAreKey
  1875  
  1876  	// The n.table node can be configured with an arbitrary set of columns. Apply
  1877  	// the corresponding projection.
  1878  	// The internal schema of the join reader is:
  1879  	//    <input columns>... <table columns>...
  1880  	numLeftCols := len(plan.ResultTypes)
  1881  	numOutCols := numLeftCols + len(n.table.cols)
  1882  	post := execinfrapb.PostProcessSpec{Projection: true}
  1883  
  1884  	post.OutputColumns = make([]uint32, numOutCols)
  1885  	types := make([]*types.T, numOutCols)
  1886  
  1887  	for i := 0; i < numLeftCols; i++ {
  1888  		types[i] = plan.ResultTypes[i]
  1889  		post.OutputColumns[i] = uint32(i)
  1890  	}
  1891  	for i := range n.table.cols {
  1892  		types[numLeftCols+i] = n.table.cols[i].Type
  1893  		ord := tableOrdinal(n.table.desc, n.table.cols[i].ID, n.table.colCfg.visibility)
  1894  		post.OutputColumns[numLeftCols+i] = uint32(numLeftCols + ord)
  1895  	}
  1896  
  1897  	// Map the columns of the lookupJoinNode to the result streams of the
  1898  	// JoinReader.
  1899  	numInputNodeCols := len(planColumns(n.input))
  1900  	planToStreamColMap := makePlanToStreamColMap(numInputNodeCols + len(n.table.cols))
  1901  	copy(planToStreamColMap, plan.PlanToStreamColMap)
  1902  	for i := range n.table.cols {
  1903  		planToStreamColMap[numInputNodeCols+i] = numLeftCols + i
  1904  	}
  1905  
  1906  	// Set the ON condition.
  1907  	if n.onCond != nil {
  1908  		// Note that (regardless of the join type or the OutputColumns projection)
  1909  		// the ON condition refers to the input columns with var indexes 0 to
  1910  		// numInputNodeCols-1 and to table columns with var indexes starting from
  1911  		// numInputNodeCols.
  1912  		indexVarMap := makePlanToStreamColMap(numInputNodeCols + len(n.table.cols))
  1913  		copy(indexVarMap, plan.PlanToStreamColMap)
  1914  		for i := range n.table.cols {
  1915  			indexVarMap[numInputNodeCols+i] = int(post.OutputColumns[numLeftCols+i])
  1916  		}
  1917  		var err error
  1918  		joinReaderSpec.OnExpr, err = physicalplan.MakeExpression(
  1919  			n.onCond, planCtx, indexVarMap,
  1920  		)
  1921  		if err != nil {
  1922  			return nil, err
  1923  		}
  1924  	}
  1925  
  1926  	if n.joinType == sqlbase.LeftSemiJoin || n.joinType == sqlbase.LeftAntiJoin {
  1927  		// For anti/semi join, we only produce the input columns.
  1928  		planToStreamColMap = planToStreamColMap[:numInputNodeCols]
  1929  		post.OutputColumns = post.OutputColumns[:numInputNodeCols]
  1930  		types = types[:numInputNodeCols]
  1931  	}
  1932  
  1933  	// Instantiate one join reader for every stream.
  1934  	plan.AddNoGroupingStage(
  1935  		execinfrapb.ProcessorCoreUnion{JoinReader: &joinReaderSpec},
  1936  		post,
  1937  		types,
  1938  		dsp.convertOrdering(planReqOrdering(n), planToStreamColMap),
  1939  	)
  1940  	plan.PlanToStreamColMap = planToStreamColMap
  1941  	return plan, nil
  1942  }
  1943  
  1944  // createPlanForZigzagJoin creates a distributed plan for a zigzagJoinNode.
  1945  func (dsp *DistSQLPlanner) createPlanForZigzagJoin(
  1946  	planCtx *PlanningCtx, n *zigzagJoinNode,
  1947  ) (plan *PhysicalPlan, err error) {
  1948  	plan = &PhysicalPlan{}
  1949  
  1950  	tables := make([]sqlbase.TableDescriptor, len(n.sides))
  1951  	indexOrdinals := make([]uint32, len(n.sides))
  1952  	cols := make([]execinfrapb.Columns, len(n.sides))
  1953  	numStreamCols := 0
  1954  	for i, side := range n.sides {
  1955  		tables[i] = *side.scan.desc.TableDesc()
  1956  		indexOrdinals[i], err = getIndexIdx(side.scan.index, side.scan.desc)
  1957  		if err != nil {
  1958  			return nil, err
  1959  		}
  1960  
  1961  		cols[i].Columns = make([]uint32, len(side.eqCols))
  1962  		for j, col := range side.eqCols {
  1963  			cols[i].Columns[j] = uint32(col)
  1964  		}
  1965  
  1966  		numStreamCols += len(side.scan.desc.Columns)
  1967  	}
  1968  
  1969  	// The zigzag join node only represents inner joins, so hardcode Type to
  1970  	// InnerJoin.
  1971  	zigzagJoinerSpec := execinfrapb.ZigzagJoinerSpec{
  1972  		Tables:        tables,
  1973  		IndexOrdinals: indexOrdinals,
  1974  		EqColumns:     cols,
  1975  		Type:          sqlbase.InnerJoin,
  1976  	}
  1977  	zigzagJoinerSpec.FixedValues = make([]*execinfrapb.ValuesCoreSpec, len(n.sides))
  1978  
  1979  	// The fixed values are represented as a Values node with one tuple.
  1980  	for i := range n.sides {
  1981  		valuesPlan, err := dsp.createPlanForValues(planCtx, n.sides[i].fixedVals)
  1982  		if err != nil {
  1983  			return nil, err
  1984  		}
  1985  		zigzagJoinerSpec.FixedValues[i] = valuesPlan.PhysicalPlan.Processors[0].Spec.Core.Values
  1986  	}
  1987  
  1988  	// The internal schema of the zigzag joiner is:
  1989  	//    <side 1 table columns> ... <side 2 table columns> ...
  1990  	// with only the columns in the specified index populated.
  1991  	//
  1992  	// The schema of the zigzagJoinNode is:
  1993  	//    <side 1 index columns> ... <side 2 index columns> ...
  1994  	// so the planToStreamColMap has to basically map index ordinals
  1995  	// to table ordinals.
  1996  	post := execinfrapb.PostProcessSpec{Projection: true}
  1997  	numOutCols := len(n.columns)
  1998  
  1999  	post.OutputColumns = make([]uint32, numOutCols)
  2000  	types := make([]*types.T, numOutCols)
  2001  	planToStreamColMap := makePlanToStreamColMap(numOutCols)
  2002  	colOffset := 0
  2003  	i := 0
  2004  
  2005  	// Populate post.OutputColumns (the implicit projection), result types,
  2006  	// and the planToStreamColMap for index columns from all sides.
  2007  	for _, side := range n.sides {
  2008  		// Note that the side's scanNode only contains the columns from that
  2009  		// index that are also in n.columns. This is because we generated
  2010  		// colCfg.wantedColumns for only the necessary columns in
  2011  		// opt/exec/execbuilder/relational_builder.go, similar to lookup joins.
  2012  		for colIdx := range side.scan.cols {
  2013  			ord := tableOrdinal(side.scan.desc, side.scan.cols[colIdx].ID, side.scan.colCfg.visibility)
  2014  			post.OutputColumns[i] = uint32(colOffset + ord)
  2015  			types[i] = side.scan.cols[colIdx].Type
  2016  			planToStreamColMap[i] = i
  2017  
  2018  			i++
  2019  		}
  2020  
  2021  		colOffset += len(side.scan.desc.Columns)
  2022  	}
  2023  
  2024  	// Figure out the node where this zigzag joiner goes.
  2025  	//
  2026  	// TODO(itsbilal): Add support for restricting the Zigzag joiner
  2027  	// to a certain set of spans (similar to the InterleavedReaderJoiner)
  2028  	// on one side. Once that's done, we can split this processor across
  2029  	// multiple nodes here. Until then, schedule on the current node.
  2030  	nodeID := dsp.nodeDesc.NodeID
  2031  
  2032  	stageID := plan.NewStageID()
  2033  	// Set the ON condition.
  2034  	if n.onCond != nil {
  2035  		// Note that the ON condition refers to the *internal* columns of the
  2036  		// processor (before the OutputColumns projection).
  2037  		indexVarMap := makePlanToStreamColMap(len(n.columns))
  2038  		for i := range n.columns {
  2039  			indexVarMap[i] = int(post.OutputColumns[i])
  2040  		}
  2041  		zigzagJoinerSpec.OnExpr, err = physicalplan.MakeExpression(
  2042  			n.onCond, planCtx, indexVarMap,
  2043  		)
  2044  		if err != nil {
  2045  			return nil, err
  2046  		}
  2047  	}
  2048  
  2049  	// Build the PhysicalPlan.
  2050  	proc := physicalplan.Processor{
  2051  		Node: nodeID,
  2052  		Spec: execinfrapb.ProcessorSpec{
  2053  			Core:    execinfrapb.ProcessorCoreUnion{ZigzagJoiner: &zigzagJoinerSpec},
  2054  			Post:    post,
  2055  			Output:  []execinfrapb.OutputRouterSpec{{Type: execinfrapb.OutputRouterSpec_PASS_THROUGH}},
  2056  			StageID: stageID,
  2057  		},
  2058  	}
  2059  
  2060  	plan.Processors = append(plan.Processors, proc)
  2061  
  2062  	// Each result router correspond to each of the processors we appended.
  2063  	plan.ResultRouters = []physicalplan.ProcessorIdx{physicalplan.ProcessorIdx(0)}
  2064  
  2065  	plan.PlanToStreamColMap = planToStreamColMap
  2066  	plan.ResultTypes = types
  2067  
  2068  	return plan, nil
  2069  }
  2070  
  2071  // getTypesForPlanResult returns the types of the elements in the result streams
  2072  // of a plan that corresponds to a given planNode. If planToStreamColMap is nil,
  2073  // a 1-1 mapping is assumed.
  2074  func getTypesForPlanResult(node planNode, planToStreamColMap []int) ([]*types.T, error) {
  2075  	nodeColumns := planColumns(node)
  2076  	if planToStreamColMap == nil {
  2077  		// No remapping.
  2078  		types := make([]*types.T, len(nodeColumns))
  2079  		for i := range nodeColumns {
  2080  			types[i] = nodeColumns[i].Typ
  2081  		}
  2082  		return types, nil
  2083  	}
  2084  	numCols := 0
  2085  	for _, streamCol := range planToStreamColMap {
  2086  		if numCols <= streamCol {
  2087  			numCols = streamCol + 1
  2088  		}
  2089  	}
  2090  	types := make([]*types.T, numCols)
  2091  	for nodeCol, streamCol := range planToStreamColMap {
  2092  		if streamCol != -1 {
  2093  			types[streamCol] = nodeColumns[nodeCol].Typ
  2094  		}
  2095  	}
  2096  	return types, nil
  2097  }
  2098  
  2099  func (dsp *DistSQLPlanner) createPlanForJoin(
  2100  	planCtx *PlanningCtx, n *joinNode,
  2101  ) (*PhysicalPlan, error) {
  2102  	// See if we can create an interleave join plan.
  2103  	if planInterleavedJoins.Get(&dsp.st.SV) {
  2104  		plan, ok, err := dsp.tryCreatePlanForInterleavedJoin(planCtx, n)
  2105  		if err != nil {
  2106  			return nil, err
  2107  		}
  2108  		// An interleave join plan could be used. Return it.
  2109  		if ok {
  2110  			return plan, nil
  2111  		}
  2112  	}
  2113  
  2114  	// Outline of the planning process for joins:
  2115  	//
  2116  	//  - We create PhysicalPlans for the left and right side. Each plan has a set
  2117  	//    of output routers with result that will serve as input for the join.
  2118  	//
  2119  	//  - We merge the list of processors and streams into a single plan. We keep
  2120  	//    track of the output routers for the left and right results.
  2121  	//
  2122  	//  - We add a set of joiner processors (say K of them).
  2123  	//
  2124  	//  - We configure the left and right output routers to send results to
  2125  	//    these joiners, distributing rows by hash (on the join equality columns).
  2126  	//    We are thus breaking up all input rows into K buckets such that rows
  2127  	//    that match on the equality columns end up in the same bucket. If there
  2128  	//    are no equality columns, we cannot distribute rows so we use a single
  2129  	//    joiner.
  2130  	//
  2131  	//  - The routers of the joiner processors are the result routers of the plan.
  2132  
  2133  	leftPlan, err := dsp.createPhysPlanForPlanNode(planCtx, n.left.plan)
  2134  	if err != nil {
  2135  		return nil, err
  2136  	}
  2137  	rightPlan, err := dsp.createPhysPlanForPlanNode(planCtx, n.right.plan)
  2138  	if err != nil {
  2139  		return nil, err
  2140  	}
  2141  
  2142  	// Nodes where we will run the join processors.
  2143  	var nodes []roachpb.NodeID
  2144  
  2145  	// We initialize these properties of the joiner. They will then be used to
  2146  	// fill in the processor spec. See descriptions for HashJoinerSpec.
  2147  	var leftEqCols, rightEqCols []uint32
  2148  	var leftMergeOrd, rightMergeOrd execinfrapb.Ordering
  2149  	joinType := n.joinType
  2150  
  2151  	// Figure out the left and right types.
  2152  	leftTypes := leftPlan.ResultTypes
  2153  	rightTypes := rightPlan.ResultTypes
  2154  
  2155  	// Set up the equality columns.
  2156  	if numEq := len(n.pred.leftEqualityIndices); numEq != 0 {
  2157  		leftEqCols = eqCols(n.pred.leftEqualityIndices, leftPlan.PlanToStreamColMap)
  2158  		rightEqCols = eqCols(n.pred.rightEqualityIndices, rightPlan.PlanToStreamColMap)
  2159  	}
  2160  
  2161  	var p PhysicalPlan
  2162  	var leftRouters, rightRouters []physicalplan.ProcessorIdx
  2163  	p.PhysicalPlan, leftRouters, rightRouters = physicalplan.MergePlans(
  2164  		&leftPlan.PhysicalPlan, &rightPlan.PhysicalPlan,
  2165  	)
  2166  
  2167  	// Set up the output columns.
  2168  	if numEq := len(n.pred.leftEqualityIndices); numEq != 0 {
  2169  		nodes = findJoinProcessorNodes(leftRouters, rightRouters, p.Processors)
  2170  
  2171  		if len(n.mergeJoinOrdering) > 0 {
  2172  			// TODO(radu): we currently only use merge joins when we have an ordering on
  2173  			// all equality columns. We should relax this by either:
  2174  			//  - implementing a hybrid hash/merge processor which implements merge
  2175  			//    logic on the columns we have an ordering on, and within each merge
  2176  			//    group uses a hashmap on the remaining columns
  2177  			//  - or: adding a sort processor to complete the order
  2178  			if len(n.mergeJoinOrdering) == len(n.pred.leftEqualityIndices) {
  2179  				// Excellent! We can use the merge joiner.
  2180  				leftMergeOrd = distsqlOrdering(n.mergeJoinOrdering, leftEqCols)
  2181  				rightMergeOrd = distsqlOrdering(n.mergeJoinOrdering, rightEqCols)
  2182  			}
  2183  		}
  2184  	} else {
  2185  		// Without column equality, we cannot distribute the join. Run a
  2186  		// single processor.
  2187  		nodes = []roachpb.NodeID{dsp.nodeDesc.NodeID}
  2188  
  2189  		// If either side has a single stream, put the processor on that node. We
  2190  		// prefer the left side because that is processed first by the hash joiner.
  2191  		if len(leftRouters) == 1 {
  2192  			nodes[0] = p.Processors[leftRouters[0]].Node
  2193  		} else if len(rightRouters) == 1 {
  2194  			nodes[0] = p.Processors[rightRouters[0]].Node
  2195  		}
  2196  	}
  2197  
  2198  	rightMap := rightPlan.PlanToStreamColMap
  2199  	post, joinToStreamColMap := joinOutColumns(n, leftPlan.PlanToStreamColMap, rightMap)
  2200  	onExpr, err := remapOnExpr(planCtx, n, leftPlan.PlanToStreamColMap, rightMap)
  2201  	if err != nil {
  2202  		return nil, err
  2203  	}
  2204  
  2205  	// Create the Core spec.
  2206  	var core execinfrapb.ProcessorCoreUnion
  2207  	if leftMergeOrd.Columns == nil {
  2208  		core.HashJoiner = &execinfrapb.HashJoinerSpec{
  2209  			LeftEqColumns:        leftEqCols,
  2210  			RightEqColumns:       rightEqCols,
  2211  			OnExpr:               onExpr,
  2212  			Type:                 joinType,
  2213  			LeftEqColumnsAreKey:  n.pred.leftEqKey,
  2214  			RightEqColumnsAreKey: n.pred.rightEqKey,
  2215  		}
  2216  	} else {
  2217  		core.MergeJoiner = &execinfrapb.MergeJoinerSpec{
  2218  			LeftOrdering:         leftMergeOrd,
  2219  			RightOrdering:        rightMergeOrd,
  2220  			OnExpr:               onExpr,
  2221  			Type:                 joinType,
  2222  			LeftEqColumnsAreKey:  n.pred.leftEqKey,
  2223  			RightEqColumnsAreKey: n.pred.rightEqKey,
  2224  		}
  2225  	}
  2226  
  2227  	p.AddJoinStage(
  2228  		nodes, core, post, leftEqCols, rightEqCols, leftTypes, rightTypes,
  2229  		leftMergeOrd, rightMergeOrd, leftRouters, rightRouters,
  2230  	)
  2231  
  2232  	p.PlanToStreamColMap = joinToStreamColMap
  2233  	p.ResultTypes, err = getTypesForPlanResult(n, joinToStreamColMap)
  2234  	if err != nil {
  2235  		return nil, err
  2236  	}
  2237  
  2238  	// Joiners may guarantee an ordering to outputs, so we ensure that
  2239  	// ordering is propagated through the input synchronizer of the next stage.
  2240  	// We can propagate the ordering from either side, we use the left side here.
  2241  	// Note that n.props only has a non-empty ordering for inner joins, where it
  2242  	// uses the mergeJoinOrdering.
  2243  	p.SetMergeOrdering(dsp.convertOrdering(n.reqOrdering, p.PlanToStreamColMap))
  2244  	return &p, nil
  2245  }
  2246  
  2247  func (dsp *DistSQLPlanner) createPhysPlan(
  2248  	planCtx *PlanningCtx, plan planMaybePhysical,
  2249  ) (physPlan *PhysicalPlan, err error) {
  2250  	if plan.isPhysicalPlan() {
  2251  		return plan.physPlan, nil
  2252  	}
  2253  	return dsp.createPhysPlanForPlanNode(planCtx, plan.planNode)
  2254  }
  2255  
  2256  func (dsp *DistSQLPlanner) createPhysPlanForPlanNode(
  2257  	planCtx *PlanningCtx, node planNode,
  2258  ) (plan *PhysicalPlan, err error) {
  2259  	planCtx.planDepth++
  2260  
  2261  	switch n := node.(type) {
  2262  	// Keep these cases alphabetized, please!
  2263  	case *distinctNode:
  2264  		plan, err = dsp.createPlanForDistinct(planCtx, n)
  2265  
  2266  	case *exportNode:
  2267  		plan, err = dsp.createPlanForExport(planCtx, n)
  2268  
  2269  	case *filterNode:
  2270  		plan, err = dsp.createPhysPlanForPlanNode(planCtx, n.source.plan)
  2271  		if err != nil {
  2272  			return nil, err
  2273  		}
  2274  
  2275  		if err := plan.AddFilter(n.filter, planCtx, plan.PlanToStreamColMap); err != nil {
  2276  			return nil, err
  2277  		}
  2278  
  2279  	case *groupNode:
  2280  		plan, err = dsp.createPhysPlanForPlanNode(planCtx, n.plan)
  2281  		if err != nil {
  2282  			return nil, err
  2283  		}
  2284  
  2285  		if err := dsp.addAggregators(planCtx, plan, n); err != nil {
  2286  			return nil, err
  2287  		}
  2288  
  2289  	case *indexJoinNode:
  2290  		plan, err = dsp.createPlanForIndexJoin(planCtx, n)
  2291  
  2292  	case *joinNode:
  2293  		plan, err = dsp.createPlanForJoin(planCtx, n)
  2294  
  2295  	case *limitNode:
  2296  		plan, err = dsp.createPhysPlanForPlanNode(planCtx, n.plan)
  2297  		if err != nil {
  2298  			return nil, err
  2299  		}
  2300  		if err := n.evalLimit(planCtx.EvalContext()); err != nil {
  2301  			return nil, err
  2302  		}
  2303  		if err := plan.AddLimit(n.count, n.offset, planCtx, dsp.nodeDesc.NodeID); err != nil {
  2304  			return nil, err
  2305  		}
  2306  
  2307  	case *lookupJoinNode:
  2308  		plan, err = dsp.createPlanForLookupJoin(planCtx, n)
  2309  
  2310  	case *ordinalityNode:
  2311  		plan, err = dsp.createPlanForOrdinality(planCtx, n)
  2312  
  2313  	case *projectSetNode:
  2314  		plan, err = dsp.createPlanForProjectSet(planCtx, n)
  2315  
  2316  	case *renderNode:
  2317  		plan, err = dsp.createPhysPlanForPlanNode(planCtx, n.source.plan)
  2318  		if err != nil {
  2319  			return nil, err
  2320  		}
  2321  		err = dsp.selectRenders(plan, n, planCtx)
  2322  		if err != nil {
  2323  			return nil, err
  2324  		}
  2325  
  2326  	case *scanNode:
  2327  		plan, err = dsp.createTableReaders(planCtx, n)
  2328  
  2329  	case *sortNode:
  2330  		plan, err = dsp.createPhysPlanForPlanNode(planCtx, n.plan)
  2331  		if err != nil {
  2332  			return nil, err
  2333  		}
  2334  
  2335  		dsp.addSorters(plan, n)
  2336  
  2337  	case *unaryNode:
  2338  		plan, err = dsp.createPlanForUnary(planCtx, n)
  2339  
  2340  	case *unionNode:
  2341  		plan, err = dsp.createPlanForSetOp(planCtx, n)
  2342  
  2343  	case *valuesNode:
  2344  		// Just like in checkSupportForPlanNode, if a valuesNode wasn't specified in
  2345  		// the query, it means that it was autogenerated for things that we don't
  2346  		// want to be distributing, like populating values from a virtual table. So,
  2347  		// we wrap the plan instead.
  2348  		//
  2349  		// If the plan is local, we also wrap the plan to avoid pointless
  2350  		// serialization of the values, and also to avoid situations in which
  2351  		// expressions within the valuesNode were not distributable in the first
  2352  		// place.
  2353  		//
  2354  		// Finally, if noEvalSubqueries is set, it means that nothing has replaced
  2355  		// the subqueries with their results yet, which again means that we can't
  2356  		// plan a DistSQL values node, which requires that all expressions be
  2357  		// evaluatable.
  2358  		//
  2359  		// NB: If you change this conditional, you must also change it in
  2360  		// checkSupportForPlanNode!
  2361  		if !n.specifiedInQuery || planCtx.isLocal || planCtx.noEvalSubqueries {
  2362  			plan, err = dsp.wrapPlan(planCtx, n)
  2363  		} else {
  2364  			plan, err = dsp.createPlanForValues(planCtx, n)
  2365  		}
  2366  
  2367  	case *windowNode:
  2368  		plan, err = dsp.createPlanForWindow(planCtx, n)
  2369  
  2370  	case *zeroNode:
  2371  		plan, err = dsp.createPlanForZero(planCtx, n)
  2372  
  2373  	case *zigzagJoinNode:
  2374  		plan, err = dsp.createPlanForZigzagJoin(planCtx, n)
  2375  
  2376  	default:
  2377  		// Can't handle a node? We wrap it and continue on our way.
  2378  		plan, err = dsp.wrapPlan(planCtx, n)
  2379  	}
  2380  
  2381  	if err != nil {
  2382  		return plan, err
  2383  	}
  2384  
  2385  	if dsp.shouldPlanTestMetadata() {
  2386  		if err := plan.CheckLastStagePost(); err != nil {
  2387  			log.Fatalf(planCtx.ctx, "%v", err)
  2388  		}
  2389  		plan.AddNoGroupingStageWithCoreFunc(
  2390  			func(_ int, _ *physicalplan.Processor) execinfrapb.ProcessorCoreUnion {
  2391  				return execinfrapb.ProcessorCoreUnion{
  2392  					MetadataTestSender: &execinfrapb.MetadataTestSenderSpec{
  2393  						ID: uuid.MakeV4().String(),
  2394  					},
  2395  				}
  2396  			},
  2397  			execinfrapb.PostProcessSpec{},
  2398  			plan.ResultTypes,
  2399  			plan.MergeOrdering,
  2400  		)
  2401  	}
  2402  
  2403  	return plan, err
  2404  }
  2405  
  2406  // wrapPlan produces a DistSQL processor for an arbitrary planNode. This is
  2407  // invoked when a particular planNode can't be distributed for some reason. It
  2408  // will create a planNodeToRowSource wrapper for the sub-tree that's not
  2409  // plannable by DistSQL. If that sub-tree has DistSQL-plannable sources, they
  2410  // will be planned by DistSQL and connected to the wrapper.
  2411  func (dsp *DistSQLPlanner) wrapPlan(planCtx *PlanningCtx, n planNode) (*PhysicalPlan, error) {
  2412  	useFastPath := planCtx.planDepth == 1 && planCtx.stmtType == tree.RowsAffected
  2413  
  2414  	// First, we search the planNode tree we're trying to wrap for the first
  2415  	// DistSQL-enabled planNode in the tree. If we find one, we ask the planner to
  2416  	// continue the DistSQL planning recursion on that planNode.
  2417  	seenTop := false
  2418  	nParents := uint32(0)
  2419  	p := &PhysicalPlan{}
  2420  	// This will be set to first DistSQL-enabled planNode we find, if any. We'll
  2421  	// modify its parent later to connect its source to the DistSQL-planned
  2422  	// subtree.
  2423  	var firstNotWrapped planNode
  2424  	if err := walkPlan(planCtx.ctx, n, planObserver{
  2425  		enterNode: func(ctx context.Context, nodeName string, plan planNode) (bool, error) {
  2426  			switch plan.(type) {
  2427  			case *explainDistSQLNode, *explainPlanNode, *explainVecNode:
  2428  				// Don't continue recursing into explain nodes - they need to be left
  2429  				// alone since they handle their own planning later.
  2430  				return false, nil
  2431  			}
  2432  			if !seenTop {
  2433  				// We know we're wrapping the first node, so ignore it.
  2434  				seenTop = true
  2435  				return true, nil
  2436  			}
  2437  			var err error
  2438  			// Continue walking until we find a node that has a DistSQL
  2439  			// representation - that's when we'll quit the wrapping process and hand
  2440  			// control of planning back to the DistSQL physical planner.
  2441  			if !dsp.mustWrapNode(planCtx, plan) {
  2442  				firstNotWrapped = plan
  2443  				p, err = dsp.createPhysPlanForPlanNode(planCtx, plan)
  2444  				if err != nil {
  2445  					return false, err
  2446  				}
  2447  				nParents++
  2448  				return false, nil
  2449  			}
  2450  			return true, nil
  2451  		},
  2452  	}); err != nil {
  2453  		return nil, err
  2454  	}
  2455  	if nParents > 1 {
  2456  		return nil, errors.Errorf("can't wrap plan %v %T with more than one input", n, n)
  2457  	}
  2458  
  2459  	// Copy the evalCtx.
  2460  	evalCtx := *planCtx.ExtendedEvalCtx
  2461  	// We permit the planNodeToRowSource to trigger the wrapped planNode's fast
  2462  	// path if its the very first node in the flow, and if the statement type we're
  2463  	// expecting is in fact RowsAffected. RowsAffected statements return a single
  2464  	// row with the number of rows affected by the statement, and are the only
  2465  	// types of statement where it's valid to invoke a plan's fast path.
  2466  	wrapper, err := makePlanNodeToRowSource(n,
  2467  		runParams{
  2468  			extendedEvalCtx: &evalCtx,
  2469  			p:               planCtx.planner,
  2470  		},
  2471  		useFastPath,
  2472  	)
  2473  	if err != nil {
  2474  		return nil, err
  2475  	}
  2476  	wrapper.firstNotWrapped = firstNotWrapped
  2477  
  2478  	idx := uint32(len(p.LocalProcessors))
  2479  	p.LocalProcessors = append(p.LocalProcessors, wrapper)
  2480  	p.LocalProcessorIndexes = append(p.LocalProcessorIndexes, &idx)
  2481  	var input []execinfrapb.InputSyncSpec
  2482  	if firstNotWrapped != nil {
  2483  		// We found a DistSQL-plannable subtree - create an input spec for it.
  2484  		input = []execinfrapb.InputSyncSpec{{
  2485  			Type:        execinfrapb.InputSyncSpec_UNORDERED,
  2486  			ColumnTypes: p.ResultTypes,
  2487  		}}
  2488  	}
  2489  	name := nodeName(n)
  2490  	proc := physicalplan.Processor{
  2491  		Node: dsp.nodeDesc.NodeID,
  2492  		Spec: execinfrapb.ProcessorSpec{
  2493  			Input: input,
  2494  			Core: execinfrapb.ProcessorCoreUnion{LocalPlanNode: &execinfrapb.LocalPlanNodeSpec{
  2495  				RowSourceIdx: &idx,
  2496  				NumInputs:    &nParents,
  2497  				Name:         &name,
  2498  			}},
  2499  			Post: execinfrapb.PostProcessSpec{},
  2500  			Output: []execinfrapb.OutputRouterSpec{{
  2501  				Type: execinfrapb.OutputRouterSpec_PASS_THROUGH,
  2502  			}},
  2503  			StageID: p.NewStageID(),
  2504  		},
  2505  	}
  2506  	pIdx := p.AddProcessor(proc)
  2507  	p.ResultTypes = wrapper.outputTypes
  2508  	p.PlanToStreamColMap = identityMapInPlace(make([]int, len(p.ResultTypes)))
  2509  	if firstNotWrapped != nil {
  2510  		// If we found a DistSQL-plannable subtree, we need to add a result stream
  2511  		// between it and the physicalPlan we're creating here.
  2512  		p.MergeResultStreams(p.ResultRouters, 0, p.MergeOrdering, pIdx, 0)
  2513  	}
  2514  	// ResultRouters gets overwritten each time we add a new PhysicalPlan. We will
  2515  	// just have a single result router, since local processors aren't
  2516  	// distributed, so make sure that p.ResultRouters has at least 1 slot and
  2517  	// write the new processor index there.
  2518  	if cap(p.ResultRouters) < 1 {
  2519  		p.ResultRouters = make([]physicalplan.ProcessorIdx, 1)
  2520  	} else {
  2521  		p.ResultRouters = p.ResultRouters[:1]
  2522  	}
  2523  	p.ResultRouters[0] = pIdx
  2524  	return p, nil
  2525  }
  2526  
  2527  // createValuesPlan creates a plan with a single Values processor
  2528  // located on the gateway node and initialized with given numRows
  2529  // and rawBytes that need to be precomputed beforehand.
  2530  func (dsp *DistSQLPlanner) createValuesPlan(
  2531  	resultTypes []*types.T, numRows int, rawBytes [][]byte,
  2532  ) (*PhysicalPlan, error) {
  2533  	numColumns := len(resultTypes)
  2534  	s := execinfrapb.ValuesCoreSpec{
  2535  		Columns: make([]execinfrapb.DatumInfo, numColumns),
  2536  	}
  2537  
  2538  	for i, t := range resultTypes {
  2539  		s.Columns[i].Encoding = sqlbase.DatumEncoding_VALUE
  2540  		s.Columns[i].Type = t
  2541  	}
  2542  
  2543  	s.NumRows = uint64(numRows)
  2544  	s.RawBytes = rawBytes
  2545  
  2546  	plan := physicalplan.PhysicalPlan{
  2547  		Processors: []physicalplan.Processor{{
  2548  			// TODO: find a better node to place processor at
  2549  			Node: dsp.nodeDesc.NodeID,
  2550  			Spec: execinfrapb.ProcessorSpec{
  2551  				Core:   execinfrapb.ProcessorCoreUnion{Values: &s},
  2552  				Output: []execinfrapb.OutputRouterSpec{{Type: execinfrapb.OutputRouterSpec_PASS_THROUGH}},
  2553  			},
  2554  		}},
  2555  		ResultRouters: []physicalplan.ProcessorIdx{0},
  2556  		ResultTypes:   resultTypes,
  2557  	}
  2558  
  2559  	return &PhysicalPlan{
  2560  		PhysicalPlan:       plan,
  2561  		PlanToStreamColMap: identityMapInPlace(make([]int, numColumns)),
  2562  	}, nil
  2563  }
  2564  
  2565  func (dsp *DistSQLPlanner) createPlanForValues(
  2566  	planCtx *PlanningCtx, n *valuesNode,
  2567  ) (*PhysicalPlan, error) {
  2568  	params := runParams{
  2569  		ctx:             planCtx.ctx,
  2570  		extendedEvalCtx: planCtx.ExtendedEvalCtx,
  2571  	}
  2572  
  2573  	types, err := getTypesForPlanResult(n, nil /* planToStreamColMap */)
  2574  	if err != nil {
  2575  		return nil, err
  2576  	}
  2577  
  2578  	if err := n.startExec(params); err != nil {
  2579  		return nil, err
  2580  	}
  2581  	defer n.Close(planCtx.ctx)
  2582  
  2583  	var a sqlbase.DatumAlloc
  2584  
  2585  	numRows := n.rows.Len()
  2586  	rawBytes := make([][]byte, numRows)
  2587  	for i := 0; i < numRows; i++ {
  2588  		if next, err := n.Next(runParams{ctx: planCtx.ctx}); !next {
  2589  			return nil, err
  2590  		}
  2591  
  2592  		var buf []byte
  2593  		datums := n.Values()
  2594  		for j := range n.columns {
  2595  			var err error
  2596  			datum := sqlbase.DatumToEncDatum(types[j], datums[j])
  2597  			buf, err = datum.Encode(types[j], &a, sqlbase.DatumEncoding_VALUE, buf)
  2598  			if err != nil {
  2599  				return nil, err
  2600  			}
  2601  		}
  2602  		rawBytes[i] = buf
  2603  	}
  2604  
  2605  	return dsp.createValuesPlan(types, numRows, rawBytes)
  2606  }
  2607  
  2608  func (dsp *DistSQLPlanner) createPlanForUnary(
  2609  	planCtx *PlanningCtx, n *unaryNode,
  2610  ) (*PhysicalPlan, error) {
  2611  	types, err := getTypesForPlanResult(n, nil /* planToStreamColMap */)
  2612  	if err != nil {
  2613  		return nil, err
  2614  	}
  2615  
  2616  	return dsp.createValuesPlan(types, 1 /* numRows */, nil /* rawBytes */)
  2617  }
  2618  
  2619  func (dsp *DistSQLPlanner) createPlanForZero(
  2620  	planCtx *PlanningCtx, n *zeroNode,
  2621  ) (*PhysicalPlan, error) {
  2622  	types, err := getTypesForPlanResult(n, nil /* planToStreamColMap */)
  2623  	if err != nil {
  2624  		return nil, err
  2625  	}
  2626  
  2627  	return dsp.createValuesPlan(types, 0 /* numRows */, nil /* rawBytes */)
  2628  }
  2629  
  2630  func createDistinctSpec(n *distinctNode, cols []int) *execinfrapb.DistinctSpec {
  2631  	var orderedColumns []uint32
  2632  	if !n.columnsInOrder.Empty() {
  2633  		orderedColumns = make([]uint32, 0, n.columnsInOrder.Len())
  2634  		for i, ok := n.columnsInOrder.Next(0); ok; i, ok = n.columnsInOrder.Next(i + 1) {
  2635  			orderedColumns = append(orderedColumns, uint32(cols[i]))
  2636  		}
  2637  	}
  2638  
  2639  	var distinctColumns []uint32
  2640  	if !n.distinctOnColIdxs.Empty() {
  2641  		for planCol, streamCol := range cols {
  2642  			if streamCol != -1 && n.distinctOnColIdxs.Contains(planCol) {
  2643  				distinctColumns = append(distinctColumns, uint32(streamCol))
  2644  			}
  2645  		}
  2646  	} else {
  2647  		// If no distinct columns were specified, run distinct on the entire row.
  2648  		for _, streamCol := range cols {
  2649  			if streamCol != -1 {
  2650  				distinctColumns = append(distinctColumns, uint32(streamCol))
  2651  			}
  2652  		}
  2653  	}
  2654  
  2655  	return &execinfrapb.DistinctSpec{
  2656  		OrderedColumns:   orderedColumns,
  2657  		DistinctColumns:  distinctColumns,
  2658  		NullsAreDistinct: n.nullsAreDistinct,
  2659  		ErrorOnDup:       n.errorOnDup,
  2660  	}
  2661  }
  2662  
  2663  func (dsp *DistSQLPlanner) createPlanForDistinct(
  2664  	planCtx *PlanningCtx, n *distinctNode,
  2665  ) (*PhysicalPlan, error) {
  2666  	plan, err := dsp.createPhysPlanForPlanNode(planCtx, n.plan)
  2667  	if err != nil {
  2668  		return nil, err
  2669  	}
  2670  	currentResultRouters := plan.ResultRouters
  2671  
  2672  	distinctSpec := execinfrapb.ProcessorCoreUnion{
  2673  		Distinct: createDistinctSpec(n, plan.PlanToStreamColMap),
  2674  	}
  2675  
  2676  	if len(currentResultRouters) == 1 {
  2677  		plan.AddNoGroupingStage(distinctSpec, execinfrapb.PostProcessSpec{}, plan.ResultTypes, plan.MergeOrdering)
  2678  		return plan, nil
  2679  	}
  2680  
  2681  	// TODO(arjun): This is potentially memory inefficient if we don't have any sorted columns.
  2682  
  2683  	// Add distinct processors local to each existing current result processor.
  2684  	plan.AddNoGroupingStage(distinctSpec, execinfrapb.PostProcessSpec{}, plan.ResultTypes, plan.MergeOrdering)
  2685  
  2686  	// TODO(arjun): We could distribute this final stage by hash.
  2687  	plan.AddSingleGroupStage(dsp.nodeDesc.NodeID, distinctSpec, execinfrapb.PostProcessSpec{}, plan.ResultTypes)
  2688  
  2689  	return plan, nil
  2690  }
  2691  
  2692  func (dsp *DistSQLPlanner) createPlanForOrdinality(
  2693  	planCtx *PlanningCtx, n *ordinalityNode,
  2694  ) (*PhysicalPlan, error) {
  2695  	plan, err := dsp.createPhysPlanForPlanNode(planCtx, n.source)
  2696  	if err != nil {
  2697  		return nil, err
  2698  	}
  2699  
  2700  	ordinalitySpec := execinfrapb.ProcessorCoreUnion{
  2701  		Ordinality: &execinfrapb.OrdinalitySpec{},
  2702  	}
  2703  
  2704  	plan.PlanToStreamColMap = append(plan.PlanToStreamColMap, len(plan.ResultTypes))
  2705  	outputTypes := append(plan.ResultTypes, types.Int)
  2706  
  2707  	// WITH ORDINALITY never gets distributed so that the gateway node can
  2708  	// always number each row in order.
  2709  	plan.AddSingleGroupStage(dsp.nodeDesc.NodeID, ordinalitySpec, execinfrapb.PostProcessSpec{}, outputTypes)
  2710  
  2711  	return plan, nil
  2712  }
  2713  
  2714  func createProjectSetSpec(
  2715  	planCtx *PlanningCtx, n *projectSetNode, indexVarMap []int,
  2716  ) (*execinfrapb.ProjectSetSpec, error) {
  2717  	spec := execinfrapb.ProjectSetSpec{
  2718  		Exprs:            make([]execinfrapb.Expression, len(n.exprs)),
  2719  		GeneratedColumns: make([]*types.T, len(n.columns)-n.numColsInSource),
  2720  		NumColsPerGen:    make([]uint32, len(n.exprs)),
  2721  	}
  2722  	for i, expr := range n.exprs {
  2723  		var err error
  2724  		spec.Exprs[i], err = physicalplan.MakeExpression(expr, planCtx, indexVarMap)
  2725  		if err != nil {
  2726  			return nil, err
  2727  		}
  2728  	}
  2729  	for i, col := range n.columns[n.numColsInSource:] {
  2730  		spec.GeneratedColumns[i] = col.Typ
  2731  	}
  2732  	for i, n := range n.numColsPerGen {
  2733  		spec.NumColsPerGen[i] = uint32(n)
  2734  	}
  2735  	return &spec, nil
  2736  }
  2737  
  2738  func (dsp *DistSQLPlanner) createPlanForProjectSet(
  2739  	planCtx *PlanningCtx, n *projectSetNode,
  2740  ) (*PhysicalPlan, error) {
  2741  	plan, err := dsp.createPhysPlanForPlanNode(planCtx, n.source)
  2742  	if err != nil {
  2743  		return nil, err
  2744  	}
  2745  	numResults := len(plan.ResultTypes)
  2746  
  2747  	indexVarMap := makePlanToStreamColMap(len(n.columns))
  2748  	copy(indexVarMap, plan.PlanToStreamColMap)
  2749  
  2750  	// Create the project set processor spec.
  2751  	projectSetSpec, err := createProjectSetSpec(planCtx, n, indexVarMap)
  2752  	if err != nil {
  2753  		return nil, err
  2754  	}
  2755  	spec := execinfrapb.ProcessorCoreUnion{
  2756  		ProjectSet: projectSetSpec,
  2757  	}
  2758  
  2759  	// Since ProjectSet tends to be a late stage which produces more rows than its
  2760  	// source, we opt to perform it only on the gateway node. If we encounter
  2761  	// cases in the future where this is non-optimal (perhaps if its output is
  2762  	// filtered), we could try to detect these cases and use AddNoGroupingStage
  2763  	// instead.
  2764  	outputTypes := append(plan.ResultTypes, projectSetSpec.GeneratedColumns...)
  2765  	plan.AddSingleGroupStage(dsp.nodeDesc.NodeID, spec, execinfrapb.PostProcessSpec{}, outputTypes)
  2766  
  2767  	// Add generated columns to PlanToStreamColMap.
  2768  	for i := range projectSetSpec.GeneratedColumns {
  2769  		plan.PlanToStreamColMap = append(plan.PlanToStreamColMap, numResults+i)
  2770  	}
  2771  
  2772  	return plan, nil
  2773  }
  2774  
  2775  // isOnlyOnGateway returns true if a physical plan is executed entirely on the
  2776  // gateway node.
  2777  func (dsp *DistSQLPlanner) isOnlyOnGateway(plan *PhysicalPlan) bool {
  2778  	if len(plan.ResultRouters) == 1 {
  2779  		processorIdx := plan.ResultRouters[0]
  2780  		if plan.Processors[processorIdx].Node == dsp.nodeDesc.NodeID {
  2781  			return true
  2782  		}
  2783  	}
  2784  	return false
  2785  }
  2786  
  2787  // TODO(abhimadan): Refactor this function to reduce the UNION vs
  2788  // EXCEPT/INTERSECT and DISTINCT vs ALL branching.
  2789  //
  2790  // createPlanForSetOp creates a physical plan for "set operations". UNION plans
  2791  // are created by merging the left and right plans together, and INTERSECT and
  2792  // EXCEPT plans are created by performing a special type of join on the left and
  2793  // right sides. In the UNION DISTINCT case, a distinct stage is placed after the
  2794  // plans are merged, and in the INTERSECT/EXCEPT DISTINCT cases, distinct stages
  2795  // are added as the inputs of the join stage. In all DISTINCT cases, an
  2796  // additional distinct stage is placed at the end of the left and right plans if
  2797  // there are multiple nodes involved in the query, to reduce the amount of
  2798  // unnecessary network I/O.
  2799  //
  2800  // Examples (single node):
  2801  // - Query: ( VALUES (1), (2), (2) ) UNION ( VALUES (2), (3) )
  2802  //   Plan:
  2803  //   VALUES        VALUES
  2804  //     |             |
  2805  //      -------------
  2806  //            |
  2807  //        DISTINCT
  2808  //
  2809  // - Query: ( VALUES (1), (2), (2) ) INTERSECT ALL ( VALUES (2), (3) )
  2810  //   Plan:
  2811  //   VALUES        VALUES
  2812  //     |             |
  2813  //      -------------
  2814  //            |
  2815  //          JOIN
  2816  //
  2817  // - Query: ( VALUES (1), (2), (2) ) EXCEPT ( VALUES (2), (3) )
  2818  //   Plan:
  2819  //   VALUES        VALUES
  2820  //     |             |
  2821  //  DISTINCT       DISTINCT
  2822  //     |             |
  2823  //      -------------
  2824  //            |
  2825  //          JOIN
  2826  func (dsp *DistSQLPlanner) createPlanForSetOp(
  2827  	planCtx *PlanningCtx, n *unionNode,
  2828  ) (*PhysicalPlan, error) {
  2829  	leftLogicalPlan := n.left
  2830  	leftPlan, err := dsp.createPhysPlanForPlanNode(planCtx, n.left)
  2831  	if err != nil {
  2832  		return nil, err
  2833  	}
  2834  	rightLogicalPlan := n.right
  2835  	rightPlan, err := dsp.createPhysPlanForPlanNode(planCtx, n.right)
  2836  	if err != nil {
  2837  		return nil, err
  2838  	}
  2839  	if n.inverted {
  2840  		leftPlan, rightPlan = rightPlan, leftPlan
  2841  		leftLogicalPlan, rightLogicalPlan = rightLogicalPlan, leftLogicalPlan
  2842  	}
  2843  	childPhysicalPlans := []*PhysicalPlan{leftPlan, rightPlan}
  2844  
  2845  	// Check that the left and right side PlanToStreamColMaps are equivalent.
  2846  	// TODO(solon): Are there any valid UNION/INTERSECT/EXCEPT cases where these
  2847  	// differ? If we encounter any, we could handle them by adding a projection on
  2848  	// the unioned columns on each side, similar to how we handle mismatched
  2849  	// ResultTypes.
  2850  	if !reflect.DeepEqual(leftPlan.PlanToStreamColMap, rightPlan.PlanToStreamColMap) {
  2851  		return nil, errors.Errorf(
  2852  			"planToStreamColMap mismatch: %v, %v", leftPlan.PlanToStreamColMap,
  2853  			rightPlan.PlanToStreamColMap)
  2854  	}
  2855  	planToStreamColMap := leftPlan.PlanToStreamColMap
  2856  	streamCols := make([]uint32, 0, len(planToStreamColMap))
  2857  	for _, streamCol := range planToStreamColMap {
  2858  		if streamCol < 0 {
  2859  			continue
  2860  		}
  2861  		streamCols = append(streamCols, uint32(streamCol))
  2862  	}
  2863  
  2864  	var distinctSpecs [2]execinfrapb.ProcessorCoreUnion
  2865  
  2866  	if !n.all {
  2867  		var distinctOrds [2]execinfrapb.Ordering
  2868  		distinctOrds[0] = execinfrapb.ConvertToMappedSpecOrdering(
  2869  			planReqOrdering(leftLogicalPlan), leftPlan.PlanToStreamColMap,
  2870  		)
  2871  		distinctOrds[1] = execinfrapb.ConvertToMappedSpecOrdering(
  2872  			planReqOrdering(rightLogicalPlan), rightPlan.PlanToStreamColMap,
  2873  		)
  2874  
  2875  		// Build distinct processor specs for the left and right child plans.
  2876  		//
  2877  		// Note there is the potential for further network I/O optimization here
  2878  		// in the UNION case, since rows are not deduplicated between left and right
  2879  		// until the single group stage. In the worst case (total duplication), this
  2880  		// causes double the amount of data to be streamed as necessary.
  2881  		for side, plan := range childPhysicalPlans {
  2882  			sortCols := make([]uint32, len(distinctOrds[side].Columns))
  2883  			for i, ord := range distinctOrds[side].Columns {
  2884  				sortCols[i] = ord.ColIdx
  2885  			}
  2886  			distinctSpec := &distinctSpecs[side]
  2887  			distinctSpec.Distinct = &execinfrapb.DistinctSpec{
  2888  				DistinctColumns: streamCols,
  2889  				OrderedColumns:  sortCols,
  2890  			}
  2891  			if !dsp.isOnlyOnGateway(plan) {
  2892  				// TODO(solon): We could skip this stage if there is a strong key on
  2893  				// the result columns.
  2894  				plan.AddNoGroupingStage(
  2895  					*distinctSpec, execinfrapb.PostProcessSpec{}, plan.ResultTypes, distinctOrds[side])
  2896  				plan.AddProjection(streamCols)
  2897  			}
  2898  		}
  2899  	}
  2900  
  2901  	var p PhysicalPlan
  2902  
  2903  	// Merge the plans' PlanToStreamColMap, which we know are equivalent.
  2904  	p.PlanToStreamColMap = planToStreamColMap
  2905  
  2906  	// Merge the plans' result types and merge ordering.
  2907  	resultTypes, err := physicalplan.MergeResultTypes(leftPlan.ResultTypes, rightPlan.ResultTypes)
  2908  	if err != nil {
  2909  		return nil, err
  2910  	}
  2911  
  2912  	if len(leftPlan.MergeOrdering.Columns) != 0 || len(rightPlan.MergeOrdering.Columns) != 0 {
  2913  		return nil, errors.AssertionFailedf("set op inputs should have no orderings")
  2914  	}
  2915  
  2916  	// TODO(radu): for INTERSECT and EXCEPT, the mergeOrdering should be set when
  2917  	// we can use merge joiners below. The optimizer needs to be modified to take
  2918  	// advantage of this optimization and pass down merge orderings. Tracked by
  2919  	// #40797.
  2920  	var mergeOrdering execinfrapb.Ordering
  2921  
  2922  	// Merge processors, streams, result routers, and stage counter.
  2923  	var leftRouters, rightRouters []physicalplan.ProcessorIdx
  2924  	p.PhysicalPlan, leftRouters, rightRouters = physicalplan.MergePlans(
  2925  		&leftPlan.PhysicalPlan, &rightPlan.PhysicalPlan)
  2926  
  2927  	if n.unionType == tree.UnionOp {
  2928  		// We just need to append the left and right streams together, so append
  2929  		// the left and right output routers.
  2930  		p.ResultRouters = append(leftRouters, rightRouters...)
  2931  
  2932  		p.ResultTypes = resultTypes
  2933  		p.SetMergeOrdering(mergeOrdering)
  2934  
  2935  		if !n.all {
  2936  			// TODO(abhimadan): use columns from mergeOrdering to fill in the
  2937  			// OrderingColumns field in DistinctSpec once the unused columns
  2938  			// are projected out.
  2939  			distinctSpec := execinfrapb.ProcessorCoreUnion{
  2940  				Distinct: &execinfrapb.DistinctSpec{DistinctColumns: streamCols},
  2941  			}
  2942  			p.AddSingleGroupStage(
  2943  				dsp.nodeDesc.NodeID, distinctSpec, execinfrapb.PostProcessSpec{}, p.ResultTypes)
  2944  		} else {
  2945  			// With UNION ALL, we can end up with multiple streams on the same node.
  2946  			// We don't want to have unnecessary routers and cross-node streams, so
  2947  			// merge these streams now.
  2948  			//
  2949  			// More importantly, we need to guarantee that if everything is planned
  2950  			// on a single node (which is always the case when there are mutations),
  2951  			// we can fuse everything so there are no concurrent KV operations (see
  2952  			// #40487, #41307).
  2953  			p.EnsureSingleStreamPerNode()
  2954  
  2955  			// UNION ALL is special: it doesn't have any required downstream
  2956  			// processor, so its two inputs might have different post-processing
  2957  			// which would violate an assumption later down the line. Check for this
  2958  			// condition and add a no-op stage if it exists.
  2959  			if err := p.CheckLastStagePost(); err != nil {
  2960  				p.AddSingleGroupStage(
  2961  					dsp.nodeDesc.NodeID,
  2962  					execinfrapb.ProcessorCoreUnion{Noop: &execinfrapb.NoopCoreSpec{}},
  2963  					execinfrapb.PostProcessSpec{},
  2964  					p.ResultTypes,
  2965  				)
  2966  			}
  2967  		}
  2968  	} else {
  2969  		// We plan INTERSECT and EXCEPT queries with joiners. Get the appropriate
  2970  		// join type.
  2971  		joinType := distsqlSetOpJoinType(n.unionType)
  2972  
  2973  		// Nodes where we will run the join processors.
  2974  		nodes := findJoinProcessorNodes(leftRouters, rightRouters, p.Processors)
  2975  
  2976  		// Set up the equality columns.
  2977  		eqCols := streamCols
  2978  
  2979  		// Project the left-side columns only.
  2980  		post := execinfrapb.PostProcessSpec{Projection: true}
  2981  		post.OutputColumns = make([]uint32, len(streamCols))
  2982  		copy(post.OutputColumns, streamCols)
  2983  
  2984  		// Create the Core spec.
  2985  		//
  2986  		// TODO(radu): we currently only use merge joins when we have an ordering on
  2987  		// all equality columns. We should relax this by either:
  2988  		//  - implementing a hybrid hash/merge processor which implements merge
  2989  		//    logic on the columns we have an ordering on, and within each merge
  2990  		//    group uses a hashmap on the remaining columns
  2991  		//  - or: adding a sort processor to complete the order
  2992  		var core execinfrapb.ProcessorCoreUnion
  2993  		if len(mergeOrdering.Columns) < len(streamCols) {
  2994  			core.HashJoiner = &execinfrapb.HashJoinerSpec{
  2995  				LeftEqColumns:  eqCols,
  2996  				RightEqColumns: eqCols,
  2997  				Type:           joinType,
  2998  			}
  2999  		} else {
  3000  			core.MergeJoiner = &execinfrapb.MergeJoinerSpec{
  3001  				LeftOrdering:  mergeOrdering,
  3002  				RightOrdering: mergeOrdering,
  3003  				Type:          joinType,
  3004  				NullEquality:  true,
  3005  			}
  3006  		}
  3007  
  3008  		if n.all {
  3009  			p.AddJoinStage(
  3010  				nodes, core, post, eqCols, eqCols,
  3011  				leftPlan.ResultTypes, rightPlan.ResultTypes,
  3012  				leftPlan.MergeOrdering, rightPlan.MergeOrdering,
  3013  				leftRouters, rightRouters,
  3014  			)
  3015  		} else {
  3016  			p.AddDistinctSetOpStage(
  3017  				nodes, core, distinctSpecs[:], post, eqCols,
  3018  				leftPlan.ResultTypes, rightPlan.ResultTypes,
  3019  				leftPlan.MergeOrdering, rightPlan.MergeOrdering,
  3020  				leftRouters, rightRouters,
  3021  			)
  3022  		}
  3023  
  3024  		// An EXCEPT ALL is like a left outer join, so there is no guaranteed ordering.
  3025  		if n.unionType == tree.ExceptOp {
  3026  			mergeOrdering = execinfrapb.Ordering{}
  3027  		}
  3028  
  3029  		p.ResultTypes = resultTypes
  3030  		p.SetMergeOrdering(mergeOrdering)
  3031  	}
  3032  
  3033  	return &p, nil
  3034  }
  3035  
  3036  // createPlanForWindow creates a physical plan for computing window functions.
  3037  // We add a new stage of windower processors for each different partitioning
  3038  // scheme found in the query's window functions.
  3039  func (dsp *DistSQLPlanner) createPlanForWindow(
  3040  	planCtx *PlanningCtx, n *windowNode,
  3041  ) (*PhysicalPlan, error) {
  3042  	plan, err := dsp.createPhysPlanForPlanNode(planCtx, n.plan)
  3043  	if err != nil {
  3044  		return nil, err
  3045  	}
  3046  
  3047  	numWindowFuncProcessed := 0
  3048  	windowPlanState := createWindowPlanState(n, planCtx, plan)
  3049  	// Each iteration of this loop adds a new stage of windowers. The steps taken:
  3050  	// 1. find a set of unprocessed window functions that have the same PARTITION BY
  3051  	//    clause. All of these will be computed using the single stage of windowers.
  3052  	// 2. a) populate output types of the current stage of windowers. All input
  3053  	//       columns are being passed through, and windower will append output
  3054  	//       columns for each window function processed at the stage.
  3055  	//    b) create specs for all window functions in the set.
  3056  	// 3. decide whether to put windowers on a single or on multiple nodes.
  3057  	//    a) if we're putting windowers on multiple nodes, we'll put them onto
  3058  	//       every node that participated in the previous stage. We leverage hash
  3059  	//       routers to partition the data based on PARTITION BY clause of window
  3060  	//       functions in the set.
  3061  	for numWindowFuncProcessed < len(n.funcs) {
  3062  		samePartitionFuncs, partitionIdxs := windowPlanState.findUnprocessedWindowFnsWithSamePartition()
  3063  		numWindowFuncProcessed += len(samePartitionFuncs)
  3064  		windowerSpec := execinfrapb.WindowerSpec{
  3065  			PartitionBy: partitionIdxs,
  3066  			WindowFns:   make([]execinfrapb.WindowerSpec_WindowFn, len(samePartitionFuncs)),
  3067  		}
  3068  
  3069  		newResultTypes := make([]*types.T, len(plan.ResultTypes)+len(samePartitionFuncs))
  3070  		copy(newResultTypes, plan.ResultTypes)
  3071  		for windowFnSpecIdx, windowFn := range samePartitionFuncs {
  3072  			windowFnSpec, outputType, err := windowPlanState.createWindowFnSpec(windowFn)
  3073  			if err != nil {
  3074  				return nil, err
  3075  			}
  3076  			newResultTypes[windowFn.outputColIdx] = outputType
  3077  			windowerSpec.WindowFns[windowFnSpecIdx] = windowFnSpec
  3078  		}
  3079  
  3080  		// Check if the previous stage is all on one node.
  3081  		prevStageNode := plan.Processors[plan.ResultRouters[0]].Node
  3082  		for i := 1; i < len(plan.ResultRouters); i++ {
  3083  			if n := plan.Processors[plan.ResultRouters[i]].Node; n != prevStageNode {
  3084  				prevStageNode = 0
  3085  				break
  3086  			}
  3087  		}
  3088  
  3089  		// Get all nodes from the previous stage.
  3090  		nodes := getNodesOfRouters(plan.ResultRouters, plan.Processors)
  3091  		if len(partitionIdxs) == 0 || len(nodes) == 1 {
  3092  			// No PARTITION BY or we have a single node. Use a single windower.
  3093  			// If the previous stage was all on a single node, put the windower
  3094  			// there. Otherwise, bring the results back on this node.
  3095  			node := dsp.nodeDesc.NodeID
  3096  			if len(nodes) == 1 {
  3097  				node = nodes[0]
  3098  			}
  3099  			plan.AddSingleGroupStage(
  3100  				node,
  3101  				execinfrapb.ProcessorCoreUnion{Windower: &windowerSpec},
  3102  				execinfrapb.PostProcessSpec{},
  3103  				newResultTypes,
  3104  			)
  3105  		} else {
  3106  			// Set up the output routers from the previous stage.
  3107  			// We use hash routers with hashing on the columns
  3108  			// from PARTITION BY clause of window functions
  3109  			// we're processing in the current stage.
  3110  			for _, resultProc := range plan.ResultRouters {
  3111  				plan.Processors[resultProc].Spec.Output[0] = execinfrapb.OutputRouterSpec{
  3112  					Type:        execinfrapb.OutputRouterSpec_BY_HASH,
  3113  					HashColumns: partitionIdxs,
  3114  				}
  3115  			}
  3116  			stageID := plan.NewStageID()
  3117  
  3118  			// We put a windower on each node and we connect it
  3119  			// with all hash routers from the previous stage in
  3120  			// a such way that each node has its designated
  3121  			// SourceRouterSlot - namely, position in which
  3122  			// a node appears in nodes.
  3123  			prevStageRouters := plan.ResultRouters
  3124  			plan.ResultRouters = make([]physicalplan.ProcessorIdx, 0, len(nodes))
  3125  			for bucket, nodeID := range nodes {
  3126  				proc := physicalplan.Processor{
  3127  					Node: nodeID,
  3128  					Spec: execinfrapb.ProcessorSpec{
  3129  						Input: []execinfrapb.InputSyncSpec{{
  3130  							Type:        execinfrapb.InputSyncSpec_UNORDERED,
  3131  							ColumnTypes: plan.ResultTypes,
  3132  						}},
  3133  						Core: execinfrapb.ProcessorCoreUnion{Windower: &windowerSpec},
  3134  						Post: execinfrapb.PostProcessSpec{},
  3135  						Output: []execinfrapb.OutputRouterSpec{{
  3136  							Type: execinfrapb.OutputRouterSpec_PASS_THROUGH,
  3137  						}},
  3138  						StageID: stageID,
  3139  					},
  3140  				}
  3141  				pIdx := plan.AddProcessor(proc)
  3142  
  3143  				for _, router := range prevStageRouters {
  3144  					plan.Streams = append(plan.Streams, physicalplan.Stream{
  3145  						SourceProcessor:  router,
  3146  						SourceRouterSlot: bucket,
  3147  						DestProcessor:    pIdx,
  3148  						DestInput:        0,
  3149  					})
  3150  				}
  3151  				plan.ResultRouters = append(plan.ResultRouters, pIdx)
  3152  			}
  3153  
  3154  			plan.ResultTypes = newResultTypes
  3155  		}
  3156  	}
  3157  
  3158  	// We definitely added columns throughout all the stages of windowers, so we
  3159  	// need to update PlanToStreamColMap. We need to update the map before adding
  3160  	// rendering or projection because it is used there.
  3161  	plan.PlanToStreamColMap = identityMap(plan.PlanToStreamColMap, len(plan.ResultTypes))
  3162  
  3163  	// windowers do not guarantee maintaining the order at the moment, so we
  3164  	// reset MergeOrdering. There shouldn't be an ordering here, but we reset it
  3165  	// defensively (see #35179).
  3166  	plan.SetMergeOrdering(execinfrapb.Ordering{})
  3167  
  3168  	// After all window functions are computed, we need to add rendering or
  3169  	// projection.
  3170  	if err := windowPlanState.addRenderingOrProjection(); err != nil {
  3171  		return nil, err
  3172  	}
  3173  
  3174  	if len(plan.ResultTypes) != len(plan.PlanToStreamColMap) {
  3175  		// We added/removed columns while rendering or projecting, so we need to
  3176  		// update PlanToStreamColMap.
  3177  		plan.PlanToStreamColMap = identityMap(plan.PlanToStreamColMap, len(plan.ResultTypes))
  3178  	}
  3179  
  3180  	return plan, nil
  3181  }
  3182  
  3183  // createPlanForExport creates a physical plan for EXPORT.
  3184  // We add a new stage of CSVWriter processors to the input plan.
  3185  func (dsp *DistSQLPlanner) createPlanForExport(
  3186  	planCtx *PlanningCtx, n *exportNode,
  3187  ) (*PhysicalPlan, error) {
  3188  	plan, err := dsp.createPhysPlanForPlanNode(planCtx, n.source)
  3189  	if err != nil {
  3190  		return nil, err
  3191  	}
  3192  
  3193  	core := execinfrapb.ProcessorCoreUnion{CSVWriter: &execinfrapb.CSVWriterSpec{
  3194  		Destination:      n.fileName,
  3195  		NamePattern:      exportFilePatternDefault,
  3196  		Options:          n.csvOpts,
  3197  		ChunkRows:        int64(n.chunkSize),
  3198  		CompressionCodec: n.fileCompression,
  3199  	}}
  3200  
  3201  	resTypes := make([]*types.T, len(sqlbase.ExportColumns))
  3202  	for i := range sqlbase.ExportColumns {
  3203  		resTypes[i] = sqlbase.ExportColumns[i].Typ
  3204  	}
  3205  	plan.AddNoGroupingStage(
  3206  		core, execinfrapb.PostProcessSpec{}, resTypes, execinfrapb.Ordering{},
  3207  	)
  3208  
  3209  	// The CSVWriter produces the same columns as the EXPORT statement.
  3210  	plan.PlanToStreamColMap = identityMap(plan.PlanToStreamColMap, len(sqlbase.ExportColumns))
  3211  	return plan, nil
  3212  }
  3213  
  3214  // NewPlanningCtx returns a new PlanningCtx. When distribute is false, a
  3215  // lightweight version PlanningCtx is returned that can be used when the caller
  3216  // knows plans will only be run on one node.
  3217  func (dsp *DistSQLPlanner) NewPlanningCtx(
  3218  	ctx context.Context, evalCtx *extendedEvalContext, txn *kv.Txn, distribute bool,
  3219  ) *PlanningCtx {
  3220  	planCtx := &PlanningCtx{
  3221  		ctx:             ctx,
  3222  		ExtendedEvalCtx: evalCtx,
  3223  		isLocal:         !distribute,
  3224  	}
  3225  	if !distribute {
  3226  		return planCtx
  3227  	}
  3228  	planCtx.spanIter = dsp.spanResolver.NewSpanResolverIterator(txn)
  3229  	planCtx.NodeStatuses = make(map[roachpb.NodeID]NodeStatus)
  3230  	planCtx.NodeStatuses[dsp.nodeDesc.NodeID] = NodeOK
  3231  	return planCtx
  3232  }
  3233  
  3234  // FinalizePlan adds a final "result" stage if necessary and populates the
  3235  // endpoints of the plan.
  3236  func (dsp *DistSQLPlanner) FinalizePlan(planCtx *PlanningCtx, plan *PhysicalPlan) {
  3237  	// Find all MetadataTestSenders in the plan, so that the MetadataTestReceiver
  3238  	// knows how many sender IDs it should expect.
  3239  	var metadataSenders []string
  3240  	for _, proc := range plan.Processors {
  3241  		if proc.Spec.Core.MetadataTestSender != nil {
  3242  			metadataSenders = append(metadataSenders, proc.Spec.Core.MetadataTestSender.ID)
  3243  		}
  3244  	}
  3245  	thisNodeID := dsp.nodeDesc.NodeID
  3246  	// If we don't already have a single result router on this node, add a final
  3247  	// stage.
  3248  	if len(plan.ResultRouters) != 1 ||
  3249  		plan.Processors[plan.ResultRouters[0]].Node != thisNodeID {
  3250  		plan.AddSingleGroupStage(
  3251  			thisNodeID,
  3252  			execinfrapb.ProcessorCoreUnion{Noop: &execinfrapb.NoopCoreSpec{}},
  3253  			execinfrapb.PostProcessSpec{},
  3254  			plan.ResultTypes,
  3255  		)
  3256  		if len(plan.ResultRouters) != 1 {
  3257  			panic(fmt.Sprintf("%d results after single group stage", len(plan.ResultRouters)))
  3258  		}
  3259  	}
  3260  
  3261  	if len(metadataSenders) > 0 {
  3262  		plan.AddSingleGroupStage(
  3263  			thisNodeID,
  3264  			execinfrapb.ProcessorCoreUnion{
  3265  				MetadataTestReceiver: &execinfrapb.MetadataTestReceiverSpec{
  3266  					SenderIDs: metadataSenders,
  3267  				},
  3268  			},
  3269  			execinfrapb.PostProcessSpec{},
  3270  			plan.ResultTypes,
  3271  		)
  3272  	}
  3273  
  3274  	// Set up the endpoints for p.streams.
  3275  	plan.PopulateEndpoints()
  3276  
  3277  	// Set up the endpoint for the final result.
  3278  	finalOut := &plan.Processors[plan.ResultRouters[0]].Spec.Output[0]
  3279  	finalOut.Streams = append(finalOut.Streams, execinfrapb.StreamEndpointSpec{
  3280  		Type: execinfrapb.StreamEndpointSpec_SYNC_RESPONSE,
  3281  	})
  3282  
  3283  	// Assign processor IDs.
  3284  	for i := range plan.Processors {
  3285  		plan.Processors[i].Spec.ProcessorID = int32(i)
  3286  	}
  3287  }
  3288  
  3289  func makeTableReaderSpans(spans roachpb.Spans) []execinfrapb.TableReaderSpan {
  3290  	trSpans := make([]execinfrapb.TableReaderSpan, len(spans))
  3291  	for i, span := range spans {
  3292  		trSpans[i].Span = span
  3293  	}
  3294  
  3295  	return trSpans
  3296  }