github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/sql/distsql_running.go (about)

     1  // Copyright 2016 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package sql
    12  
    13  import (
    14  	"context"
    15  	"fmt"
    16  	"math"
    17  	"sync"
    18  	"sync/atomic"
    19  
    20  	"github.com/cockroachdb/cockroach/pkg/kv"
    21  	"github.com/cockroachdb/cockroach/pkg/kv/kvclient/kvcoord"
    22  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    23  	"github.com/cockroachdb/cockroach/pkg/rpc"
    24  	"github.com/cockroachdb/cockroach/pkg/rpc/nodedialer"
    25  	"github.com/cockroachdb/cockroach/pkg/server/telemetry"
    26  	"github.com/cockroachdb/cockroach/pkg/sql/colflow"
    27  	"github.com/cockroachdb/cockroach/pkg/sql/distsql"
    28  	"github.com/cockroachdb/cockroach/pkg/sql/execinfra"
    29  	"github.com/cockroachdb/cockroach/pkg/sql/execinfrapb"
    30  	"github.com/cockroachdb/cockroach/pkg/sql/flowinfra"
    31  	"github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgcode"
    32  	"github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgerror"
    33  	"github.com/cockroachdb/cockroach/pkg/sql/physicalplan"
    34  	"github.com/cockroachdb/cockroach/pkg/sql/rowcontainer"
    35  	"github.com/cockroachdb/cockroach/pkg/sql/rowexec"
    36  	"github.com/cockroachdb/cockroach/pkg/sql/sem/tree"
    37  	"github.com/cockroachdb/cockroach/pkg/sql/sessiondata"
    38  	"github.com/cockroachdb/cockroach/pkg/sql/sqlbase"
    39  	"github.com/cockroachdb/cockroach/pkg/sql/sqltelemetry"
    40  	"github.com/cockroachdb/cockroach/pkg/sql/types"
    41  	"github.com/cockroachdb/cockroach/pkg/util/errorutil"
    42  	"github.com/cockroachdb/cockroach/pkg/util/errorutil/unimplemented"
    43  	"github.com/cockroachdb/cockroach/pkg/util/hlc"
    44  	"github.com/cockroachdb/cockroach/pkg/util/log"
    45  	"github.com/cockroachdb/cockroach/pkg/util/mon"
    46  	"github.com/cockroachdb/cockroach/pkg/util/tracing"
    47  	"github.com/cockroachdb/cockroach/pkg/util/uuid"
    48  	"github.com/cockroachdb/errors"
    49  	opentracing "github.com/opentracing/opentracing-go"
    50  )
    51  
    52  // To allow queries to send out flow RPCs in parallel, we use a pool of workers
    53  // that can issue the RPCs on behalf of the running code. The pool is shared by
    54  // multiple queries.
    55  const numRunners = 16
    56  
    57  const clientRejectedMsg string = "client rejected when attempting to run DistSQL plan"
    58  
    59  // runnerRequest is the request that is sent (via a channel) to a worker.
    60  type runnerRequest struct {
    61  	ctx        context.Context
    62  	nodeDialer *nodedialer.Dialer
    63  	flowReq    *execinfrapb.SetupFlowRequest
    64  	nodeID     roachpb.NodeID
    65  	resultChan chan<- runnerResult
    66  }
    67  
    68  // runnerResult is returned by a worker (via a channel) for each received
    69  // request.
    70  type runnerResult struct {
    71  	nodeID roachpb.NodeID
    72  	err    error
    73  }
    74  
    75  func (req runnerRequest) run() {
    76  	res := runnerResult{nodeID: req.nodeID}
    77  
    78  	conn, err := req.nodeDialer.Dial(req.ctx, req.nodeID, rpc.DefaultClass)
    79  	if err != nil {
    80  		res.err = err
    81  	} else {
    82  		client := execinfrapb.NewDistSQLClient(conn)
    83  		// TODO(radu): do we want a timeout here?
    84  		resp, err := client.SetupFlow(req.ctx, req.flowReq)
    85  		if err != nil {
    86  			res.err = err
    87  		} else {
    88  			res.err = resp.Error.ErrorDetail(req.ctx)
    89  		}
    90  	}
    91  	req.resultChan <- res
    92  }
    93  
    94  func (dsp *DistSQLPlanner) initRunners() {
    95  	// This channel has to be unbuffered because we want to only be able to send
    96  	// requests if a worker is actually there to receive them.
    97  	dsp.runnerChan = make(chan runnerRequest)
    98  	for i := 0; i < numRunners; i++ {
    99  		dsp.stopper.RunWorker(context.TODO(), func(context.Context) {
   100  			runnerChan := dsp.runnerChan
   101  			stopChan := dsp.stopper.ShouldStop()
   102  			for {
   103  				select {
   104  				case req := <-runnerChan:
   105  					req.run()
   106  
   107  				case <-stopChan:
   108  					return
   109  				}
   110  			}
   111  		})
   112  	}
   113  }
   114  
   115  // setupFlows sets up all the flows specified in flows using the provided state.
   116  // It will first attempt to set up all remote flows using the dsp workers if
   117  // available or sequentially if not, and then finally set up the gateway flow,
   118  // whose output is the DistSQLReceiver provided. This flow is then returned to
   119  // be run.
   120  func (dsp *DistSQLPlanner) setupFlows(
   121  	ctx context.Context,
   122  	evalCtx *extendedEvalContext,
   123  	leafInputState *roachpb.LeafTxnInputState,
   124  	flows map[roachpb.NodeID]*execinfrapb.FlowSpec,
   125  	recv *DistSQLReceiver,
   126  	localState distsql.LocalState,
   127  	vectorizeThresholdMet bool,
   128  ) (context.Context, flowinfra.Flow, error) {
   129  	thisNodeID := dsp.nodeDesc.NodeID
   130  	_, ok := flows[thisNodeID]
   131  	if !ok {
   132  		return nil, nil, errors.AssertionFailedf("missing gateway flow")
   133  	}
   134  	if localState.IsLocal && len(flows) != 1 {
   135  		return nil, nil, errors.AssertionFailedf("IsLocal set but there's multiple flows")
   136  	}
   137  
   138  	evalCtxProto := execinfrapb.MakeEvalContext(&evalCtx.EvalContext)
   139  	setupReq := execinfrapb.SetupFlowRequest{
   140  		LeafTxnInputState: leafInputState,
   141  		Version:           execinfra.Version,
   142  		EvalContext:       evalCtxProto,
   143  		TraceKV:           evalCtx.Tracing.KVTracingEnabled(),
   144  	}
   145  
   146  	// Start all the flows except the flow on this node (there is always a flow on
   147  	// this node).
   148  	var resultChan chan runnerResult
   149  	if len(flows) > 1 {
   150  		resultChan = make(chan runnerResult, len(flows)-1)
   151  	}
   152  
   153  	if evalCtx.SessionData.VectorizeMode != sessiondata.VectorizeOff {
   154  		if !vectorizeThresholdMet && (evalCtx.SessionData.VectorizeMode == sessiondata.Vectorize201Auto || evalCtx.SessionData.VectorizeMode == sessiondata.VectorizeOn) {
   155  			// Vectorization is not justified for this flow because the expected
   156  			// amount of data is too small and the overhead of pre-allocating data
   157  			// structures needed for the vectorized engine is expected to dominate
   158  			// the execution time.
   159  			setupReq.EvalContext.Vectorize = int32(sessiondata.VectorizeOff)
   160  		} else {
   161  			fuseOpt := flowinfra.FuseNormally
   162  			if localState.IsLocal {
   163  				fuseOpt = flowinfra.FuseAggressively
   164  			}
   165  			// Now we check to see whether or not to even try vectorizing the flow.
   166  			// The goal here is to determine up front whether all of the flows can be
   167  			// vectorized. If any of them can't, turn off the setting.
   168  			// TODO(yuzefovich): this is a safe but quite inefficient way of setting
   169  			// up vectorized flows since the flows will effectively be planned twice.
   170  			for _, spec := range flows {
   171  				if _, err := colflow.SupportsVectorized(
   172  					ctx, &execinfra.FlowCtx{
   173  						EvalCtx: &evalCtx.EvalContext,
   174  						Cfg: &execinfra.ServerConfig{
   175  							DiskMonitor:    &mon.BytesMonitor{},
   176  							Settings:       dsp.st,
   177  							ClusterID:      &dsp.rpcCtx.ClusterID,
   178  							VecFDSemaphore: dsp.distSQLSrv.VecFDSemaphore,
   179  						},
   180  						NodeID: evalCtx.NodeID,
   181  					}, spec.Processors, fuseOpt, recv,
   182  				); err != nil {
   183  					// Vectorization attempt failed with an error.
   184  					returnVectorizationSetupError := false
   185  					if evalCtx.SessionData.VectorizeMode == sessiondata.VectorizeExperimentalAlways {
   186  						returnVectorizationSetupError = true
   187  						// If running with VectorizeExperimentalAlways, this check makes sure
   188  						// that we can still run SET statements (mostly to set vectorize to
   189  						// off) and the like.
   190  						if len(spec.Processors) == 1 &&
   191  							spec.Processors[0].Core.LocalPlanNode != nil {
   192  							rsidx := spec.Processors[0].Core.LocalPlanNode.RowSourceIdx
   193  							if rsidx != nil {
   194  								lp := localState.LocalProcs[*rsidx]
   195  								if z, ok := lp.(colflow.VectorizeAlwaysException); ok {
   196  									if z.IsException() {
   197  										returnVectorizationSetupError = false
   198  									}
   199  								}
   200  							}
   201  						}
   202  					}
   203  					log.VEventf(ctx, 1, "failed to vectorize: %s", err)
   204  					if returnVectorizationSetupError {
   205  						return nil, nil, err
   206  					}
   207  					// Vectorization is not supported for this flow, so we override the
   208  					// setting.
   209  					setupReq.EvalContext.Vectorize = int32(sessiondata.VectorizeOff)
   210  					break
   211  				}
   212  			}
   213  		}
   214  	}
   215  	for nodeID, flowSpec := range flows {
   216  		if nodeID == thisNodeID {
   217  			// Skip this node.
   218  			continue
   219  		}
   220  		if !evalCtx.Codec.ForSystemTenant() {
   221  			// A tenant server should never find itself distributing flows.
   222  			// NB: we wouldn't hit this in practice but if we did the actual
   223  			// error would be opaque.
   224  			return nil, nil, errorutil.UnsupportedWithMultiTenancy(47900)
   225  		}
   226  		req := setupReq
   227  		req.Flow = *flowSpec
   228  		runReq := runnerRequest{
   229  			ctx:        ctx,
   230  			nodeDialer: dsp.nodeDialer,
   231  			flowReq:    &req,
   232  			nodeID:     nodeID,
   233  			resultChan: resultChan,
   234  		}
   235  		defer physicalplan.ReleaseSetupFlowRequest(&req)
   236  
   237  		// Send out a request to the workers; if no worker is available, run
   238  		// directly.
   239  		select {
   240  		case dsp.runnerChan <- runReq:
   241  		default:
   242  			runReq.run()
   243  		}
   244  	}
   245  
   246  	var firstErr error
   247  	// Now wait for all the flows to be scheduled on remote nodes. Note that we
   248  	// are not waiting for the flows themselves to complete.
   249  	for i := 0; i < len(flows)-1; i++ {
   250  		res := <-resultChan
   251  		if firstErr == nil {
   252  			firstErr = res.err
   253  		}
   254  		// TODO(radu): accumulate the flows that we failed to set up and move them
   255  		// into the local flow.
   256  	}
   257  	if firstErr != nil {
   258  		return nil, nil, firstErr
   259  	}
   260  
   261  	// Set up the flow on this node.
   262  	localReq := setupReq
   263  	localReq.Flow = *flows[thisNodeID]
   264  	defer physicalplan.ReleaseSetupFlowRequest(&localReq)
   265  	ctx, flow, err := dsp.distSQLSrv.SetupLocalSyncFlow(ctx, evalCtx.Mon, &localReq, recv, localState)
   266  	if err != nil {
   267  		return nil, nil, err
   268  	}
   269  
   270  	return ctx, flow, nil
   271  }
   272  
   273  // Run executes a physical plan. The plan should have been finalized using
   274  // FinalizePlan.
   275  //
   276  // All errors encountered are reported to the DistSQLReceiver's resultWriter.
   277  // Additionally, if the error is a "communication error" (an error encountered
   278  // while using that resultWriter), the error is also stored in
   279  // DistSQLReceiver.commErr. That can be tested to see if a client session needs
   280  // to be closed.
   281  //
   282  // Args:
   283  // - txn is the transaction in which the plan will run. If nil, the different
   284  // processors are expected to manage their own internal transactions.
   285  // - evalCtx is the evaluation context in which the plan will run. It might be
   286  // mutated.
   287  // - finishedSetupFn, if non-nil, is called synchronously after all the
   288  // processors have successfully started up.
   289  //
   290  // It returns a non-nil (although it can be a noop when an error is
   291  // encountered) cleanup function that must be called in order to release the
   292  // resources.
   293  func (dsp *DistSQLPlanner) Run(
   294  	planCtx *PlanningCtx,
   295  	txn *kv.Txn,
   296  	plan *PhysicalPlan,
   297  	recv *DistSQLReceiver,
   298  	evalCtx *extendedEvalContext,
   299  	finishedSetupFn func(),
   300  ) (cleanup func()) {
   301  	ctx := planCtx.ctx
   302  
   303  	var (
   304  		localState     distsql.LocalState
   305  		leafInputState *roachpb.LeafTxnInputState
   306  	)
   307  	// NB: putting part of evalCtx in localState means it might be mutated down
   308  	// the line.
   309  	localState.EvalContext = &evalCtx.EvalContext
   310  	localState.Txn = txn
   311  	if planCtx.isLocal {
   312  		localState.IsLocal = true
   313  		localState.LocalProcs = plan.LocalProcessors
   314  	} else if txn != nil {
   315  		// If the plan is not local, we will have to set up leaf txns using the
   316  		// txnCoordMeta.
   317  		tis, err := txn.GetLeafTxnInputStateOrRejectClient(ctx)
   318  		if err != nil {
   319  			log.Infof(ctx, "%s: %s", clientRejectedMsg, err)
   320  			recv.SetError(err)
   321  			return func() {}
   322  		}
   323  		leafInputState = &tis
   324  	}
   325  
   326  	flows := plan.GenerateFlowSpecs(dsp.nodeDesc.NodeID /* gateway */)
   327  	if _, ok := flows[dsp.nodeDesc.NodeID]; !ok {
   328  		recv.SetError(errors.Errorf("expected to find gateway flow"))
   329  		return func() {}
   330  	}
   331  
   332  	if planCtx.saveDiagram != nil {
   333  		// Local flows might not have the UUID field set. We need it to be set to
   334  		// distinguish statistics for processors in subqueries vs the main query vs
   335  		// postqueries.
   336  		if len(flows) == 1 {
   337  			for _, f := range flows {
   338  				if f.FlowID == (execinfrapb.FlowID{}) {
   339  					f.FlowID.UUID = uuid.MakeV4()
   340  				}
   341  			}
   342  		}
   343  		log.VEvent(ctx, 1, "creating plan diagram")
   344  		var stmtStr string
   345  		if planCtx.planner != nil && planCtx.planner.stmt != nil {
   346  			stmtStr = planCtx.planner.stmt.String()
   347  		}
   348  		diagram, err := execinfrapb.GeneratePlanDiagram(
   349  			stmtStr, flows, planCtx.saveDiagramShowInputTypes,
   350  		)
   351  		if err != nil {
   352  			recv.SetError(err)
   353  			return func() {}
   354  		}
   355  		planCtx.saveDiagram(diagram)
   356  	}
   357  
   358  	if logPlanDiagram {
   359  		log.VEvent(ctx, 1, "creating plan diagram for logging")
   360  		var stmtStr string
   361  		if planCtx.planner != nil && planCtx.planner.stmt != nil {
   362  			stmtStr = planCtx.planner.stmt.String()
   363  		}
   364  		_, url, err := execinfrapb.GeneratePlanDiagramURL(stmtStr, flows, false /* showInputTypes */)
   365  		if err != nil {
   366  			log.Infof(ctx, "Error generating diagram: %s", err)
   367  		} else {
   368  			log.Infof(ctx, "Plan diagram URL:\n%s", url.String())
   369  		}
   370  	}
   371  
   372  	log.VEvent(ctx, 1, "running DistSQL plan")
   373  
   374  	dsp.distSQLSrv.ServerConfig.Metrics.QueryStart()
   375  	defer dsp.distSQLSrv.ServerConfig.Metrics.QueryStop()
   376  
   377  	recv.outputTypes = plan.ResultTypes
   378  	recv.resultToStreamColMap = plan.PlanToStreamColMap
   379  
   380  	vectorizedThresholdMet := plan.MaxEstimatedRowCount >= evalCtx.SessionData.VectorizeRowCountThreshold
   381  
   382  	if len(flows) == 1 {
   383  		// We ended up planning everything locally, regardless of whether we
   384  		// intended to distribute or not.
   385  		localState.IsLocal = true
   386  	}
   387  
   388  	ctx, flow, err := dsp.setupFlows(ctx, evalCtx, leafInputState, flows, recv, localState, vectorizedThresholdMet)
   389  	if err != nil {
   390  		recv.SetError(err)
   391  		return func() {}
   392  	}
   393  
   394  	if finishedSetupFn != nil {
   395  		finishedSetupFn()
   396  	}
   397  
   398  	// Check that flows that were forced to be planned locally also have no concurrency.
   399  	// This is important, since these flows are forced to use the RootTxn (since
   400  	// they might have mutations), and the RootTxn does not permit concurrency.
   401  	// For such flows, we were supposed to have fused everything.
   402  	if txn != nil && planCtx.isLocal && flow.ConcurrentExecution() {
   403  		recv.SetError(errors.AssertionFailedf(
   404  			"unexpected concurrency for a flow that was forced to be planned locally"))
   405  		return func() {}
   406  	}
   407  
   408  	// TODO(radu): this should go through the flow scheduler.
   409  	if err := flow.Run(ctx, func() {}); err != nil {
   410  		log.Fatalf(ctx, "unexpected error from syncFlow.Start(): %s "+
   411  			"The error should have gone to the consumer.", err)
   412  	}
   413  
   414  	// TODO(yuzefovich): it feels like this closing should happen after
   415  	// PlanAndRun. We should refactor this and get rid off ignoreClose field.
   416  	if planCtx.planner != nil && !planCtx.ignoreClose {
   417  		// planCtx can change before the cleanup function is executed, so we make
   418  		// a copy of the planner and bind it to the function.
   419  		curPlan := &planCtx.planner.curPlan
   420  		return func() {
   421  			// We need to close the planNode tree we translated into a DistSQL plan
   422  			// before flow.Cleanup, which closes memory accounts that expect to be
   423  			// emptied.
   424  			curPlan.execErr = recv.resultWriter.Err()
   425  			curPlan.close(ctx)
   426  			flow.Cleanup(ctx)
   427  		}
   428  	}
   429  
   430  	// ignoreClose is set to true meaning that someone else will handle the
   431  	// closing of the current plan, so we simply clean up the flow.
   432  	return func() {
   433  		flow.Cleanup(ctx)
   434  	}
   435  }
   436  
   437  // DistSQLReceiver is a RowReceiver that writes results to a rowResultWriter.
   438  // This is where the DistSQL execution meets the SQL Session - the RowContainer
   439  // comes from a client Session.
   440  //
   441  // DistSQLReceiver also update the RangeDescriptorCache and the LeaseholderCache
   442  // in response to DistSQL metadata about misplanned ranges.
   443  type DistSQLReceiver struct {
   444  	ctx context.Context
   445  
   446  	// resultWriter is the interface which we send results to.
   447  	resultWriter rowResultWriter
   448  
   449  	stmtType tree.StatementType
   450  
   451  	// outputTypes are the types of the result columns produced by the plan.
   452  	outputTypes []*types.T
   453  
   454  	// resultToStreamColMap maps result columns to columns in the rowexec results
   455  	// stream.
   456  	resultToStreamColMap []int
   457  
   458  	// noColsRequired indicates that the caller is only interested in the
   459  	// existence of a single row. Used by subqueries in EXISTS mode.
   460  	noColsRequired bool
   461  
   462  	// discardRows is set when we want to discard rows (for testing/benchmarks).
   463  	// See EXECUTE .. DISCARD ROWS.
   464  	discardRows bool
   465  
   466  	// commErr keeps track of the error received from interacting with the
   467  	// resultWriter. This represents a "communication error" and as such is unlike
   468  	// query execution errors: when the DistSQLReceiver is used within a SQL
   469  	// session, such errors mean that we have to bail on the session.
   470  	// Query execution errors are reported to the resultWriter. For some client's
   471  	// convenience, communication errors are also reported to the resultWriter.
   472  	//
   473  	// Once set, no more rows are accepted.
   474  	commErr error
   475  
   476  	row    tree.Datums
   477  	status execinfra.ConsumerStatus
   478  	alloc  sqlbase.DatumAlloc
   479  	closed bool
   480  
   481  	rangeCache *kvcoord.RangeDescriptorCache
   482  	leaseCache *kvcoord.LeaseHolderCache
   483  	tracing    *SessionTracing
   484  	cleanup    func()
   485  
   486  	// The transaction in which the flow producing data for this
   487  	// receiver runs. The DistSQLReceiver updates the transaction in
   488  	// response to RetryableTxnError's and when distributed processors
   489  	// pass back LeafTxnFinalState objects via ProducerMetas. Nil if no
   490  	// transaction should be updated on errors (i.e. if the flow overall
   491  	// doesn't run in a transaction).
   492  	txn *kv.Txn
   493  
   494  	// A handler for clock signals arriving from remote nodes. This should update
   495  	// this node's clock.
   496  	updateClock func(observedTs hlc.Timestamp)
   497  
   498  	// bytesRead and rowsRead track the corresponding metrics while executing the
   499  	// statement.
   500  	bytesRead int64
   501  	rowsRead  int64
   502  
   503  	expectedRowsRead int64
   504  	progressAtomic   *uint64
   505  }
   506  
   507  // rowResultWriter is a subset of CommandResult to be used with the
   508  // DistSQLReceiver. It's implemented by RowResultWriter.
   509  type rowResultWriter interface {
   510  	// AddRow writes a result row.
   511  	// Note that the caller owns the row slice and might reuse it.
   512  	AddRow(ctx context.Context, row tree.Datums) error
   513  	IncrementRowsAffected(n int)
   514  	SetError(error)
   515  	Err() error
   516  }
   517  
   518  type metadataResultWriter interface {
   519  	AddMeta(ctx context.Context, meta *execinfrapb.ProducerMetadata)
   520  }
   521  
   522  type metadataCallbackWriter struct {
   523  	rowResultWriter
   524  	fn func(ctx context.Context, meta *execinfrapb.ProducerMetadata) error
   525  }
   526  
   527  func (w *metadataCallbackWriter) AddMeta(ctx context.Context, meta *execinfrapb.ProducerMetadata) {
   528  	if err := w.fn(ctx, meta); err != nil {
   529  		w.SetError(err)
   530  	}
   531  }
   532  
   533  // errOnlyResultWriter is a rowResultWriter that only supports receiving an
   534  // error. All other functions that deal with producing results panic.
   535  type errOnlyResultWriter struct {
   536  	err error
   537  }
   538  
   539  var _ rowResultWriter = &errOnlyResultWriter{}
   540  
   541  func (w *errOnlyResultWriter) SetError(err error) {
   542  	w.err = err
   543  }
   544  func (w *errOnlyResultWriter) Err() error {
   545  	return w.err
   546  }
   547  
   548  func (w *errOnlyResultWriter) AddRow(ctx context.Context, row tree.Datums) error {
   549  	panic("AddRow not supported by errOnlyResultWriter")
   550  }
   551  func (w *errOnlyResultWriter) IncrementRowsAffected(n int) {
   552  	panic("IncrementRowsAffected not supported by errOnlyResultWriter")
   553  }
   554  
   555  var _ execinfra.RowReceiver = &DistSQLReceiver{}
   556  
   557  var receiverSyncPool = sync.Pool{
   558  	New: func() interface{} {
   559  		return &DistSQLReceiver{}
   560  	},
   561  }
   562  
   563  // MakeDistSQLReceiver creates a DistSQLReceiver.
   564  //
   565  // ctx is the Context that the receiver will use throughout its
   566  // lifetime. resultWriter is the container where the results will be
   567  // stored. If only the row count is needed, this can be nil.
   568  //
   569  // txn is the transaction in which the producer flow runs; it will be updated
   570  // on errors. Nil if the flow overall doesn't run in a transaction.
   571  func MakeDistSQLReceiver(
   572  	ctx context.Context,
   573  	resultWriter rowResultWriter,
   574  	stmtType tree.StatementType,
   575  	rangeCache *kvcoord.RangeDescriptorCache,
   576  	leaseCache *kvcoord.LeaseHolderCache,
   577  	txn *kv.Txn,
   578  	updateClock func(observedTs hlc.Timestamp),
   579  	tracing *SessionTracing,
   580  ) *DistSQLReceiver {
   581  	consumeCtx, cleanup := tracing.TraceExecConsume(ctx)
   582  	r := receiverSyncPool.Get().(*DistSQLReceiver)
   583  	*r = DistSQLReceiver{
   584  		ctx:          consumeCtx,
   585  		cleanup:      cleanup,
   586  		resultWriter: resultWriter,
   587  		rangeCache:   rangeCache,
   588  		leaseCache:   leaseCache,
   589  		txn:          txn,
   590  		updateClock:  updateClock,
   591  		stmtType:     stmtType,
   592  		tracing:      tracing,
   593  	}
   594  	return r
   595  }
   596  
   597  // Release releases this DistSQLReceiver back to the pool.
   598  func (r *DistSQLReceiver) Release() {
   599  	*r = DistSQLReceiver{}
   600  	receiverSyncPool.Put(r)
   601  }
   602  
   603  // clone clones the receiver for running subqueries. Not all fields are cloned,
   604  // only those required for running subqueries.
   605  func (r *DistSQLReceiver) clone() *DistSQLReceiver {
   606  	ret := receiverSyncPool.Get().(*DistSQLReceiver)
   607  	*ret = DistSQLReceiver{
   608  		ctx:         r.ctx,
   609  		cleanup:     func() {},
   610  		rangeCache:  r.rangeCache,
   611  		leaseCache:  r.leaseCache,
   612  		txn:         r.txn,
   613  		updateClock: r.updateClock,
   614  		stmtType:    tree.Rows,
   615  		tracing:     r.tracing,
   616  	}
   617  	return ret
   618  }
   619  
   620  // SetError provides a convenient way for a client to pass in an error, thus
   621  // pretending that a query execution error happened. The error is passed along
   622  // to the resultWriter.
   623  func (r *DistSQLReceiver) SetError(err error) {
   624  	r.resultWriter.SetError(err)
   625  }
   626  
   627  // Push is part of the RowReceiver interface.
   628  func (r *DistSQLReceiver) Push(
   629  	row sqlbase.EncDatumRow, meta *execinfrapb.ProducerMetadata,
   630  ) execinfra.ConsumerStatus {
   631  	if meta != nil {
   632  		if meta.LeafTxnFinalState != nil {
   633  			if r.txn != nil {
   634  				if r.txn.ID() == meta.LeafTxnFinalState.Txn.ID {
   635  					if err := r.txn.UpdateRootWithLeafFinalState(r.ctx, meta.LeafTxnFinalState); err != nil {
   636  						r.resultWriter.SetError(err)
   637  					}
   638  				}
   639  			} else {
   640  				r.resultWriter.SetError(
   641  					errors.Errorf("received a leaf final state (%s); but have no root", meta.LeafTxnFinalState))
   642  			}
   643  		}
   644  		if meta.Err != nil {
   645  			// Check if the error we just received should take precedence over a
   646  			// previous error (if any).
   647  			if roachpb.ErrPriority(meta.Err) > roachpb.ErrPriority(r.resultWriter.Err()) {
   648  				if r.txn != nil {
   649  					if retryErr := (*roachpb.UnhandledRetryableError)(nil); errors.As(meta.Err, &retryErr) {
   650  						// Update the txn in response to remote errors. In the non-DistSQL
   651  						// world, the TxnCoordSender handles "unhandled" retryable errors,
   652  						// but this one is coming from a distributed SQL node, which has
   653  						// left the handling up to the root transaction.
   654  						meta.Err = r.txn.UpdateStateOnRemoteRetryableErr(r.ctx, &retryErr.PErr)
   655  						// Update the clock with information from the error. On non-DistSQL
   656  						// code paths, the DistSender does this.
   657  						// TODO(andrei): We don't propagate clock signals on success cases
   658  						// through DistSQL; we should. We also don't propagate them through
   659  						// non-retryable errors; we also should.
   660  						r.updateClock(retryErr.PErr.Now)
   661  					}
   662  				}
   663  				r.resultWriter.SetError(meta.Err)
   664  			}
   665  		}
   666  		if len(meta.Ranges) > 0 {
   667  			r.updateCaches(r.ctx, meta.Ranges)
   668  		}
   669  		if len(meta.TraceData) > 0 {
   670  			span := opentracing.SpanFromContext(r.ctx)
   671  			if span == nil {
   672  				r.resultWriter.SetError(
   673  					errors.New("trying to ingest remote spans but there is no recording span set up"))
   674  			} else if err := tracing.ImportRemoteSpans(span, meta.TraceData); err != nil {
   675  				r.resultWriter.SetError(errors.Errorf("error ingesting remote spans: %s", err))
   676  			}
   677  		}
   678  		if meta.Metrics != nil {
   679  			r.bytesRead += meta.Metrics.BytesRead
   680  			r.rowsRead += meta.Metrics.RowsRead
   681  			if r.progressAtomic != nil && r.expectedRowsRead != 0 {
   682  				progress := float64(r.rowsRead) / float64(r.expectedRowsRead)
   683  				atomic.StoreUint64(r.progressAtomic, math.Float64bits(progress))
   684  			}
   685  			meta.Metrics.Release()
   686  			meta.Release()
   687  		}
   688  		if metaWriter, ok := r.resultWriter.(metadataResultWriter); ok {
   689  			metaWriter.AddMeta(r.ctx, meta)
   690  		}
   691  		return r.status
   692  	}
   693  	if r.resultWriter.Err() == nil && r.ctx.Err() != nil {
   694  		r.resultWriter.SetError(r.ctx.Err())
   695  	}
   696  	if r.resultWriter.Err() != nil {
   697  		// TODO(andrei): We should drain here if we weren't canceled.
   698  		return execinfra.ConsumerClosed
   699  	}
   700  	if r.status != execinfra.NeedMoreRows {
   701  		return r.status
   702  	}
   703  
   704  	if r.stmtType != tree.Rows {
   705  		// We only need the row count. planNodeToRowSource is set up to handle
   706  		// ensuring that the last stage in the pipeline will return a single-column
   707  		// row with the row count in it, so just grab that and exit.
   708  		r.resultWriter.IncrementRowsAffected(int(tree.MustBeDInt(row[0].Datum)))
   709  		return r.status
   710  	}
   711  
   712  	if r.discardRows {
   713  		// Discard rows.
   714  		return r.status
   715  	}
   716  
   717  	// If no columns are needed by the output, the consumer is only looking for
   718  	// whether a single row is pushed or not, so the contents do not matter, and
   719  	// planNodeToRowSource is not set up to handle decoding the row.
   720  	if r.noColsRequired {
   721  		r.row = []tree.Datum{}
   722  		r.status = execinfra.ConsumerClosed
   723  	} else {
   724  		if r.row == nil {
   725  			r.row = make(tree.Datums, len(r.resultToStreamColMap))
   726  		}
   727  		for i, resIdx := range r.resultToStreamColMap {
   728  			err := row[resIdx].EnsureDecoded(r.outputTypes[resIdx], &r.alloc)
   729  			if err != nil {
   730  				r.resultWriter.SetError(err)
   731  				r.status = execinfra.ConsumerClosed
   732  				return r.status
   733  			}
   734  			r.row[i] = row[resIdx].Datum
   735  		}
   736  	}
   737  	r.tracing.TraceExecRowsResult(r.ctx, r.row)
   738  	// Note that AddRow accounts for the memory used by the Datums.
   739  	if commErr := r.resultWriter.AddRow(r.ctx, r.row); commErr != nil {
   740  		// ErrLimitedResultClosed is not a real error, it is a
   741  		// signal to stop distsql and return success to the client.
   742  		if !errors.Is(commErr, ErrLimitedResultClosed) {
   743  			// Set the error on the resultWriter too, for the convenience of some of the
   744  			// clients. If clients don't care to differentiate between communication
   745  			// errors and query execution errors, they can simply inspect
   746  			// resultWriter.Err(). Also, this function itself doesn't care about the
   747  			// distinction and just uses resultWriter.Err() to see if we're still
   748  			// accepting results.
   749  			r.resultWriter.SetError(commErr)
   750  
   751  			// We don't need to shut down the connection
   752  			// if there's a portal-related error. This is
   753  			// definitely a layering violation, but is part
   754  			// of some accepted technical debt (see comments on
   755  			// sql/pgwire.limitedCommandResult.moreResultsNeeded).
   756  			// Instead of changing the signature of AddRow, we have
   757  			// a sentinel error that is handled specially here.
   758  			if !errors.Is(commErr, ErrLimitedResultNotSupported) {
   759  				r.commErr = commErr
   760  			}
   761  		}
   762  		// TODO(andrei): We should drain here. Metadata from this query would be
   763  		// useful, particularly as it was likely a large query (since AddRow()
   764  		// above failed, presumably with an out-of-memory error).
   765  		r.status = execinfra.ConsumerClosed
   766  	}
   767  	return r.status
   768  }
   769  
   770  var (
   771  	// ErrLimitedResultNotSupported is an error produced by pgwire
   772  	// indicating an unsupported feature of row count limits was attempted.
   773  	ErrLimitedResultNotSupported = unimplemented.NewWithIssue(40195, "multiple active portals not supported")
   774  	// ErrLimitedResultClosed is a sentinel error produced by pgwire
   775  	// indicating the portal should be closed without error.
   776  	ErrLimitedResultClosed = errors.New("row count limit closed")
   777  )
   778  
   779  // ProducerDone is part of the RowReceiver interface.
   780  func (r *DistSQLReceiver) ProducerDone() {
   781  	if r.closed {
   782  		panic("double close")
   783  	}
   784  	r.closed = true
   785  	r.cleanup()
   786  }
   787  
   788  // Types is part of the RowReceiver interface.
   789  func (r *DistSQLReceiver) Types() []*types.T {
   790  	return r.outputTypes
   791  }
   792  
   793  // updateCaches takes information about some ranges that were mis-planned and
   794  // updates the range descriptor and lease-holder caches accordingly.
   795  //
   796  // TODO(andrei): updating these caches is not perfect: we can clobber newer
   797  // information that someone else has populated because there's no timing info
   798  // anywhere. We also may fail to remove stale info from the LeaseHolderCache if
   799  // the ids of the ranges that we get are different than the ids in that cache.
   800  func (r *DistSQLReceiver) updateCaches(ctx context.Context, ranges []roachpb.RangeInfo) {
   801  	// Update the RangeDescriptorCache.
   802  	rngDescs := make([]roachpb.RangeDescriptor, len(ranges))
   803  	for i, ri := range ranges {
   804  		rngDescs[i] = ri.Desc
   805  	}
   806  	r.rangeCache.InsertRangeDescriptors(ctx, rngDescs...)
   807  
   808  	// Update the LeaseHolderCache.
   809  	for _, ri := range ranges {
   810  		r.leaseCache.Update(ctx, ri.Desc.RangeID, ri.Lease.Replica.StoreID)
   811  	}
   812  }
   813  
   814  // PlanAndRunSubqueries returns false if an error was encountered and sets that
   815  // error in the provided receiver.
   816  func (dsp *DistSQLPlanner) PlanAndRunSubqueries(
   817  	ctx context.Context,
   818  	planner *planner,
   819  	evalCtxFactory func() *extendedEvalContext,
   820  	subqueryPlans []subquery,
   821  	recv *DistSQLReceiver,
   822  	maybeDistribute bool,
   823  ) bool {
   824  	for planIdx, subqueryPlan := range subqueryPlans {
   825  		if err := dsp.planAndRunSubquery(
   826  			ctx,
   827  			planIdx,
   828  			subqueryPlan,
   829  			planner,
   830  			evalCtxFactory(),
   831  			subqueryPlans,
   832  			recv,
   833  			maybeDistribute,
   834  		); err != nil {
   835  			recv.SetError(err)
   836  			return false
   837  		}
   838  	}
   839  
   840  	return true
   841  }
   842  
   843  func (dsp *DistSQLPlanner) planAndRunSubquery(
   844  	ctx context.Context,
   845  	planIdx int,
   846  	subqueryPlan subquery,
   847  	planner *planner,
   848  	evalCtx *extendedEvalContext,
   849  	subqueryPlans []subquery,
   850  	recv *DistSQLReceiver,
   851  	maybeDistribute bool,
   852  ) error {
   853  	subqueryMonitor := mon.MakeMonitor(
   854  		"subquery",
   855  		mon.MemoryResource,
   856  		dsp.distSQLSrv.Metrics.CurBytesCount,
   857  		dsp.distSQLSrv.Metrics.MaxBytesHist,
   858  		-1, /* use default block size */
   859  		noteworthyMemoryUsageBytes,
   860  		dsp.distSQLSrv.Settings,
   861  	)
   862  	subqueryMonitor.Start(ctx, evalCtx.Mon, mon.BoundAccount{})
   863  	defer subqueryMonitor.Stop(ctx)
   864  
   865  	subqueryMemAccount := subqueryMonitor.MakeBoundAccount()
   866  	defer subqueryMemAccount.Close(ctx)
   867  
   868  	var distributeSubquery bool
   869  	if maybeDistribute {
   870  		distributeSubquery = willDistributePlan(
   871  			ctx, planner.execCfg.NodeID, planner.SessionData().DistSQLMode, subqueryPlan.plan,
   872  		)
   873  	}
   874  	subqueryPlanCtx := dsp.NewPlanningCtx(ctx, evalCtx, planner.txn, distributeSubquery)
   875  	subqueryPlanCtx.planner = planner
   876  	subqueryPlanCtx.stmtType = tree.Rows
   877  	if planner.collectBundle {
   878  		subqueryPlanCtx.saveDiagram = func(diagram execinfrapb.FlowDiagram) {
   879  			planner.curPlan.distSQLDiagrams = append(planner.curPlan.distSQLDiagrams, diagram)
   880  		}
   881  	}
   882  	// Don't close the top-level plan from subqueries - someone else will handle
   883  	// that.
   884  	subqueryPlanCtx.ignoreClose = true
   885  	subqueryPhysPlan, err := dsp.createPhysPlan(subqueryPlanCtx, subqueryPlan.plan)
   886  	if err != nil {
   887  		return err
   888  	}
   889  	dsp.FinalizePlan(subqueryPlanCtx, subqueryPhysPlan)
   890  
   891  	// TODO(arjun): #28264: We set up a row container, wrap it in a row
   892  	// receiver, and use it and serialize the results of the subquery. The type
   893  	// of the results stored in the container depends on the type of the subquery.
   894  	subqueryRecv := recv.clone()
   895  	var typ sqlbase.ColTypeInfo
   896  	var rows *rowcontainer.RowContainer
   897  	if subqueryPlan.execMode == rowexec.SubqueryExecModeExists {
   898  		subqueryRecv.noColsRequired = true
   899  		typ = sqlbase.ColTypeInfoFromColTypes([]*types.T{})
   900  	} else {
   901  		// Apply the PlanToStreamColMap projection to the ResultTypes to get the
   902  		// final set of output types for the subquery. The reason this is necessary
   903  		// is that the output schema of a query sometimes contains columns necessary
   904  		// to merge the streams, but that aren't required by the final output of the
   905  		// query. These get projected out, so we need to similarly adjust the
   906  		// expected result types of the subquery here.
   907  		colTypes := make([]*types.T, len(subqueryPhysPlan.PlanToStreamColMap))
   908  		for i, resIdx := range subqueryPhysPlan.PlanToStreamColMap {
   909  			colTypes[i] = subqueryPhysPlan.ResultTypes[resIdx]
   910  		}
   911  		typ = sqlbase.ColTypeInfoFromColTypes(colTypes)
   912  	}
   913  	rows = rowcontainer.NewRowContainer(subqueryMemAccount, typ, 0)
   914  	defer rows.Close(ctx)
   915  
   916  	subqueryRowReceiver := NewRowResultWriter(rows)
   917  	subqueryRecv.resultWriter = subqueryRowReceiver
   918  	subqueryPlans[planIdx].started = true
   919  	dsp.Run(subqueryPlanCtx, planner.txn, subqueryPhysPlan, subqueryRecv, evalCtx, nil /* finishedSetupFn */)()
   920  	if subqueryRecv.commErr != nil {
   921  		return subqueryRecv.commErr
   922  	}
   923  	if err := subqueryRowReceiver.Err(); err != nil {
   924  		return err
   925  	}
   926  	switch subqueryPlan.execMode {
   927  	case rowexec.SubqueryExecModeExists:
   928  		// For EXISTS expressions, all we want to know if there is at least one row.
   929  		hasRows := rows.Len() != 0
   930  		subqueryPlans[planIdx].result = tree.MakeDBool(tree.DBool(hasRows))
   931  	case rowexec.SubqueryExecModeAllRows, rowexec.SubqueryExecModeAllRowsNormalized:
   932  		var result tree.DTuple
   933  		for rows.Len() > 0 {
   934  			row := rows.At(0)
   935  			rows.PopFirst()
   936  			if row.Len() == 1 {
   937  				// This seems hokey, but if we don't do this then the subquery expands
   938  				// to a tuple of tuples instead of a tuple of values and an expression
   939  				// like "k IN (SELECT foo FROM bar)" will fail because we're comparing
   940  				// a single value against a tuple.
   941  				result.D = append(result.D, row[0])
   942  			} else {
   943  				result.D = append(result.D, &tree.DTuple{D: row})
   944  			}
   945  		}
   946  
   947  		if subqueryPlan.execMode == rowexec.SubqueryExecModeAllRowsNormalized {
   948  			result.Normalize(&evalCtx.EvalContext)
   949  		}
   950  		subqueryPlans[planIdx].result = &result
   951  	case rowexec.SubqueryExecModeOneRow:
   952  		switch rows.Len() {
   953  		case 0:
   954  			subqueryPlans[planIdx].result = tree.DNull
   955  		case 1:
   956  			row := rows.At(0)
   957  			switch row.Len() {
   958  			case 1:
   959  				subqueryPlans[planIdx].result = row[0]
   960  			default:
   961  				subqueryPlans[planIdx].result = &tree.DTuple{D: rows.At(0)}
   962  			}
   963  		default:
   964  			return pgerror.Newf(pgcode.CardinalityViolation,
   965  				"more than one row returned by a subquery used as an expression")
   966  		}
   967  	default:
   968  		return fmt.Errorf("unexpected subqueryExecMode: %d", subqueryPlan.execMode)
   969  	}
   970  	return nil
   971  }
   972  
   973  // PlanAndRun generates a physical plan from a planNode tree and executes it. It
   974  // assumes that the tree is supported (see CheckSupport).
   975  //
   976  // All errors encountered are reported to the DistSQLReceiver's resultWriter.
   977  // Additionally, if the error is a "communication error" (an error encountered
   978  // while using that resultWriter), the error is also stored in
   979  // DistSQLReceiver.commErr. That can be tested to see if a client session needs
   980  // to be closed.
   981  //
   982  // It returns a non-nil (although it can be a noop when an error is
   983  // encountered) cleanup function that must be called once the planTop AST is no
   984  // longer needed and can be closed. Note that this function also cleans up the
   985  // flow which is unfortunate but is caused by the sharing of memory monitors
   986  // between planning and execution - cleaning up the flow wants to close the
   987  // monitor, but it cannot do so because the AST needs to live longer and still
   988  // uses the same monitor. That's why we end up in a situation that in order to
   989  // clean up the flow, we need to close the AST first, but we can only do that
   990  // after PlanAndRun returns.
   991  func (dsp *DistSQLPlanner) PlanAndRun(
   992  	ctx context.Context,
   993  	evalCtx *extendedEvalContext,
   994  	planCtx *PlanningCtx,
   995  	txn *kv.Txn,
   996  	plan planMaybePhysical,
   997  	recv *DistSQLReceiver,
   998  ) (cleanup func()) {
   999  	log.VEventf(ctx, 1, "creating DistSQL plan with isLocal=%v", planCtx.isLocal)
  1000  
  1001  	physPlan, err := dsp.createPhysPlan(planCtx, plan)
  1002  	if err != nil {
  1003  		recv.SetError(err)
  1004  		return func() {}
  1005  	}
  1006  	dsp.FinalizePlan(planCtx, physPlan)
  1007  	recv.expectedRowsRead = int64(physPlan.TotalEstimatedScannedRows)
  1008  	return dsp.Run(planCtx, txn, physPlan, recv, evalCtx, nil /* finishedSetupFn */)
  1009  }
  1010  
  1011  // PlanAndRunCascadesAndChecks runs any cascade and check queries.
  1012  //
  1013  // Because cascades can themselves generate more cascades or check queries, this
  1014  // method can append to plan.cascades and plan.checkPlans (and all these plans
  1015  // must be closed later).
  1016  //
  1017  // Returns false if an error was encountered and sets that error in the provided
  1018  // receiver.
  1019  func (dsp *DistSQLPlanner) PlanAndRunCascadesAndChecks(
  1020  	ctx context.Context,
  1021  	planner *planner,
  1022  	evalCtxFactory func() *extendedEvalContext,
  1023  	plan *planComponents,
  1024  	recv *DistSQLReceiver,
  1025  	maybeDistribute bool,
  1026  ) bool {
  1027  	if len(plan.cascades) == 0 && len(plan.checkPlans) == 0 {
  1028  		return false
  1029  	}
  1030  
  1031  	prevSteppingMode := planner.Txn().ConfigureStepping(ctx, kv.SteppingEnabled)
  1032  	defer func() { _ = planner.Txn().ConfigureStepping(ctx, prevSteppingMode) }()
  1033  
  1034  	// We treat plan.cascades as a queue.
  1035  	for i := 0; i < len(plan.cascades); i++ {
  1036  		// The original bufferNode is stored in c.Buffer; we can refer to it
  1037  		// directly.
  1038  		// TODO(radu): this requires keeping all previous plans "alive" until the
  1039  		// very end. We may want to make copies of the buffer nodes and clean up
  1040  		// everything else.
  1041  		buf := plan.cascades[i].Buffer.(*bufferNode)
  1042  		if buf.bufferedRows.Len() == 0 {
  1043  			// No rows were actually modified.
  1044  			continue
  1045  		}
  1046  
  1047  		log.VEventf(ctx, 1, "executing cascade for constraint %s", plan.cascades[i].FKName)
  1048  
  1049  		// We place a sequence point before every cascade, so
  1050  		// that each subsequent cascade can observe the writes
  1051  		// by the previous step.
  1052  		// TODO(radu): the cascades themselves can have more cascades; if any of
  1053  		// those fall back to legacy cascades code, it will disable stepping. So we
  1054  		// have to reenable stepping each time.
  1055  		_ = planner.Txn().ConfigureStepping(ctx, kv.SteppingEnabled)
  1056  		if err := planner.Txn().Step(ctx); err != nil {
  1057  			recv.SetError(err)
  1058  			return false
  1059  		}
  1060  
  1061  		evalCtx := evalCtxFactory()
  1062  		execFactory := newExecFactory(planner)
  1063  		// The cascading query is allowed to autocommit only if it is the last
  1064  		// cascade and there are no check queries to run.
  1065  		if len(plan.checkPlans) > 0 || i < len(plan.cascades)-1 {
  1066  			execFactory.disableAutoCommit()
  1067  		}
  1068  		cascadePlan, err := plan.cascades[i].PlanFn(
  1069  			ctx, &planner.semaCtx, &evalCtx.EvalContext, execFactory, buf, buf.bufferedRows.Len(),
  1070  		)
  1071  		if err != nil {
  1072  			recv.SetError(err)
  1073  			return false
  1074  		}
  1075  		cp := cascadePlan.(*planTop)
  1076  		plan.cascades[i].plan = cp.main
  1077  		if len(cp.subqueryPlans) > 0 {
  1078  			recv.SetError(errors.AssertionFailedf("cascades should not have subqueries"))
  1079  			return false
  1080  		}
  1081  
  1082  		// Queue any new cascades.
  1083  		if len(cp.cascades) > 0 {
  1084  			plan.cascades = append(plan.cascades, cp.cascades...)
  1085  		}
  1086  
  1087  		// Collect any new checks.
  1088  		if len(cp.checkPlans) > 0 {
  1089  			plan.checkPlans = append(plan.checkPlans, cp.checkPlans...)
  1090  		}
  1091  
  1092  		// In cyclical reference situations, the number of cascading operations can
  1093  		// be arbitrarily large. To avoid OOM, we enforce a limit. This is also a
  1094  		// safeguard in case we have a bug that results in an infinite cascade loop.
  1095  		if limit := evalCtx.SessionData.OptimizerFKCascadesLimit; len(plan.cascades) > limit {
  1096  			telemetry.Inc(sqltelemetry.CascadesLimitReached)
  1097  			err := pgerror.Newf(pgcode.TriggeredActionException, "cascades limit (%d) reached", limit)
  1098  			recv.SetError(err)
  1099  			return false
  1100  		}
  1101  
  1102  		if err := dsp.planAndRunPostquery(
  1103  			ctx,
  1104  			cp.main,
  1105  			planner,
  1106  			evalCtx,
  1107  			recv,
  1108  			maybeDistribute,
  1109  		); err != nil {
  1110  			recv.SetError(err)
  1111  			return false
  1112  		}
  1113  	}
  1114  
  1115  	if len(plan.checkPlans) == 0 {
  1116  		return true
  1117  	}
  1118  
  1119  	// We place a sequence point before the checks, so that they observe the
  1120  	// writes of the main query and/or any cascades.
  1121  	// TODO(radu): the cascades themselves can have more cascades; if any of
  1122  	// those fall back to legacy cascades code, it will disable stepping. So we
  1123  	// have to reenable stepping each time.
  1124  	_ = planner.Txn().ConfigureStepping(ctx, kv.SteppingEnabled)
  1125  	if err := planner.Txn().Step(ctx); err != nil {
  1126  		recv.SetError(err)
  1127  		return false
  1128  	}
  1129  
  1130  	for i := range plan.checkPlans {
  1131  		log.VEventf(ctx, 1, "executing check query %d out of %d", i+1, len(plan.checkPlans))
  1132  		if err := dsp.planAndRunPostquery(
  1133  			ctx,
  1134  			plan.checkPlans[i].plan,
  1135  			planner,
  1136  			evalCtxFactory(),
  1137  			recv,
  1138  			maybeDistribute,
  1139  		); err != nil {
  1140  			recv.SetError(err)
  1141  			return false
  1142  		}
  1143  	}
  1144  
  1145  	return true
  1146  }
  1147  
  1148  // planAndRunPostquery runs a cascade or check query.
  1149  func (dsp *DistSQLPlanner) planAndRunPostquery(
  1150  	ctx context.Context,
  1151  	postqueryPlan planMaybePhysical,
  1152  	planner *planner,
  1153  	evalCtx *extendedEvalContext,
  1154  	recv *DistSQLReceiver,
  1155  	maybeDistribute bool,
  1156  ) error {
  1157  	postqueryMonitor := mon.MakeMonitor(
  1158  		"postquery",
  1159  		mon.MemoryResource,
  1160  		dsp.distSQLSrv.Metrics.CurBytesCount,
  1161  		dsp.distSQLSrv.Metrics.MaxBytesHist,
  1162  		-1, /* use default block size */
  1163  		noteworthyMemoryUsageBytes,
  1164  		dsp.distSQLSrv.Settings,
  1165  	)
  1166  	postqueryMonitor.Start(ctx, evalCtx.Mon, mon.BoundAccount{})
  1167  	defer postqueryMonitor.Stop(ctx)
  1168  
  1169  	postqueryMemAccount := postqueryMonitor.MakeBoundAccount()
  1170  	defer postqueryMemAccount.Close(ctx)
  1171  
  1172  	var distributePostquery bool
  1173  	if maybeDistribute {
  1174  		distributePostquery = willDistributePlan(
  1175  			ctx, planner.execCfg.NodeID, planner.SessionData().DistSQLMode, postqueryPlan,
  1176  		)
  1177  	}
  1178  	postqueryPlanCtx := dsp.NewPlanningCtx(ctx, evalCtx, planner.txn, distributePostquery)
  1179  	postqueryPlanCtx.planner = planner
  1180  	postqueryPlanCtx.stmtType = tree.Rows
  1181  	postqueryPlanCtx.ignoreClose = true
  1182  	if planner.collectBundle {
  1183  		postqueryPlanCtx.saveDiagram = func(diagram execinfrapb.FlowDiagram) {
  1184  			planner.curPlan.distSQLDiagrams = append(planner.curPlan.distSQLDiagrams, diagram)
  1185  		}
  1186  	}
  1187  
  1188  	postqueryPhysPlan, err := dsp.createPhysPlan(postqueryPlanCtx, postqueryPlan)
  1189  	if err != nil {
  1190  		return err
  1191  	}
  1192  	dsp.FinalizePlan(postqueryPlanCtx, postqueryPhysPlan)
  1193  
  1194  	postqueryRecv := recv.clone()
  1195  	// TODO(yuzefovich): at the moment, errOnlyResultWriter is sufficient here,
  1196  	// but it may not be the case when we support cascades through the optimizer.
  1197  	postqueryRecv.resultWriter = &errOnlyResultWriter{}
  1198  	dsp.Run(postqueryPlanCtx, planner.txn, postqueryPhysPlan, postqueryRecv, evalCtx, nil /* finishedSetupFn */)()
  1199  	if postqueryRecv.commErr != nil {
  1200  		return postqueryRecv.commErr
  1201  	}
  1202  	return postqueryRecv.resultWriter.Err()
  1203  }