github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/sql/distsql_running.go (about) 1 // Copyright 2016 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package sql 12 13 import ( 14 "context" 15 "fmt" 16 "math" 17 "sync" 18 "sync/atomic" 19 20 "github.com/cockroachdb/cockroach/pkg/kv" 21 "github.com/cockroachdb/cockroach/pkg/kv/kvclient/kvcoord" 22 "github.com/cockroachdb/cockroach/pkg/roachpb" 23 "github.com/cockroachdb/cockroach/pkg/rpc" 24 "github.com/cockroachdb/cockroach/pkg/rpc/nodedialer" 25 "github.com/cockroachdb/cockroach/pkg/server/telemetry" 26 "github.com/cockroachdb/cockroach/pkg/sql/colflow" 27 "github.com/cockroachdb/cockroach/pkg/sql/distsql" 28 "github.com/cockroachdb/cockroach/pkg/sql/execinfra" 29 "github.com/cockroachdb/cockroach/pkg/sql/execinfrapb" 30 "github.com/cockroachdb/cockroach/pkg/sql/flowinfra" 31 "github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgcode" 32 "github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgerror" 33 "github.com/cockroachdb/cockroach/pkg/sql/physicalplan" 34 "github.com/cockroachdb/cockroach/pkg/sql/rowcontainer" 35 "github.com/cockroachdb/cockroach/pkg/sql/rowexec" 36 "github.com/cockroachdb/cockroach/pkg/sql/sem/tree" 37 "github.com/cockroachdb/cockroach/pkg/sql/sessiondata" 38 "github.com/cockroachdb/cockroach/pkg/sql/sqlbase" 39 "github.com/cockroachdb/cockroach/pkg/sql/sqltelemetry" 40 "github.com/cockroachdb/cockroach/pkg/sql/types" 41 "github.com/cockroachdb/cockroach/pkg/util/errorutil" 42 "github.com/cockroachdb/cockroach/pkg/util/errorutil/unimplemented" 43 "github.com/cockroachdb/cockroach/pkg/util/hlc" 44 "github.com/cockroachdb/cockroach/pkg/util/log" 45 "github.com/cockroachdb/cockroach/pkg/util/mon" 46 "github.com/cockroachdb/cockroach/pkg/util/tracing" 47 "github.com/cockroachdb/cockroach/pkg/util/uuid" 48 "github.com/cockroachdb/errors" 49 opentracing "github.com/opentracing/opentracing-go" 50 ) 51 52 // To allow queries to send out flow RPCs in parallel, we use a pool of workers 53 // that can issue the RPCs on behalf of the running code. The pool is shared by 54 // multiple queries. 55 const numRunners = 16 56 57 const clientRejectedMsg string = "client rejected when attempting to run DistSQL plan" 58 59 // runnerRequest is the request that is sent (via a channel) to a worker. 60 type runnerRequest struct { 61 ctx context.Context 62 nodeDialer *nodedialer.Dialer 63 flowReq *execinfrapb.SetupFlowRequest 64 nodeID roachpb.NodeID 65 resultChan chan<- runnerResult 66 } 67 68 // runnerResult is returned by a worker (via a channel) for each received 69 // request. 70 type runnerResult struct { 71 nodeID roachpb.NodeID 72 err error 73 } 74 75 func (req runnerRequest) run() { 76 res := runnerResult{nodeID: req.nodeID} 77 78 conn, err := req.nodeDialer.Dial(req.ctx, req.nodeID, rpc.DefaultClass) 79 if err != nil { 80 res.err = err 81 } else { 82 client := execinfrapb.NewDistSQLClient(conn) 83 // TODO(radu): do we want a timeout here? 84 resp, err := client.SetupFlow(req.ctx, req.flowReq) 85 if err != nil { 86 res.err = err 87 } else { 88 res.err = resp.Error.ErrorDetail(req.ctx) 89 } 90 } 91 req.resultChan <- res 92 } 93 94 func (dsp *DistSQLPlanner) initRunners() { 95 // This channel has to be unbuffered because we want to only be able to send 96 // requests if a worker is actually there to receive them. 97 dsp.runnerChan = make(chan runnerRequest) 98 for i := 0; i < numRunners; i++ { 99 dsp.stopper.RunWorker(context.TODO(), func(context.Context) { 100 runnerChan := dsp.runnerChan 101 stopChan := dsp.stopper.ShouldStop() 102 for { 103 select { 104 case req := <-runnerChan: 105 req.run() 106 107 case <-stopChan: 108 return 109 } 110 } 111 }) 112 } 113 } 114 115 // setupFlows sets up all the flows specified in flows using the provided state. 116 // It will first attempt to set up all remote flows using the dsp workers if 117 // available or sequentially if not, and then finally set up the gateway flow, 118 // whose output is the DistSQLReceiver provided. This flow is then returned to 119 // be run. 120 func (dsp *DistSQLPlanner) setupFlows( 121 ctx context.Context, 122 evalCtx *extendedEvalContext, 123 leafInputState *roachpb.LeafTxnInputState, 124 flows map[roachpb.NodeID]*execinfrapb.FlowSpec, 125 recv *DistSQLReceiver, 126 localState distsql.LocalState, 127 vectorizeThresholdMet bool, 128 ) (context.Context, flowinfra.Flow, error) { 129 thisNodeID := dsp.nodeDesc.NodeID 130 _, ok := flows[thisNodeID] 131 if !ok { 132 return nil, nil, errors.AssertionFailedf("missing gateway flow") 133 } 134 if localState.IsLocal && len(flows) != 1 { 135 return nil, nil, errors.AssertionFailedf("IsLocal set but there's multiple flows") 136 } 137 138 evalCtxProto := execinfrapb.MakeEvalContext(&evalCtx.EvalContext) 139 setupReq := execinfrapb.SetupFlowRequest{ 140 LeafTxnInputState: leafInputState, 141 Version: execinfra.Version, 142 EvalContext: evalCtxProto, 143 TraceKV: evalCtx.Tracing.KVTracingEnabled(), 144 } 145 146 // Start all the flows except the flow on this node (there is always a flow on 147 // this node). 148 var resultChan chan runnerResult 149 if len(flows) > 1 { 150 resultChan = make(chan runnerResult, len(flows)-1) 151 } 152 153 if evalCtx.SessionData.VectorizeMode != sessiondata.VectorizeOff { 154 if !vectorizeThresholdMet && (evalCtx.SessionData.VectorizeMode == sessiondata.Vectorize201Auto || evalCtx.SessionData.VectorizeMode == sessiondata.VectorizeOn) { 155 // Vectorization is not justified for this flow because the expected 156 // amount of data is too small and the overhead of pre-allocating data 157 // structures needed for the vectorized engine is expected to dominate 158 // the execution time. 159 setupReq.EvalContext.Vectorize = int32(sessiondata.VectorizeOff) 160 } else { 161 fuseOpt := flowinfra.FuseNormally 162 if localState.IsLocal { 163 fuseOpt = flowinfra.FuseAggressively 164 } 165 // Now we check to see whether or not to even try vectorizing the flow. 166 // The goal here is to determine up front whether all of the flows can be 167 // vectorized. If any of them can't, turn off the setting. 168 // TODO(yuzefovich): this is a safe but quite inefficient way of setting 169 // up vectorized flows since the flows will effectively be planned twice. 170 for _, spec := range flows { 171 if _, err := colflow.SupportsVectorized( 172 ctx, &execinfra.FlowCtx{ 173 EvalCtx: &evalCtx.EvalContext, 174 Cfg: &execinfra.ServerConfig{ 175 DiskMonitor: &mon.BytesMonitor{}, 176 Settings: dsp.st, 177 ClusterID: &dsp.rpcCtx.ClusterID, 178 VecFDSemaphore: dsp.distSQLSrv.VecFDSemaphore, 179 }, 180 NodeID: evalCtx.NodeID, 181 }, spec.Processors, fuseOpt, recv, 182 ); err != nil { 183 // Vectorization attempt failed with an error. 184 returnVectorizationSetupError := false 185 if evalCtx.SessionData.VectorizeMode == sessiondata.VectorizeExperimentalAlways { 186 returnVectorizationSetupError = true 187 // If running with VectorizeExperimentalAlways, this check makes sure 188 // that we can still run SET statements (mostly to set vectorize to 189 // off) and the like. 190 if len(spec.Processors) == 1 && 191 spec.Processors[0].Core.LocalPlanNode != nil { 192 rsidx := spec.Processors[0].Core.LocalPlanNode.RowSourceIdx 193 if rsidx != nil { 194 lp := localState.LocalProcs[*rsidx] 195 if z, ok := lp.(colflow.VectorizeAlwaysException); ok { 196 if z.IsException() { 197 returnVectorizationSetupError = false 198 } 199 } 200 } 201 } 202 } 203 log.VEventf(ctx, 1, "failed to vectorize: %s", err) 204 if returnVectorizationSetupError { 205 return nil, nil, err 206 } 207 // Vectorization is not supported for this flow, so we override the 208 // setting. 209 setupReq.EvalContext.Vectorize = int32(sessiondata.VectorizeOff) 210 break 211 } 212 } 213 } 214 } 215 for nodeID, flowSpec := range flows { 216 if nodeID == thisNodeID { 217 // Skip this node. 218 continue 219 } 220 if !evalCtx.Codec.ForSystemTenant() { 221 // A tenant server should never find itself distributing flows. 222 // NB: we wouldn't hit this in practice but if we did the actual 223 // error would be opaque. 224 return nil, nil, errorutil.UnsupportedWithMultiTenancy(47900) 225 } 226 req := setupReq 227 req.Flow = *flowSpec 228 runReq := runnerRequest{ 229 ctx: ctx, 230 nodeDialer: dsp.nodeDialer, 231 flowReq: &req, 232 nodeID: nodeID, 233 resultChan: resultChan, 234 } 235 defer physicalplan.ReleaseSetupFlowRequest(&req) 236 237 // Send out a request to the workers; if no worker is available, run 238 // directly. 239 select { 240 case dsp.runnerChan <- runReq: 241 default: 242 runReq.run() 243 } 244 } 245 246 var firstErr error 247 // Now wait for all the flows to be scheduled on remote nodes. Note that we 248 // are not waiting for the flows themselves to complete. 249 for i := 0; i < len(flows)-1; i++ { 250 res := <-resultChan 251 if firstErr == nil { 252 firstErr = res.err 253 } 254 // TODO(radu): accumulate the flows that we failed to set up and move them 255 // into the local flow. 256 } 257 if firstErr != nil { 258 return nil, nil, firstErr 259 } 260 261 // Set up the flow on this node. 262 localReq := setupReq 263 localReq.Flow = *flows[thisNodeID] 264 defer physicalplan.ReleaseSetupFlowRequest(&localReq) 265 ctx, flow, err := dsp.distSQLSrv.SetupLocalSyncFlow(ctx, evalCtx.Mon, &localReq, recv, localState) 266 if err != nil { 267 return nil, nil, err 268 } 269 270 return ctx, flow, nil 271 } 272 273 // Run executes a physical plan. The plan should have been finalized using 274 // FinalizePlan. 275 // 276 // All errors encountered are reported to the DistSQLReceiver's resultWriter. 277 // Additionally, if the error is a "communication error" (an error encountered 278 // while using that resultWriter), the error is also stored in 279 // DistSQLReceiver.commErr. That can be tested to see if a client session needs 280 // to be closed. 281 // 282 // Args: 283 // - txn is the transaction in which the plan will run. If nil, the different 284 // processors are expected to manage their own internal transactions. 285 // - evalCtx is the evaluation context in which the plan will run. It might be 286 // mutated. 287 // - finishedSetupFn, if non-nil, is called synchronously after all the 288 // processors have successfully started up. 289 // 290 // It returns a non-nil (although it can be a noop when an error is 291 // encountered) cleanup function that must be called in order to release the 292 // resources. 293 func (dsp *DistSQLPlanner) Run( 294 planCtx *PlanningCtx, 295 txn *kv.Txn, 296 plan *PhysicalPlan, 297 recv *DistSQLReceiver, 298 evalCtx *extendedEvalContext, 299 finishedSetupFn func(), 300 ) (cleanup func()) { 301 ctx := planCtx.ctx 302 303 var ( 304 localState distsql.LocalState 305 leafInputState *roachpb.LeafTxnInputState 306 ) 307 // NB: putting part of evalCtx in localState means it might be mutated down 308 // the line. 309 localState.EvalContext = &evalCtx.EvalContext 310 localState.Txn = txn 311 if planCtx.isLocal { 312 localState.IsLocal = true 313 localState.LocalProcs = plan.LocalProcessors 314 } else if txn != nil { 315 // If the plan is not local, we will have to set up leaf txns using the 316 // txnCoordMeta. 317 tis, err := txn.GetLeafTxnInputStateOrRejectClient(ctx) 318 if err != nil { 319 log.Infof(ctx, "%s: %s", clientRejectedMsg, err) 320 recv.SetError(err) 321 return func() {} 322 } 323 leafInputState = &tis 324 } 325 326 flows := plan.GenerateFlowSpecs(dsp.nodeDesc.NodeID /* gateway */) 327 if _, ok := flows[dsp.nodeDesc.NodeID]; !ok { 328 recv.SetError(errors.Errorf("expected to find gateway flow")) 329 return func() {} 330 } 331 332 if planCtx.saveDiagram != nil { 333 // Local flows might not have the UUID field set. We need it to be set to 334 // distinguish statistics for processors in subqueries vs the main query vs 335 // postqueries. 336 if len(flows) == 1 { 337 for _, f := range flows { 338 if f.FlowID == (execinfrapb.FlowID{}) { 339 f.FlowID.UUID = uuid.MakeV4() 340 } 341 } 342 } 343 log.VEvent(ctx, 1, "creating plan diagram") 344 var stmtStr string 345 if planCtx.planner != nil && planCtx.planner.stmt != nil { 346 stmtStr = planCtx.planner.stmt.String() 347 } 348 diagram, err := execinfrapb.GeneratePlanDiagram( 349 stmtStr, flows, planCtx.saveDiagramShowInputTypes, 350 ) 351 if err != nil { 352 recv.SetError(err) 353 return func() {} 354 } 355 planCtx.saveDiagram(diagram) 356 } 357 358 if logPlanDiagram { 359 log.VEvent(ctx, 1, "creating plan diagram for logging") 360 var stmtStr string 361 if planCtx.planner != nil && planCtx.planner.stmt != nil { 362 stmtStr = planCtx.planner.stmt.String() 363 } 364 _, url, err := execinfrapb.GeneratePlanDiagramURL(stmtStr, flows, false /* showInputTypes */) 365 if err != nil { 366 log.Infof(ctx, "Error generating diagram: %s", err) 367 } else { 368 log.Infof(ctx, "Plan diagram URL:\n%s", url.String()) 369 } 370 } 371 372 log.VEvent(ctx, 1, "running DistSQL plan") 373 374 dsp.distSQLSrv.ServerConfig.Metrics.QueryStart() 375 defer dsp.distSQLSrv.ServerConfig.Metrics.QueryStop() 376 377 recv.outputTypes = plan.ResultTypes 378 recv.resultToStreamColMap = plan.PlanToStreamColMap 379 380 vectorizedThresholdMet := plan.MaxEstimatedRowCount >= evalCtx.SessionData.VectorizeRowCountThreshold 381 382 if len(flows) == 1 { 383 // We ended up planning everything locally, regardless of whether we 384 // intended to distribute or not. 385 localState.IsLocal = true 386 } 387 388 ctx, flow, err := dsp.setupFlows(ctx, evalCtx, leafInputState, flows, recv, localState, vectorizedThresholdMet) 389 if err != nil { 390 recv.SetError(err) 391 return func() {} 392 } 393 394 if finishedSetupFn != nil { 395 finishedSetupFn() 396 } 397 398 // Check that flows that were forced to be planned locally also have no concurrency. 399 // This is important, since these flows are forced to use the RootTxn (since 400 // they might have mutations), and the RootTxn does not permit concurrency. 401 // For such flows, we were supposed to have fused everything. 402 if txn != nil && planCtx.isLocal && flow.ConcurrentExecution() { 403 recv.SetError(errors.AssertionFailedf( 404 "unexpected concurrency for a flow that was forced to be planned locally")) 405 return func() {} 406 } 407 408 // TODO(radu): this should go through the flow scheduler. 409 if err := flow.Run(ctx, func() {}); err != nil { 410 log.Fatalf(ctx, "unexpected error from syncFlow.Start(): %s "+ 411 "The error should have gone to the consumer.", err) 412 } 413 414 // TODO(yuzefovich): it feels like this closing should happen after 415 // PlanAndRun. We should refactor this and get rid off ignoreClose field. 416 if planCtx.planner != nil && !planCtx.ignoreClose { 417 // planCtx can change before the cleanup function is executed, so we make 418 // a copy of the planner and bind it to the function. 419 curPlan := &planCtx.planner.curPlan 420 return func() { 421 // We need to close the planNode tree we translated into a DistSQL plan 422 // before flow.Cleanup, which closes memory accounts that expect to be 423 // emptied. 424 curPlan.execErr = recv.resultWriter.Err() 425 curPlan.close(ctx) 426 flow.Cleanup(ctx) 427 } 428 } 429 430 // ignoreClose is set to true meaning that someone else will handle the 431 // closing of the current plan, so we simply clean up the flow. 432 return func() { 433 flow.Cleanup(ctx) 434 } 435 } 436 437 // DistSQLReceiver is a RowReceiver that writes results to a rowResultWriter. 438 // This is where the DistSQL execution meets the SQL Session - the RowContainer 439 // comes from a client Session. 440 // 441 // DistSQLReceiver also update the RangeDescriptorCache and the LeaseholderCache 442 // in response to DistSQL metadata about misplanned ranges. 443 type DistSQLReceiver struct { 444 ctx context.Context 445 446 // resultWriter is the interface which we send results to. 447 resultWriter rowResultWriter 448 449 stmtType tree.StatementType 450 451 // outputTypes are the types of the result columns produced by the plan. 452 outputTypes []*types.T 453 454 // resultToStreamColMap maps result columns to columns in the rowexec results 455 // stream. 456 resultToStreamColMap []int 457 458 // noColsRequired indicates that the caller is only interested in the 459 // existence of a single row. Used by subqueries in EXISTS mode. 460 noColsRequired bool 461 462 // discardRows is set when we want to discard rows (for testing/benchmarks). 463 // See EXECUTE .. DISCARD ROWS. 464 discardRows bool 465 466 // commErr keeps track of the error received from interacting with the 467 // resultWriter. This represents a "communication error" and as such is unlike 468 // query execution errors: when the DistSQLReceiver is used within a SQL 469 // session, such errors mean that we have to bail on the session. 470 // Query execution errors are reported to the resultWriter. For some client's 471 // convenience, communication errors are also reported to the resultWriter. 472 // 473 // Once set, no more rows are accepted. 474 commErr error 475 476 row tree.Datums 477 status execinfra.ConsumerStatus 478 alloc sqlbase.DatumAlloc 479 closed bool 480 481 rangeCache *kvcoord.RangeDescriptorCache 482 leaseCache *kvcoord.LeaseHolderCache 483 tracing *SessionTracing 484 cleanup func() 485 486 // The transaction in which the flow producing data for this 487 // receiver runs. The DistSQLReceiver updates the transaction in 488 // response to RetryableTxnError's and when distributed processors 489 // pass back LeafTxnFinalState objects via ProducerMetas. Nil if no 490 // transaction should be updated on errors (i.e. if the flow overall 491 // doesn't run in a transaction). 492 txn *kv.Txn 493 494 // A handler for clock signals arriving from remote nodes. This should update 495 // this node's clock. 496 updateClock func(observedTs hlc.Timestamp) 497 498 // bytesRead and rowsRead track the corresponding metrics while executing the 499 // statement. 500 bytesRead int64 501 rowsRead int64 502 503 expectedRowsRead int64 504 progressAtomic *uint64 505 } 506 507 // rowResultWriter is a subset of CommandResult to be used with the 508 // DistSQLReceiver. It's implemented by RowResultWriter. 509 type rowResultWriter interface { 510 // AddRow writes a result row. 511 // Note that the caller owns the row slice and might reuse it. 512 AddRow(ctx context.Context, row tree.Datums) error 513 IncrementRowsAffected(n int) 514 SetError(error) 515 Err() error 516 } 517 518 type metadataResultWriter interface { 519 AddMeta(ctx context.Context, meta *execinfrapb.ProducerMetadata) 520 } 521 522 type metadataCallbackWriter struct { 523 rowResultWriter 524 fn func(ctx context.Context, meta *execinfrapb.ProducerMetadata) error 525 } 526 527 func (w *metadataCallbackWriter) AddMeta(ctx context.Context, meta *execinfrapb.ProducerMetadata) { 528 if err := w.fn(ctx, meta); err != nil { 529 w.SetError(err) 530 } 531 } 532 533 // errOnlyResultWriter is a rowResultWriter that only supports receiving an 534 // error. All other functions that deal with producing results panic. 535 type errOnlyResultWriter struct { 536 err error 537 } 538 539 var _ rowResultWriter = &errOnlyResultWriter{} 540 541 func (w *errOnlyResultWriter) SetError(err error) { 542 w.err = err 543 } 544 func (w *errOnlyResultWriter) Err() error { 545 return w.err 546 } 547 548 func (w *errOnlyResultWriter) AddRow(ctx context.Context, row tree.Datums) error { 549 panic("AddRow not supported by errOnlyResultWriter") 550 } 551 func (w *errOnlyResultWriter) IncrementRowsAffected(n int) { 552 panic("IncrementRowsAffected not supported by errOnlyResultWriter") 553 } 554 555 var _ execinfra.RowReceiver = &DistSQLReceiver{} 556 557 var receiverSyncPool = sync.Pool{ 558 New: func() interface{} { 559 return &DistSQLReceiver{} 560 }, 561 } 562 563 // MakeDistSQLReceiver creates a DistSQLReceiver. 564 // 565 // ctx is the Context that the receiver will use throughout its 566 // lifetime. resultWriter is the container where the results will be 567 // stored. If only the row count is needed, this can be nil. 568 // 569 // txn is the transaction in which the producer flow runs; it will be updated 570 // on errors. Nil if the flow overall doesn't run in a transaction. 571 func MakeDistSQLReceiver( 572 ctx context.Context, 573 resultWriter rowResultWriter, 574 stmtType tree.StatementType, 575 rangeCache *kvcoord.RangeDescriptorCache, 576 leaseCache *kvcoord.LeaseHolderCache, 577 txn *kv.Txn, 578 updateClock func(observedTs hlc.Timestamp), 579 tracing *SessionTracing, 580 ) *DistSQLReceiver { 581 consumeCtx, cleanup := tracing.TraceExecConsume(ctx) 582 r := receiverSyncPool.Get().(*DistSQLReceiver) 583 *r = DistSQLReceiver{ 584 ctx: consumeCtx, 585 cleanup: cleanup, 586 resultWriter: resultWriter, 587 rangeCache: rangeCache, 588 leaseCache: leaseCache, 589 txn: txn, 590 updateClock: updateClock, 591 stmtType: stmtType, 592 tracing: tracing, 593 } 594 return r 595 } 596 597 // Release releases this DistSQLReceiver back to the pool. 598 func (r *DistSQLReceiver) Release() { 599 *r = DistSQLReceiver{} 600 receiverSyncPool.Put(r) 601 } 602 603 // clone clones the receiver for running subqueries. Not all fields are cloned, 604 // only those required for running subqueries. 605 func (r *DistSQLReceiver) clone() *DistSQLReceiver { 606 ret := receiverSyncPool.Get().(*DistSQLReceiver) 607 *ret = DistSQLReceiver{ 608 ctx: r.ctx, 609 cleanup: func() {}, 610 rangeCache: r.rangeCache, 611 leaseCache: r.leaseCache, 612 txn: r.txn, 613 updateClock: r.updateClock, 614 stmtType: tree.Rows, 615 tracing: r.tracing, 616 } 617 return ret 618 } 619 620 // SetError provides a convenient way for a client to pass in an error, thus 621 // pretending that a query execution error happened. The error is passed along 622 // to the resultWriter. 623 func (r *DistSQLReceiver) SetError(err error) { 624 r.resultWriter.SetError(err) 625 } 626 627 // Push is part of the RowReceiver interface. 628 func (r *DistSQLReceiver) Push( 629 row sqlbase.EncDatumRow, meta *execinfrapb.ProducerMetadata, 630 ) execinfra.ConsumerStatus { 631 if meta != nil { 632 if meta.LeafTxnFinalState != nil { 633 if r.txn != nil { 634 if r.txn.ID() == meta.LeafTxnFinalState.Txn.ID { 635 if err := r.txn.UpdateRootWithLeafFinalState(r.ctx, meta.LeafTxnFinalState); err != nil { 636 r.resultWriter.SetError(err) 637 } 638 } 639 } else { 640 r.resultWriter.SetError( 641 errors.Errorf("received a leaf final state (%s); but have no root", meta.LeafTxnFinalState)) 642 } 643 } 644 if meta.Err != nil { 645 // Check if the error we just received should take precedence over a 646 // previous error (if any). 647 if roachpb.ErrPriority(meta.Err) > roachpb.ErrPriority(r.resultWriter.Err()) { 648 if r.txn != nil { 649 if retryErr := (*roachpb.UnhandledRetryableError)(nil); errors.As(meta.Err, &retryErr) { 650 // Update the txn in response to remote errors. In the non-DistSQL 651 // world, the TxnCoordSender handles "unhandled" retryable errors, 652 // but this one is coming from a distributed SQL node, which has 653 // left the handling up to the root transaction. 654 meta.Err = r.txn.UpdateStateOnRemoteRetryableErr(r.ctx, &retryErr.PErr) 655 // Update the clock with information from the error. On non-DistSQL 656 // code paths, the DistSender does this. 657 // TODO(andrei): We don't propagate clock signals on success cases 658 // through DistSQL; we should. We also don't propagate them through 659 // non-retryable errors; we also should. 660 r.updateClock(retryErr.PErr.Now) 661 } 662 } 663 r.resultWriter.SetError(meta.Err) 664 } 665 } 666 if len(meta.Ranges) > 0 { 667 r.updateCaches(r.ctx, meta.Ranges) 668 } 669 if len(meta.TraceData) > 0 { 670 span := opentracing.SpanFromContext(r.ctx) 671 if span == nil { 672 r.resultWriter.SetError( 673 errors.New("trying to ingest remote spans but there is no recording span set up")) 674 } else if err := tracing.ImportRemoteSpans(span, meta.TraceData); err != nil { 675 r.resultWriter.SetError(errors.Errorf("error ingesting remote spans: %s", err)) 676 } 677 } 678 if meta.Metrics != nil { 679 r.bytesRead += meta.Metrics.BytesRead 680 r.rowsRead += meta.Metrics.RowsRead 681 if r.progressAtomic != nil && r.expectedRowsRead != 0 { 682 progress := float64(r.rowsRead) / float64(r.expectedRowsRead) 683 atomic.StoreUint64(r.progressAtomic, math.Float64bits(progress)) 684 } 685 meta.Metrics.Release() 686 meta.Release() 687 } 688 if metaWriter, ok := r.resultWriter.(metadataResultWriter); ok { 689 metaWriter.AddMeta(r.ctx, meta) 690 } 691 return r.status 692 } 693 if r.resultWriter.Err() == nil && r.ctx.Err() != nil { 694 r.resultWriter.SetError(r.ctx.Err()) 695 } 696 if r.resultWriter.Err() != nil { 697 // TODO(andrei): We should drain here if we weren't canceled. 698 return execinfra.ConsumerClosed 699 } 700 if r.status != execinfra.NeedMoreRows { 701 return r.status 702 } 703 704 if r.stmtType != tree.Rows { 705 // We only need the row count. planNodeToRowSource is set up to handle 706 // ensuring that the last stage in the pipeline will return a single-column 707 // row with the row count in it, so just grab that and exit. 708 r.resultWriter.IncrementRowsAffected(int(tree.MustBeDInt(row[0].Datum))) 709 return r.status 710 } 711 712 if r.discardRows { 713 // Discard rows. 714 return r.status 715 } 716 717 // If no columns are needed by the output, the consumer is only looking for 718 // whether a single row is pushed or not, so the contents do not matter, and 719 // planNodeToRowSource is not set up to handle decoding the row. 720 if r.noColsRequired { 721 r.row = []tree.Datum{} 722 r.status = execinfra.ConsumerClosed 723 } else { 724 if r.row == nil { 725 r.row = make(tree.Datums, len(r.resultToStreamColMap)) 726 } 727 for i, resIdx := range r.resultToStreamColMap { 728 err := row[resIdx].EnsureDecoded(r.outputTypes[resIdx], &r.alloc) 729 if err != nil { 730 r.resultWriter.SetError(err) 731 r.status = execinfra.ConsumerClosed 732 return r.status 733 } 734 r.row[i] = row[resIdx].Datum 735 } 736 } 737 r.tracing.TraceExecRowsResult(r.ctx, r.row) 738 // Note that AddRow accounts for the memory used by the Datums. 739 if commErr := r.resultWriter.AddRow(r.ctx, r.row); commErr != nil { 740 // ErrLimitedResultClosed is not a real error, it is a 741 // signal to stop distsql and return success to the client. 742 if !errors.Is(commErr, ErrLimitedResultClosed) { 743 // Set the error on the resultWriter too, for the convenience of some of the 744 // clients. If clients don't care to differentiate between communication 745 // errors and query execution errors, they can simply inspect 746 // resultWriter.Err(). Also, this function itself doesn't care about the 747 // distinction and just uses resultWriter.Err() to see if we're still 748 // accepting results. 749 r.resultWriter.SetError(commErr) 750 751 // We don't need to shut down the connection 752 // if there's a portal-related error. This is 753 // definitely a layering violation, but is part 754 // of some accepted technical debt (see comments on 755 // sql/pgwire.limitedCommandResult.moreResultsNeeded). 756 // Instead of changing the signature of AddRow, we have 757 // a sentinel error that is handled specially here. 758 if !errors.Is(commErr, ErrLimitedResultNotSupported) { 759 r.commErr = commErr 760 } 761 } 762 // TODO(andrei): We should drain here. Metadata from this query would be 763 // useful, particularly as it was likely a large query (since AddRow() 764 // above failed, presumably with an out-of-memory error). 765 r.status = execinfra.ConsumerClosed 766 } 767 return r.status 768 } 769 770 var ( 771 // ErrLimitedResultNotSupported is an error produced by pgwire 772 // indicating an unsupported feature of row count limits was attempted. 773 ErrLimitedResultNotSupported = unimplemented.NewWithIssue(40195, "multiple active portals not supported") 774 // ErrLimitedResultClosed is a sentinel error produced by pgwire 775 // indicating the portal should be closed without error. 776 ErrLimitedResultClosed = errors.New("row count limit closed") 777 ) 778 779 // ProducerDone is part of the RowReceiver interface. 780 func (r *DistSQLReceiver) ProducerDone() { 781 if r.closed { 782 panic("double close") 783 } 784 r.closed = true 785 r.cleanup() 786 } 787 788 // Types is part of the RowReceiver interface. 789 func (r *DistSQLReceiver) Types() []*types.T { 790 return r.outputTypes 791 } 792 793 // updateCaches takes information about some ranges that were mis-planned and 794 // updates the range descriptor and lease-holder caches accordingly. 795 // 796 // TODO(andrei): updating these caches is not perfect: we can clobber newer 797 // information that someone else has populated because there's no timing info 798 // anywhere. We also may fail to remove stale info from the LeaseHolderCache if 799 // the ids of the ranges that we get are different than the ids in that cache. 800 func (r *DistSQLReceiver) updateCaches(ctx context.Context, ranges []roachpb.RangeInfo) { 801 // Update the RangeDescriptorCache. 802 rngDescs := make([]roachpb.RangeDescriptor, len(ranges)) 803 for i, ri := range ranges { 804 rngDescs[i] = ri.Desc 805 } 806 r.rangeCache.InsertRangeDescriptors(ctx, rngDescs...) 807 808 // Update the LeaseHolderCache. 809 for _, ri := range ranges { 810 r.leaseCache.Update(ctx, ri.Desc.RangeID, ri.Lease.Replica.StoreID) 811 } 812 } 813 814 // PlanAndRunSubqueries returns false if an error was encountered and sets that 815 // error in the provided receiver. 816 func (dsp *DistSQLPlanner) PlanAndRunSubqueries( 817 ctx context.Context, 818 planner *planner, 819 evalCtxFactory func() *extendedEvalContext, 820 subqueryPlans []subquery, 821 recv *DistSQLReceiver, 822 maybeDistribute bool, 823 ) bool { 824 for planIdx, subqueryPlan := range subqueryPlans { 825 if err := dsp.planAndRunSubquery( 826 ctx, 827 planIdx, 828 subqueryPlan, 829 planner, 830 evalCtxFactory(), 831 subqueryPlans, 832 recv, 833 maybeDistribute, 834 ); err != nil { 835 recv.SetError(err) 836 return false 837 } 838 } 839 840 return true 841 } 842 843 func (dsp *DistSQLPlanner) planAndRunSubquery( 844 ctx context.Context, 845 planIdx int, 846 subqueryPlan subquery, 847 planner *planner, 848 evalCtx *extendedEvalContext, 849 subqueryPlans []subquery, 850 recv *DistSQLReceiver, 851 maybeDistribute bool, 852 ) error { 853 subqueryMonitor := mon.MakeMonitor( 854 "subquery", 855 mon.MemoryResource, 856 dsp.distSQLSrv.Metrics.CurBytesCount, 857 dsp.distSQLSrv.Metrics.MaxBytesHist, 858 -1, /* use default block size */ 859 noteworthyMemoryUsageBytes, 860 dsp.distSQLSrv.Settings, 861 ) 862 subqueryMonitor.Start(ctx, evalCtx.Mon, mon.BoundAccount{}) 863 defer subqueryMonitor.Stop(ctx) 864 865 subqueryMemAccount := subqueryMonitor.MakeBoundAccount() 866 defer subqueryMemAccount.Close(ctx) 867 868 var distributeSubquery bool 869 if maybeDistribute { 870 distributeSubquery = willDistributePlan( 871 ctx, planner.execCfg.NodeID, planner.SessionData().DistSQLMode, subqueryPlan.plan, 872 ) 873 } 874 subqueryPlanCtx := dsp.NewPlanningCtx(ctx, evalCtx, planner.txn, distributeSubquery) 875 subqueryPlanCtx.planner = planner 876 subqueryPlanCtx.stmtType = tree.Rows 877 if planner.collectBundle { 878 subqueryPlanCtx.saveDiagram = func(diagram execinfrapb.FlowDiagram) { 879 planner.curPlan.distSQLDiagrams = append(planner.curPlan.distSQLDiagrams, diagram) 880 } 881 } 882 // Don't close the top-level plan from subqueries - someone else will handle 883 // that. 884 subqueryPlanCtx.ignoreClose = true 885 subqueryPhysPlan, err := dsp.createPhysPlan(subqueryPlanCtx, subqueryPlan.plan) 886 if err != nil { 887 return err 888 } 889 dsp.FinalizePlan(subqueryPlanCtx, subqueryPhysPlan) 890 891 // TODO(arjun): #28264: We set up a row container, wrap it in a row 892 // receiver, and use it and serialize the results of the subquery. The type 893 // of the results stored in the container depends on the type of the subquery. 894 subqueryRecv := recv.clone() 895 var typ sqlbase.ColTypeInfo 896 var rows *rowcontainer.RowContainer 897 if subqueryPlan.execMode == rowexec.SubqueryExecModeExists { 898 subqueryRecv.noColsRequired = true 899 typ = sqlbase.ColTypeInfoFromColTypes([]*types.T{}) 900 } else { 901 // Apply the PlanToStreamColMap projection to the ResultTypes to get the 902 // final set of output types for the subquery. The reason this is necessary 903 // is that the output schema of a query sometimes contains columns necessary 904 // to merge the streams, but that aren't required by the final output of the 905 // query. These get projected out, so we need to similarly adjust the 906 // expected result types of the subquery here. 907 colTypes := make([]*types.T, len(subqueryPhysPlan.PlanToStreamColMap)) 908 for i, resIdx := range subqueryPhysPlan.PlanToStreamColMap { 909 colTypes[i] = subqueryPhysPlan.ResultTypes[resIdx] 910 } 911 typ = sqlbase.ColTypeInfoFromColTypes(colTypes) 912 } 913 rows = rowcontainer.NewRowContainer(subqueryMemAccount, typ, 0) 914 defer rows.Close(ctx) 915 916 subqueryRowReceiver := NewRowResultWriter(rows) 917 subqueryRecv.resultWriter = subqueryRowReceiver 918 subqueryPlans[planIdx].started = true 919 dsp.Run(subqueryPlanCtx, planner.txn, subqueryPhysPlan, subqueryRecv, evalCtx, nil /* finishedSetupFn */)() 920 if subqueryRecv.commErr != nil { 921 return subqueryRecv.commErr 922 } 923 if err := subqueryRowReceiver.Err(); err != nil { 924 return err 925 } 926 switch subqueryPlan.execMode { 927 case rowexec.SubqueryExecModeExists: 928 // For EXISTS expressions, all we want to know if there is at least one row. 929 hasRows := rows.Len() != 0 930 subqueryPlans[planIdx].result = tree.MakeDBool(tree.DBool(hasRows)) 931 case rowexec.SubqueryExecModeAllRows, rowexec.SubqueryExecModeAllRowsNormalized: 932 var result tree.DTuple 933 for rows.Len() > 0 { 934 row := rows.At(0) 935 rows.PopFirst() 936 if row.Len() == 1 { 937 // This seems hokey, but if we don't do this then the subquery expands 938 // to a tuple of tuples instead of a tuple of values and an expression 939 // like "k IN (SELECT foo FROM bar)" will fail because we're comparing 940 // a single value against a tuple. 941 result.D = append(result.D, row[0]) 942 } else { 943 result.D = append(result.D, &tree.DTuple{D: row}) 944 } 945 } 946 947 if subqueryPlan.execMode == rowexec.SubqueryExecModeAllRowsNormalized { 948 result.Normalize(&evalCtx.EvalContext) 949 } 950 subqueryPlans[planIdx].result = &result 951 case rowexec.SubqueryExecModeOneRow: 952 switch rows.Len() { 953 case 0: 954 subqueryPlans[planIdx].result = tree.DNull 955 case 1: 956 row := rows.At(0) 957 switch row.Len() { 958 case 1: 959 subqueryPlans[planIdx].result = row[0] 960 default: 961 subqueryPlans[planIdx].result = &tree.DTuple{D: rows.At(0)} 962 } 963 default: 964 return pgerror.Newf(pgcode.CardinalityViolation, 965 "more than one row returned by a subquery used as an expression") 966 } 967 default: 968 return fmt.Errorf("unexpected subqueryExecMode: %d", subqueryPlan.execMode) 969 } 970 return nil 971 } 972 973 // PlanAndRun generates a physical plan from a planNode tree and executes it. It 974 // assumes that the tree is supported (see CheckSupport). 975 // 976 // All errors encountered are reported to the DistSQLReceiver's resultWriter. 977 // Additionally, if the error is a "communication error" (an error encountered 978 // while using that resultWriter), the error is also stored in 979 // DistSQLReceiver.commErr. That can be tested to see if a client session needs 980 // to be closed. 981 // 982 // It returns a non-nil (although it can be a noop when an error is 983 // encountered) cleanup function that must be called once the planTop AST is no 984 // longer needed and can be closed. Note that this function also cleans up the 985 // flow which is unfortunate but is caused by the sharing of memory monitors 986 // between planning and execution - cleaning up the flow wants to close the 987 // monitor, but it cannot do so because the AST needs to live longer and still 988 // uses the same monitor. That's why we end up in a situation that in order to 989 // clean up the flow, we need to close the AST first, but we can only do that 990 // after PlanAndRun returns. 991 func (dsp *DistSQLPlanner) PlanAndRun( 992 ctx context.Context, 993 evalCtx *extendedEvalContext, 994 planCtx *PlanningCtx, 995 txn *kv.Txn, 996 plan planMaybePhysical, 997 recv *DistSQLReceiver, 998 ) (cleanup func()) { 999 log.VEventf(ctx, 1, "creating DistSQL plan with isLocal=%v", planCtx.isLocal) 1000 1001 physPlan, err := dsp.createPhysPlan(planCtx, plan) 1002 if err != nil { 1003 recv.SetError(err) 1004 return func() {} 1005 } 1006 dsp.FinalizePlan(planCtx, physPlan) 1007 recv.expectedRowsRead = int64(physPlan.TotalEstimatedScannedRows) 1008 return dsp.Run(planCtx, txn, physPlan, recv, evalCtx, nil /* finishedSetupFn */) 1009 } 1010 1011 // PlanAndRunCascadesAndChecks runs any cascade and check queries. 1012 // 1013 // Because cascades can themselves generate more cascades or check queries, this 1014 // method can append to plan.cascades and plan.checkPlans (and all these plans 1015 // must be closed later). 1016 // 1017 // Returns false if an error was encountered and sets that error in the provided 1018 // receiver. 1019 func (dsp *DistSQLPlanner) PlanAndRunCascadesAndChecks( 1020 ctx context.Context, 1021 planner *planner, 1022 evalCtxFactory func() *extendedEvalContext, 1023 plan *planComponents, 1024 recv *DistSQLReceiver, 1025 maybeDistribute bool, 1026 ) bool { 1027 if len(plan.cascades) == 0 && len(plan.checkPlans) == 0 { 1028 return false 1029 } 1030 1031 prevSteppingMode := planner.Txn().ConfigureStepping(ctx, kv.SteppingEnabled) 1032 defer func() { _ = planner.Txn().ConfigureStepping(ctx, prevSteppingMode) }() 1033 1034 // We treat plan.cascades as a queue. 1035 for i := 0; i < len(plan.cascades); i++ { 1036 // The original bufferNode is stored in c.Buffer; we can refer to it 1037 // directly. 1038 // TODO(radu): this requires keeping all previous plans "alive" until the 1039 // very end. We may want to make copies of the buffer nodes and clean up 1040 // everything else. 1041 buf := plan.cascades[i].Buffer.(*bufferNode) 1042 if buf.bufferedRows.Len() == 0 { 1043 // No rows were actually modified. 1044 continue 1045 } 1046 1047 log.VEventf(ctx, 1, "executing cascade for constraint %s", plan.cascades[i].FKName) 1048 1049 // We place a sequence point before every cascade, so 1050 // that each subsequent cascade can observe the writes 1051 // by the previous step. 1052 // TODO(radu): the cascades themselves can have more cascades; if any of 1053 // those fall back to legacy cascades code, it will disable stepping. So we 1054 // have to reenable stepping each time. 1055 _ = planner.Txn().ConfigureStepping(ctx, kv.SteppingEnabled) 1056 if err := planner.Txn().Step(ctx); err != nil { 1057 recv.SetError(err) 1058 return false 1059 } 1060 1061 evalCtx := evalCtxFactory() 1062 execFactory := newExecFactory(planner) 1063 // The cascading query is allowed to autocommit only if it is the last 1064 // cascade and there are no check queries to run. 1065 if len(plan.checkPlans) > 0 || i < len(plan.cascades)-1 { 1066 execFactory.disableAutoCommit() 1067 } 1068 cascadePlan, err := plan.cascades[i].PlanFn( 1069 ctx, &planner.semaCtx, &evalCtx.EvalContext, execFactory, buf, buf.bufferedRows.Len(), 1070 ) 1071 if err != nil { 1072 recv.SetError(err) 1073 return false 1074 } 1075 cp := cascadePlan.(*planTop) 1076 plan.cascades[i].plan = cp.main 1077 if len(cp.subqueryPlans) > 0 { 1078 recv.SetError(errors.AssertionFailedf("cascades should not have subqueries")) 1079 return false 1080 } 1081 1082 // Queue any new cascades. 1083 if len(cp.cascades) > 0 { 1084 plan.cascades = append(plan.cascades, cp.cascades...) 1085 } 1086 1087 // Collect any new checks. 1088 if len(cp.checkPlans) > 0 { 1089 plan.checkPlans = append(plan.checkPlans, cp.checkPlans...) 1090 } 1091 1092 // In cyclical reference situations, the number of cascading operations can 1093 // be arbitrarily large. To avoid OOM, we enforce a limit. This is also a 1094 // safeguard in case we have a bug that results in an infinite cascade loop. 1095 if limit := evalCtx.SessionData.OptimizerFKCascadesLimit; len(plan.cascades) > limit { 1096 telemetry.Inc(sqltelemetry.CascadesLimitReached) 1097 err := pgerror.Newf(pgcode.TriggeredActionException, "cascades limit (%d) reached", limit) 1098 recv.SetError(err) 1099 return false 1100 } 1101 1102 if err := dsp.planAndRunPostquery( 1103 ctx, 1104 cp.main, 1105 planner, 1106 evalCtx, 1107 recv, 1108 maybeDistribute, 1109 ); err != nil { 1110 recv.SetError(err) 1111 return false 1112 } 1113 } 1114 1115 if len(plan.checkPlans) == 0 { 1116 return true 1117 } 1118 1119 // We place a sequence point before the checks, so that they observe the 1120 // writes of the main query and/or any cascades. 1121 // TODO(radu): the cascades themselves can have more cascades; if any of 1122 // those fall back to legacy cascades code, it will disable stepping. So we 1123 // have to reenable stepping each time. 1124 _ = planner.Txn().ConfigureStepping(ctx, kv.SteppingEnabled) 1125 if err := planner.Txn().Step(ctx); err != nil { 1126 recv.SetError(err) 1127 return false 1128 } 1129 1130 for i := range plan.checkPlans { 1131 log.VEventf(ctx, 1, "executing check query %d out of %d", i+1, len(plan.checkPlans)) 1132 if err := dsp.planAndRunPostquery( 1133 ctx, 1134 plan.checkPlans[i].plan, 1135 planner, 1136 evalCtxFactory(), 1137 recv, 1138 maybeDistribute, 1139 ); err != nil { 1140 recv.SetError(err) 1141 return false 1142 } 1143 } 1144 1145 return true 1146 } 1147 1148 // planAndRunPostquery runs a cascade or check query. 1149 func (dsp *DistSQLPlanner) planAndRunPostquery( 1150 ctx context.Context, 1151 postqueryPlan planMaybePhysical, 1152 planner *planner, 1153 evalCtx *extendedEvalContext, 1154 recv *DistSQLReceiver, 1155 maybeDistribute bool, 1156 ) error { 1157 postqueryMonitor := mon.MakeMonitor( 1158 "postquery", 1159 mon.MemoryResource, 1160 dsp.distSQLSrv.Metrics.CurBytesCount, 1161 dsp.distSQLSrv.Metrics.MaxBytesHist, 1162 -1, /* use default block size */ 1163 noteworthyMemoryUsageBytes, 1164 dsp.distSQLSrv.Settings, 1165 ) 1166 postqueryMonitor.Start(ctx, evalCtx.Mon, mon.BoundAccount{}) 1167 defer postqueryMonitor.Stop(ctx) 1168 1169 postqueryMemAccount := postqueryMonitor.MakeBoundAccount() 1170 defer postqueryMemAccount.Close(ctx) 1171 1172 var distributePostquery bool 1173 if maybeDistribute { 1174 distributePostquery = willDistributePlan( 1175 ctx, planner.execCfg.NodeID, planner.SessionData().DistSQLMode, postqueryPlan, 1176 ) 1177 } 1178 postqueryPlanCtx := dsp.NewPlanningCtx(ctx, evalCtx, planner.txn, distributePostquery) 1179 postqueryPlanCtx.planner = planner 1180 postqueryPlanCtx.stmtType = tree.Rows 1181 postqueryPlanCtx.ignoreClose = true 1182 if planner.collectBundle { 1183 postqueryPlanCtx.saveDiagram = func(diagram execinfrapb.FlowDiagram) { 1184 planner.curPlan.distSQLDiagrams = append(planner.curPlan.distSQLDiagrams, diagram) 1185 } 1186 } 1187 1188 postqueryPhysPlan, err := dsp.createPhysPlan(postqueryPlanCtx, postqueryPlan) 1189 if err != nil { 1190 return err 1191 } 1192 dsp.FinalizePlan(postqueryPlanCtx, postqueryPhysPlan) 1193 1194 postqueryRecv := recv.clone() 1195 // TODO(yuzefovich): at the moment, errOnlyResultWriter is sufficient here, 1196 // but it may not be the case when we support cascades through the optimizer. 1197 postqueryRecv.resultWriter = &errOnlyResultWriter{} 1198 dsp.Run(postqueryPlanCtx, planner.txn, postqueryPhysPlan, postqueryRecv, evalCtx, nil /* finishedSetupFn */)() 1199 if postqueryRecv.commErr != nil { 1200 return postqueryRecv.commErr 1201 } 1202 return postqueryRecv.resultWriter.Err() 1203 }