github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/sql/physicalplan/physical_plan.go (about) 1 // Copyright 2017 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 // This file defines structures and basic functionality that is useful when 12 // building distsql plans. It does not contain the actual physical planning 13 // code. 14 15 package physicalplan 16 17 import ( 18 "fmt" 19 "math" 20 21 "github.com/cockroachdb/cockroach/pkg/roachpb" 22 "github.com/cockroachdb/cockroach/pkg/sql/execinfra" 23 "github.com/cockroachdb/cockroach/pkg/sql/execinfrapb" 24 "github.com/cockroachdb/cockroach/pkg/sql/sem/tree" 25 "github.com/cockroachdb/cockroach/pkg/sql/sqlbase" 26 "github.com/cockroachdb/cockroach/pkg/sql/types" 27 "github.com/cockroachdb/cockroach/pkg/util" 28 "github.com/cockroachdb/cockroach/pkg/util/uuid" 29 "github.com/cockroachdb/errors" 30 ) 31 32 // Processor contains the information associated with a processor in a plan. 33 type Processor struct { 34 // Node where the processor must be instantiated. 35 Node roachpb.NodeID 36 37 // Spec for the processor; note that the StreamEndpointSpecs in the input 38 // synchronizers and output routers are not set until the end of the planning 39 // process. 40 Spec execinfrapb.ProcessorSpec 41 } 42 43 // ProcessorIdx identifies a processor by its index in PhysicalPlan.Processors. 44 type ProcessorIdx int 45 46 // Stream connects the output router of one processor to an input synchronizer 47 // of another processor. 48 type Stream struct { 49 // SourceProcessor index (within the same plan). 50 SourceProcessor ProcessorIdx 51 52 // SourceRouterSlot identifies the position of this stream among the streams 53 // that originate from the same router. This is important when routing by hash 54 // where the order of the streams in the OutputRouterSpec matters. 55 SourceRouterSlot int 56 57 // DestProcessor index (within the same plan). 58 DestProcessor ProcessorIdx 59 60 // DestInput identifies the input of DestProcessor (some processors have 61 // multiple inputs). 62 DestInput int 63 } 64 65 // PhysicalPlan represents a network of processors and streams along with 66 // information about the results output by this network. The results come from 67 // unconnected output routers of a subset of processors; all these routers 68 // output the same kind of data (same schema). 69 type PhysicalPlan struct { 70 // Processors in the plan. 71 Processors []Processor 72 73 // LocalProcessors contains all of the planNodeToRowSourceWrappers that were 74 // installed in this physical plan to wrap any planNodes that couldn't be 75 // properly translated into DistSQL processors. This will be empty if no 76 // wrapping had to happen. 77 LocalProcessors []execinfra.LocalProcessor 78 79 // LocalProcessorIndexes contains pointers to all of the RowSourceIdx fields 80 // of the LocalPlanNodeSpecs that were created. This list is in the same 81 // order as LocalProcessors, and is kept up-to-date so that LocalPlanNodeSpecs 82 // always have the correct index into the LocalProcessors slice. 83 LocalProcessorIndexes []*uint32 84 85 // Streams accumulates the streams in the plan - both local (intra-node) and 86 // remote (inter-node); when we have a final plan, the streams are used to 87 // generate processor input and output specs (see PopulateEndpoints). 88 Streams []Stream 89 90 // ResultRouters identifies the output routers which output the results of the 91 // plan. These are the routers to which we have to connect new streams in 92 // order to extend the plan. 93 // 94 // The processors which have this routers are all part of the same "stage": 95 // they have the same "schema" and PostProcessSpec. 96 // 97 // We assume all processors have a single output so we only need the processor 98 // index. 99 ResultRouters []ProcessorIdx 100 101 // ResultTypes is the schema (column types) of the rows produced by the 102 // ResultRouters. 103 // 104 // This is aliased with InputSyncSpec.ColumnTypes, so it must not be modified 105 // in-place during planning. 106 ResultTypes []*types.T 107 108 // MergeOrdering is the ordering guarantee for the result streams that must be 109 // maintained when the streams eventually merge. The column indexes refer to 110 // columns for the rows produced by ResultRouters. 111 // 112 // Empty when there is a single result router. The reason is that maintaining 113 // an ordering sometimes requires to add columns to streams for the sole 114 // reason of correctly merging the streams later (see AddProjection); we don't 115 // want to pay this cost if we don't have multiple streams to merge. 116 MergeOrdering execinfrapb.Ordering 117 118 // Used internally for numbering stages. 119 stageCounter int32 120 121 // Used internally to avoid creating flow IDs for local flows. This boolean 122 // specifies whether there is more than one node involved in a plan. 123 remotePlan bool 124 125 // MaxEstimatedRowCount tracks the maximum estimated row count that a table 126 // reader in this plan will output. This information is used to decide 127 // whether to use the vectorized execution engine. 128 MaxEstimatedRowCount uint64 129 // TotalEstimatedScannedRows is the sum of the row count estimate of all the 130 // table readers in the plan. 131 TotalEstimatedScannedRows uint64 132 } 133 134 // NewStageID creates a stage identifier that can be used in processor specs. 135 func (p *PhysicalPlan) NewStageID() int32 { 136 p.stageCounter++ 137 return p.stageCounter 138 } 139 140 // AddProcessor adds a processor to a PhysicalPlan and returns the index that 141 // can be used to refer to that processor. 142 func (p *PhysicalPlan) AddProcessor(proc Processor) ProcessorIdx { 143 idx := ProcessorIdx(len(p.Processors)) 144 p.Processors = append(p.Processors, proc) 145 return idx 146 } 147 148 // SetMergeOrdering sets p.MergeOrdering. 149 func (p *PhysicalPlan) SetMergeOrdering(o execinfrapb.Ordering) { 150 if len(p.ResultRouters) > 1 { 151 p.MergeOrdering = o 152 } else { 153 p.MergeOrdering = execinfrapb.Ordering{} 154 } 155 } 156 157 // AddNoGroupingStage adds a processor for each result router, on the same node 158 // with the source of the stream; all processors have the same core. This is for 159 // stages that correspond to logical blocks that don't require any grouping 160 // (e.g. evaluator, sorting, etc). 161 func (p *PhysicalPlan) AddNoGroupingStage( 162 core execinfrapb.ProcessorCoreUnion, 163 post execinfrapb.PostProcessSpec, 164 outputTypes []*types.T, 165 newOrdering execinfrapb.Ordering, 166 ) { 167 p.AddNoGroupingStageWithCoreFunc( 168 func(_ int, _ *Processor) execinfrapb.ProcessorCoreUnion { return core }, 169 post, 170 outputTypes, 171 newOrdering, 172 ) 173 } 174 175 // AddNoGroupingStageWithCoreFunc is like AddNoGroupingStage, but creates a core 176 // spec based on the input processor's spec. 177 func (p *PhysicalPlan) AddNoGroupingStageWithCoreFunc( 178 coreFunc func(int, *Processor) execinfrapb.ProcessorCoreUnion, 179 post execinfrapb.PostProcessSpec, 180 outputTypes []*types.T, 181 newOrdering execinfrapb.Ordering, 182 ) { 183 stageID := p.NewStageID() 184 for i, resultProc := range p.ResultRouters { 185 prevProc := &p.Processors[resultProc] 186 187 proc := Processor{ 188 Node: prevProc.Node, 189 Spec: execinfrapb.ProcessorSpec{ 190 Input: []execinfrapb.InputSyncSpec{{ 191 Type: execinfrapb.InputSyncSpec_UNORDERED, 192 ColumnTypes: p.ResultTypes, 193 }}, 194 Core: coreFunc(int(resultProc), prevProc), 195 Post: post, 196 Output: []execinfrapb.OutputRouterSpec{{ 197 Type: execinfrapb.OutputRouterSpec_PASS_THROUGH, 198 }}, 199 StageID: stageID, 200 }, 201 } 202 203 pIdx := p.AddProcessor(proc) 204 205 p.Streams = append(p.Streams, Stream{ 206 SourceProcessor: resultProc, 207 DestProcessor: pIdx, 208 SourceRouterSlot: 0, 209 DestInput: 0, 210 }) 211 212 p.ResultRouters[i] = pIdx 213 } 214 p.ResultTypes = outputTypes 215 p.SetMergeOrdering(newOrdering) 216 } 217 218 // MergeResultStreams connects a set of resultRouters to a synchronizer. The 219 // synchronizer is configured with the provided ordering. 220 func (p *PhysicalPlan) MergeResultStreams( 221 resultRouters []ProcessorIdx, 222 sourceRouterSlot int, 223 ordering execinfrapb.Ordering, 224 destProcessor ProcessorIdx, 225 destInput int, 226 ) { 227 proc := &p.Processors[destProcessor] 228 if len(ordering.Columns) == 0 || len(resultRouters) == 1 { 229 proc.Spec.Input[destInput].Type = execinfrapb.InputSyncSpec_UNORDERED 230 } else { 231 proc.Spec.Input[destInput].Type = execinfrapb.InputSyncSpec_ORDERED 232 proc.Spec.Input[destInput].Ordering = ordering 233 } 234 235 for _, resultProc := range resultRouters { 236 p.Streams = append(p.Streams, Stream{ 237 SourceProcessor: resultProc, 238 SourceRouterSlot: sourceRouterSlot, 239 DestProcessor: destProcessor, 240 DestInput: destInput, 241 }) 242 } 243 } 244 245 // AddSingleGroupStage adds a "single group" stage (one that cannot be 246 // parallelized) which consists of a single processor on the specified node. The 247 // previous stage (ResultRouters) are all connected to this processor. 248 func (p *PhysicalPlan) AddSingleGroupStage( 249 nodeID roachpb.NodeID, 250 core execinfrapb.ProcessorCoreUnion, 251 post execinfrapb.PostProcessSpec, 252 outputTypes []*types.T, 253 ) { 254 proc := Processor{ 255 Node: nodeID, 256 Spec: execinfrapb.ProcessorSpec{ 257 Input: []execinfrapb.InputSyncSpec{{ 258 // The other fields will be filled in by mergeResultStreams. 259 ColumnTypes: p.ResultTypes, 260 }}, 261 Core: core, 262 Post: post, 263 Output: []execinfrapb.OutputRouterSpec{{ 264 Type: execinfrapb.OutputRouterSpec_PASS_THROUGH, 265 }}, 266 StageID: p.NewStageID(), 267 }, 268 } 269 270 pIdx := p.AddProcessor(proc) 271 272 // Connect the result routers to the processor. 273 p.MergeResultStreams(p.ResultRouters, 0, p.MergeOrdering, pIdx, 0) 274 275 // We now have a single result stream. 276 p.ResultRouters = p.ResultRouters[:1] 277 p.ResultRouters[0] = pIdx 278 279 p.ResultTypes = outputTypes 280 p.MergeOrdering = execinfrapb.Ordering{} 281 } 282 283 // CheckLastStagePost checks that the processors of the last stage of the 284 // PhysicalPlan have identical post-processing, returning an error if not. 285 func (p *PhysicalPlan) CheckLastStagePost() error { 286 post := p.Processors[p.ResultRouters[0]].Spec.Post 287 288 // All processors of a stage should be identical in terms of post-processing; 289 // verify this assumption. 290 for i := 1; i < len(p.ResultRouters); i++ { 291 pi := &p.Processors[p.ResultRouters[i]].Spec.Post 292 if pi.Filter != post.Filter || 293 pi.Projection != post.Projection || 294 len(pi.OutputColumns) != len(post.OutputColumns) || 295 len(pi.RenderExprs) != len(post.RenderExprs) { 296 return errors.Errorf("inconsistent post-processing: %v vs %v", post, pi) 297 } 298 for j, col := range pi.OutputColumns { 299 if col != post.OutputColumns[j] { 300 return errors.Errorf("inconsistent post-processing: %v vs %v", post, pi) 301 } 302 } 303 for j, col := range pi.RenderExprs { 304 if col != post.RenderExprs[j] { 305 return errors.Errorf("inconsistent post-processing: %v vs %v", post, pi) 306 } 307 } 308 } 309 310 return nil 311 } 312 313 // GetLastStagePost returns the PostProcessSpec for the processors in the last 314 // stage (ResultRouters). 315 func (p *PhysicalPlan) GetLastStagePost() execinfrapb.PostProcessSpec { 316 if err := p.CheckLastStagePost(); err != nil { 317 panic(err) 318 } 319 return p.Processors[p.ResultRouters[0]].Spec.Post 320 } 321 322 // SetLastStagePost changes the PostProcess spec of the processors in the last 323 // stage (ResultRouters). 324 // The caller must update the ordering via SetOrdering. 325 func (p *PhysicalPlan) SetLastStagePost(post execinfrapb.PostProcessSpec, outputTypes []*types.T) { 326 for _, pIdx := range p.ResultRouters { 327 p.Processors[pIdx].Spec.Post = post 328 } 329 p.ResultTypes = outputTypes 330 } 331 332 func isIdentityProjection(columns []uint32, numExistingCols int) bool { 333 if len(columns) != numExistingCols { 334 return false 335 } 336 for i, c := range columns { 337 if c != uint32(i) { 338 return false 339 } 340 } 341 return true 342 } 343 344 // AddProjection applies a projection to a plan. The new plan outputs the 345 // columns of the old plan as listed in the slice. The Ordering is updated; 346 // columns in the ordering are added to the projection as needed. 347 // 348 // The PostProcessSpec may not be updated if the resulting projection keeps all 349 // the columns in their original order. 350 // 351 // Note: the columns slice is relinquished to this function, which can modify it 352 // or use it directly in specs. 353 func (p *PhysicalPlan) AddProjection(columns []uint32) { 354 // If the projection we are trying to apply projects every column, don't 355 // update the spec. 356 if isIdentityProjection(columns, len(p.ResultTypes)) { 357 return 358 } 359 360 // Update the ordering. 361 if len(p.MergeOrdering.Columns) > 0 { 362 newOrdering := make([]execinfrapb.Ordering_Column, len(p.MergeOrdering.Columns)) 363 for i, c := range p.MergeOrdering.Columns { 364 // Look for the column in the new projection. 365 found := -1 366 for j, projCol := range columns { 367 if projCol == c.ColIdx { 368 found = j 369 } 370 } 371 if found == -1 { 372 // We have a column that is not in the projection but will be necessary 373 // later when the streams are merged; add it. 374 found = len(columns) 375 columns = append(columns, c.ColIdx) 376 } 377 newOrdering[i].ColIdx = uint32(found) 378 newOrdering[i].Direction = c.Direction 379 } 380 p.MergeOrdering.Columns = newOrdering 381 } 382 383 newResultTypes := make([]*types.T, len(columns)) 384 for i, c := range columns { 385 newResultTypes[i] = p.ResultTypes[c] 386 } 387 388 post := p.GetLastStagePost() 389 390 if post.RenderExprs != nil { 391 // Apply the projection to the existing rendering; in other words, keep 392 // only the renders needed by the new output columns, and reorder them 393 // accordingly. 394 oldRenders := post.RenderExprs 395 post.RenderExprs = make([]execinfrapb.Expression, len(columns)) 396 for i, c := range columns { 397 post.RenderExprs[i] = oldRenders[c] 398 } 399 } else { 400 // There is no existing rendering; we can use OutputColumns to set the 401 // projection. 402 if post.Projection { 403 // We already had a projection: compose it with the new one. 404 for i, c := range columns { 405 columns[i] = post.OutputColumns[c] 406 } 407 } 408 post.OutputColumns = columns 409 post.Projection = true 410 } 411 412 p.SetLastStagePost(post, newResultTypes) 413 } 414 415 // exprColumn returns the column that is referenced by the expression, if the 416 // expression is just an IndexedVar. 417 // 418 // See MakeExpression for a description of indexVarMap. 419 func exprColumn(expr tree.TypedExpr, indexVarMap []int) (int, bool) { 420 v, ok := expr.(*tree.IndexedVar) 421 if !ok { 422 return -1, false 423 } 424 return indexVarMap[v.Idx], true 425 } 426 427 // AddRendering adds a rendering (expression evaluation) to the output of a 428 // plan. The rendering is achieved either through an adjustment on the last 429 // stage post-process spec, or via a new stage. 430 // 431 // The Ordering is updated; columns in the ordering are added to the render 432 // expressions as necessary. 433 // 434 // See MakeExpression for a description of indexVarMap. 435 func (p *PhysicalPlan) AddRendering( 436 exprs []tree.TypedExpr, exprCtx ExprContext, indexVarMap []int, outTypes []*types.T, 437 ) error { 438 // First check if we need an Evaluator, or we are just shuffling values. We 439 // also check if the rendering is a no-op ("identity"). 440 needRendering := false 441 identity := (len(exprs) == len(p.ResultTypes)) 442 443 for exprIdx, e := range exprs { 444 varIdx, ok := exprColumn(e, indexVarMap) 445 if !ok { 446 needRendering = true 447 break 448 } 449 identity = identity && (varIdx == exprIdx) 450 } 451 452 if !needRendering { 453 if identity { 454 // Nothing to do. 455 return nil 456 } 457 // We don't need to do any rendering: the expressions effectively describe 458 // just a projection. 459 cols := make([]uint32, len(exprs)) 460 for i, e := range exprs { 461 streamCol, _ := exprColumn(e, indexVarMap) 462 if streamCol == -1 { 463 panic(fmt.Sprintf("render %d refers to column not in source: %s", i, e)) 464 } 465 cols[i] = uint32(streamCol) 466 } 467 p.AddProjection(cols) 468 return nil 469 } 470 471 post := p.GetLastStagePost() 472 if len(post.RenderExprs) > 0 { 473 post = execinfrapb.PostProcessSpec{} 474 // The last stage contains render expressions. The new renders refer to 475 // the output of these, so we need to add another "no-op" stage to which 476 // to attach the new rendering. 477 p.AddNoGroupingStage( 478 execinfrapb.ProcessorCoreUnion{Noop: &execinfrapb.NoopCoreSpec{}}, 479 post, 480 p.ResultTypes, 481 p.MergeOrdering, 482 ) 483 } 484 485 compositeMap := indexVarMap 486 if post.Projection { 487 compositeMap = reverseProjection(post.OutputColumns, indexVarMap) 488 } 489 post.RenderExprs = make([]execinfrapb.Expression, len(exprs)) 490 for i, e := range exprs { 491 var err error 492 post.RenderExprs[i], err = MakeExpression(e, exprCtx, compositeMap) 493 if err != nil { 494 return err 495 } 496 } 497 498 if len(p.MergeOrdering.Columns) > 0 { 499 outTypes = outTypes[:len(outTypes):len(outTypes)] 500 newOrdering := make([]execinfrapb.Ordering_Column, len(p.MergeOrdering.Columns)) 501 for i, c := range p.MergeOrdering.Columns { 502 found := -1 503 // Look for the column in the new projection. 504 for exprIdx, e := range exprs { 505 if varIdx, ok := exprColumn(e, indexVarMap); ok && varIdx == int(c.ColIdx) { 506 found = exprIdx 507 break 508 } 509 } 510 if found == -1 { 511 // We have a column that is not being rendered but will be necessary 512 // later when the streams are merged; add it. 513 514 // The new expression refers to column post.OutputColumns[c.ColIdx]. 515 internalColIdx := c.ColIdx 516 if post.Projection { 517 internalColIdx = post.OutputColumns[internalColIdx] 518 } 519 newExpr, err := MakeExpression(tree.NewTypedOrdinalReference( 520 int(internalColIdx), 521 p.ResultTypes[c.ColIdx]), 522 exprCtx, nil /* indexVarMap */) 523 if err != nil { 524 return err 525 } 526 527 found = len(post.RenderExprs) 528 post.RenderExprs = append(post.RenderExprs, newExpr) 529 outTypes = append(outTypes, p.ResultTypes[c.ColIdx]) 530 } 531 newOrdering[i].ColIdx = uint32(found) 532 newOrdering[i].Direction = c.Direction 533 } 534 p.MergeOrdering.Columns = newOrdering 535 } 536 537 post.Projection = false 538 post.OutputColumns = nil 539 p.SetLastStagePost(post, outTypes) 540 return nil 541 } 542 543 // reverseProjection remaps expression variable indices to refer to internal 544 // columns (i.e. before post-processing) of a processor instead of output 545 // columns (i.e. after post-processing). 546 // 547 // Inputs: 548 // indexVarMap is a mapping from columns that appear in an expression 549 // (planNode columns) to columns in the output stream of a 550 // processor. 551 // outputColumns is the list of output columns in the processor's 552 // PostProcessSpec; it is effectively a mapping from the output 553 // schema to the internal schema of a processor. 554 // 555 // Result: a "composite map" that maps the planNode columns to the internal 556 // columns of the processor. 557 // 558 // For efficiency, the indexVarMap and the resulting map are represented as 559 // slices, with missing elements having values -1. 560 // 561 // Used when adding expressions (filtering, rendering) to a processor's 562 // PostProcessSpec. For example: 563 // 564 // TableReader // table columns A,B,C,D 565 // Internal schema (before post-processing): A, B, C, D 566 // OutputColumns: [1 3] 567 // Output schema (after post-processing): B, D 568 // 569 // Expression "B < D" might be represented as: 570 // IndexedVar(4) < IndexedVar(1) 571 // with associated indexVarMap: 572 // [-1 1 -1 -1 0] // 1->1, 4->0 573 // This is effectively equivalent to "IndexedVar(0) < IndexedVar(1)"; 0 means 574 // the first output column (B), 1 means the second output column (D). 575 // 576 // To get an index var map that refers to the internal schema: 577 // reverseProjection( 578 // [1 3], // OutputColumns 579 // [-1 1 -1 -1 0], 580 // ) = 581 // [-1 3 -1 -1 1] // 1->3, 4->1 582 // This is effectively equivalent to "IndexedVar(1) < IndexedVar(3)"; 1 583 // means the second internal column (B), 3 means the fourth internal column 584 // (D). 585 func reverseProjection(outputColumns []uint32, indexVarMap []int) []int { 586 if indexVarMap == nil { 587 panic("no indexVarMap") 588 } 589 compositeMap := make([]int, len(indexVarMap)) 590 for i, col := range indexVarMap { 591 if col == -1 { 592 compositeMap[i] = -1 593 } else { 594 compositeMap[i] = int(outputColumns[col]) 595 } 596 } 597 return compositeMap 598 } 599 600 // AddFilter adds a filter on the output of a plan. The filter is added either 601 // as a post-processing step to the last stage or to a new "no-op" stage, as 602 // necessary. 603 // 604 // See MakeExpression for a description of indexVarMap. 605 func (p *PhysicalPlan) AddFilter( 606 expr tree.TypedExpr, exprCtx ExprContext, indexVarMap []int, 607 ) error { 608 if expr == nil { 609 return errors.Errorf("nil filter") 610 } 611 post := p.GetLastStagePost() 612 if len(post.RenderExprs) > 0 || post.Offset != 0 || post.Limit != 0 { 613 // The last stage contains render expressions or a limit. The filter refers 614 // to the output as described by the existing spec, so we need to add 615 // another "no-op" stage to which to attach the filter. 616 // 617 // In general, we might be able to push the filter "through" the rendering; 618 // but the higher level planning code should figure this out when 619 // propagating filters. 620 post = execinfrapb.PostProcessSpec{} 621 p.AddNoGroupingStage( 622 execinfrapb.ProcessorCoreUnion{Noop: &execinfrapb.NoopCoreSpec{}}, 623 post, 624 p.ResultTypes, 625 p.MergeOrdering, 626 ) 627 } 628 629 compositeMap := indexVarMap 630 if post.Projection { 631 compositeMap = reverseProjection(post.OutputColumns, indexVarMap) 632 } 633 filter, err := MakeExpression(expr, exprCtx, compositeMap) 634 if err != nil { 635 return err 636 } 637 if !post.Filter.Empty() { 638 // Either Expr or LocalExpr will be set (not both). 639 if filter.Expr != "" { 640 filter.Expr = fmt.Sprintf("(%s) AND (%s)", post.Filter.Expr, filter.Expr) 641 } else if filter.LocalExpr != nil { 642 filter.LocalExpr = tree.NewTypedAndExpr( 643 post.Filter.LocalExpr, 644 filter.LocalExpr, 645 ) 646 } 647 } 648 for _, pIdx := range p.ResultRouters { 649 p.Processors[pIdx].Spec.Post.Filter = filter 650 } 651 return nil 652 } 653 654 // emptyPlan creates a plan with a single processor that generates no rows; the 655 // output stream has the given types. 656 func emptyPlan(types []*types.T, node roachpb.NodeID) PhysicalPlan { 657 s := execinfrapb.ValuesCoreSpec{ 658 Columns: make([]execinfrapb.DatumInfo, len(types)), 659 } 660 for i, t := range types { 661 s.Columns[i].Encoding = sqlbase.DatumEncoding_VALUE 662 s.Columns[i].Type = t 663 } 664 665 return PhysicalPlan{ 666 Processors: []Processor{{ 667 Node: node, 668 Spec: execinfrapb.ProcessorSpec{ 669 Core: execinfrapb.ProcessorCoreUnion{Values: &s}, 670 Output: make([]execinfrapb.OutputRouterSpec, 1), 671 }, 672 }}, 673 ResultRouters: []ProcessorIdx{0}, 674 ResultTypes: types, 675 } 676 } 677 678 // AddLimit adds a limit and/or offset to the results of the current plan. If 679 // there are multiple result streams, they are joined into a single processor 680 // that is placed on the given node. 681 // 682 // For no limit, count should be MaxInt64. 683 func (p *PhysicalPlan) AddLimit( 684 count int64, offset int64, exprCtx ExprContext, node roachpb.NodeID, 685 ) error { 686 if count < 0 { 687 return errors.Errorf("negative limit") 688 } 689 if offset < 0 { 690 return errors.Errorf("negative offset") 691 } 692 // limitZero is set to true if the limit is a legitimate LIMIT 0 requested by 693 // the user. This needs to be tracked as a separate condition because DistSQL 694 // uses count=0 to mean no limit, not a limit of 0. Normally, DistSQL will 695 // short circuit 0-limit plans, but wrapped local planNodes sometimes need to 696 // be fully-executed despite having 0 limit, so if we do in fact have a 697 // limit-0 case when there's local planNodes around, we add an empty plan 698 // instead of completely eliding the 0-limit plan. 699 limitZero := false 700 if count == 0 { 701 if len(p.LocalProcessors) == 0 { 702 *p = emptyPlan(p.ResultTypes, node) 703 return nil 704 } 705 count = 1 706 limitZero = true 707 } 708 709 if len(p.ResultRouters) == 1 { 710 // We only have one processor producing results. Just update its PostProcessSpec. 711 // SELECT FROM (SELECT OFFSET 10 LIMIT 1000) OFFSET 5 LIMIT 20 becomes 712 // SELECT OFFSET 10+5 LIMIT min(1000, 20). 713 post := p.GetLastStagePost() 714 if offset != 0 { 715 if post.Limit > 0 && post.Limit <= uint64(offset) { 716 // The previous limit is not enough to reach the offset; we know there 717 // will be no results. For example: 718 // SELECT * FROM (SELECT * FROM .. LIMIT 5) OFFSET 10 719 // TODO(radu): perform this optimization while propagating filters 720 // instead of having to detect it here. 721 if len(p.LocalProcessors) == 0 { 722 // Even though we know there will be no results, we don't elide the 723 // plan if there are local processors. See comment above limitZero 724 // for why. 725 *p = emptyPlan(p.ResultTypes, node) 726 return nil 727 } 728 count = 1 729 limitZero = true 730 } 731 // If we're collapsing an offset into a stage that already has a limit, 732 // we have to be careful, since offsets always are applied first, before 733 // limits. So, if the last stage already has a limit, we subtract the 734 // offset from that limit to preserve correctness. 735 // 736 // As an example, consider the requirement of applying an offset of 3 on 737 // top of a limit of 10. In this case, we need to emit 7 result rows. But 738 // just propagating the offset blindly would produce 10 result rows, an 739 // incorrect result. 740 post.Offset += uint64(offset) 741 if post.Limit > 0 { 742 // Note that this can't fall below 0 - we would have already caught this 743 // case above and returned an empty plan. 744 post.Limit -= uint64(offset) 745 } 746 } 747 if count != math.MaxInt64 && (post.Limit == 0 || post.Limit > uint64(count)) { 748 post.Limit = uint64(count) 749 } 750 p.SetLastStagePost(post, p.ResultTypes) 751 if limitZero { 752 if err := p.AddFilter(tree.DBoolFalse, exprCtx, nil); err != nil { 753 return err 754 } 755 } 756 return nil 757 } 758 759 // We have multiple processors producing results. We will add a single 760 // processor stage that limits. As an optimization, we also set a 761 // "local" limit on each processor producing results. 762 if count != math.MaxInt64 { 763 post := p.GetLastStagePost() 764 // If we have OFFSET 10 LIMIT 5, we may need as much as 15 rows from any 765 // processor. 766 localLimit := uint64(count + offset) 767 if post.Limit == 0 || post.Limit > localLimit { 768 post.Limit = localLimit 769 p.SetLastStagePost(post, p.ResultTypes) 770 } 771 } 772 773 post := execinfrapb.PostProcessSpec{ 774 Offset: uint64(offset), 775 } 776 if count != math.MaxInt64 { 777 post.Limit = uint64(count) 778 } 779 p.AddSingleGroupStage( 780 node, 781 execinfrapb.ProcessorCoreUnion{Noop: &execinfrapb.NoopCoreSpec{}}, 782 post, 783 p.ResultTypes, 784 ) 785 if limitZero { 786 if err := p.AddFilter(tree.DBoolFalse, exprCtx, nil); err != nil { 787 return err 788 } 789 } 790 return nil 791 } 792 793 // PopulateEndpoints processes p.Streams and adds the corresponding 794 // StreamEndpointSpecs to the processors' input and output specs. This should be 795 // used when the plan is completed and ready to be executed. 796 func (p *PhysicalPlan) PopulateEndpoints() { 797 // Note: instead of using p.Streams, we could fill in the input/output specs 798 // directly throughout the planning code, but this makes the rest of the code 799 // a bit simpler. 800 for sIdx, s := range p.Streams { 801 p1 := &p.Processors[s.SourceProcessor] 802 p2 := &p.Processors[s.DestProcessor] 803 endpoint := execinfrapb.StreamEndpointSpec{StreamID: execinfrapb.StreamID(sIdx)} 804 if p1.Node == p2.Node { 805 endpoint.Type = execinfrapb.StreamEndpointSpec_LOCAL 806 } else { 807 endpoint.Type = execinfrapb.StreamEndpointSpec_REMOTE 808 } 809 p2.Spec.Input[s.DestInput].Streams = append(p2.Spec.Input[s.DestInput].Streams, endpoint) 810 if endpoint.Type == execinfrapb.StreamEndpointSpec_REMOTE { 811 if !p.remotePlan { 812 p.remotePlan = true 813 } 814 endpoint.TargetNodeID = p2.Node 815 } 816 817 router := &p1.Spec.Output[0] 818 // We are about to put this stream on the len(router.Streams) position in 819 // the router; verify this matches the sourceRouterSlot. We expect it to 820 // because the streams should be in order; if that assumption changes we can 821 // reorder them here according to sourceRouterSlot. 822 if len(router.Streams) != s.SourceRouterSlot { 823 panic(fmt.Sprintf( 824 "sourceRouterSlot mismatch: %d, expected %d", len(router.Streams), s.SourceRouterSlot, 825 )) 826 } 827 router.Streams = append(router.Streams, endpoint) 828 } 829 } 830 831 // GenerateFlowSpecs takes a plan (with populated endpoints) and generates the 832 // set of FlowSpecs (one per node involved in the plan). 833 // 834 // gateway is the current node's NodeID. 835 func (p *PhysicalPlan) GenerateFlowSpecs( 836 gateway roachpb.NodeID, 837 ) map[roachpb.NodeID]*execinfrapb.FlowSpec { 838 // Only generate a flow ID for a remote plan because it will need to be 839 // referenced by remote nodes when connecting streams. This id generation is 840 // skipped for performance reasons on local flows. 841 flowID := execinfrapb.FlowID{} 842 if p.remotePlan { 843 flowID.UUID = uuid.MakeV4() 844 } 845 flows := make(map[roachpb.NodeID]*execinfrapb.FlowSpec, 1) 846 847 for _, proc := range p.Processors { 848 flowSpec, ok := flows[proc.Node] 849 if !ok { 850 flowSpec = NewFlowSpec(flowID, gateway) 851 flows[proc.Node] = flowSpec 852 } 853 flowSpec.Processors = append(flowSpec.Processors, proc.Spec) 854 } 855 return flows 856 } 857 858 // MergePlans merges the processors and streams of two plan into a new plan. 859 // The result routers for each side are also returned (they point at processors 860 // in the merged plan). 861 func MergePlans( 862 left, right *PhysicalPlan, 863 ) (mergedPlan PhysicalPlan, leftRouters []ProcessorIdx, rightRouters []ProcessorIdx) { 864 mergedPlan.Processors = append(left.Processors, right.Processors...) 865 rightProcStart := ProcessorIdx(len(left.Processors)) 866 867 mergedPlan.Streams = append(left.Streams, right.Streams...) 868 869 // Update the processor indices in the right streams. 870 for i := len(left.Streams); i < len(mergedPlan.Streams); i++ { 871 mergedPlan.Streams[i].SourceProcessor += rightProcStart 872 mergedPlan.Streams[i].DestProcessor += rightProcStart 873 } 874 875 // Renumber the stages from the right plan. 876 for i := rightProcStart; int(i) < len(mergedPlan.Processors); i++ { 877 s := &mergedPlan.Processors[i].Spec 878 if s.StageID != 0 { 879 s.StageID += left.stageCounter 880 } 881 } 882 mergedPlan.stageCounter = left.stageCounter + right.stageCounter 883 884 mergedPlan.LocalProcessors = append(left.LocalProcessors, right.LocalProcessors...) 885 mergedPlan.LocalProcessorIndexes = append(left.LocalProcessorIndexes, right.LocalProcessorIndexes...) 886 // Update the local processor indices in the right streams. 887 for i := len(left.LocalProcessorIndexes); i < len(mergedPlan.LocalProcessorIndexes); i++ { 888 *mergedPlan.LocalProcessorIndexes[i] += uint32(len(left.LocalProcessorIndexes)) 889 } 890 891 leftRouters = left.ResultRouters 892 rightRouters = append([]ProcessorIdx(nil), right.ResultRouters...) 893 // Update the processor indices in the right routers. 894 for i := range rightRouters { 895 rightRouters[i] += rightProcStart 896 } 897 898 mergedPlan.TotalEstimatedScannedRows = left.TotalEstimatedScannedRows + right.TotalEstimatedScannedRows 899 // NB(dt): AFAIK no one looks at the MaxEstimatedRowCount of the overall plan 900 // but it is maintained here too just for completeness. 901 mergedPlan.MaxEstimatedRowCount = left.MaxEstimatedRowCount 902 if right.MaxEstimatedRowCount > left.MaxEstimatedRowCount { 903 mergedPlan.MaxEstimatedRowCount = left.MaxEstimatedRowCount 904 } 905 906 return mergedPlan, leftRouters, rightRouters 907 } 908 909 // MergeResultTypes reconciles the ResultTypes between two plans. It enforces 910 // that each pair of ColumnTypes must either match or be null, in which case the 911 // non-null type is used. This logic is necessary for cases like 912 // SELECT NULL UNION SELECT 1. 913 func MergeResultTypes(left, right []*types.T) ([]*types.T, error) { 914 if len(left) != len(right) { 915 return nil, errors.Errorf("ResultTypes length mismatch: %d and %d", len(left), len(right)) 916 } 917 merged := make([]*types.T, len(left)) 918 for i := range left { 919 leftType, rightType := left[i], right[i] 920 if rightType.Family() == types.UnknownFamily { 921 merged[i] = leftType 922 } else if leftType.Family() == types.UnknownFamily { 923 merged[i] = rightType 924 } else if equivalentTypes(leftType, rightType) { 925 merged[i] = leftType 926 } else { 927 return nil, errors.Errorf( 928 "conflicting ColumnTypes: %s and %s", leftType.DebugString(), rightType.DebugString()) 929 } 930 } 931 return merged, nil 932 } 933 934 // equivalentType checks whether a column type is equivalent to another for the 935 // purpose of UNION. Precision, Width, Oid, etc. do not affect the merging of 936 // values. 937 func equivalentTypes(c, other *types.T) bool { 938 return c.Equivalent(other) 939 } 940 941 // AddJoinStage adds join processors at each of the specified nodes, and wires 942 // the left and right-side outputs to these processors. 943 func (p *PhysicalPlan) AddJoinStage( 944 nodes []roachpb.NodeID, 945 core execinfrapb.ProcessorCoreUnion, 946 post execinfrapb.PostProcessSpec, 947 leftEqCols, rightEqCols []uint32, 948 leftTypes, rightTypes []*types.T, 949 leftMergeOrd, rightMergeOrd execinfrapb.Ordering, 950 leftRouters, rightRouters []ProcessorIdx, 951 ) { 952 pIdxStart := ProcessorIdx(len(p.Processors)) 953 stageID := p.NewStageID() 954 955 for _, n := range nodes { 956 inputs := make([]execinfrapb.InputSyncSpec, 0, 2) 957 inputs = append(inputs, execinfrapb.InputSyncSpec{ColumnTypes: leftTypes}) 958 inputs = append(inputs, execinfrapb.InputSyncSpec{ColumnTypes: rightTypes}) 959 960 proc := Processor{ 961 Node: n, 962 Spec: execinfrapb.ProcessorSpec{ 963 Input: inputs, 964 Core: core, 965 Post: post, 966 Output: []execinfrapb.OutputRouterSpec{{Type: execinfrapb.OutputRouterSpec_PASS_THROUGH}}, 967 StageID: stageID, 968 }, 969 } 970 p.Processors = append(p.Processors, proc) 971 } 972 973 if len(nodes) > 1 { 974 // Parallel hash or merge join: we distribute rows (by hash of 975 // equality columns) to len(nodes) join processors. 976 977 // Set up the left routers. 978 for _, resultProc := range leftRouters { 979 p.Processors[resultProc].Spec.Output[0] = execinfrapb.OutputRouterSpec{ 980 Type: execinfrapb.OutputRouterSpec_BY_HASH, 981 HashColumns: leftEqCols, 982 } 983 } 984 // Set up the right routers. 985 for _, resultProc := range rightRouters { 986 p.Processors[resultProc].Spec.Output[0] = execinfrapb.OutputRouterSpec{ 987 Type: execinfrapb.OutputRouterSpec_BY_HASH, 988 HashColumns: rightEqCols, 989 } 990 } 991 } 992 p.ResultRouters = p.ResultRouters[:0] 993 994 // Connect the left and right routers to the output joiners. Each joiner 995 // corresponds to a hash bucket. 996 for bucket := 0; bucket < len(nodes); bucket++ { 997 pIdx := pIdxStart + ProcessorIdx(bucket) 998 999 // Connect left routers to the processor's first input. Currently the join 1000 // node doesn't care about the orderings of the left and right results. 1001 p.MergeResultStreams(leftRouters, bucket, leftMergeOrd, pIdx, 0) 1002 // Connect right routers to the processor's second input if it has one. 1003 p.MergeResultStreams(rightRouters, bucket, rightMergeOrd, pIdx, 1) 1004 1005 p.ResultRouters = append(p.ResultRouters, pIdx) 1006 } 1007 } 1008 1009 // AddDistinctSetOpStage creates a distinct stage and a join stage to implement 1010 // INTERSECT and EXCEPT plans. 1011 // 1012 // TODO(abhimadan): If there's a strong key on the left or right side, we 1013 // can elide the distinct stage on that side. 1014 func (p *PhysicalPlan) AddDistinctSetOpStage( 1015 nodes []roachpb.NodeID, 1016 joinCore execinfrapb.ProcessorCoreUnion, 1017 distinctCores []execinfrapb.ProcessorCoreUnion, 1018 post execinfrapb.PostProcessSpec, 1019 eqCols []uint32, 1020 leftTypes, rightTypes []*types.T, 1021 leftMergeOrd, rightMergeOrd execinfrapb.Ordering, 1022 leftRouters, rightRouters []ProcessorIdx, 1023 ) { 1024 const numSides = 2 1025 inputResultTypes := [numSides][]*types.T{leftTypes, rightTypes} 1026 inputMergeOrderings := [numSides]execinfrapb.Ordering{leftMergeOrd, rightMergeOrd} 1027 inputResultRouters := [numSides][]ProcessorIdx{leftRouters, rightRouters} 1028 1029 // Create distinct stages for the left and right sides, where left and right 1030 // sources are sent by hash to the node which will contain the join processor. 1031 // The distinct stage must be before the join stage for EXCEPT queries to 1032 // produce correct results (e.g., (VALUES (1),(1),(2)) EXCEPT (VALUES (1)) 1033 // would return (1),(2) instead of (2) if there was no distinct processor 1034 // before the EXCEPT ALL join). 1035 distinctIdxStart := len(p.Processors) 1036 distinctProcs := make(map[roachpb.NodeID][]ProcessorIdx) 1037 1038 for side, types := range inputResultTypes { 1039 distinctStageID := p.NewStageID() 1040 for _, n := range nodes { 1041 proc := Processor{ 1042 Node: n, 1043 Spec: execinfrapb.ProcessorSpec{ 1044 Input: []execinfrapb.InputSyncSpec{ 1045 {ColumnTypes: types}, 1046 }, 1047 Core: distinctCores[side], 1048 Post: execinfrapb.PostProcessSpec{}, 1049 Output: []execinfrapb.OutputRouterSpec{{Type: execinfrapb.OutputRouterSpec_PASS_THROUGH}}, 1050 StageID: distinctStageID, 1051 }, 1052 } 1053 pIdx := p.AddProcessor(proc) 1054 distinctProcs[n] = append(distinctProcs[n], pIdx) 1055 } 1056 } 1057 1058 if len(nodes) > 1 { 1059 // Set up the left routers. 1060 for _, resultProc := range leftRouters { 1061 p.Processors[resultProc].Spec.Output[0] = execinfrapb.OutputRouterSpec{ 1062 Type: execinfrapb.OutputRouterSpec_BY_HASH, 1063 HashColumns: eqCols, 1064 } 1065 } 1066 // Set up the right routers. 1067 for _, resultProc := range rightRouters { 1068 p.Processors[resultProc].Spec.Output[0] = execinfrapb.OutputRouterSpec{ 1069 Type: execinfrapb.OutputRouterSpec_BY_HASH, 1070 HashColumns: eqCols, 1071 } 1072 } 1073 } 1074 1075 // Connect the left and right streams to the distinct processors. 1076 for side, routers := range inputResultRouters { 1077 // Get the processor index offset for the current side. 1078 sideOffset := side * len(nodes) 1079 for bucket := 0; bucket < len(nodes); bucket++ { 1080 pIdx := ProcessorIdx(distinctIdxStart + sideOffset + bucket) 1081 p.MergeResultStreams(routers, bucket, inputMergeOrderings[side], pIdx, 0) 1082 } 1083 } 1084 1085 // Create a join stage, where the distinct processors on the same node are 1086 // connected to a join processor. 1087 joinStageID := p.NewStageID() 1088 p.ResultRouters = p.ResultRouters[:0] 1089 1090 for _, n := range nodes { 1091 proc := Processor{ 1092 Node: n, 1093 Spec: execinfrapb.ProcessorSpec{ 1094 Input: []execinfrapb.InputSyncSpec{ 1095 {ColumnTypes: leftTypes}, 1096 {ColumnTypes: rightTypes}, 1097 }, 1098 Core: joinCore, 1099 Post: post, 1100 Output: []execinfrapb.OutputRouterSpec{{Type: execinfrapb.OutputRouterSpec_PASS_THROUGH}}, 1101 StageID: joinStageID, 1102 }, 1103 } 1104 pIdx := p.AddProcessor(proc) 1105 1106 for side, distinctProc := range distinctProcs[n] { 1107 p.Streams = append(p.Streams, Stream{ 1108 SourceProcessor: distinctProc, 1109 SourceRouterSlot: 0, 1110 DestProcessor: pIdx, 1111 DestInput: side, 1112 }) 1113 } 1114 1115 p.ResultRouters = append(p.ResultRouters, pIdx) 1116 } 1117 } 1118 1119 // EnsureSingleStreamPerNode goes over the ResultRouters and merges any group of 1120 // routers that are on the same node, using a no-op processor. 1121 // 1122 // TODO(radu): a no-op processor is not ideal if the next processor is on the 1123 // same node. A fix for that is much more complicated, requiring remembering 1124 // extra state in the PhysicalPlan. 1125 func (p *PhysicalPlan) EnsureSingleStreamPerNode() { 1126 // Fast path - check if we need to do anything. 1127 var nodes util.FastIntSet 1128 var foundDuplicates bool 1129 for _, pIdx := range p.ResultRouters { 1130 proc := &p.Processors[pIdx] 1131 if nodes.Contains(int(proc.Node)) { 1132 foundDuplicates = true 1133 break 1134 } 1135 nodes.Add(int(proc.Node)) 1136 } 1137 if !foundDuplicates { 1138 return 1139 } 1140 streams := make([]ProcessorIdx, 0, 2) 1141 1142 for i := 0; i < len(p.ResultRouters); i++ { 1143 pIdx := p.ResultRouters[i] 1144 node := p.Processors[p.ResultRouters[i]].Node 1145 streams = append(streams[:0], pIdx) 1146 // Find all streams on the same node. 1147 for j := i + 1; j < len(p.ResultRouters); { 1148 if p.Processors[p.ResultRouters[j]].Node == node { 1149 streams = append(streams, p.ResultRouters[j]) 1150 // Remove the stream. 1151 copy(p.ResultRouters[j:], p.ResultRouters[j+1:]) 1152 p.ResultRouters = p.ResultRouters[:len(p.ResultRouters)-1] 1153 } else { 1154 j++ 1155 } 1156 } 1157 if len(streams) == 1 { 1158 // Nothing to do for this node. 1159 continue 1160 } 1161 1162 // Merge the streams into a no-op processor. 1163 proc := Processor{ 1164 Node: node, 1165 Spec: execinfrapb.ProcessorSpec{ 1166 Input: []execinfrapb.InputSyncSpec{{ 1167 // The other fields will be filled in by MergeResultStreams. 1168 ColumnTypes: p.ResultTypes, 1169 }}, 1170 Core: execinfrapb.ProcessorCoreUnion{Noop: &execinfrapb.NoopCoreSpec{}}, 1171 Output: []execinfrapb.OutputRouterSpec{{Type: execinfrapb.OutputRouterSpec_PASS_THROUGH}}, 1172 }, 1173 } 1174 mergedProcIdx := p.AddProcessor(proc) 1175 p.MergeResultStreams(streams, 0 /* sourceRouterSlot */, p.MergeOrdering, mergedProcIdx, 0 /* destInput */) 1176 p.ResultRouters[i] = mergedProcIdx 1177 } 1178 }