github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/sql/colexec/execplan.go (about) 1 // Copyright 2019 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package colexec 12 13 import ( 14 "context" 15 "fmt" 16 "math" 17 "reflect" 18 19 "github.com/cockroachdb/cockroach/pkg/col/coldata" 20 "github.com/cockroachdb/cockroach/pkg/col/coldataext" 21 "github.com/cockroachdb/cockroach/pkg/col/typeconv" 22 "github.com/cockroachdb/cockroach/pkg/sql/colcontainer" 23 "github.com/cockroachdb/cockroach/pkg/sql/colexecbase" 24 "github.com/cockroachdb/cockroach/pkg/sql/colexecbase/colexecerror" 25 "github.com/cockroachdb/cockroach/pkg/sql/colmem" 26 "github.com/cockroachdb/cockroach/pkg/sql/execinfra" 27 "github.com/cockroachdb/cockroach/pkg/sql/execinfrapb" 28 "github.com/cockroachdb/cockroach/pkg/sql/sem/tree" 29 "github.com/cockroachdb/cockroach/pkg/sql/sessiondata" 30 "github.com/cockroachdb/cockroach/pkg/sql/sqlbase" 31 "github.com/cockroachdb/cockroach/pkg/sql/types" 32 "github.com/cockroachdb/cockroach/pkg/util" 33 "github.com/cockroachdb/cockroach/pkg/util/log" 34 "github.com/cockroachdb/cockroach/pkg/util/mon" 35 "github.com/cockroachdb/errors" 36 "github.com/marusama/semaphore" 37 ) 38 39 func checkNumIn(inputs []colexecbase.Operator, numIn int) error { 40 if len(inputs) != numIn { 41 return errors.Errorf("expected %d input(s), got %d", numIn, len(inputs)) 42 } 43 return nil 44 } 45 46 // wrapRowSources, given input Operators, integrates toWrap into a columnar 47 // execution flow and returns toWrap's output as an Operator. 48 func wrapRowSources( 49 ctx context.Context, 50 flowCtx *execinfra.FlowCtx, 51 inputs []colexecbase.Operator, 52 inputTypes [][]*types.T, 53 acc *mon.BoundAccount, 54 processorID int32, 55 newToWrap func([]execinfra.RowSource) (execinfra.RowSource, error), 56 factory coldata.ColumnFactory, 57 ) (*Columnarizer, error) { 58 var toWrapInputs []execinfra.RowSource 59 for i, input := range inputs { 60 // Optimization: if the input is a Columnarizer, its input is necessarily a 61 // execinfra.RowSource, so remove the unnecessary conversion. 62 if c, ok := input.(*Columnarizer); ok { 63 // TODO(asubiotto): We might need to do some extra work to remove references 64 // to this operator (e.g. streamIDToOp). 65 toWrapInputs = append(toWrapInputs, c.input) 66 } else { 67 toWrapInput, err := NewMaterializer( 68 flowCtx, 69 processorID, 70 input, 71 inputTypes[i], 72 nil, /* output */ 73 nil, /* metadataSourcesQueue */ 74 nil, /* toClose */ 75 nil, /* outputStatsToTrace */ 76 nil, /* cancelFlow */ 77 ) 78 if err != nil { 79 return nil, err 80 } 81 toWrapInputs = append(toWrapInputs, toWrapInput) 82 } 83 } 84 85 toWrap, err := newToWrap(toWrapInputs) 86 if err != nil { 87 return nil, err 88 } 89 90 return NewColumnarizer(ctx, colmem.NewAllocator(ctx, acc, factory), flowCtx, processorID, toWrap) 91 } 92 93 // NewColOperatorArgs is a helper struct that encompasses all of the input 94 // arguments to NewColOperator call. 95 type NewColOperatorArgs struct { 96 Spec *execinfrapb.ProcessorSpec 97 Inputs []colexecbase.Operator 98 StreamingMemAccount *mon.BoundAccount 99 ProcessorConstructor execinfra.ProcessorConstructor 100 DiskQueueCfg colcontainer.DiskQueueCfg 101 FDSemaphore semaphore.Semaphore 102 TestingKnobs struct { 103 // UseStreamingMemAccountForBuffering specifies whether to use 104 // StreamingMemAccount when creating buffering operators and should only be 105 // set to 'true' in tests. The idea behind this flag is reducing the number 106 // of memory accounts and monitors we need to close, so we plumbed it into 107 // the planning code so that it doesn't create extra memory monitoring 108 // infrastructure (and so that we could use testMemAccount defined in 109 // main_test.go). 110 UseStreamingMemAccountForBuffering bool 111 // SpillingCallbackFn will be called when the spilling from an in-memory to 112 // disk-backed operator occurs. It should only be set in tests. 113 SpillingCallbackFn func() 114 // DiskSpillingDisabled specifies whether only in-memory operators should 115 // be created. 116 DiskSpillingDisabled bool 117 // NumForcedRepartitions specifies a number of "repartitions" that a 118 // disk-backed operator should be forced to perform. "Repartition" can mean 119 // different things depending on the operator (for example, for hash joiner 120 // it is dividing original partition into multiple new partitions; for 121 // sorter it is merging already created partitions into new one before 122 // proceeding to the next partition from the input). 123 NumForcedRepartitions int 124 // DelegateFDAcquisitions should be observed by users of a 125 // PartitionedDiskQueue. During normal operations, these should acquire the 126 // maximum number of file descriptors they will use from FDSemaphore up 127 // front. Setting this testing knob to true disables that behavior and 128 // lets the PartitionedDiskQueue interact with the semaphore as partitions 129 // are opened/closed, which ensures that the number of open files never 130 // exceeds what is expected. 131 DelegateFDAcquisitions bool 132 } 133 } 134 135 // NewColOperatorResult is a helper struct that encompasses all of the return 136 // values of NewColOperator call. 137 type NewColOperatorResult struct { 138 Op colexecbase.Operator 139 ColumnTypes []*types.T 140 InternalMemUsage int 141 MetadataSources []execinfrapb.MetadataSource 142 // ToClose is a slice of components that need to be Closed. Close should be 143 // idempotent. 144 ToClose []IdempotentCloser 145 IsStreaming bool 146 OpMonitors []*mon.BytesMonitor 147 OpAccounts []*mon.BoundAccount 148 } 149 150 // resetToState resets r to the state specified in arg. arg may be a shallow 151 // copy made at a given point in time. 152 func (r *NewColOperatorResult) resetToState(ctx context.Context, arg NewColOperatorResult) { 153 // MetadataSources are left untouched since there is no need to do any 154 // cleaning there. 155 156 // Close BoundAccounts that are not present in arg.OpAccounts. 157 accs := make(map[*mon.BoundAccount]struct{}) 158 for _, a := range arg.OpAccounts { 159 accs[a] = struct{}{} 160 } 161 for _, a := range r.OpAccounts { 162 if _, ok := accs[a]; !ok { 163 a.Close(ctx) 164 } 165 } 166 // Stop BytesMonitors that are not present in arg.OpMonitors. 167 mons := make(map[*mon.BytesMonitor]struct{}) 168 for _, m := range arg.OpMonitors { 169 mons[m] = struct{}{} 170 } 171 172 for _, m := range r.OpMonitors { 173 if _, ok := mons[m]; !ok { 174 m.Stop(ctx) 175 } 176 } 177 178 // Shallow copy over the rest. 179 *r = arg 180 } 181 182 const noFilterIdx = -1 183 184 // isSupported checks whether we have a columnar operator equivalent to a 185 // processor described by spec. Note that it doesn't perform any other checks 186 // (like validity of the number of inputs). 187 func isSupported( 188 allocator *colmem.Allocator, mode sessiondata.VectorizeExecMode, spec *execinfrapb.ProcessorSpec, 189 ) (bool, error) { 190 core := spec.Core 191 isFullVectorization := mode == sessiondata.VectorizeOn || 192 mode == sessiondata.VectorizeExperimentalAlways 193 194 switch { 195 case core.Noop != nil: 196 return true, nil 197 198 case core.TableReader != nil: 199 if core.TableReader.IsCheck { 200 return false, errors.Newf("scrub table reader is unsupported in vectorized") 201 } 202 return true, nil 203 204 case core.Aggregator != nil: 205 aggSpec := core.Aggregator 206 for _, agg := range aggSpec.Aggregations { 207 if agg.Distinct { 208 return false, errors.Newf("distinct aggregation not supported") 209 } 210 if agg.FilterColIdx != nil { 211 return false, errors.Newf("filtering aggregation not supported") 212 } 213 if len(agg.Arguments) > 0 { 214 return false, errors.Newf("aggregates with arguments not supported") 215 } 216 inputTypes := make([]*types.T, len(agg.ColIdx)) 217 for pos, colIdx := range agg.ColIdx { 218 inputTypes[pos] = spec.Input[0].ColumnTypes[colIdx] 219 } 220 if supported, err := isAggregateSupported(allocator, agg.Func, inputTypes); !supported { 221 return false, err 222 } 223 } 224 return true, nil 225 226 case core.Distinct != nil: 227 if core.Distinct.NullsAreDistinct { 228 return false, errors.Newf("distinct with unique nulls not supported") 229 } 230 if core.Distinct.ErrorOnDup != "" { 231 return false, errors.Newf("distinct with error on duplicates not supported") 232 } 233 if !isFullVectorization { 234 if len(core.Distinct.OrderedColumns) < len(core.Distinct.DistinctColumns) { 235 return false, errors.Newf("unordered distinct can only run in vectorize 'on' mode") 236 } 237 } 238 return true, nil 239 240 case core.Ordinality != nil: 241 return true, nil 242 243 case core.HashJoiner != nil: 244 if !core.HashJoiner.OnExpr.Empty() && core.HashJoiner.Type != sqlbase.InnerJoin { 245 return false, errors.Newf("can't plan vectorized non-inner hash joins with ON expressions") 246 } 247 leftInput, rightInput := spec.Input[0], spec.Input[1] 248 if len(leftInput.ColumnTypes) == 0 || len(rightInput.ColumnTypes) == 0 { 249 // We have a cross join of two inputs, and at least one of them has 250 // zero-length schema. However, the hash join operators (both 251 // external and in-memory) have a built-in assumption of non-empty 252 // inputs, so we will fallback to row execution in such cases. 253 // TODO(yuzefovich): implement specialized cross join operator. 254 return false, errors.Newf("can't plan vectorized hash joins with an empty input schema") 255 } 256 return true, nil 257 258 case core.MergeJoiner != nil: 259 if !core.MergeJoiner.OnExpr.Empty() && 260 core.MergeJoiner.Type != sqlbase.InnerJoin { 261 return false, errors.Errorf("can't plan non-inner merge join with ON expressions") 262 } 263 return true, nil 264 265 case core.Sorter != nil: 266 return true, nil 267 268 case core.Windower != nil: 269 for _, wf := range core.Windower.WindowFns { 270 if wf.Frame != nil { 271 frame, err := wf.Frame.ConvertToAST() 272 if err != nil { 273 return false, err 274 } 275 if !frame.IsDefaultFrame() { 276 return false, errors.Newf("window functions with non-default window frames are not supported") 277 } 278 } 279 if wf.FilterColIdx != noFilterIdx { 280 return false, errors.Newf("window functions with FILTER clause are not supported") 281 } 282 if wf.Func.AggregateFunc != nil { 283 return false, errors.Newf("aggregate functions used as window functions are not supported") 284 } 285 286 if _, supported := SupportedWindowFns[*wf.Func.WindowFunc]; !supported { 287 return false, errors.Newf("window function %s is not supported", wf.String()) 288 } 289 if !isFullVectorization { 290 switch *wf.Func.WindowFunc { 291 case execinfrapb.WindowerSpec_PERCENT_RANK, execinfrapb.WindowerSpec_CUME_DIST: 292 return false, errors.Newf("window function %s can only run in vectorize 'on' mode", wf.String()) 293 } 294 } 295 } 296 return true, nil 297 298 default: 299 return false, errors.Newf("unsupported processor core %q", core) 300 } 301 } 302 303 // createDiskBackedSort creates a new disk-backed operator that sorts the input 304 // according to ordering. 305 // - matchLen specifies the length of the prefix of ordering columns the input 306 // is already ordered on. 307 // - maxNumberPartitions (when non-zero) overrides the semi-dynamically 308 // computed maximum number of partitions that the external sorter will have 309 // at once. 310 // - processorID is the ProcessorID of the processor core that requested 311 // creation of this operator. It is used only to distinguish memory monitors. 312 // - post describes the post-processing spec of the processor. It will be used 313 // to determine whether top K sort can be planned. If you want the general sort 314 // operator, then pass in empty struct. 315 func (r *NewColOperatorResult) createDiskBackedSort( 316 ctx context.Context, 317 flowCtx *execinfra.FlowCtx, 318 args NewColOperatorArgs, 319 input colexecbase.Operator, 320 inputTypes []*types.T, 321 ordering execinfrapb.Ordering, 322 matchLen uint32, 323 maxNumberPartitions int, 324 processorID int32, 325 post *execinfrapb.PostProcessSpec, 326 memMonitorNamePrefix string, 327 factory coldata.ColumnFactory, 328 ) (colexecbase.Operator, error) { 329 streamingMemAccount := args.StreamingMemAccount 330 useStreamingMemAccountForBuffering := args.TestingKnobs.UseStreamingMemAccountForBuffering 331 var ( 332 sorterMemMonitorName string 333 inMemorySorter colexecbase.Operator 334 err error 335 ) 336 if len(ordering.Columns) == int(matchLen) { 337 // The input is already fully ordered, so there is nothing to sort. 338 return input, nil 339 } 340 if matchLen > 0 { 341 // The input is already partially ordered. Use a chunks sorter to avoid 342 // loading all the rows into memory. 343 sorterMemMonitorName = fmt.Sprintf("%ssort-chunks-%d", memMonitorNamePrefix, processorID) 344 var sortChunksMemAccount *mon.BoundAccount 345 if useStreamingMemAccountForBuffering { 346 sortChunksMemAccount = streamingMemAccount 347 } else { 348 sortChunksMemAccount = r.createMemAccountForSpillStrategy( 349 ctx, flowCtx, sorterMemMonitorName, 350 ) 351 } 352 inMemorySorter, err = NewSortChunks( 353 colmem.NewAllocator(ctx, sortChunksMemAccount, factory), input, inputTypes, 354 ordering.Columns, int(matchLen), 355 ) 356 } else if post.Limit != 0 && post.Filter.Empty() && int(post.Limit+post.Offset) > 0 { 357 // There is a limit specified with no post-process filter, so we know 358 // exactly how many rows the sorter should output. The last part of the 359 // condition is making sure there is no overflow when converting from 360 // the sum of two uint64s to int. 361 // 362 // Choose a top K sorter, which uses a heap to avoid storing more rows 363 // than necessary. 364 sorterMemMonitorName = fmt.Sprintf("%stopk-sort-%d", memMonitorNamePrefix, processorID) 365 var topKSorterMemAccount *mon.BoundAccount 366 if useStreamingMemAccountForBuffering { 367 topKSorterMemAccount = streamingMemAccount 368 } else { 369 topKSorterMemAccount = r.createMemAccountForSpillStrategy( 370 ctx, flowCtx, sorterMemMonitorName, 371 ) 372 } 373 k := int(post.Limit + post.Offset) 374 inMemorySorter = NewTopKSorter( 375 colmem.NewAllocator(ctx, topKSorterMemAccount, factory), input, inputTypes, 376 ordering.Columns, k, 377 ) 378 } else { 379 // No optimizations possible. Default to the standard sort operator. 380 sorterMemMonitorName = fmt.Sprintf("%ssort-all-%d", memMonitorNamePrefix, processorID) 381 var sorterMemAccount *mon.BoundAccount 382 if useStreamingMemAccountForBuffering { 383 sorterMemAccount = streamingMemAccount 384 } else { 385 sorterMemAccount = r.createMemAccountForSpillStrategy( 386 ctx, flowCtx, sorterMemMonitorName, 387 ) 388 } 389 inMemorySorter, err = NewSorter( 390 colmem.NewAllocator(ctx, sorterMemAccount, factory), input, inputTypes, ordering.Columns, 391 ) 392 } 393 if err != nil { 394 return nil, err 395 } 396 if inMemorySorter == nil { 397 return nil, errors.AssertionFailedf("unexpectedly inMemorySorter is nil") 398 } 399 // NOTE: when spilling to disk, we're using the same general external 400 // sorter regardless of which sorter variant we have instantiated (i.e. 401 // we don't take advantage of the limits and of partial ordering). We 402 // could improve this. 403 return newOneInputDiskSpiller( 404 input, inMemorySorter.(bufferingInMemoryOperator), 405 sorterMemMonitorName, 406 func(input colexecbase.Operator) colexecbase.Operator { 407 monitorNamePrefix := fmt.Sprintf("%sexternal-sorter", memMonitorNamePrefix) 408 // We are using an unlimited memory monitor here because external 409 // sort itself is responsible for making sure that we stay within 410 // the memory limit. 411 unlimitedAllocator := colmem.NewAllocator( 412 ctx, r.createBufferingUnlimitedMemAccount( 413 ctx, flowCtx, monitorNamePrefix, 414 ), factory) 415 standaloneMemAccount := r.createStandaloneMemAccount( 416 ctx, flowCtx, monitorNamePrefix, 417 ) 418 diskAccount := r.createDiskAccount(ctx, flowCtx, monitorNamePrefix) 419 // Make a copy of the DiskQueueCfg and set defaults for the sorter. 420 // The cache mode is chosen to reuse the cache to have a smaller 421 // cache per partition without affecting performance. 422 diskQueueCfg := args.DiskQueueCfg 423 diskQueueCfg.CacheMode = colcontainer.DiskQueueCacheModeReuseCache 424 diskQueueCfg.SetDefaultBufferSizeBytesForCacheMode() 425 if args.TestingKnobs.NumForcedRepartitions != 0 { 426 maxNumberPartitions = args.TestingKnobs.NumForcedRepartitions 427 } 428 es := newExternalSorter( 429 ctx, 430 unlimitedAllocator, 431 standaloneMemAccount, 432 input, inputTypes, ordering, 433 execinfra.GetWorkMemLimit(flowCtx.Cfg), 434 maxNumberPartitions, 435 args.TestingKnobs.DelegateFDAcquisitions, 436 diskQueueCfg, 437 args.FDSemaphore, 438 diskAccount, 439 ) 440 r.ToClose = append(r.ToClose, es.(IdempotentCloser)) 441 return es 442 }, 443 args.TestingKnobs.SpillingCallbackFn, 444 ), nil 445 } 446 447 // createAndWrapRowSource takes a processor spec, creating the row source and 448 // wrapping it using wrapRowSources. Note that the post process spec is included 449 // in the processor creation, so make sure to clear it if it will be inspected 450 // again. NewColOperatorResult is updated with the new OutputTypes and the 451 // resulting Columnarizer if there is no error. The result is also annotated as 452 // streaming because the resulting operator is not a buffering operator (even if 453 // it is a buffering processor). This is not a problem for memory accounting 454 // because each processor does that on its own, so the used memory will be 455 // accounted for. 456 func (r *NewColOperatorResult) createAndWrapRowSource( 457 ctx context.Context, 458 flowCtx *execinfra.FlowCtx, 459 inputs []colexecbase.Operator, 460 inputTypes [][]*types.T, 461 streamingMemAccount *mon.BoundAccount, 462 spec *execinfrapb.ProcessorSpec, 463 processorConstructor execinfra.ProcessorConstructor, 464 factory coldata.ColumnFactory, 465 ) error { 466 if flowCtx.EvalCtx.SessionData.VectorizeMode == sessiondata.Vectorize201Auto && 467 spec.Core.JoinReader == nil { 468 return errors.New("rowexec processor wrapping for non-JoinReader core unsupported in vectorize=201auto mode") 469 } 470 c, err := wrapRowSources( 471 ctx, 472 flowCtx, 473 inputs, 474 inputTypes, 475 streamingMemAccount, 476 spec.ProcessorID, 477 func(inputs []execinfra.RowSource) (execinfra.RowSource, error) { 478 // We provide a slice with a single nil as 'outputs' parameter because 479 // all processors expect a single output. Passing nil is ok here 480 // because when wrapping the processor, the materializer will be its 481 // output, and it will be set up in wrapRowSources. 482 proc, err := processorConstructor( 483 ctx, flowCtx, spec.ProcessorID, &spec.Core, &spec.Post, inputs, 484 []execinfra.RowReceiver{nil}, /* outputs */ 485 nil, /* localProcessors */ 486 ) 487 if err != nil { 488 return nil, err 489 } 490 var ( 491 rs execinfra.RowSource 492 ok bool 493 ) 494 if rs, ok = proc.(execinfra.RowSource); !ok { 495 return nil, errors.Newf( 496 "processor %s is not an execinfra.RowSource", spec.Core.String(), 497 ) 498 } 499 r.ColumnTypes = rs.OutputTypes() 500 return rs, nil 501 }, 502 factory, 503 ) 504 if err != nil { 505 return err 506 } 507 // We say that the wrapped processor is "streaming" because it is not a 508 // buffering operator (even if it is a buffering processor). This is not a 509 // problem for memory accounting because each processor does that on its 510 // own, so the used memory will be accounted for. 511 r.Op, r.IsStreaming = c, true 512 r.MetadataSources = append(r.MetadataSources, c) 513 return nil 514 } 515 516 // NOTE: throughout this file we do not append an output type of a projecting 517 // operator to the passed-in type schema - we, instead, always allocate a new 518 // type slice and copy over the old schema and set the output column of a 519 // projecting operator in the next slot. We attempt to enforce this by a linter 520 // rule, and such behavior prevents the type schema corruption scenario as 521 // described below. 522 // 523 // Without explicit new allocations, it is possible that planSelectionOperators 524 // (and other planning functions) reuse the same array for filterColumnTypes as 525 // result.ColumnTypes is using because there was enough capacity to do so. 526 // As an example, consider the following scenario in the context of 527 // planFilterExpr method: 528 // 1. r.ColumnTypes={types.Bool} with len=1 and cap=4 529 // 2. planSelectionOperators adds another types.Int column, so 530 // filterColumnTypes={types.Bool, types.Int} with len=2 and cap=4 531 // Crucially, it uses exact same underlying array as r.ColumnTypes 532 // uses. 533 // 3. we project out second column, so r.ColumnTypes={types.Bool} 534 // 4. later, we add another types.Float column, so 535 // r.ColumnTypes={types.Bool, types.Float}, but there is enough 536 // capacity in the array, so we simply overwrite the second slot 537 // with the new type which corrupts filterColumnTypes to become 538 // {types.Bool, types.Float}, and we can get into a runtime type 539 // mismatch situation. 540 541 // NewColOperator creates a new columnar operator according to the given spec. 542 func NewColOperator( 543 ctx context.Context, flowCtx *execinfra.FlowCtx, args NewColOperatorArgs, 544 ) (result NewColOperatorResult, err error) { 545 // Make sure that we clean up memory monitoring infrastructure in case of an 546 // error or a panic. 547 defer func() { 548 returnedErr := err 549 panicErr := recover() 550 if returnedErr != nil || panicErr != nil { 551 for _, acc := range result.OpAccounts { 552 acc.Close(ctx) 553 } 554 result.OpAccounts = result.OpAccounts[:0] 555 for _, mon := range result.OpMonitors { 556 mon.Stop(ctx) 557 } 558 result.OpMonitors = result.OpMonitors[:0] 559 } 560 if panicErr != nil { 561 colexecerror.InternalError(panicErr) 562 } 563 }() 564 spec := args.Spec 565 inputs := args.Inputs 566 factory := coldataext.NewExtendedColumnFactory(flowCtx.NewEvalCtx()) 567 streamingMemAccount := args.StreamingMemAccount 568 streamingAllocator := colmem.NewAllocator(ctx, streamingMemAccount, factory) 569 useStreamingMemAccountForBuffering := args.TestingKnobs.UseStreamingMemAccountForBuffering 570 processorConstructor := args.ProcessorConstructor 571 572 log.VEventf(ctx, 2, "planning col operator for spec %q", spec) 573 574 core := &spec.Core 575 post := &spec.Post 576 577 // By default, we safely assume that an operator is not streaming. Note that 578 // projections, renders, filters, limits, offsets as well as all internal 579 // operators (like stats collectors and cancel checkers) are streaming, so in 580 // order to determine whether the resulting chain of operators is streaming, 581 // it is sufficient to look only at the "core" operator. 582 result.IsStreaming = false 583 584 // resultPreSpecPlanningStateShallowCopy is a shallow copy of the result 585 // before any specs are planned. Used if there is a need to backtrack. 586 resultPreSpecPlanningStateShallowCopy := result 587 588 supported, err := isSupported(streamingAllocator, flowCtx.EvalCtx.SessionData.VectorizeMode, spec) 589 if !supported { 590 // We refuse to wrap LocalPlanNode processor (which is a DistSQL wrapper 591 // around a planNode) because it creates complications, and a flow with 592 // such processor probably will not benefit from the vectorization. 593 if core.LocalPlanNode != nil { 594 return result, errors.Newf("core.LocalPlanNode is not supported") 595 } 596 // We also do not wrap MetadataTest{Sender,Receiver} because of the way 597 // metadata is propagated through the vectorized flow - it is drained at 598 // the flow shutdown unlike these test processors expect. 599 if core.MetadataTestSender != nil { 600 return result, errors.Newf("core.MetadataTestSender is not supported") 601 } 602 if core.MetadataTestReceiver != nil { 603 return result, errors.Newf("core.MetadataTestReceiver is not supported") 604 } 605 606 log.VEventf(ctx, 1, "planning a wrapped processor because %s", err.Error()) 607 608 inputTypes := make([][]*types.T, len(spec.Input)) 609 for inputIdx, input := range spec.Input { 610 inputTypes[inputIdx] = make([]*types.T, len(input.ColumnTypes)) 611 copy(inputTypes[inputIdx], input.ColumnTypes) 612 } 613 614 err = result.createAndWrapRowSource(ctx, flowCtx, inputs, inputTypes, 615 streamingMemAccount, spec, processorConstructor, factory) 616 // The wrapped processors need to be passed the post-process specs, 617 // since they inspect them to figure out information about needed 618 // columns. This means that we'll let those processors do any renders 619 // or filters, which isn't ideal. We could improve this. 620 post = &execinfrapb.PostProcessSpec{} 621 622 } else { 623 switch { 624 case core.Noop != nil: 625 if err := checkNumIn(inputs, 1); err != nil { 626 return result, err 627 } 628 result.Op, result.IsStreaming = NewNoop(inputs[0]), true 629 result.ColumnTypes = make([]*types.T, len(spec.Input[0].ColumnTypes)) 630 copy(result.ColumnTypes, spec.Input[0].ColumnTypes) 631 case core.TableReader != nil: 632 if err := checkNumIn(inputs, 0); err != nil { 633 return result, err 634 } 635 var scanOp *colBatchScan 636 scanOp, err = newColBatchScan(streamingAllocator, flowCtx, core.TableReader, post) 637 if err != nil { 638 return result, err 639 } 640 result.Op, result.IsStreaming = scanOp, true 641 result.MetadataSources = append(result.MetadataSources, scanOp) 642 // colBatchScan is wrapped with a cancel checker below, so we need to 643 // log its creation separately. 644 log.VEventf(ctx, 1, "made op %T\n", result.Op) 645 646 // We want to check for cancellation once per input batch, and wrapping 647 // only colBatchScan with a CancelChecker allows us to do just that. 648 // It's sufficient for most of the operators since they are extremely fast. 649 // However, some of the long-running operators (for example, sorter) are 650 // still responsible for doing the cancellation check on their own while 651 // performing long operations. 652 result.Op = NewCancelChecker(result.Op) 653 returnMutations := core.TableReader.Visibility == execinfra.ScanVisibilityPublicAndNotPublic 654 result.ColumnTypes = core.TableReader.Table.ColumnTypesWithMutations(returnMutations) 655 case core.Aggregator != nil: 656 if err := checkNumIn(inputs, 1); err != nil { 657 return result, err 658 } 659 aggSpec := core.Aggregator 660 if len(aggSpec.Aggregations) == 0 { 661 // We can get an aggregator when no aggregate functions are present if 662 // HAVING clause is present, for example, with a query as follows: 663 // SELECT 1 FROM t HAVING true. In this case, we plan a special operator 664 // that outputs a batch of length 1 without actual columns once and then 665 // zero-length batches. The actual "data" will be added by projections 666 // below. 667 // TODO(solon): The distsql plan for this case includes a TableReader, so 668 // we end up creating an orphaned colBatchScan. We should avoid that. 669 // Ideally the optimizer would not plan a scan in this unusual case. 670 result.Op, result.IsStreaming, err = NewSingleTupleNoInputOp(streamingAllocator), true, nil 671 // We make ColumnTypes non-nil so that sanity check doesn't panic. 672 result.ColumnTypes = []*types.T{} 673 break 674 } 675 if aggSpec.IsRowCount() { 676 result.Op, result.IsStreaming, err = NewCountOp(streamingAllocator, inputs[0]), true, nil 677 result.ColumnTypes = []*types.T{types.Int} 678 break 679 } 680 681 var groupCols, orderedCols util.FastIntSet 682 683 for _, col := range aggSpec.OrderedGroupCols { 684 orderedCols.Add(int(col)) 685 } 686 687 needHash := false 688 for _, col := range aggSpec.GroupCols { 689 if !orderedCols.Contains(int(col)) { 690 needHash = true 691 } 692 groupCols.Add(int(col)) 693 } 694 if !orderedCols.SubsetOf(groupCols) { 695 return result, errors.AssertionFailedf("ordered cols must be a subset of grouping cols") 696 } 697 698 aggTyps := make([][]*types.T, len(aggSpec.Aggregations)) 699 aggCols := make([][]uint32, len(aggSpec.Aggregations)) 700 aggFns := make([]execinfrapb.AggregatorSpec_Func, len(aggSpec.Aggregations)) 701 result.ColumnTypes = make([]*types.T, len(aggSpec.Aggregations)) 702 for i, agg := range aggSpec.Aggregations { 703 aggTyps[i] = make([]*types.T, len(agg.ColIdx)) 704 for j, colIdx := range agg.ColIdx { 705 aggTyps[i][j] = spec.Input[0].ColumnTypes[colIdx] 706 } 707 aggCols[i] = agg.ColIdx 708 aggFns[i] = agg.Func 709 _, retType, err := execinfrapb.GetAggregateInfo(agg.Func, aggTyps[i]...) 710 if err != nil { 711 return result, err 712 } 713 result.ColumnTypes[i] = retType 714 } 715 typs := make([]*types.T, len(spec.Input[0].ColumnTypes)) 716 copy(typs, spec.Input[0].ColumnTypes) 717 if needHash { 718 hashAggregatorMemAccount := streamingMemAccount 719 if !useStreamingMemAccountForBuffering { 720 // Create an unlimited mem account explicitly even though there is no 721 // disk spilling because the memory usage of an aggregator is 722 // proportional to the number of groups, not the number of inputs. 723 // The row execution engine also gives an unlimited (that still 724 // needs to be approved by the upstream monitor, so not really 725 // "unlimited") amount of memory to the aggregator. 726 hashAggregatorMemAccount = result.createBufferingUnlimitedMemAccount(ctx, flowCtx, "hash-aggregator") 727 } 728 result.Op, err = NewHashAggregator( 729 colmem.NewAllocator(ctx, hashAggregatorMemAccount, factory), inputs[0], typs, aggFns, 730 aggSpec.GroupCols, aggCols, 731 ) 732 } else { 733 result.Op, err = NewOrderedAggregator( 734 streamingAllocator, inputs[0], typs, aggFns, 735 aggSpec.GroupCols, aggCols, aggSpec.IsScalar(), 736 ) 737 result.IsStreaming = true 738 } 739 740 case core.Distinct != nil: 741 if err := checkNumIn(inputs, 1); err != nil { 742 return result, err 743 } 744 result.ColumnTypes = make([]*types.T, len(spec.Input[0].ColumnTypes)) 745 copy(result.ColumnTypes, spec.Input[0].ColumnTypes) 746 if len(core.Distinct.OrderedColumns) == len(core.Distinct.DistinctColumns) { 747 result.Op, err = NewOrderedDistinct(inputs[0], core.Distinct.OrderedColumns, result.ColumnTypes) 748 result.IsStreaming = true 749 } else { 750 distinctMemAccount := streamingMemAccount 751 if !useStreamingMemAccountForBuffering { 752 // Create an unlimited mem account explicitly even though there is no 753 // disk spilling because the memory usage of an unordered distinct 754 // operator is proportional to the number of distinct tuples, not the 755 // number of input tuples. 756 // The row execution engine also gives an unlimited amount (that still 757 // needs to be approved by the upstream monitor, so not really 758 // "unlimited") amount of memory to the unordered distinct operator. 759 distinctMemAccount = result.createBufferingUnlimitedMemAccount(ctx, flowCtx, "distinct") 760 } 761 // TODO(yuzefovich): we have an implementation of partially ordered 762 // distinct, and we should plan it when we have non-empty ordered 763 // columns and we think that the probability of distinct tuples in the 764 // input is about 0.01 or less. 765 result.Op = NewUnorderedDistinct( 766 colmem.NewAllocator(ctx, distinctMemAccount, factory), inputs[0], 767 core.Distinct.DistinctColumns, result.ColumnTypes, hashTableNumBuckets, 768 ) 769 } 770 771 case core.Ordinality != nil: 772 if err := checkNumIn(inputs, 1); err != nil { 773 return result, err 774 } 775 outputIdx := len(spec.Input[0].ColumnTypes) 776 result.Op = NewOrdinalityOp(streamingAllocator, inputs[0], outputIdx) 777 result.IsStreaming = true 778 result.ColumnTypes = appendOneType(spec.Input[0].ColumnTypes, types.Int) 779 780 case core.HashJoiner != nil: 781 if err := checkNumIn(inputs, 2); err != nil { 782 return result, err 783 } 784 leftTypes := make([]*types.T, len(spec.Input[0].ColumnTypes)) 785 copy(leftTypes, spec.Input[0].ColumnTypes) 786 rightTypes := make([]*types.T, len(spec.Input[1].ColumnTypes)) 787 copy(rightTypes, spec.Input[1].ColumnTypes) 788 789 hashJoinerMemMonitorName := fmt.Sprintf("hash-joiner-%d", spec.ProcessorID) 790 var hashJoinerMemAccount *mon.BoundAccount 791 if useStreamingMemAccountForBuffering { 792 hashJoinerMemAccount = streamingMemAccount 793 } else { 794 hashJoinerMemAccount = result.createMemAccountForSpillStrategy( 795 ctx, flowCtx, hashJoinerMemMonitorName, 796 ) 797 } 798 // It is valid for empty set of equality columns to be considered as 799 // "key" (for example, the input has at most 1 row). However, hash 800 // joiner, in order to handle NULL values correctly, needs to think 801 // that an empty set of equality columns doesn't form a key. 802 rightEqColsAreKey := core.HashJoiner.RightEqColumnsAreKey && len(core.HashJoiner.RightEqColumns) > 0 803 hjSpec, err := makeHashJoinerSpec( 804 core.HashJoiner.Type, 805 core.HashJoiner.LeftEqColumns, 806 core.HashJoiner.RightEqColumns, 807 leftTypes, 808 rightTypes, 809 rightEqColsAreKey, 810 ) 811 if err != nil { 812 return result, err 813 } 814 inMemoryHashJoiner := newHashJoiner( 815 colmem.NewAllocator(ctx, hashJoinerMemAccount, factory), hjSpec, inputs[0], inputs[1], 816 ) 817 if args.TestingKnobs.DiskSpillingDisabled { 818 // We will not be creating a disk-backed hash joiner because we're 819 // running a test that explicitly asked for only in-memory hash 820 // joiner. 821 result.Op = inMemoryHashJoiner 822 } else { 823 diskAccount := result.createDiskAccount(ctx, flowCtx, hashJoinerMemMonitorName) 824 result.Op = newTwoInputDiskSpiller( 825 inputs[0], inputs[1], inMemoryHashJoiner.(bufferingInMemoryOperator), 826 hashJoinerMemMonitorName, 827 func(inputOne, inputTwo colexecbase.Operator) colexecbase.Operator { 828 monitorNamePrefix := "external-hash-joiner" 829 unlimitedAllocator := colmem.NewAllocator( 830 ctx, result.createBufferingUnlimitedMemAccount( 831 ctx, flowCtx, monitorNamePrefix, 832 ), factory) 833 // Make a copy of the DiskQueueCfg and set defaults for the hash 834 // joiner. The cache mode is chosen to automatically close the cache 835 // belonging to partitions at a parent level when repartitioning. 836 diskQueueCfg := args.DiskQueueCfg 837 diskQueueCfg.CacheMode = colcontainer.DiskQueueCacheModeClearAndReuseCache 838 diskQueueCfg.SetDefaultBufferSizeBytesForCacheMode() 839 ehj := newExternalHashJoiner( 840 unlimitedAllocator, hjSpec, 841 inputOne, inputTwo, 842 execinfra.GetWorkMemLimit(flowCtx.Cfg), 843 diskQueueCfg, 844 args.FDSemaphore, 845 func(input colexecbase.Operator, inputTypes []*types.T, orderingCols []execinfrapb.Ordering_Column, maxNumberPartitions int) (colexecbase.Operator, error) { 846 sortArgs := args 847 if !args.TestingKnobs.DelegateFDAcquisitions { 848 // Set the FDSemaphore to nil. This indicates that no FDs 849 // should be acquired. The external hash joiner will do this 850 // up front. 851 sortArgs.FDSemaphore = nil 852 } 853 return result.createDiskBackedSort( 854 ctx, flowCtx, sortArgs, input, inputTypes, 855 execinfrapb.Ordering{Columns: orderingCols}, 856 0 /* matchLen */, maxNumberPartitions, spec.ProcessorID, 857 &execinfrapb.PostProcessSpec{}, monitorNamePrefix+"-", factory) 858 }, 859 args.TestingKnobs.NumForcedRepartitions, 860 args.TestingKnobs.DelegateFDAcquisitions, 861 diskAccount, 862 ) 863 result.ToClose = append(result.ToClose, ehj.(IdempotentCloser)) 864 return ehj 865 }, 866 args.TestingKnobs.SpillingCallbackFn, 867 ) 868 } 869 result.ColumnTypes = make([]*types.T, len(leftTypes)+len(rightTypes)) 870 copy(result.ColumnTypes, leftTypes) 871 if !core.HashJoiner.Type.ShouldIncludeRightColsInOutput() { 872 result.ColumnTypes = result.ColumnTypes[:len(leftTypes):len(leftTypes)] 873 } else { 874 copy(result.ColumnTypes[len(leftTypes):], rightTypes) 875 } 876 877 if !core.HashJoiner.OnExpr.Empty() && core.HashJoiner.Type == sqlbase.InnerJoin { 878 if err = 879 result.planAndMaybeWrapOnExprAsFilter(ctx, flowCtx, core.HashJoiner.OnExpr, 880 streamingMemAccount, processorConstructor, factory); err != nil { 881 return result, err 882 } 883 } 884 885 case core.MergeJoiner != nil: 886 if err := checkNumIn(inputs, 2); err != nil { 887 return result, err 888 } 889 // Merge joiner is a streaming operator when equality columns form a key 890 // for both of the inputs. 891 result.IsStreaming = core.MergeJoiner.LeftEqColumnsAreKey && core.MergeJoiner.RightEqColumnsAreKey 892 893 leftTypes := make([]*types.T, len(spec.Input[0].ColumnTypes)) 894 copy(leftTypes, spec.Input[0].ColumnTypes) 895 rightTypes := make([]*types.T, len(spec.Input[1].ColumnTypes)) 896 copy(rightTypes, spec.Input[1].ColumnTypes) 897 898 joinType := core.MergeJoiner.Type 899 var onExpr *execinfrapb.Expression 900 if !core.MergeJoiner.OnExpr.Empty() { 901 if joinType != sqlbase.InnerJoin { 902 return result, errors.AssertionFailedf( 903 "ON expression (%s) was unexpectedly planned for merge joiner with join type %s", 904 core.MergeJoiner.OnExpr.String(), core.MergeJoiner.Type.String(), 905 ) 906 } 907 onExpr = &core.MergeJoiner.OnExpr 908 } 909 910 monitorName := "merge-joiner" 911 // We are using an unlimited memory monitor here because merge joiner 912 // itself is responsible for making sure that we stay within the memory 913 // limit, and it will fall back to disk if necessary. 914 unlimitedAllocator := colmem.NewAllocator( 915 ctx, result.createBufferingUnlimitedMemAccount( 916 ctx, flowCtx, monitorName, 917 ), factory) 918 diskAccount := result.createDiskAccount(ctx, flowCtx, monitorName) 919 mj, err := newMergeJoinOp( 920 unlimitedAllocator, execinfra.GetWorkMemLimit(flowCtx.Cfg), 921 args.DiskQueueCfg, args.FDSemaphore, 922 joinType, inputs[0], inputs[1], leftTypes, rightTypes, 923 core.MergeJoiner.LeftOrdering.Columns, core.MergeJoiner.RightOrdering.Columns, 924 diskAccount, 925 ) 926 if err != nil { 927 return result, err 928 } 929 930 result.Op = mj 931 result.ToClose = append(result.ToClose, mj.(IdempotentCloser)) 932 result.ColumnTypes = make([]*types.T, len(leftTypes)+len(rightTypes)) 933 copy(result.ColumnTypes, leftTypes) 934 if !core.MergeJoiner.Type.ShouldIncludeRightColsInOutput() { 935 result.ColumnTypes = result.ColumnTypes[:len(leftTypes):len(leftTypes)] 936 } else { 937 copy(result.ColumnTypes[len(leftTypes):], rightTypes) 938 } 939 940 if onExpr != nil { 941 if err = result.planAndMaybeWrapOnExprAsFilter(ctx, flowCtx, *onExpr, 942 streamingMemAccount, processorConstructor, factory); err != nil { 943 return result, err 944 } 945 } 946 947 case core.Sorter != nil: 948 if err := checkNumIn(inputs, 1); err != nil { 949 return result, err 950 } 951 input := inputs[0] 952 result.ColumnTypes = make([]*types.T, len(spec.Input[0].ColumnTypes)) 953 copy(result.ColumnTypes, spec.Input[0].ColumnTypes) 954 ordering := core.Sorter.OutputOrdering 955 matchLen := core.Sorter.OrderingMatchLen 956 result.Op, err = result.createDiskBackedSort( 957 ctx, flowCtx, args, input, result.ColumnTypes, ordering, matchLen, 0, /* maxNumberPartitions */ 958 spec.ProcessorID, post, "" /* memMonitorNamePrefix */, factory, 959 ) 960 961 case core.Windower != nil: 962 if err := checkNumIn(inputs, 1); err != nil { 963 return result, err 964 } 965 memMonitorsPrefix := "window-" 966 input := inputs[0] 967 result.ColumnTypes = make([]*types.T, len(spec.Input[0].ColumnTypes)) 968 copy(result.ColumnTypes, spec.Input[0].ColumnTypes) 969 for _, wf := range core.Windower.WindowFns { 970 // We allocate the capacity for two extra types because of the 971 // temporary columns that can be appended below. 972 typs := make([]*types.T, len(result.ColumnTypes), len(result.ColumnTypes)+2) 973 copy(typs, result.ColumnTypes) 974 tempColOffset, partitionColIdx := uint32(0), columnOmitted 975 peersColIdx := columnOmitted 976 windowFn := *wf.Func.WindowFunc 977 if len(core.Windower.PartitionBy) > 0 { 978 // TODO(yuzefovich): add support for hashing partitioner (probably by 979 // leveraging hash routers once we can distribute). The decision about 980 // which kind of partitioner to use should come from the optimizer. 981 partitionColIdx = int(wf.OutputColIdx) 982 input, err = NewWindowSortingPartitioner( 983 streamingAllocator, input, typs, 984 core.Windower.PartitionBy, wf.Ordering.Columns, int(wf.OutputColIdx), 985 func(input colexecbase.Operator, inputTypes []*types.T, orderingCols []execinfrapb.Ordering_Column) (colexecbase.Operator, error) { 986 return result.createDiskBackedSort( 987 ctx, flowCtx, args, input, inputTypes, 988 execinfrapb.Ordering{Columns: orderingCols}, 0, /* matchLen */ 989 0 /* maxNumberPartitions */, spec.ProcessorID, 990 &execinfrapb.PostProcessSpec{}, memMonitorsPrefix, factory) 991 }, 992 ) 993 // Window partitioner will append a boolean column. 994 tempColOffset++ 995 typs = typs[:len(typs)+1] 996 typs[len(typs)-1] = types.Bool 997 } else { 998 if len(wf.Ordering.Columns) > 0 { 999 input, err = result.createDiskBackedSort( 1000 ctx, flowCtx, args, input, typs, 1001 wf.Ordering, 0 /* matchLen */, 0, /* maxNumberPartitions */ 1002 spec.ProcessorID, &execinfrapb.PostProcessSpec{}, memMonitorsPrefix, factory, 1003 ) 1004 } 1005 } 1006 if err != nil { 1007 return result, err 1008 } 1009 if windowFnNeedsPeersInfo(*wf.Func.WindowFunc) { 1010 peersColIdx = int(wf.OutputColIdx + tempColOffset) 1011 input, err = NewWindowPeerGrouper( 1012 streamingAllocator, input, typs, wf.Ordering.Columns, 1013 partitionColIdx, peersColIdx, 1014 ) 1015 // Window peer grouper will append a boolean column. 1016 tempColOffset++ 1017 typs = typs[:len(typs)+1] 1018 typs[len(typs)-1] = types.Bool 1019 } 1020 1021 outputIdx := int(wf.OutputColIdx + tempColOffset) 1022 switch windowFn { 1023 case execinfrapb.WindowerSpec_ROW_NUMBER: 1024 result.Op = NewRowNumberOperator(streamingAllocator, input, outputIdx, partitionColIdx) 1025 case execinfrapb.WindowerSpec_RANK, execinfrapb.WindowerSpec_DENSE_RANK: 1026 result.Op, err = NewRankOperator( 1027 streamingAllocator, input, windowFn, wf.Ordering.Columns, 1028 outputIdx, partitionColIdx, peersColIdx, 1029 ) 1030 case execinfrapb.WindowerSpec_PERCENT_RANK, execinfrapb.WindowerSpec_CUME_DIST: 1031 // We are using an unlimited memory monitor here because 1032 // relative rank operators themselves are responsible for 1033 // making sure that we stay within the memory limit, and 1034 // they will fall back to disk if necessary. 1035 memAccName := memMonitorsPrefix + "relative-rank" 1036 unlimitedAllocator := colmem.NewAllocator( 1037 ctx, result.createBufferingUnlimitedMemAccount(ctx, flowCtx, memAccName), factory, 1038 ) 1039 diskAcc := result.createDiskAccount(ctx, flowCtx, memAccName) 1040 result.Op, err = NewRelativeRankOperator( 1041 unlimitedAllocator, execinfra.GetWorkMemLimit(flowCtx.Cfg), args.DiskQueueCfg, 1042 args.FDSemaphore, input, typs, windowFn, wf.Ordering.Columns, 1043 outputIdx, partitionColIdx, peersColIdx, diskAcc, 1044 ) 1045 // NewRelativeRankOperator sometimes returns a constOp when there 1046 // are no ordering columns, so we check that the returned operator 1047 // is an IdempotentCloser. 1048 if c, ok := result.Op.(IdempotentCloser); ok { 1049 result.ToClose = append(result.ToClose, c) 1050 } 1051 default: 1052 return result, errors.AssertionFailedf("window function %s is not supported", wf.String()) 1053 } 1054 1055 if tempColOffset > 0 { 1056 // We want to project out temporary columns (which have indices in the 1057 // range [wf.OutputColIdx, wf.OutputColIdx+tempColOffset)). 1058 projection := make([]uint32, 0, wf.OutputColIdx+tempColOffset) 1059 for i := uint32(0); i < wf.OutputColIdx; i++ { 1060 projection = append(projection, i) 1061 } 1062 projection = append(projection, wf.OutputColIdx+tempColOffset) 1063 result.Op = NewSimpleProjectOp(result.Op, int(wf.OutputColIdx+tempColOffset), projection) 1064 } 1065 1066 _, returnType, err := execinfrapb.GetWindowFunctionInfo(wf.Func, []*types.T{}...) 1067 if err != nil { 1068 return result, err 1069 } 1070 result.ColumnTypes = appendOneType(result.ColumnTypes, returnType) 1071 input = result.Op 1072 } 1073 1074 default: 1075 return result, errors.Newf("unsupported processor core %q", core) 1076 } 1077 } 1078 1079 if err != nil { 1080 return result, err 1081 } 1082 1083 // After constructing the base operator, calculate its internal memory usage. 1084 if sMem, ok := result.Op.(InternalMemoryOperator); ok { 1085 result.InternalMemUsage += sMem.InternalMemoryUsage() 1086 } 1087 log.VEventf(ctx, 1, "made op %T\n", result.Op) 1088 1089 // Note: at this point, it is legal for ColumnTypes to be empty (it is 1090 // legal for empty rows to be passed between processors). 1091 1092 ppr := postProcessResult{ 1093 Op: result.Op, 1094 ColumnTypes: result.ColumnTypes, 1095 } 1096 err = ppr.planPostProcessSpec(ctx, flowCtx, post, streamingMemAccount, factory) 1097 // TODO(yuzefovich): update unit tests to remove panic-catcher when fallback 1098 // to rowexec is not allowed. 1099 if err != nil && processorConstructor == nil { 1100 // Do not attempt to wrap as a row source if there is no 1101 // processorConstructor because it would fail. 1102 return result, err 1103 } 1104 1105 if err != nil { 1106 log.VEventf( 1107 ctx, 2, 1108 "vectorized post process planning failed with error %v post spec is %s, attempting to wrap as a row source", 1109 err, post, 1110 ) 1111 if core.TableReader != nil { 1112 // We cannot naively wrap a TableReader's post-processing spec since it 1113 // might project out unneeded columns that are of unsupported types. These 1114 // columns are still returned, either as coltypes.Unhandled if the type is 1115 // unsupported, or as an empty column of a supported type. If we were to 1116 // wrap an unsupported post-processing spec, a Materializer would naively 1117 // decode these columns, which would return errors (e.g. UUIDs require 16 1118 // bytes, coltypes.Unhandled may not be decoded). 1119 inputTypes := make([][]*types.T, len(spec.Input)) 1120 for inputIdx, input := range spec.Input { 1121 inputTypes[inputIdx] = make([]*types.T, len(input.ColumnTypes)) 1122 copy(inputTypes[inputIdx], input.ColumnTypes) 1123 } 1124 result.resetToState(ctx, resultPreSpecPlanningStateShallowCopy) 1125 err = result.createAndWrapRowSource( 1126 ctx, flowCtx, inputs, inputTypes, streamingMemAccount, spec, processorConstructor, factory, 1127 ) 1128 if err != nil { 1129 // There was an error wrapping the TableReader. 1130 return result, err 1131 } 1132 } else { 1133 err = result.wrapPostProcessSpec(ctx, flowCtx, post, streamingMemAccount, processorConstructor, factory) 1134 } 1135 } else { 1136 // The result can be updated with the post process result. 1137 result.updateWithPostProcessResult(ppr) 1138 } 1139 return result, err 1140 } 1141 1142 // planAndMaybeWrapOnExprAsFilter plans a joiner ON expression as a filter. If 1143 // the filter is unsupported, it is planned as a wrapped noop processor with 1144 // the filter as a post-processing stage. 1145 func (r *NewColOperatorResult) planAndMaybeWrapOnExprAsFilter( 1146 ctx context.Context, 1147 flowCtx *execinfra.FlowCtx, 1148 onExpr execinfrapb.Expression, 1149 streamingMemAccount *mon.BoundAccount, 1150 processorConstructor execinfra.ProcessorConstructor, 1151 factory coldata.ColumnFactory, 1152 ) error { 1153 // We will plan other Operators on top of r.Op, so we need to account for the 1154 // internal memory explicitly. 1155 if internalMemOp, ok := r.Op.(InternalMemoryOperator); ok { 1156 r.InternalMemUsage += internalMemOp.InternalMemoryUsage() 1157 } 1158 ppr := postProcessResult{ 1159 Op: r.Op, 1160 ColumnTypes: r.ColumnTypes, 1161 } 1162 if err := ppr.planFilterExpr( 1163 ctx, flowCtx.NewEvalCtx(), onExpr, streamingMemAccount, factory, 1164 ); err != nil { 1165 // ON expression planning failed. Fall back to planning the filter 1166 // using row execution. 1167 log.VEventf( 1168 ctx, 2, 1169 "vectorized join ON expr planning failed with error %v ON expr is %s, attempting to wrap as a row source", 1170 err, onExpr.String(), 1171 ) 1172 1173 onExprAsFilter := &execinfrapb.PostProcessSpec{Filter: onExpr} 1174 return r.wrapPostProcessSpec(ctx, flowCtx, onExprAsFilter, streamingMemAccount, processorConstructor, factory) 1175 } 1176 r.updateWithPostProcessResult(ppr) 1177 return nil 1178 } 1179 1180 // wrapPostProcessSpec plans the given post process spec by wrapping a noop 1181 // processor with that output spec. This is used to fall back to row execution 1182 // when encountering unsupported post processing specs. An error is returned 1183 // if the wrapping failed. A reason for this could be an unsupported type, in 1184 // which case the row execution engine is used fully. 1185 func (r *NewColOperatorResult) wrapPostProcessSpec( 1186 ctx context.Context, 1187 flowCtx *execinfra.FlowCtx, 1188 post *execinfrapb.PostProcessSpec, 1189 streamingMemAccount *mon.BoundAccount, 1190 processorConstructor execinfra.ProcessorConstructor, 1191 factory coldata.ColumnFactory, 1192 ) error { 1193 noopSpec := &execinfrapb.ProcessorSpec{ 1194 Core: execinfrapb.ProcessorCoreUnion{ 1195 Noop: &execinfrapb.NoopCoreSpec{}, 1196 }, 1197 Post: *post, 1198 } 1199 return r.createAndWrapRowSource( 1200 ctx, flowCtx, []colexecbase.Operator{r.Op}, [][]*types.T{r.ColumnTypes}, 1201 streamingMemAccount, noopSpec, processorConstructor, factory, 1202 ) 1203 } 1204 1205 // planPostProcessSpec plans the post processing stage specified in post on top 1206 // of r.Op. 1207 func (r *postProcessResult) planPostProcessSpec( 1208 ctx context.Context, 1209 flowCtx *execinfra.FlowCtx, 1210 post *execinfrapb.PostProcessSpec, 1211 streamingMemAccount *mon.BoundAccount, 1212 factory coldata.ColumnFactory, 1213 ) error { 1214 if !post.Filter.Empty() { 1215 if err := r.planFilterExpr( 1216 ctx, flowCtx.NewEvalCtx(), post.Filter, streamingMemAccount, factory, 1217 ); err != nil { 1218 return err 1219 } 1220 } 1221 1222 if post.Projection { 1223 r.addProjection(post.OutputColumns) 1224 } else if post.RenderExprs != nil { 1225 log.VEventf(ctx, 2, "planning render expressions %+v", post.RenderExprs) 1226 var renderedCols []uint32 1227 for _, expr := range post.RenderExprs { 1228 var ( 1229 helper execinfra.ExprHelper 1230 renderInternalMem int 1231 ) 1232 err := helper.Init(expr, r.ColumnTypes, flowCtx.EvalCtx) 1233 if err != nil { 1234 return err 1235 } 1236 var outputIdx int 1237 r.Op, outputIdx, r.ColumnTypes, renderInternalMem, err = planProjectionOperators( 1238 ctx, flowCtx.NewEvalCtx(), helper.Expr, r.ColumnTypes, r.Op, streamingMemAccount, factory, 1239 ) 1240 if err != nil { 1241 return errors.Wrapf(err, "unable to columnarize render expression %q", expr) 1242 } 1243 if outputIdx < 0 { 1244 return errors.AssertionFailedf("missing outputIdx") 1245 } 1246 r.InternalMemUsage += renderInternalMem 1247 renderedCols = append(renderedCols, uint32(outputIdx)) 1248 } 1249 r.Op = NewSimpleProjectOp(r.Op, len(r.ColumnTypes), renderedCols) 1250 newTypes := make([]*types.T, len(renderedCols)) 1251 for i, j := range renderedCols { 1252 newTypes[i] = r.ColumnTypes[j] 1253 } 1254 r.ColumnTypes = newTypes 1255 } 1256 if post.Offset != 0 { 1257 r.Op = NewOffsetOp(r.Op, int(post.Offset)) 1258 } 1259 if post.Limit != 0 { 1260 r.Op = NewLimitOp(r.Op, int(post.Limit)) 1261 } 1262 return nil 1263 } 1264 1265 // createBufferingUnlimitedMemMonitor instantiates an unlimited memory monitor. 1266 // These should only be used when spilling to disk and an operator is made aware 1267 // of a memory usage limit separately. 1268 // The receiver is updated to have a reference to the unlimited memory monitor. 1269 func (r *NewColOperatorResult) createBufferingUnlimitedMemMonitor( 1270 ctx context.Context, flowCtx *execinfra.FlowCtx, name string, 1271 ) *mon.BytesMonitor { 1272 bufferingOpUnlimitedMemMonitor := execinfra.NewMonitor( 1273 ctx, flowCtx.EvalCtx.Mon, name+"-unlimited", 1274 ) 1275 r.OpMonitors = append(r.OpMonitors, bufferingOpUnlimitedMemMonitor) 1276 return bufferingOpUnlimitedMemMonitor 1277 } 1278 1279 // createMemAccountForSpillStrategy instantiates a memory monitor and a memory 1280 // account to be used with a buffering Operator that can fall back to disk. 1281 // The default memory limit is used, if flowCtx.Cfg.ForceDiskSpill is used, this 1282 // will be 1. The receiver is updated to have references to both objects. 1283 func (r *NewColOperatorResult) createMemAccountForSpillStrategy( 1284 ctx context.Context, flowCtx *execinfra.FlowCtx, name string, 1285 ) *mon.BoundAccount { 1286 bufferingOpMemMonitor := execinfra.NewLimitedMonitor( 1287 ctx, flowCtx.EvalCtx.Mon, flowCtx.Cfg, name+"-limited", 1288 ) 1289 r.OpMonitors = append(r.OpMonitors, bufferingOpMemMonitor) 1290 bufferingMemAccount := bufferingOpMemMonitor.MakeBoundAccount() 1291 r.OpAccounts = append(r.OpAccounts, &bufferingMemAccount) 1292 return &bufferingMemAccount 1293 } 1294 1295 // createBufferingUnlimitedMemAccount instantiates an unlimited memory monitor 1296 // and a memory account to be used with a buffering disk-backed Operator. The 1297 // receiver is updated to have references to both objects. Note that the 1298 // returned account is only "unlimited" in that it does not have a hard limit 1299 // that it enforces, but a limit might be enforced by a root monitor. 1300 func (r *NewColOperatorResult) createBufferingUnlimitedMemAccount( 1301 ctx context.Context, flowCtx *execinfra.FlowCtx, name string, 1302 ) *mon.BoundAccount { 1303 bufferingOpUnlimitedMemMonitor := r.createBufferingUnlimitedMemMonitor(ctx, flowCtx, name) 1304 bufferingMemAccount := bufferingOpUnlimitedMemMonitor.MakeBoundAccount() 1305 r.OpAccounts = append(r.OpAccounts, &bufferingMemAccount) 1306 return &bufferingMemAccount 1307 } 1308 1309 // createStandaloneMemAccount instantiates an unlimited memory monitor and a 1310 // memory account that have a standalone budget. This means that the memory 1311 // registered with these objects is *not* reported to the root monitor (i.e. 1312 // it will not count towards max-sql-memory). Use it only when the memory in 1313 // use is accounted for with a different memory monitor. The receiver is 1314 // updated to have references to both objects. 1315 func (r *NewColOperatorResult) createStandaloneMemAccount( 1316 ctx context.Context, flowCtx *execinfra.FlowCtx, name string, 1317 ) *mon.BoundAccount { 1318 standaloneMemMonitor := mon.MakeMonitor( 1319 name+"-standalone", 1320 mon.MemoryResource, 1321 nil, /* curCount */ 1322 nil, /* maxHist */ 1323 -1, /* increment: use default increment */ 1324 math.MaxInt64, /* noteworthy */ 1325 flowCtx.Cfg.Settings, 1326 ) 1327 r.OpMonitors = append(r.OpMonitors, &standaloneMemMonitor) 1328 standaloneMemMonitor.Start(ctx, nil, mon.MakeStandaloneBudget(math.MaxInt64)) 1329 standaloneMemAccount := standaloneMemMonitor.MakeBoundAccount() 1330 r.OpAccounts = append(r.OpAccounts, &standaloneMemAccount) 1331 return &standaloneMemAccount 1332 } 1333 1334 // createDiskAccount instantiates an unlimited disk monitor and a disk account 1335 // to be used for disk spilling infrastructure in vectorized engine. 1336 // TODO(azhng): consolidates all allocation monitors/account manage into one 1337 // place after branch cut for 20.1. 1338 func (r *NewColOperatorResult) createDiskAccount( 1339 ctx context.Context, flowCtx *execinfra.FlowCtx, name string, 1340 ) *mon.BoundAccount { 1341 opDiskMonitor := execinfra.NewMonitor(ctx, flowCtx.Cfg.DiskMonitor, name) 1342 r.OpMonitors = append(r.OpMonitors, opDiskMonitor) 1343 opDiskAccount := opDiskMonitor.MakeBoundAccount() 1344 r.OpAccounts = append(r.OpAccounts, &opDiskAccount) 1345 return &opDiskAccount 1346 } 1347 1348 type postProcessResult struct { 1349 Op colexecbase.Operator 1350 ColumnTypes []*types.T 1351 InternalMemUsage int 1352 } 1353 1354 func (r *NewColOperatorResult) updateWithPostProcessResult(ppr postProcessResult) { 1355 r.Op = ppr.Op 1356 r.ColumnTypes = make([]*types.T, len(ppr.ColumnTypes)) 1357 copy(r.ColumnTypes, ppr.ColumnTypes) 1358 r.InternalMemUsage += ppr.InternalMemUsage 1359 } 1360 1361 func (r *postProcessResult) planFilterExpr( 1362 ctx context.Context, 1363 evalCtx *tree.EvalContext, 1364 filter execinfrapb.Expression, 1365 acc *mon.BoundAccount, 1366 factory coldata.ColumnFactory, 1367 ) error { 1368 var ( 1369 helper execinfra.ExprHelper 1370 selectionInternalMem int 1371 ) 1372 err := helper.Init(filter, r.ColumnTypes, evalCtx) 1373 if err != nil { 1374 return err 1375 } 1376 if helper.Expr == tree.DNull { 1377 // The filter expression is tree.DNull meaning that it is always false, so 1378 // we put a zero operator. 1379 r.Op = NewZeroOp(r.Op) 1380 return nil 1381 } 1382 var filterColumnTypes []*types.T 1383 r.Op, _, filterColumnTypes, selectionInternalMem, err = planSelectionOperators( 1384 ctx, evalCtx, helper.Expr, r.ColumnTypes, r.Op, acc, factory, 1385 ) 1386 if err != nil { 1387 return errors.Wrapf(err, "unable to columnarize filter expression %q", filter.Expr) 1388 } 1389 r.InternalMemUsage += selectionInternalMem 1390 if len(filterColumnTypes) > len(r.ColumnTypes) { 1391 // Additional columns were appended to store projections while evaluating 1392 // the filter. Project them away. 1393 var outputColumns []uint32 1394 for i := range r.ColumnTypes { 1395 outputColumns = append(outputColumns, uint32(i)) 1396 } 1397 r.Op = NewSimpleProjectOp(r.Op, len(filterColumnTypes), outputColumns) 1398 } 1399 return nil 1400 } 1401 1402 // addProjection adds a simple projection to r (Op and ColumnTypes are updated 1403 // accordingly). 1404 func (r *postProcessResult) addProjection(projection []uint32) { 1405 r.Op = NewSimpleProjectOp(r.Op, len(r.ColumnTypes), projection) 1406 // Update output ColumnTypes. 1407 newTypes := make([]*types.T, len(projection)) 1408 for i, j := range projection { 1409 newTypes[i] = r.ColumnTypes[j] 1410 } 1411 r.ColumnTypes = newTypes 1412 } 1413 1414 func planSelectionOperators( 1415 ctx context.Context, 1416 evalCtx *tree.EvalContext, 1417 expr tree.TypedExpr, 1418 columnTypes []*types.T, 1419 input colexecbase.Operator, 1420 acc *mon.BoundAccount, 1421 factory coldata.ColumnFactory, 1422 ) (op colexecbase.Operator, resultIdx int, typs []*types.T, internalMemUsed int, err error) { 1423 switch t := expr.(type) { 1424 case *tree.IndexedVar: 1425 op, err = boolOrUnknownToSelOp(input, columnTypes, t.Idx) 1426 return op, -1, columnTypes, internalMemUsed, err 1427 case *tree.AndExpr: 1428 // AND expressions are handled by an implicit AND'ing of selection vectors. 1429 // First we select out the tuples that are true on the left side, and then, 1430 // only among the matched tuples, we select out the tuples that are true on 1431 // the right side. 1432 var leftOp, rightOp colexecbase.Operator 1433 var internalMemUsedLeft, internalMemUsedRight int 1434 leftOp, _, typs, internalMemUsedLeft, err = planSelectionOperators( 1435 ctx, evalCtx, t.TypedLeft(), columnTypes, input, acc, factory, 1436 ) 1437 if err != nil { 1438 return nil, resultIdx, typs, internalMemUsed, err 1439 } 1440 rightOp, resultIdx, typs, internalMemUsedRight, err = planSelectionOperators( 1441 ctx, evalCtx, t.TypedRight(), typs, leftOp, acc, factory, 1442 ) 1443 return rightOp, resultIdx, typs, internalMemUsedLeft + internalMemUsedRight, err 1444 case *tree.OrExpr: 1445 // OR expressions are handled by converting them to an equivalent CASE 1446 // statement. Since CASE statements don't have a selection form, plan a 1447 // projection and then convert the resulting boolean to a selection vector. 1448 // 1449 // Rewrite the OR expression as an equivalent CASE expression. 1450 // "a OR b" becomes "CASE WHEN a THEN true WHEN b THEN true ELSE false END". 1451 // This way we can take advantage of the short-circuiting logic built into 1452 // the CASE operator. (b should not be evaluated if a is true.) 1453 caseExpr, err := tree.NewTypedCaseExpr( 1454 nil, /* expr */ 1455 []*tree.When{ 1456 {Cond: t.Left, Val: tree.DBoolTrue}, 1457 {Cond: t.Right, Val: tree.DBoolTrue}, 1458 }, 1459 tree.DBoolFalse, 1460 types.Bool) 1461 if err != nil { 1462 return nil, resultIdx, typs, internalMemUsed, err 1463 } 1464 op, resultIdx, typs, internalMemUsed, err = planProjectionOperators( 1465 ctx, evalCtx, caseExpr, columnTypes, input, acc, factory, 1466 ) 1467 if err != nil { 1468 return nil, resultIdx, typs, internalMemUsed, err 1469 } 1470 op, err = boolOrUnknownToSelOp(op, typs, resultIdx) 1471 return op, resultIdx, typs, internalMemUsed, err 1472 case *tree.CaseExpr: 1473 op, resultIdx, typs, internalMemUsed, err = planProjectionOperators( 1474 ctx, evalCtx, expr, columnTypes, input, acc, factory, 1475 ) 1476 if err != nil { 1477 return op, resultIdx, typs, internalMemUsed, err 1478 } 1479 op, err = boolOrUnknownToSelOp(op, typs, resultIdx) 1480 return op, resultIdx, typs, internalMemUsed, err 1481 case *tree.IsNullExpr: 1482 op, resultIdx, typs, internalMemUsed, err = planProjectionOperators( 1483 ctx, evalCtx, t.TypedInnerExpr(), columnTypes, input, acc, factory, 1484 ) 1485 op = newIsNullSelOp(op, resultIdx, false) 1486 return op, resultIdx, typs, internalMemUsed, err 1487 case *tree.IsNotNullExpr: 1488 op, resultIdx, typs, internalMemUsed, err = planProjectionOperators( 1489 ctx, evalCtx, t.TypedInnerExpr(), columnTypes, input, acc, factory, 1490 ) 1491 op = newIsNullSelOp(op, resultIdx, true) 1492 return op, resultIdx, typs, internalMemUsed, err 1493 case *tree.ComparisonExpr: 1494 cmpOp := t.Operator 1495 leftOp, leftIdx, ct, internalMemUsedLeft, err := planProjectionOperators( 1496 ctx, evalCtx, t.TypedLeft(), columnTypes, input, acc, factory, 1497 ) 1498 if err != nil { 1499 return nil, resultIdx, ct, internalMemUsed, err 1500 } 1501 lTyp := ct[leftIdx] 1502 if constArg, ok := t.Right.(tree.Datum); ok { 1503 if t.Operator == tree.Like || t.Operator == tree.NotLike { 1504 negate := t.Operator == tree.NotLike 1505 op, err = GetLikeOperator( 1506 evalCtx, leftOp, leftIdx, string(tree.MustBeDString(constArg)), negate) 1507 return op, resultIdx, ct, internalMemUsedLeft, err 1508 } 1509 if t.Operator == tree.In || t.Operator == tree.NotIn { 1510 negate := t.Operator == tree.NotIn 1511 datumTuple, ok := tree.AsDTuple(constArg) 1512 if !ok { 1513 err = errors.Errorf("IN is only supported for constant expressions") 1514 return nil, resultIdx, ct, internalMemUsed, err 1515 } 1516 op, err = GetInOperator(lTyp, leftOp, leftIdx, datumTuple, negate) 1517 return op, resultIdx, ct, internalMemUsedLeft, err 1518 } 1519 if t.Operator == tree.IsDistinctFrom || t.Operator == tree.IsNotDistinctFrom { 1520 if t.Right != tree.DNull { 1521 err = errors.Errorf("IS DISTINCT FROM and IS NOT DISTINCT FROM are supported only with NULL argument") 1522 return nil, resultIdx, ct, internalMemUsed, err 1523 } 1524 // IS NOT DISTINCT FROM NULL is synonymous with IS NULL and IS 1525 // DISTINCT FROM NULL is synonymous with IS NOT NULL (except for 1526 // tuples). Therefore, negate when the operator is IS DISTINCT 1527 // FROM NULL. 1528 negate := t.Operator == tree.IsDistinctFrom 1529 op = newIsNullSelOp(leftOp, leftIdx, negate) 1530 return op, resultIdx, ct, internalMemUsedLeft, err 1531 } 1532 op, err := GetSelectionConstOperator( 1533 lTyp, t.TypedRight().ResolvedType(), cmpOp, leftOp, leftIdx, 1534 constArg, overloadHelper{}, 1535 ) 1536 return op, resultIdx, ct, internalMemUsedLeft, err 1537 } 1538 rightOp, rightIdx, ct, internalMemUsedRight, err := planProjectionOperators( 1539 ctx, evalCtx, t.TypedRight(), ct, leftOp, acc, factory, 1540 ) 1541 if err != nil { 1542 return nil, resultIdx, ct, internalMemUsed, err 1543 } 1544 op, err := GetSelectionOperator( 1545 lTyp, ct[rightIdx], cmpOp, rightOp, leftIdx, rightIdx, 1546 overloadHelper{}, 1547 ) 1548 return op, resultIdx, ct, internalMemUsedLeft + internalMemUsedRight, err 1549 default: 1550 return nil, resultIdx, nil, internalMemUsed, errors.Errorf("unhandled selection expression type: %s", reflect.TypeOf(t)) 1551 } 1552 } 1553 1554 func checkCastSupported(fromType, toType *types.T) error { 1555 switch toType.Family() { 1556 case types.DecimalFamily: 1557 // If we're casting to a decimal, we're only allowing casting from the 1558 // decimal of the same precision due to the fact that we're losing 1559 // precision information once we start operating on coltypes.T. For 1560 // such casts we will fallback to row-by-row engine. 1561 // TODO(yuzefovich): coltypes.T type system has been removed, 1562 // reevaluate the situation. 1563 if !fromType.Identical(toType) { 1564 return errors.New("decimal casts with rounding unsupported") 1565 } 1566 } 1567 return nil 1568 } 1569 1570 // planCastOperator plans a CAST operator that casts the column at index 1571 // 'inputIdx' coming from input of type 'fromType' into a column of type 1572 // 'toType' that will be output at index 'resultIdx'. 1573 func planCastOperator( 1574 ctx context.Context, 1575 acc *mon.BoundAccount, 1576 columnTypes []*types.T, 1577 input colexecbase.Operator, 1578 inputIdx int, 1579 fromType *types.T, 1580 toType *types.T, 1581 factory coldata.ColumnFactory, 1582 ) (op colexecbase.Operator, resultIdx int, typs []*types.T, err error) { 1583 if err := checkCastSupported(fromType, toType); err != nil { 1584 return op, resultIdx, typs, err 1585 } 1586 outputIdx := len(columnTypes) 1587 op, err = GetCastOperator(colmem.NewAllocator(ctx, acc, factory), input, inputIdx, outputIdx, fromType, toType) 1588 typs = appendOneType(columnTypes, toType) 1589 return op, outputIdx, typs, err 1590 } 1591 1592 // planProjectionOperators plans a chain of operators to execute the provided 1593 // expression. It returns the tail of the chain, as well as the column index 1594 // of the expression's result (if any, otherwise -1) and the column types of the 1595 // resulting batches. 1596 func planProjectionOperators( 1597 ctx context.Context, 1598 evalCtx *tree.EvalContext, 1599 expr tree.TypedExpr, 1600 columnTypes []*types.T, 1601 input colexecbase.Operator, 1602 acc *mon.BoundAccount, 1603 factory coldata.ColumnFactory, 1604 ) (op colexecbase.Operator, resultIdx int, typs []*types.T, internalMemUsed int, err error) { 1605 resultIdx = -1 1606 switch t := expr.(type) { 1607 case *tree.IndexedVar: 1608 return input, t.Idx, columnTypes, internalMemUsed, nil 1609 case *tree.ComparisonExpr: 1610 return planProjectionExpr( 1611 ctx, evalCtx, t.Operator, t.ResolvedType(), t.TypedLeft(), t.TypedRight(), 1612 columnTypes, input, acc, factory, overloadHelper{}, 1613 ) 1614 case *tree.BinaryExpr: 1615 if err = checkSupportedBinaryExpr(t.TypedLeft(), t.TypedRight(), t.ResolvedType()); err != nil { 1616 return op, resultIdx, typs, internalMemUsed, err 1617 } 1618 return planProjectionExpr( 1619 ctx, evalCtx, t.Operator, t.ResolvedType(), t.TypedLeft(), t.TypedRight(), 1620 columnTypes, input, acc, factory, overloadHelper{binFn: t.Fn}, 1621 ) 1622 case *tree.IsNullExpr: 1623 t.TypedInnerExpr() 1624 return planIsNullProjectionOp(ctx, evalCtx, t.ResolvedType(), t.TypedInnerExpr(), columnTypes, input, acc, false /* negate */, factory) 1625 case *tree.IsNotNullExpr: 1626 return planIsNullProjectionOp(ctx, evalCtx, t.ResolvedType(), t.TypedInnerExpr(), columnTypes, input, acc, true /* negate */, factory) 1627 case *tree.CastExpr: 1628 expr := t.Expr.(tree.TypedExpr) 1629 op, resultIdx, typs, internalMemUsed, err = planProjectionOperators( 1630 ctx, evalCtx, expr, columnTypes, input, acc, factory, 1631 ) 1632 if err != nil { 1633 return nil, 0, nil, internalMemUsed, err 1634 } 1635 op, resultIdx, typs, err = planCastOperator(ctx, acc, typs, op, resultIdx, expr.ResolvedType(), t.ResolvedType(), factory) 1636 return op, resultIdx, typs, internalMemUsed, err 1637 case *tree.FuncExpr: 1638 var ( 1639 inputCols []int 1640 projectionInternalMem int 1641 ) 1642 typs = make([]*types.T, len(columnTypes)) 1643 copy(typs, columnTypes) 1644 op = input 1645 for _, e := range t.Exprs { 1646 var err error 1647 // TODO(rohany): This could be done better, especially in the case of 1648 // constant arguments, because the vectorized engine right now 1649 // creates a new column full of the constant value. 1650 op, resultIdx, typs, projectionInternalMem, err = planProjectionOperators( 1651 ctx, evalCtx, e.(tree.TypedExpr), typs, op, acc, factory, 1652 ) 1653 if err != nil { 1654 return nil, resultIdx, nil, internalMemUsed, err 1655 } 1656 inputCols = append(inputCols, resultIdx) 1657 internalMemUsed += projectionInternalMem 1658 } 1659 resultIdx = len(typs) 1660 op, err = NewBuiltinFunctionOperator( 1661 colmem.NewAllocator(ctx, acc, factory), evalCtx, t, typs, inputCols, resultIdx, op, 1662 ) 1663 typs = appendOneType(typs, t.ResolvedType()) 1664 return op, resultIdx, typs, internalMemUsed, err 1665 case tree.Datum: 1666 datumType := t.ResolvedType() 1667 resultIdx = len(columnTypes) 1668 typs = appendOneType(columnTypes, datumType) 1669 if datumType.Family() == types.UnknownFamily { 1670 // We handle Unknown type by planning a special constant null 1671 // operator. 1672 op = NewConstNullOp(colmem.NewAllocator(ctx, acc, factory), input, resultIdx) 1673 return op, resultIdx, typs, internalMemUsed, nil 1674 } 1675 constVal, err := getDatumToPhysicalFn(datumType)(t) 1676 if err != nil { 1677 return nil, resultIdx, typs, internalMemUsed, err 1678 } 1679 op, err := NewConstOp(colmem.NewAllocator(ctx, acc, factory), input, datumType, constVal, resultIdx) 1680 if err != nil { 1681 return nil, resultIdx, typs, internalMemUsed, err 1682 } 1683 return op, resultIdx, typs, internalMemUsed, nil 1684 case *tree.CaseExpr: 1685 if t.Expr != nil { 1686 return nil, resultIdx, typs, internalMemUsed, errors.New("CASE <expr> WHEN expressions unsupported") 1687 } 1688 1689 allocator := colmem.NewAllocator(ctx, acc, factory) 1690 caseOutputType := t.ResolvedType() 1691 if typeconv.TypeFamilyToCanonicalTypeFamily(caseOutputType.Family()) == types.BytesFamily { 1692 // Currently, there is a contradiction between the way CASE operator 1693 // works (which populates its output in arbitrary order) and the flat 1694 // bytes implementation of Bytes type (which prohibits sets in arbitrary 1695 // order), so we reject such scenario to fall back to row-by-row engine. 1696 return nil, resultIdx, typs, internalMemUsed, errors.Newf( 1697 "unsupported type %s in CASE operator", caseOutputType) 1698 } 1699 caseOutputIdx := len(columnTypes) 1700 // We don't know the schema yet and will update it below, right before 1701 // instantiating caseOp. The same goes for subsetEndIdx. 1702 schemaEnforcer := newBatchSchemaSubsetEnforcer( 1703 allocator, input, nil /* typs */, caseOutputIdx, -1, /* subsetEndIdx */ 1704 ) 1705 buffer := NewBufferOp(schemaEnforcer) 1706 caseOps := make([]colexecbase.Operator, len(t.Whens)) 1707 typs = appendOneType(columnTypes, caseOutputType) 1708 thenIdxs := make([]int, len(t.Whens)+1) 1709 for i, when := range t.Whens { 1710 // The case operator is assembled from n WHEN arms, n THEN arms, and an 1711 // ELSE arm. Each WHEN arm is a boolean projection. Each THEN arm (and the 1712 // ELSE arm) is a projection of the type of the CASE expression. We set up 1713 // each WHEN arm to write its output to a fresh column, and likewise for 1714 // the THEN arms and the ELSE arm. Each WHEN arm individually acts on the 1715 // single input batch from the CaseExpr's input and is then transformed 1716 // into a selection vector, after which the THEN arm runs to create the 1717 // output just for the tuples that matched the WHEN arm. Each subsequent 1718 // WHEN arm will use the inverse of the selection vector to avoid running 1719 // the WHEN projection on tuples that have already been matched by a 1720 // previous WHEN arm. Finally, after each WHEN arm runs, we copy the 1721 // results of the WHEN into a single output vector, assembling the final 1722 // result of the case projection. 1723 whenTyped := when.Cond.(tree.TypedExpr) 1724 var whenInternalMemUsed, thenInternalMemUsed int 1725 caseOps[i], resultIdx, typs, whenInternalMemUsed, err = planProjectionOperators( 1726 ctx, evalCtx, whenTyped, typs, buffer, acc, factory, 1727 ) 1728 if err != nil { 1729 return nil, resultIdx, typs, internalMemUsed, err 1730 } 1731 caseOps[i], err = boolOrUnknownToSelOp(caseOps[i], typs, resultIdx) 1732 if err != nil { 1733 return nil, resultIdx, typs, internalMemUsed, err 1734 } 1735 1736 // Run the "then" clause on those tuples that were selected. 1737 caseOps[i], thenIdxs[i], typs, thenInternalMemUsed, err = planProjectionOperators( 1738 ctx, evalCtx, when.Val.(tree.TypedExpr), typs, caseOps[i], acc, factory, 1739 ) 1740 if err != nil { 1741 return nil, resultIdx, typs, internalMemUsed, err 1742 } 1743 internalMemUsed += whenInternalMemUsed + thenInternalMemUsed 1744 if !typs[thenIdxs[i]].Identical(typs[caseOutputIdx]) { 1745 // It is possible that the projection of this THEN arm has different 1746 // column type (for example, we expect INT2, but INT8 is given). In 1747 // such case, we need to plan a cast. 1748 fromType, toType := typs[thenIdxs[i]], typs[caseOutputIdx] 1749 caseOps[i], thenIdxs[i], typs, err = planCastOperator( 1750 ctx, acc, typs, caseOps[i], thenIdxs[i], fromType, toType, factory, 1751 ) 1752 if err != nil { 1753 return nil, resultIdx, typs, internalMemUsed, err 1754 } 1755 } 1756 } 1757 var elseInternalMemUsed int 1758 var elseOp colexecbase.Operator 1759 elseExpr := t.Else 1760 if elseExpr == nil { 1761 // If there's no ELSE arm, we write NULLs. 1762 elseExpr = tree.DNull 1763 } 1764 elseOp, thenIdxs[len(t.Whens)], typs, elseInternalMemUsed, err = planProjectionOperators( 1765 ctx, evalCtx, elseExpr.(tree.TypedExpr), typs, buffer, acc, factory, 1766 ) 1767 if err != nil { 1768 return nil, resultIdx, typs, internalMemUsed, err 1769 } 1770 internalMemUsed += elseInternalMemUsed 1771 if !typs[thenIdxs[len(t.Whens)]].Identical(typs[caseOutputIdx]) { 1772 // It is possible that the projection of the ELSE arm has different 1773 // column type (for example, we expect INT2, but INT8 is given). In 1774 // such case, we need to plan a cast. 1775 elseIdx := thenIdxs[len(t.Whens)] 1776 fromType, toType := typs[elseIdx], typs[caseOutputIdx] 1777 elseOp, thenIdxs[len(t.Whens)], typs, err = planCastOperator( 1778 ctx, acc, typs, elseOp, elseIdx, fromType, toType, factory, 1779 ) 1780 if err != nil { 1781 return nil, resultIdx, typs, internalMemUsed, err 1782 } 1783 } 1784 1785 schemaEnforcer.typs = typs 1786 schemaEnforcer.subsetEndIdx = len(typs) 1787 op := NewCaseOp(allocator, buffer, caseOps, elseOp, thenIdxs, caseOutputIdx, caseOutputType) 1788 internalMemUsed += op.(InternalMemoryOperator).InternalMemoryUsage() 1789 return op, caseOutputIdx, typs, internalMemUsed, err 1790 case *tree.AndExpr, *tree.OrExpr: 1791 return planLogicalProjectionOp(ctx, evalCtx, expr, columnTypes, input, acc, factory) 1792 default: 1793 return nil, resultIdx, nil, internalMemUsed, errors.Errorf("unhandled projection expression type: %s", reflect.TypeOf(t)) 1794 } 1795 } 1796 1797 func checkSupportedProjectionExpr(left, right tree.TypedExpr) error { 1798 leftTyp := left.ResolvedType() 1799 rightTyp := right.ResolvedType() 1800 if leftTyp.Equivalent(rightTyp) { 1801 return nil 1802 } 1803 1804 // The types are not equivalent. Check if either is a type we'd like to avoid. 1805 for _, t := range []*types.T{leftTyp, rightTyp} { 1806 switch t.Family() { 1807 case types.DateFamily, types.TimestampFamily, types.TimestampTZFamily: 1808 return errors.New("dates and timestamp(tz) not supported in mixed-type expressions in the vectorized engine") 1809 } 1810 } 1811 return nil 1812 } 1813 1814 func checkSupportedBinaryExpr(left, right tree.TypedExpr, outputType *types.T) error { 1815 leftDatumBacked := typeconv.TypeFamilyToCanonicalTypeFamily(left.ResolvedType().Family()) == typeconv.DatumVecCanonicalTypeFamily 1816 rightDatumBacked := typeconv.TypeFamilyToCanonicalTypeFamily(right.ResolvedType().Family()) == typeconv.DatumVecCanonicalTypeFamily 1817 outputDatumBacked := typeconv.TypeFamilyToCanonicalTypeFamily(outputType.Family()) == typeconv.DatumVecCanonicalTypeFamily 1818 if (leftDatumBacked || rightDatumBacked) && !outputDatumBacked { 1819 return errors.New("datum-backed arguments and not datum-backed " + 1820 "output of a binary expression is currently not supported") 1821 } 1822 return nil 1823 } 1824 1825 func planProjectionExpr( 1826 ctx context.Context, 1827 evalCtx *tree.EvalContext, 1828 projOp tree.Operator, 1829 outputType *types.T, 1830 left, right tree.TypedExpr, 1831 columnTypes []*types.T, 1832 input colexecbase.Operator, 1833 acc *mon.BoundAccount, 1834 factory coldata.ColumnFactory, 1835 overloadHelper overloadHelper, 1836 ) (op colexecbase.Operator, resultIdx int, typs []*types.T, internalMemUsed int, err error) { 1837 if err := checkSupportedProjectionExpr(left, right); err != nil { 1838 return nil, resultIdx, typs, internalMemUsed, err 1839 } 1840 resultIdx = -1 1841 // actualOutputType tracks the logical type of the output column of the 1842 // projection operator. See the comment below for more details. 1843 actualOutputType := outputType 1844 if outputType.Identical(types.Int) { 1845 // Currently, SQL type system does not respect the width of integers 1846 // when figuring out the type of the output of a projection expression 1847 // (for example, INT2 + INT2 will be typed as INT8); however, 1848 // vectorized operators do respect the width when both operands have 1849 // the same width. In order to go around this limitation, we explicitly 1850 // check whether output type is INT8, and if so, we override the output 1851 // physical types to be what the vectorized projection operators will 1852 // actually output. 1853 // 1854 // Note that in mixed-width scenarios (i.e. INT2 + INT4) the vectorized 1855 // engine will output INT8, so no overriding is needed. 1856 // 1857 // We do, however, need to plan a cast to the expected logical type and 1858 // we will do that below. 1859 leftType := left.ResolvedType() 1860 rightType := right.ResolvedType() 1861 if leftType.Identical(types.Int2) && rightType.Identical(types.Int2) { 1862 actualOutputType = types.Int2 1863 } else if leftType.Identical(types.Int4) && rightType.Identical(types.Int4) { 1864 actualOutputType = types.Int4 1865 } 1866 } 1867 // There are 3 cases. Either the left is constant, the right is constant, 1868 // or neither are constant. 1869 if lConstArg, lConst := left.(tree.Datum); lConst { 1870 // Case one: The left is constant. 1871 // Normally, the optimizer normalizes binary exprs so that the constant 1872 // argument is on the right side. This doesn't happen for non-commutative 1873 // operators such as - and /, though, so we still need this case. 1874 var rightIdx int 1875 input, rightIdx, typs, internalMemUsed, err = planProjectionOperators( 1876 ctx, evalCtx, right, columnTypes, input, acc, factory, 1877 ) 1878 if err != nil { 1879 return nil, resultIdx, typs, internalMemUsed, err 1880 } 1881 resultIdx = len(typs) 1882 // The projection result will be outputted to a new column which is appended 1883 // to the input batch. 1884 op, err = GetProjectionLConstOperator( 1885 colmem.NewAllocator(ctx, acc, factory), left.ResolvedType(), typs[rightIdx], actualOutputType, 1886 projOp, input, rightIdx, lConstArg, resultIdx, overloadHelper, 1887 ) 1888 } else { 1889 var ( 1890 leftIdx int 1891 internalMemUsedLeft int 1892 ) 1893 input, leftIdx, typs, internalMemUsedLeft, err = planProjectionOperators( 1894 ctx, evalCtx, left, columnTypes, input, acc, factory, 1895 ) 1896 if err != nil { 1897 return nil, resultIdx, typs, internalMemUsed, err 1898 } 1899 internalMemUsed += internalMemUsedLeft 1900 if rConstArg, rConst := right.(tree.Datum); rConst { 1901 // Case 2: The right is constant. 1902 // The projection result will be outputted to a new column which is appended 1903 // to the input batch. 1904 resultIdx = len(typs) 1905 if projOp == tree.Like || projOp == tree.NotLike { 1906 negate := projOp == tree.NotLike 1907 op, err = GetLikeProjectionOperator( 1908 colmem.NewAllocator(ctx, acc, factory), evalCtx, input, leftIdx, resultIdx, 1909 string(tree.MustBeDString(rConstArg)), negate, 1910 ) 1911 } else if projOp == tree.In || projOp == tree.NotIn { 1912 negate := projOp == tree.NotIn 1913 datumTuple, ok := tree.AsDTuple(rConstArg) 1914 if !ok { 1915 err = errors.Errorf("IN operator supported only on constant expressions") 1916 return nil, resultIdx, typs, internalMemUsed, err 1917 } 1918 op, err = GetInProjectionOperator( 1919 colmem.NewAllocator(ctx, acc, factory), typs[leftIdx], input, leftIdx, 1920 resultIdx, datumTuple, negate, 1921 ) 1922 } else if projOp == tree.IsDistinctFrom || projOp == tree.IsNotDistinctFrom { 1923 if right != tree.DNull { 1924 err = errors.Errorf("IS DISTINCT FROM and IS NOT DISTINCT FROM are supported only with NULL argument") 1925 return nil, resultIdx, typs, internalMemUsed, err 1926 } 1927 // IS NULL is replaced with IS NOT DISTINCT FROM NULL, so we want to 1928 // negate when IS DISTINCT FROM is used. 1929 negate := projOp == tree.IsDistinctFrom 1930 op = newIsNullProjOp(colmem.NewAllocator(ctx, acc, factory), input, leftIdx, resultIdx, negate) 1931 } else { 1932 op, err = GetProjectionRConstOperator( 1933 colmem.NewAllocator(ctx, acc, factory), typs[leftIdx], right.ResolvedType(), actualOutputType, 1934 projOp, input, leftIdx, rConstArg, resultIdx, overloadHelper, 1935 ) 1936 } 1937 } else { 1938 // Case 3: neither are constant. 1939 var ( 1940 rightIdx int 1941 internalMemUsedRight int 1942 ) 1943 input, rightIdx, typs, internalMemUsedRight, err = planProjectionOperators( 1944 ctx, evalCtx, right, typs, input, acc, factory, 1945 ) 1946 if err != nil { 1947 return nil, resultIdx, nil, internalMemUsed, err 1948 } 1949 internalMemUsed += internalMemUsedRight 1950 resultIdx = len(typs) 1951 op, err = GetProjectionOperator( 1952 colmem.NewAllocator(ctx, acc, factory), typs[leftIdx], typs[rightIdx], actualOutputType, 1953 projOp, input, leftIdx, rightIdx, resultIdx, overloadHelper, 1954 ) 1955 } 1956 } 1957 if err != nil { 1958 return op, resultIdx, typs, internalMemUsed, err 1959 } 1960 if sMem, ok := op.(InternalMemoryOperator); ok { 1961 internalMemUsed += sMem.InternalMemoryUsage() 1962 } 1963 typs = appendOneType(typs, actualOutputType) 1964 if !outputType.Identical(actualOutputType) { 1965 // The projection operator outputs a column of a different type than 1966 // the expected logical type. In order to "synchronize" the reality and 1967 // the expectations, we plan a cast. 1968 // 1969 // For example, INT2 + INT2 will be typed as INT8 by the SQL type 1970 // system, but we will plan a projection operator that outputs INT2, so 1971 // in such scenario we will have 1972 // actualOutputType = types.Int2 1973 // outputType = types.Int8 1974 // and will plan the corresponding cast. 1975 // 1976 // NOTE: this is *only* needed for integer types and should be removed 1977 // once #46940 is resolved. 1978 op, resultIdx, typs, err = 1979 planCastOperator(ctx, acc, typs, op, resultIdx, actualOutputType, outputType, factory) 1980 } 1981 return op, resultIdx, typs, internalMemUsed, err 1982 } 1983 1984 // planLogicalProjectionOp plans all the needed operators for a projection of 1985 // a logical operation (either AND or OR). 1986 func planLogicalProjectionOp( 1987 ctx context.Context, 1988 evalCtx *tree.EvalContext, 1989 expr tree.TypedExpr, 1990 columnTypes []*types.T, 1991 input colexecbase.Operator, 1992 acc *mon.BoundAccount, 1993 factory coldata.ColumnFactory, 1994 ) (op colexecbase.Operator, resultIdx int, typs []*types.T, internalMemUsed int, err error) { 1995 // Add a new boolean column that will store the result of the projection. 1996 resultIdx = len(columnTypes) 1997 typs = appendOneType(columnTypes, types.Bool) 1998 var ( 1999 typedLeft, typedRight tree.TypedExpr 2000 leftProjOpChain, rightProjOpChain, outputOp colexecbase.Operator 2001 leftIdx, rightIdx int 2002 internalMemUsedLeft, internalMemUsedRight int 2003 leftFeedOp, rightFeedOp feedOperator 2004 ) 2005 switch t := expr.(type) { 2006 case *tree.AndExpr: 2007 typedLeft = t.TypedLeft() 2008 typedRight = t.TypedRight() 2009 case *tree.OrExpr: 2010 typedLeft = t.TypedLeft() 2011 typedRight = t.TypedRight() 2012 default: 2013 colexecerror.InternalError(fmt.Sprintf("unexpected logical expression type %s", t.String())) 2014 } 2015 leftProjOpChain, leftIdx, typs, internalMemUsedLeft, err = planProjectionOperators( 2016 ctx, evalCtx, typedLeft, typs, &leftFeedOp, acc, factory, 2017 ) 2018 if err != nil { 2019 return nil, resultIdx, typs, internalMemUsed, err 2020 } 2021 rightProjOpChain, rightIdx, typs, internalMemUsedRight, err = planProjectionOperators( 2022 ctx, evalCtx, typedRight, typs, &rightFeedOp, acc, factory, 2023 ) 2024 if err != nil { 2025 return nil, resultIdx, typs, internalMemUsed, err 2026 } 2027 allocator := colmem.NewAllocator(ctx, acc, factory) 2028 input = newBatchSchemaSubsetEnforcer(allocator, input, typs, resultIdx, len(typs)) 2029 switch expr.(type) { 2030 case *tree.AndExpr: 2031 outputOp = NewAndProjOp( 2032 allocator, 2033 input, leftProjOpChain, rightProjOpChain, 2034 &leftFeedOp, &rightFeedOp, 2035 leftIdx, rightIdx, resultIdx, 2036 ) 2037 case *tree.OrExpr: 2038 outputOp = NewOrProjOp( 2039 allocator, 2040 input, leftProjOpChain, rightProjOpChain, 2041 &leftFeedOp, &rightFeedOp, 2042 leftIdx, rightIdx, resultIdx, 2043 ) 2044 } 2045 return outputOp, resultIdx, typs, internalMemUsedLeft + internalMemUsedRight, nil 2046 } 2047 2048 // planIsNullProjectionOp plans the operator for IS NULL and IS NOT NULL 2049 // expressions (tree.IsNullExpr and tree.IsNotNullExpr, respectively). 2050 func planIsNullProjectionOp( 2051 ctx context.Context, 2052 evalCtx *tree.EvalContext, 2053 outputType *types.T, 2054 expr tree.TypedExpr, 2055 columnTypes []*types.T, 2056 input colexecbase.Operator, 2057 acc *mon.BoundAccount, 2058 negate bool, 2059 factory coldata.ColumnFactory, 2060 ) (op colexecbase.Operator, resultIdx int, typs []*types.T, internalMemUsed int, err error) { 2061 op, resultIdx, typs, internalMemUsed, err = planProjectionOperators( 2062 ctx, evalCtx, expr, columnTypes, input, acc, factory, 2063 ) 2064 outputIdx := len(typs) 2065 op = newIsNullProjOp(colmem.NewAllocator(ctx, acc, factory), op, resultIdx, outputIdx, negate) 2066 typs = appendOneType(typs, outputType) 2067 return op, outputIdx, typs, internalMemUsed, err 2068 } 2069 2070 // appendOneType appends a *types.T to then end of a []*types.T. The size of the 2071 // underlying array of the resulting slice is 1 greater than the input slice. 2072 // This differs from the built-in append function, which can double the capacity 2073 // of the slice if its length is less than 1024, or increase by 25% otherwise. 2074 func appendOneType(typs []*types.T, t *types.T) []*types.T { 2075 newTyps := make([]*types.T, len(typs)+1) 2076 copy(newTyps, typs) 2077 newTyps[len(newTyps)-1] = t 2078 return newTyps 2079 }