github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/sql/rowexec/aggregator.go (about) 1 // Copyright 2016 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package rowexec 12 13 import ( 14 "context" 15 "fmt" 16 "unsafe" 17 18 "github.com/cockroachdb/cockroach/pkg/sql/execinfra" 19 "github.com/cockroachdb/cockroach/pkg/sql/execinfrapb" 20 "github.com/cockroachdb/cockroach/pkg/sql/sem/tree" 21 "github.com/cockroachdb/cockroach/pkg/sql/sqlbase" 22 "github.com/cockroachdb/cockroach/pkg/sql/types" 23 "github.com/cockroachdb/cockroach/pkg/util/humanizeutil" 24 "github.com/cockroachdb/cockroach/pkg/util/log" 25 "github.com/cockroachdb/cockroach/pkg/util/mon" 26 "github.com/cockroachdb/cockroach/pkg/util/stringarena" 27 "github.com/cockroachdb/cockroach/pkg/util/tracing" 28 "github.com/cockroachdb/errors" 29 "github.com/opentracing/opentracing-go" 30 ) 31 32 type aggregateFuncs []tree.AggregateFunc 33 34 func (af aggregateFuncs) close(ctx context.Context) { 35 for _, f := range af { 36 f.Close(ctx) 37 } 38 } 39 40 // aggregatorBase is the foundation of the processor core type that does 41 // "aggregation" in the SQL sense. It groups rows and computes an aggregate for 42 // each group. The group is configured using the group key and the aggregator 43 // can be configured with one or more aggregation functions, as defined in the 44 // AggregatorSpec_Func enum. 45 // 46 // aggregatorBase's output schema is comprised of what is specified by the 47 // accompanying SELECT expressions. 48 type aggregatorBase struct { 49 execinfra.ProcessorBase 50 51 // runningState represents the state of the aggregator. This is in addition to 52 // ProcessorBase.State - the runningState is only relevant when 53 // ProcessorBase.State == StateRunning. 54 runningState aggregatorState 55 input execinfra.RowSource 56 inputDone bool 57 inputTypes []*types.T 58 funcs []*aggregateFuncHolder 59 outputTypes []*types.T 60 datumAlloc sqlbase.DatumAlloc 61 rowAlloc sqlbase.EncDatumRowAlloc 62 63 bucketsAcc mon.BoundAccount 64 aggFuncsAcc mon.BoundAccount 65 66 // isScalar can only be set if there are no groupCols, and it means that we 67 // will generate a result row even if there are no input rows. Used for 68 // queries like SELECT MAX(n) FROM t. 69 isScalar bool 70 groupCols []uint32 71 orderedGroupCols []uint32 72 aggregations []execinfrapb.AggregatorSpec_Aggregation 73 74 lastOrdGroupCols sqlbase.EncDatumRow 75 arena stringarena.Arena 76 row sqlbase.EncDatumRow 77 scratch []byte 78 79 cancelChecker *sqlbase.CancelChecker 80 } 81 82 // init initializes the aggregatorBase. 83 // 84 // trailingMetaCallback is passed as part of ProcStateOpts; the inputs to drain 85 // are in aggregatorBase. 86 func (ag *aggregatorBase) init( 87 self execinfra.RowSource, 88 flowCtx *execinfra.FlowCtx, 89 processorID int32, 90 spec *execinfrapb.AggregatorSpec, 91 input execinfra.RowSource, 92 post *execinfrapb.PostProcessSpec, 93 output execinfra.RowReceiver, 94 trailingMetaCallback func(context.Context) []execinfrapb.ProducerMetadata, 95 ) error { 96 ctx := flowCtx.EvalCtx.Ctx() 97 memMonitor := execinfra.NewMonitor(ctx, flowCtx.EvalCtx.Mon, "aggregator-mem") 98 if sp := opentracing.SpanFromContext(ctx); sp != nil && tracing.IsRecording(sp) { 99 input = newInputStatCollector(input) 100 ag.FinishTrace = ag.outputStatsToTrace 101 } 102 ag.input = input 103 ag.isScalar = spec.IsScalar() 104 ag.groupCols = spec.GroupCols 105 ag.orderedGroupCols = spec.OrderedGroupCols 106 ag.aggregations = spec.Aggregations 107 ag.funcs = make([]*aggregateFuncHolder, len(spec.Aggregations)) 108 ag.outputTypes = make([]*types.T, len(spec.Aggregations)) 109 ag.row = make(sqlbase.EncDatumRow, len(spec.Aggregations)) 110 ag.bucketsAcc = memMonitor.MakeBoundAccount() 111 ag.arena = stringarena.Make(&ag.bucketsAcc) 112 ag.aggFuncsAcc = memMonitor.MakeBoundAccount() 113 114 // Loop over the select expressions and extract any aggregate functions -- 115 // non-aggregation functions are replaced with parser.NewIdentAggregate, 116 // (which just returns the last value added to them for a bucket) to provide 117 // grouped-by values for each bucket. ag.funcs is updated to contain all 118 // the functions which need to be fed values. 119 ag.inputTypes = input.OutputTypes() 120 for i, aggInfo := range spec.Aggregations { 121 if aggInfo.FilterColIdx != nil { 122 col := *aggInfo.FilterColIdx 123 if col >= uint32(len(ag.inputTypes)) { 124 return errors.Errorf("FilterColIdx out of range (%d)", col) 125 } 126 t := ag.inputTypes[col].Family() 127 if t != types.BoolFamily && t != types.UnknownFamily { 128 return errors.Errorf( 129 "filter column %d must be of boolean type, not %s", *aggInfo.FilterColIdx, t, 130 ) 131 } 132 } 133 argTypes := make([]*types.T, len(aggInfo.ColIdx)+len(aggInfo.Arguments)) 134 for j, c := range aggInfo.ColIdx { 135 if c >= uint32(len(ag.inputTypes)) { 136 return errors.Errorf("ColIdx out of range (%d)", aggInfo.ColIdx) 137 } 138 argTypes[j] = ag.inputTypes[c] 139 } 140 141 arguments := make(tree.Datums, len(aggInfo.Arguments)) 142 for j, argument := range aggInfo.Arguments { 143 h := execinfra.ExprHelper{} 144 // Pass nil types and row - there are no variables in these expressions. 145 if err := h.Init(argument, nil /* types */, flowCtx.EvalCtx); err != nil { 146 return errors.Wrapf(err, "%s", argument) 147 } 148 d, err := h.Eval(nil /* row */) 149 if err != nil { 150 return errors.Wrapf(err, "%s", argument) 151 } 152 argTypes[len(aggInfo.ColIdx)+j] = d.ResolvedType() 153 arguments[j] = d 154 } 155 156 aggConstructor, retType, err := execinfrapb.GetAggregateInfo(aggInfo.Func, argTypes...) 157 if err != nil { 158 return err 159 } 160 161 ag.funcs[i] = ag.newAggregateFuncHolder(aggConstructor, arguments) 162 if aggInfo.Distinct { 163 ag.funcs[i].seen = make(map[string]struct{}) 164 } 165 166 ag.outputTypes[i] = retType 167 } 168 169 return ag.ProcessorBase.Init( 170 self, post, ag.outputTypes, flowCtx, processorID, output, memMonitor, 171 execinfra.ProcStateOpts{ 172 InputsToDrain: []execinfra.RowSource{ag.input}, 173 TrailingMetaCallback: trailingMetaCallback, 174 }, 175 ) 176 } 177 178 var _ execinfrapb.DistSQLSpanStats = &AggregatorStats{} 179 180 const aggregatorTagPrefix = "aggregator." 181 182 // Stats implements the SpanStats interface. 183 func (as *AggregatorStats) Stats() map[string]string { 184 inputStatsMap := as.InputStats.Stats(aggregatorTagPrefix) 185 inputStatsMap[aggregatorTagPrefix+MaxMemoryTagSuffix] = humanizeutil.IBytes(as.MaxAllocatedMem) 186 return inputStatsMap 187 } 188 189 // StatsForQueryPlan implements the DistSQLSpanStats interface. 190 func (as *AggregatorStats) StatsForQueryPlan() []string { 191 stats := as.InputStats.StatsForQueryPlan("" /* prefix */) 192 193 if as.MaxAllocatedMem != 0 { 194 stats = append(stats, 195 fmt.Sprintf("%s: %s", MaxMemoryQueryPlanSuffix, humanizeutil.IBytes(as.MaxAllocatedMem))) 196 } 197 198 return stats 199 } 200 201 func (ag *aggregatorBase) outputStatsToTrace() { 202 is, ok := getInputStats(ag.FlowCtx, ag.input) 203 if !ok { 204 return 205 } 206 if sp := opentracing.SpanFromContext(ag.Ctx); sp != nil { 207 tracing.SetSpanStats( 208 sp, 209 &AggregatorStats{ 210 InputStats: is, 211 MaxAllocatedMem: ag.MemMonitor.MaximumBytes(), 212 }, 213 ) 214 } 215 } 216 217 // ChildCount is part of the execinfra.OpNode interface. 218 func (ag *aggregatorBase) ChildCount(verbose bool) int { 219 if _, ok := ag.input.(execinfra.OpNode); ok { 220 return 1 221 } 222 return 0 223 } 224 225 // Child is part of the execinfra.OpNode interface. 226 func (ag *aggregatorBase) Child(nth int, verbose bool) execinfra.OpNode { 227 if nth == 0 { 228 if n, ok := ag.input.(execinfra.OpNode); ok { 229 return n 230 } 231 panic("input to aggregatorBase is not an execinfra.OpNode") 232 } 233 panic(fmt.Sprintf("invalid index %d", nth)) 234 } 235 236 const ( 237 // hashAggregatorBucketsInitialLen is a guess on how many "items" the 238 // 'buckets' map of hashAggregator has the capacity for initially. 239 hashAggregatorBucketsInitialLen = 8 240 // hashAggregatorSizeOfBucketsItem is a guess on how much space (in bytes) 241 // each item added to 'buckets' map of hashAggregator takes up in the map 242 // (i.e. it is memory internal to the map, orthogonal to "key-value" pair 243 // that we're adding to the map). 244 hashAggregatorSizeOfBucketsItem = 64 245 ) 246 247 // hashAggregator is a specialization of aggregatorBase that must keep track of 248 // multiple grouping buckets at a time. 249 type hashAggregator struct { 250 aggregatorBase 251 252 // buckets is used during the accumulation phase to track the bucket keys 253 // that have been seen. After accumulation, the keys are extracted into 254 // bucketsIter for iteration. 255 buckets map[string]aggregateFuncs 256 bucketsIter []string 257 // bucketsLenGrowThreshold is the threshold which, when reached by the 258 // number of items in 'buckets', will trigger the update to memory 259 // accounting. It will start out at hashAggregatorBucketsInitialLen and 260 // then will be doubling in size. 261 bucketsLenGrowThreshold int 262 // alreadyAccountedFor tracks the number of items in 'buckets' memory for 263 // which we have already accounted for. 264 alreadyAccountedFor int 265 } 266 267 // orderedAggregator is a specialization of aggregatorBase that only needs to 268 // keep track of a single grouping bucket at a time. 269 type orderedAggregator struct { 270 aggregatorBase 271 272 // bucket is used during the accumulation phase to aggregate results. 273 bucket aggregateFuncs 274 } 275 276 var _ execinfra.Processor = &hashAggregator{} 277 var _ execinfra.RowSource = &hashAggregator{} 278 var _ execinfra.OpNode = &hashAggregator{} 279 280 const hashAggregatorProcName = "hash aggregator" 281 282 var _ execinfra.Processor = &orderedAggregator{} 283 var _ execinfra.RowSource = &orderedAggregator{} 284 var _ execinfra.OpNode = &orderedAggregator{} 285 286 const orderedAggregatorProcName = "ordered aggregator" 287 288 // aggregatorState represents the state of the processor. 289 type aggregatorState int 290 291 const ( 292 aggStateUnknown aggregatorState = iota 293 // aggAccumulating means that rows are being read from the input and used to 294 // compute intermediary aggregation results. 295 aggAccumulating 296 // aggEmittingRows means that accumulation has finished and rows are being 297 // sent to the output. 298 aggEmittingRows 299 ) 300 301 func newAggregator( 302 flowCtx *execinfra.FlowCtx, 303 processorID int32, 304 spec *execinfrapb.AggregatorSpec, 305 input execinfra.RowSource, 306 post *execinfrapb.PostProcessSpec, 307 output execinfra.RowReceiver, 308 ) (execinfra.Processor, error) { 309 if spec.IsRowCount() { 310 return newCountAggregator(flowCtx, processorID, input, post, output) 311 } 312 if len(spec.OrderedGroupCols) == len(spec.GroupCols) { 313 return newOrderedAggregator(flowCtx, processorID, spec, input, post, output) 314 } 315 316 ag := &hashAggregator{ 317 buckets: make(map[string]aggregateFuncs), 318 bucketsLenGrowThreshold: hashAggregatorBucketsInitialLen, 319 } 320 321 if err := ag.init( 322 ag, 323 flowCtx, 324 processorID, 325 spec, 326 input, 327 post, 328 output, 329 func(context.Context) []execinfrapb.ProducerMetadata { 330 ag.close() 331 return nil 332 }, 333 ); err != nil { 334 return nil, err 335 } 336 337 // A new tree.EvalCtx was created during initializing aggregatorBase above 338 // and will be used only by this aggregator, so it is ok to update EvalCtx 339 // directly. 340 ag.EvalCtx.SingleDatumAggMemAccount = &ag.aggFuncsAcc 341 return ag, nil 342 } 343 344 func newOrderedAggregator( 345 flowCtx *execinfra.FlowCtx, 346 processorID int32, 347 spec *execinfrapb.AggregatorSpec, 348 input execinfra.RowSource, 349 post *execinfrapb.PostProcessSpec, 350 output execinfra.RowReceiver, 351 ) (*orderedAggregator, error) { 352 ag := &orderedAggregator{} 353 354 if err := ag.init( 355 ag, 356 flowCtx, 357 processorID, 358 spec, 359 input, 360 post, 361 output, 362 func(context.Context) []execinfrapb.ProducerMetadata { 363 ag.close() 364 return nil 365 }, 366 ); err != nil { 367 return nil, err 368 } 369 370 // A new tree.EvalCtx was created during initializing aggregatorBase above 371 // and will be used only by this aggregator, so it is ok to update EvalCtx 372 // directly. 373 ag.EvalCtx.SingleDatumAggMemAccount = &ag.aggFuncsAcc 374 return ag, nil 375 } 376 377 // Start is part of the RowSource interface. 378 func (ag *hashAggregator) Start(ctx context.Context) context.Context { 379 return ag.start(ctx, hashAggregatorProcName) 380 } 381 382 // Start is part of the RowSource interface. 383 func (ag *orderedAggregator) Start(ctx context.Context) context.Context { 384 return ag.start(ctx, orderedAggregatorProcName) 385 } 386 387 func (ag *aggregatorBase) start(ctx context.Context, procName string) context.Context { 388 ag.input.Start(ctx) 389 ctx = ag.StartInternal(ctx, procName) 390 ag.cancelChecker = sqlbase.NewCancelChecker(ctx) 391 ag.runningState = aggAccumulating 392 return ctx 393 } 394 395 func (ag *hashAggregator) close() { 396 if ag.InternalClose() { 397 log.VEventf(ag.Ctx, 2, "exiting aggregator") 398 // If we have started emitting rows, bucketsIter will represent which 399 // buckets are still open, since buckets are closed once their results are 400 // emitted. 401 if ag.bucketsIter == nil { 402 for _, bucket := range ag.buckets { 403 bucket.close(ag.Ctx) 404 } 405 } else { 406 for _, bucket := range ag.bucketsIter { 407 ag.buckets[bucket].close(ag.Ctx) 408 } 409 } 410 // Make sure to release any remaining memory under 'buckets'. 411 ag.buckets = nil 412 // Note that we should be closing accounts only after closing all the 413 // buckets since the latter might be releasing some precisely tracked 414 // memory, and if we were to close the accounts first, there would be 415 // no memory to release for the buckets. 416 ag.bucketsAcc.Close(ag.Ctx) 417 ag.aggFuncsAcc.Close(ag.Ctx) 418 ag.MemMonitor.Stop(ag.Ctx) 419 } 420 } 421 422 func (ag *orderedAggregator) close() { 423 if ag.InternalClose() { 424 log.VEventf(ag.Ctx, 2, "exiting aggregator") 425 if ag.bucket != nil { 426 ag.bucket.close(ag.Ctx) 427 } 428 // Note that we should be closing accounts only after closing the 429 // bucket since the latter might be releasing some precisely tracked 430 // memory, and if we were to close the accounts first, there would be 431 // no memory to release for the bucket. 432 ag.bucketsAcc.Close(ag.Ctx) 433 ag.aggFuncsAcc.Close(ag.Ctx) 434 ag.MemMonitor.Stop(ag.Ctx) 435 } 436 } 437 438 // matchLastOrdGroupCols takes a row and matches it with the row stored by 439 // lastOrdGroupCols. It returns true if the two rows are equal on the grouping 440 // columns, and false otherwise. 441 func (ag *aggregatorBase) matchLastOrdGroupCols(row sqlbase.EncDatumRow) (bool, error) { 442 for _, colIdx := range ag.orderedGroupCols { 443 res, err := ag.lastOrdGroupCols[colIdx].Compare( 444 ag.inputTypes[colIdx], &ag.datumAlloc, ag.EvalCtx, &row[colIdx], 445 ) 446 if res != 0 || err != nil { 447 return false, err 448 } 449 } 450 return true, nil 451 } 452 453 // accumulateRows continually reads rows from the input and accumulates them 454 // into intermediary aggregate results. If it encounters metadata, the metadata 455 // is immediately returned. Subsequent calls of this function will resume row 456 // accumulation. 457 func (ag *hashAggregator) accumulateRows() ( 458 aggregatorState, 459 sqlbase.EncDatumRow, 460 *execinfrapb.ProducerMetadata, 461 ) { 462 for { 463 row, meta := ag.input.Next() 464 if meta != nil { 465 if meta.Err != nil { 466 ag.MoveToDraining(nil /* err */) 467 return aggStateUnknown, nil, meta 468 } 469 return aggAccumulating, nil, meta 470 } 471 if row == nil { 472 log.VEvent(ag.Ctx, 1, "accumulation complete") 473 ag.inputDone = true 474 break 475 } 476 477 if ag.lastOrdGroupCols == nil { 478 ag.lastOrdGroupCols = ag.rowAlloc.CopyRow(row) 479 } else { 480 matched, err := ag.matchLastOrdGroupCols(row) 481 if err != nil { 482 ag.MoveToDraining(err) 483 return aggStateUnknown, nil, nil 484 } 485 if !matched { 486 copy(ag.lastOrdGroupCols, row) 487 break 488 } 489 } 490 if err := ag.accumulateRow(row); err != nil { 491 ag.MoveToDraining(err) 492 return aggStateUnknown, nil, nil 493 } 494 } 495 496 // Queries like `SELECT MAX(n) FROM t` expect a row of NULLs if nothing was 497 // aggregated. 498 if len(ag.buckets) < 1 && len(ag.groupCols) == 0 { 499 bucket, err := ag.createAggregateFuncs() 500 if err != nil { 501 ag.MoveToDraining(err) 502 return aggStateUnknown, nil, nil 503 } 504 ag.buckets[""] = bucket 505 } 506 507 // Note that, for simplicity, we're ignoring the overhead of the slice of 508 // strings. 509 if err := ag.bucketsAcc.Grow(ag.Ctx, int64(len(ag.buckets))*sizeOfString); err != nil { 510 ag.MoveToDraining(err) 511 return aggStateUnknown, nil, nil 512 } 513 ag.bucketsIter = make([]string, 0, len(ag.buckets)) 514 for bucket := range ag.buckets { 515 ag.bucketsIter = append(ag.bucketsIter, bucket) 516 } 517 518 // Transition to aggEmittingRows, and let it generate the next row/meta. 519 return aggEmittingRows, nil, nil 520 } 521 522 // accumulateRows continually reads rows from the input and accumulates them 523 // into intermediary aggregate results. If it encounters metadata, the metadata 524 // is immediately returned. Subsequent calls of this function will resume row 525 // accumulation. 526 func (ag *orderedAggregator) accumulateRows() ( 527 aggregatorState, 528 sqlbase.EncDatumRow, 529 *execinfrapb.ProducerMetadata, 530 ) { 531 for { 532 row, meta := ag.input.Next() 533 if meta != nil { 534 if meta.Err != nil { 535 ag.MoveToDraining(nil /* err */) 536 return aggStateUnknown, nil, meta 537 } 538 return aggAccumulating, nil, meta 539 } 540 if row == nil { 541 log.VEvent(ag.Ctx, 1, "accumulation complete") 542 ag.inputDone = true 543 break 544 } 545 546 if ag.lastOrdGroupCols == nil { 547 ag.lastOrdGroupCols = ag.rowAlloc.CopyRow(row) 548 } else { 549 matched, err := ag.matchLastOrdGroupCols(row) 550 if err != nil { 551 ag.MoveToDraining(err) 552 return aggStateUnknown, nil, nil 553 } 554 if !matched { 555 copy(ag.lastOrdGroupCols, row) 556 break 557 } 558 } 559 if err := ag.accumulateRow(row); err != nil { 560 ag.MoveToDraining(err) 561 return aggStateUnknown, nil, nil 562 } 563 } 564 565 // Queries like `SELECT MAX(n) FROM t` expect a row of NULLs if nothing was 566 // aggregated. 567 if ag.bucket == nil && ag.isScalar { 568 var err error 569 ag.bucket, err = ag.createAggregateFuncs() 570 if err != nil { 571 ag.MoveToDraining(err) 572 return aggStateUnknown, nil, nil 573 } 574 } 575 576 // Transition to aggEmittingRows, and let it generate the next row/meta. 577 return aggEmittingRows, nil, nil 578 } 579 580 func (ag *aggregatorBase) getAggResults( 581 bucket aggregateFuncs, 582 ) (aggregatorState, sqlbase.EncDatumRow, *execinfrapb.ProducerMetadata) { 583 for i, b := range bucket { 584 result, err := b.Result() 585 if err != nil { 586 ag.MoveToDraining(err) 587 return aggStateUnknown, nil, nil 588 } 589 if result == nil { 590 // We can't encode nil into an EncDatum, so we represent it with DNull. 591 result = tree.DNull 592 } 593 ag.row[i] = sqlbase.DatumToEncDatum(ag.outputTypes[i], result) 594 } 595 bucket.close(ag.Ctx) 596 597 if outRow := ag.ProcessRowHelper(ag.row); outRow != nil { 598 return aggEmittingRows, outRow, nil 599 } 600 // We might have switched to draining, we might not have. In case we 601 // haven't, aggEmittingRows is accurate. If we have, it will be ignored by 602 // the caller. 603 return aggEmittingRows, nil, nil 604 } 605 606 // emitRow constructs an output row from an accumulated bucket and returns it. 607 // 608 // emitRow() might move to stateDraining. It might also not return a row if the 609 // ProcOutputHelper filtered the current row out. 610 func (ag *hashAggregator) emitRow() ( 611 aggregatorState, 612 sqlbase.EncDatumRow, 613 *execinfrapb.ProducerMetadata, 614 ) { 615 if len(ag.bucketsIter) == 0 { 616 // We've exhausted all of the aggregation buckets. 617 if ag.inputDone { 618 // The input has been fully consumed. Transition to draining so that we 619 // emit any metadata that we've produced. 620 ag.MoveToDraining(nil /* err */) 621 return aggStateUnknown, nil, nil 622 } 623 624 // We've only consumed part of the input where the rows are equal over 625 // the columns specified by ag.orderedGroupCols, so we need to continue 626 // accumulating the remaining rows. 627 628 if err := ag.arena.UnsafeReset(ag.Ctx); err != nil { 629 ag.MoveToDraining(err) 630 return aggStateUnknown, nil, nil 631 } 632 // Before we create a new 'buckets' map below, we need to "release" the 633 // already accounted for memory of the current map. 634 ag.bucketsAcc.Shrink(ag.Ctx, int64(ag.alreadyAccountedFor)*hashAggregatorSizeOfBucketsItem) 635 // Note that, for simplicity, we're ignoring the overhead of the slice of 636 // strings. 637 ag.bucketsAcc.Shrink(ag.Ctx, int64(len(ag.buckets))*sizeOfString) 638 ag.bucketsIter = nil 639 ag.buckets = make(map[string]aggregateFuncs) 640 ag.bucketsLenGrowThreshold = hashAggregatorBucketsInitialLen 641 ag.alreadyAccountedFor = 0 642 for _, f := range ag.funcs { 643 if f.seen != nil { 644 f.seen = make(map[string]struct{}) 645 } 646 } 647 648 if err := ag.accumulateRow(ag.lastOrdGroupCols); err != nil { 649 ag.MoveToDraining(err) 650 return aggStateUnknown, nil, nil 651 } 652 653 return aggAccumulating, nil, nil 654 } 655 656 bucket := ag.bucketsIter[0] 657 ag.bucketsIter = ag.bucketsIter[1:] 658 659 // Once we get the results from the bucket, we can delete it from the map. 660 // This will allow us to return the memory to the system before the hash 661 // aggregator is fully done (which matters when we have many buckets). 662 // NOTE: accounting for the memory under aggregate builtins in the bucket 663 // is updated in getAggResults (the bucket will be closed), however, we 664 // choose to not reduce our estimate of the map's internal footprint 665 // because it is error-prone to estimate the new footprint (we don't 666 // whether and when Go runtime will release some of the underlying memory). 667 // This behavior is ok, though, since actual usage of buckets will be lower 668 // than what we accounted for - in the worst case, the query might hit a 669 // memory budget limit and error out when it might actually be within the 670 // limit. However, we might be under accounting memory usage in other 671 // places, so having some over accounting here might be actually beneficial 672 // as a defensive mechanism against OOM crashes. 673 state, row, meta := ag.getAggResults(ag.buckets[bucket]) 674 delete(ag.buckets, bucket) 675 return state, row, meta 676 } 677 678 // emitRow constructs an output row from an accumulated bucket and returns it. 679 // 680 // emitRow() might move to stateDraining. It might also not return a row if the 681 // ProcOutputHelper filtered a the current row out. 682 func (ag *orderedAggregator) emitRow() ( 683 aggregatorState, 684 sqlbase.EncDatumRow, 685 *execinfrapb.ProducerMetadata, 686 ) { 687 if ag.bucket == nil { 688 // We've exhausted all of the aggregation buckets. 689 if ag.inputDone { 690 // The input has been fully consumed. Transition to draining so that we 691 // emit any metadata that we've produced. 692 ag.MoveToDraining(nil /* err */) 693 return aggStateUnknown, nil, nil 694 } 695 696 // We've only consumed part of the input where the rows are equal over 697 // the columns specified by ag.orderedGroupCols, so we need to continue 698 // accumulating the remaining rows. 699 700 if err := ag.arena.UnsafeReset(ag.Ctx); err != nil { 701 ag.MoveToDraining(err) 702 return aggStateUnknown, nil, nil 703 } 704 for _, f := range ag.funcs { 705 if f.seen != nil { 706 f.seen = make(map[string]struct{}) 707 } 708 } 709 710 if err := ag.accumulateRow(ag.lastOrdGroupCols); err != nil { 711 ag.MoveToDraining(err) 712 return aggStateUnknown, nil, nil 713 } 714 715 return aggAccumulating, nil, nil 716 } 717 718 bucket := ag.bucket 719 ag.bucket = nil 720 return ag.getAggResults(bucket) 721 } 722 723 // Next is part of the RowSource interface. 724 func (ag *hashAggregator) Next() (sqlbase.EncDatumRow, *execinfrapb.ProducerMetadata) { 725 for ag.State == execinfra.StateRunning { 726 var row sqlbase.EncDatumRow 727 var meta *execinfrapb.ProducerMetadata 728 switch ag.runningState { 729 case aggAccumulating: 730 ag.runningState, row, meta = ag.accumulateRows() 731 case aggEmittingRows: 732 ag.runningState, row, meta = ag.emitRow() 733 default: 734 log.Fatalf(ag.Ctx, "unsupported state: %d", ag.runningState) 735 } 736 737 if row == nil && meta == nil { 738 continue 739 } 740 return row, meta 741 } 742 return nil, ag.DrainHelper() 743 } 744 745 // Next is part of the RowSource interface. 746 func (ag *orderedAggregator) Next() (sqlbase.EncDatumRow, *execinfrapb.ProducerMetadata) { 747 for ag.State == execinfra.StateRunning { 748 var row sqlbase.EncDatumRow 749 var meta *execinfrapb.ProducerMetadata 750 switch ag.runningState { 751 case aggAccumulating: 752 ag.runningState, row, meta = ag.accumulateRows() 753 case aggEmittingRows: 754 ag.runningState, row, meta = ag.emitRow() 755 default: 756 log.Fatalf(ag.Ctx, "unsupported state: %d", ag.runningState) 757 } 758 759 if row == nil && meta == nil { 760 continue 761 } 762 return row, meta 763 } 764 return nil, ag.DrainHelper() 765 } 766 767 // ConsumerClosed is part of the RowSource interface. 768 func (ag *hashAggregator) ConsumerClosed() { 769 // The consumer is done, Next() will not be called again. 770 ag.close() 771 } 772 773 // ConsumerClosed is part of the RowSource interface. 774 func (ag *orderedAggregator) ConsumerClosed() { 775 // The consumer is done, Next() will not be called again. 776 ag.close() 777 } 778 779 func (ag *aggregatorBase) accumulateRowIntoBucket( 780 row sqlbase.EncDatumRow, groupKey []byte, bucket aggregateFuncs, 781 ) error { 782 var err error 783 // Feed the func holders for this bucket the non-grouping datums. 784 for i, a := range ag.aggregations { 785 if a.FilterColIdx != nil { 786 col := *a.FilterColIdx 787 if err := row[col].EnsureDecoded(ag.inputTypes[col], &ag.datumAlloc); err != nil { 788 return err 789 } 790 if row[*a.FilterColIdx].Datum != tree.DBoolTrue { 791 // This row doesn't contribute to this aggregation. 792 continue 793 } 794 } 795 // Extract the corresponding arguments from the row to feed into the 796 // aggregate function. 797 // Most functions require at most one argument thus we separate 798 // the first argument and allocation of (if applicable) a variadic 799 // collection of arguments thereafter. 800 var firstArg tree.Datum 801 var otherArgs tree.Datums 802 if len(a.ColIdx) > 1 { 803 otherArgs = make(tree.Datums, len(a.ColIdx)-1) 804 } 805 isFirstArg := true 806 for j, c := range a.ColIdx { 807 if err := row[c].EnsureDecoded(ag.inputTypes[c], &ag.datumAlloc); err != nil { 808 return err 809 } 810 if isFirstArg { 811 firstArg = row[c].Datum 812 isFirstArg = false 813 continue 814 } 815 otherArgs[j-1] = row[c].Datum 816 } 817 818 canAdd := true 819 if a.Distinct { 820 canAdd, err = ag.funcs[i].isDistinct( 821 ag.Ctx, 822 &ag.datumAlloc, 823 groupKey, 824 firstArg, 825 otherArgs, 826 ) 827 if err != nil { 828 return err 829 } 830 } 831 if !canAdd { 832 continue 833 } 834 if err := bucket[i].Add(ag.Ctx, firstArg, otherArgs...); err != nil { 835 return err 836 } 837 } 838 return nil 839 } 840 841 // accumulateRow accumulates a single row, returning an error if accumulation 842 // failed for any reason. 843 func (ag *hashAggregator) accumulateRow(row sqlbase.EncDatumRow) error { 844 if err := ag.cancelChecker.Check(); err != nil { 845 return err 846 } 847 848 // The encoding computed here determines which bucket the non-grouping 849 // datums are accumulated to. 850 encoded, err := ag.encode(ag.scratch, row) 851 if err != nil { 852 return err 853 } 854 ag.scratch = encoded[:0] 855 856 bucket, ok := ag.buckets[string(encoded)] 857 if !ok { 858 s, err := ag.arena.AllocBytes(ag.Ctx, encoded) 859 if err != nil { 860 return err 861 } 862 bucket, err = ag.createAggregateFuncs() 863 if err != nil { 864 return err 865 } 866 ag.buckets[s] = bucket 867 if len(ag.buckets) == ag.bucketsLenGrowThreshold { 868 toAccountFor := ag.bucketsLenGrowThreshold - ag.alreadyAccountedFor 869 if err := ag.bucketsAcc.Grow(ag.Ctx, int64(toAccountFor)*hashAggregatorSizeOfBucketsItem); err != nil { 870 return err 871 } 872 ag.alreadyAccountedFor = ag.bucketsLenGrowThreshold 873 ag.bucketsLenGrowThreshold *= 2 874 } 875 } 876 877 return ag.accumulateRowIntoBucket(row, encoded, bucket) 878 } 879 880 // accumulateRow accumulates a single row, returning an error if accumulation 881 // failed for any reason. 882 func (ag *orderedAggregator) accumulateRow(row sqlbase.EncDatumRow) error { 883 if err := ag.cancelChecker.Check(); err != nil { 884 return err 885 } 886 887 if ag.bucket == nil { 888 var err error 889 ag.bucket, err = ag.createAggregateFuncs() 890 if err != nil { 891 return err 892 } 893 } 894 895 return ag.accumulateRowIntoBucket(row, nil /* groupKey */, ag.bucket) 896 } 897 898 type aggregateFuncHolder struct { 899 create func(*tree.EvalContext, tree.Datums) tree.AggregateFunc 900 901 // arguments is the list of constant (non-aggregated) arguments to the 902 // aggregate, for instance, the separator in string_agg. 903 arguments tree.Datums 904 905 group *aggregatorBase 906 seen map[string]struct{} 907 arena *stringarena.Arena 908 } 909 910 const ( 911 sizeOfString = int64(unsafe.Sizeof("")) 912 sizeOfAggregateFuncs = int64(unsafe.Sizeof(aggregateFuncs{})) 913 sizeOfAggregateFunc = int64(unsafe.Sizeof(tree.AggregateFunc(nil))) 914 ) 915 916 func (ag *aggregatorBase) newAggregateFuncHolder( 917 create func(*tree.EvalContext, tree.Datums) tree.AggregateFunc, arguments tree.Datums, 918 ) *aggregateFuncHolder { 919 return &aggregateFuncHolder{ 920 create: create, 921 group: ag, 922 arena: &ag.arena, 923 arguments: arguments, 924 } 925 } 926 927 // isDistinct returns whether this aggregateFuncHolder has not already seen the 928 // encoding of grouping columns and argument columns. It should be used *only* 929 // when we have DISTINCT aggregation so that we can aggregate only the "first" 930 // row in the group. 931 func (a *aggregateFuncHolder) isDistinct( 932 ctx context.Context, 933 alloc *sqlbase.DatumAlloc, 934 prefix []byte, 935 firstArg tree.Datum, 936 otherArgs tree.Datums, 937 ) (bool, error) { 938 // Allocate one EncDatum that will be reused when encoding every argument. 939 ed := sqlbase.EncDatum{Datum: firstArg} 940 encoded, err := ed.Fingerprint(firstArg.ResolvedType(), alloc, prefix) 941 if err != nil { 942 return false, err 943 } 944 if otherArgs != nil { 945 for _, arg := range otherArgs { 946 ed.Datum = arg 947 encoded, err = ed.Fingerprint(arg.ResolvedType(), alloc, encoded) 948 if err != nil { 949 return false, err 950 } 951 } 952 } 953 954 if _, ok := a.seen[string(encoded)]; ok { 955 // We have already seen a row with such combination of grouping and 956 // argument columns. 957 return false, nil 958 } 959 s, err := a.arena.AllocBytes(ctx, encoded) 960 if err != nil { 961 return false, err 962 } 963 a.seen[s] = struct{}{} 964 return true, nil 965 } 966 967 // encode returns the encoding for the grouping columns, this is then used as 968 // our group key to determine which bucket to add to. 969 func (ag *aggregatorBase) encode( 970 appendTo []byte, row sqlbase.EncDatumRow, 971 ) (encoding []byte, err error) { 972 for _, colIdx := range ag.groupCols { 973 appendTo, err = row[colIdx].Fingerprint( 974 ag.inputTypes[colIdx], &ag.datumAlloc, appendTo) 975 if err != nil { 976 return appendTo, err 977 } 978 } 979 return appendTo, nil 980 } 981 982 func (ag *aggregatorBase) createAggregateFuncs() (aggregateFuncs, error) { 983 if err := ag.bucketsAcc.Grow(ag.Ctx, sizeOfAggregateFuncs+sizeOfAggregateFunc*int64(len(ag.funcs))); err != nil { 984 return nil, err 985 } 986 bucket := make(aggregateFuncs, len(ag.funcs)) 987 for i, f := range ag.funcs { 988 agg := f.create(ag.EvalCtx, f.arguments) 989 if err := ag.bucketsAcc.Grow(ag.Ctx, agg.Size()); err != nil { 990 return nil, err 991 } 992 bucket[i] = agg 993 } 994 return bucket, nil 995 }