github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/sql/rowexec/aggregator.go (about)

     1  // Copyright 2016 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package rowexec
    12  
    13  import (
    14  	"context"
    15  	"fmt"
    16  	"unsafe"
    17  
    18  	"github.com/cockroachdb/cockroach/pkg/sql/execinfra"
    19  	"github.com/cockroachdb/cockroach/pkg/sql/execinfrapb"
    20  	"github.com/cockroachdb/cockroach/pkg/sql/sem/tree"
    21  	"github.com/cockroachdb/cockroach/pkg/sql/sqlbase"
    22  	"github.com/cockroachdb/cockroach/pkg/sql/types"
    23  	"github.com/cockroachdb/cockroach/pkg/util/humanizeutil"
    24  	"github.com/cockroachdb/cockroach/pkg/util/log"
    25  	"github.com/cockroachdb/cockroach/pkg/util/mon"
    26  	"github.com/cockroachdb/cockroach/pkg/util/stringarena"
    27  	"github.com/cockroachdb/cockroach/pkg/util/tracing"
    28  	"github.com/cockroachdb/errors"
    29  	"github.com/opentracing/opentracing-go"
    30  )
    31  
    32  type aggregateFuncs []tree.AggregateFunc
    33  
    34  func (af aggregateFuncs) close(ctx context.Context) {
    35  	for _, f := range af {
    36  		f.Close(ctx)
    37  	}
    38  }
    39  
    40  // aggregatorBase is the foundation of the processor core type that does
    41  // "aggregation" in the SQL sense. It groups rows and computes an aggregate for
    42  // each group. The group is configured using the group key and the aggregator
    43  // can be configured with one or more aggregation functions, as defined in the
    44  // AggregatorSpec_Func enum.
    45  //
    46  // aggregatorBase's output schema is comprised of what is specified by the
    47  // accompanying SELECT expressions.
    48  type aggregatorBase struct {
    49  	execinfra.ProcessorBase
    50  
    51  	// runningState represents the state of the aggregator. This is in addition to
    52  	// ProcessorBase.State - the runningState is only relevant when
    53  	// ProcessorBase.State == StateRunning.
    54  	runningState aggregatorState
    55  	input        execinfra.RowSource
    56  	inputDone    bool
    57  	inputTypes   []*types.T
    58  	funcs        []*aggregateFuncHolder
    59  	outputTypes  []*types.T
    60  	datumAlloc   sqlbase.DatumAlloc
    61  	rowAlloc     sqlbase.EncDatumRowAlloc
    62  
    63  	bucketsAcc  mon.BoundAccount
    64  	aggFuncsAcc mon.BoundAccount
    65  
    66  	// isScalar can only be set if there are no groupCols, and it means that we
    67  	// will generate a result row even if there are no input rows. Used for
    68  	// queries like SELECT MAX(n) FROM t.
    69  	isScalar         bool
    70  	groupCols        []uint32
    71  	orderedGroupCols []uint32
    72  	aggregations     []execinfrapb.AggregatorSpec_Aggregation
    73  
    74  	lastOrdGroupCols sqlbase.EncDatumRow
    75  	arena            stringarena.Arena
    76  	row              sqlbase.EncDatumRow
    77  	scratch          []byte
    78  
    79  	cancelChecker *sqlbase.CancelChecker
    80  }
    81  
    82  // init initializes the aggregatorBase.
    83  //
    84  // trailingMetaCallback is passed as part of ProcStateOpts; the inputs to drain
    85  // are in aggregatorBase.
    86  func (ag *aggregatorBase) init(
    87  	self execinfra.RowSource,
    88  	flowCtx *execinfra.FlowCtx,
    89  	processorID int32,
    90  	spec *execinfrapb.AggregatorSpec,
    91  	input execinfra.RowSource,
    92  	post *execinfrapb.PostProcessSpec,
    93  	output execinfra.RowReceiver,
    94  	trailingMetaCallback func(context.Context) []execinfrapb.ProducerMetadata,
    95  ) error {
    96  	ctx := flowCtx.EvalCtx.Ctx()
    97  	memMonitor := execinfra.NewMonitor(ctx, flowCtx.EvalCtx.Mon, "aggregator-mem")
    98  	if sp := opentracing.SpanFromContext(ctx); sp != nil && tracing.IsRecording(sp) {
    99  		input = newInputStatCollector(input)
   100  		ag.FinishTrace = ag.outputStatsToTrace
   101  	}
   102  	ag.input = input
   103  	ag.isScalar = spec.IsScalar()
   104  	ag.groupCols = spec.GroupCols
   105  	ag.orderedGroupCols = spec.OrderedGroupCols
   106  	ag.aggregations = spec.Aggregations
   107  	ag.funcs = make([]*aggregateFuncHolder, len(spec.Aggregations))
   108  	ag.outputTypes = make([]*types.T, len(spec.Aggregations))
   109  	ag.row = make(sqlbase.EncDatumRow, len(spec.Aggregations))
   110  	ag.bucketsAcc = memMonitor.MakeBoundAccount()
   111  	ag.arena = stringarena.Make(&ag.bucketsAcc)
   112  	ag.aggFuncsAcc = memMonitor.MakeBoundAccount()
   113  
   114  	// Loop over the select expressions and extract any aggregate functions --
   115  	// non-aggregation functions are replaced with parser.NewIdentAggregate,
   116  	// (which just returns the last value added to them for a bucket) to provide
   117  	// grouped-by values for each bucket.  ag.funcs is updated to contain all
   118  	// the functions which need to be fed values.
   119  	ag.inputTypes = input.OutputTypes()
   120  	for i, aggInfo := range spec.Aggregations {
   121  		if aggInfo.FilterColIdx != nil {
   122  			col := *aggInfo.FilterColIdx
   123  			if col >= uint32(len(ag.inputTypes)) {
   124  				return errors.Errorf("FilterColIdx out of range (%d)", col)
   125  			}
   126  			t := ag.inputTypes[col].Family()
   127  			if t != types.BoolFamily && t != types.UnknownFamily {
   128  				return errors.Errorf(
   129  					"filter column %d must be of boolean type, not %s", *aggInfo.FilterColIdx, t,
   130  				)
   131  			}
   132  		}
   133  		argTypes := make([]*types.T, len(aggInfo.ColIdx)+len(aggInfo.Arguments))
   134  		for j, c := range aggInfo.ColIdx {
   135  			if c >= uint32(len(ag.inputTypes)) {
   136  				return errors.Errorf("ColIdx out of range (%d)", aggInfo.ColIdx)
   137  			}
   138  			argTypes[j] = ag.inputTypes[c]
   139  		}
   140  
   141  		arguments := make(tree.Datums, len(aggInfo.Arguments))
   142  		for j, argument := range aggInfo.Arguments {
   143  			h := execinfra.ExprHelper{}
   144  			// Pass nil types and row - there are no variables in these expressions.
   145  			if err := h.Init(argument, nil /* types */, flowCtx.EvalCtx); err != nil {
   146  				return errors.Wrapf(err, "%s", argument)
   147  			}
   148  			d, err := h.Eval(nil /* row */)
   149  			if err != nil {
   150  				return errors.Wrapf(err, "%s", argument)
   151  			}
   152  			argTypes[len(aggInfo.ColIdx)+j] = d.ResolvedType()
   153  			arguments[j] = d
   154  		}
   155  
   156  		aggConstructor, retType, err := execinfrapb.GetAggregateInfo(aggInfo.Func, argTypes...)
   157  		if err != nil {
   158  			return err
   159  		}
   160  
   161  		ag.funcs[i] = ag.newAggregateFuncHolder(aggConstructor, arguments)
   162  		if aggInfo.Distinct {
   163  			ag.funcs[i].seen = make(map[string]struct{})
   164  		}
   165  
   166  		ag.outputTypes[i] = retType
   167  	}
   168  
   169  	return ag.ProcessorBase.Init(
   170  		self, post, ag.outputTypes, flowCtx, processorID, output, memMonitor,
   171  		execinfra.ProcStateOpts{
   172  			InputsToDrain:        []execinfra.RowSource{ag.input},
   173  			TrailingMetaCallback: trailingMetaCallback,
   174  		},
   175  	)
   176  }
   177  
   178  var _ execinfrapb.DistSQLSpanStats = &AggregatorStats{}
   179  
   180  const aggregatorTagPrefix = "aggregator."
   181  
   182  // Stats implements the SpanStats interface.
   183  func (as *AggregatorStats) Stats() map[string]string {
   184  	inputStatsMap := as.InputStats.Stats(aggregatorTagPrefix)
   185  	inputStatsMap[aggregatorTagPrefix+MaxMemoryTagSuffix] = humanizeutil.IBytes(as.MaxAllocatedMem)
   186  	return inputStatsMap
   187  }
   188  
   189  // StatsForQueryPlan implements the DistSQLSpanStats interface.
   190  func (as *AggregatorStats) StatsForQueryPlan() []string {
   191  	stats := as.InputStats.StatsForQueryPlan("" /* prefix */)
   192  
   193  	if as.MaxAllocatedMem != 0 {
   194  		stats = append(stats,
   195  			fmt.Sprintf("%s: %s", MaxMemoryQueryPlanSuffix, humanizeutil.IBytes(as.MaxAllocatedMem)))
   196  	}
   197  
   198  	return stats
   199  }
   200  
   201  func (ag *aggregatorBase) outputStatsToTrace() {
   202  	is, ok := getInputStats(ag.FlowCtx, ag.input)
   203  	if !ok {
   204  		return
   205  	}
   206  	if sp := opentracing.SpanFromContext(ag.Ctx); sp != nil {
   207  		tracing.SetSpanStats(
   208  			sp,
   209  			&AggregatorStats{
   210  				InputStats:      is,
   211  				MaxAllocatedMem: ag.MemMonitor.MaximumBytes(),
   212  			},
   213  		)
   214  	}
   215  }
   216  
   217  // ChildCount is part of the execinfra.OpNode interface.
   218  func (ag *aggregatorBase) ChildCount(verbose bool) int {
   219  	if _, ok := ag.input.(execinfra.OpNode); ok {
   220  		return 1
   221  	}
   222  	return 0
   223  }
   224  
   225  // Child is part of the execinfra.OpNode interface.
   226  func (ag *aggregatorBase) Child(nth int, verbose bool) execinfra.OpNode {
   227  	if nth == 0 {
   228  		if n, ok := ag.input.(execinfra.OpNode); ok {
   229  			return n
   230  		}
   231  		panic("input to aggregatorBase is not an execinfra.OpNode")
   232  	}
   233  	panic(fmt.Sprintf("invalid index %d", nth))
   234  }
   235  
   236  const (
   237  	// hashAggregatorBucketsInitialLen is a guess on how many "items" the
   238  	// 'buckets' map of hashAggregator has the capacity for initially.
   239  	hashAggregatorBucketsInitialLen = 8
   240  	// hashAggregatorSizeOfBucketsItem is a guess on how much space (in bytes)
   241  	// each item added to 'buckets' map of hashAggregator takes up in the map
   242  	// (i.e. it is memory internal to the map, orthogonal to "key-value" pair
   243  	// that we're adding to the map).
   244  	hashAggregatorSizeOfBucketsItem = 64
   245  )
   246  
   247  // hashAggregator is a specialization of aggregatorBase that must keep track of
   248  // multiple grouping buckets at a time.
   249  type hashAggregator struct {
   250  	aggregatorBase
   251  
   252  	// buckets is used during the accumulation phase to track the bucket keys
   253  	// that have been seen. After accumulation, the keys are extracted into
   254  	// bucketsIter for iteration.
   255  	buckets     map[string]aggregateFuncs
   256  	bucketsIter []string
   257  	// bucketsLenGrowThreshold is the threshold which, when reached by the
   258  	// number of items in 'buckets', will trigger the update to memory
   259  	// accounting. It will start out at hashAggregatorBucketsInitialLen and
   260  	// then will be doubling in size.
   261  	bucketsLenGrowThreshold int
   262  	// alreadyAccountedFor tracks the number of items in 'buckets' memory for
   263  	// which we have already accounted for.
   264  	alreadyAccountedFor int
   265  }
   266  
   267  // orderedAggregator is a specialization of aggregatorBase that only needs to
   268  // keep track of a single grouping bucket at a time.
   269  type orderedAggregator struct {
   270  	aggregatorBase
   271  
   272  	// bucket is used during the accumulation phase to aggregate results.
   273  	bucket aggregateFuncs
   274  }
   275  
   276  var _ execinfra.Processor = &hashAggregator{}
   277  var _ execinfra.RowSource = &hashAggregator{}
   278  var _ execinfra.OpNode = &hashAggregator{}
   279  
   280  const hashAggregatorProcName = "hash aggregator"
   281  
   282  var _ execinfra.Processor = &orderedAggregator{}
   283  var _ execinfra.RowSource = &orderedAggregator{}
   284  var _ execinfra.OpNode = &orderedAggregator{}
   285  
   286  const orderedAggregatorProcName = "ordered aggregator"
   287  
   288  // aggregatorState represents the state of the processor.
   289  type aggregatorState int
   290  
   291  const (
   292  	aggStateUnknown aggregatorState = iota
   293  	// aggAccumulating means that rows are being read from the input and used to
   294  	// compute intermediary aggregation results.
   295  	aggAccumulating
   296  	// aggEmittingRows means that accumulation has finished and rows are being
   297  	// sent to the output.
   298  	aggEmittingRows
   299  )
   300  
   301  func newAggregator(
   302  	flowCtx *execinfra.FlowCtx,
   303  	processorID int32,
   304  	spec *execinfrapb.AggregatorSpec,
   305  	input execinfra.RowSource,
   306  	post *execinfrapb.PostProcessSpec,
   307  	output execinfra.RowReceiver,
   308  ) (execinfra.Processor, error) {
   309  	if spec.IsRowCount() {
   310  		return newCountAggregator(flowCtx, processorID, input, post, output)
   311  	}
   312  	if len(spec.OrderedGroupCols) == len(spec.GroupCols) {
   313  		return newOrderedAggregator(flowCtx, processorID, spec, input, post, output)
   314  	}
   315  
   316  	ag := &hashAggregator{
   317  		buckets:                 make(map[string]aggregateFuncs),
   318  		bucketsLenGrowThreshold: hashAggregatorBucketsInitialLen,
   319  	}
   320  
   321  	if err := ag.init(
   322  		ag,
   323  		flowCtx,
   324  		processorID,
   325  		spec,
   326  		input,
   327  		post,
   328  		output,
   329  		func(context.Context) []execinfrapb.ProducerMetadata {
   330  			ag.close()
   331  			return nil
   332  		},
   333  	); err != nil {
   334  		return nil, err
   335  	}
   336  
   337  	// A new tree.EvalCtx was created during initializing aggregatorBase above
   338  	// and will be used only by this aggregator, so it is ok to update EvalCtx
   339  	// directly.
   340  	ag.EvalCtx.SingleDatumAggMemAccount = &ag.aggFuncsAcc
   341  	return ag, nil
   342  }
   343  
   344  func newOrderedAggregator(
   345  	flowCtx *execinfra.FlowCtx,
   346  	processorID int32,
   347  	spec *execinfrapb.AggregatorSpec,
   348  	input execinfra.RowSource,
   349  	post *execinfrapb.PostProcessSpec,
   350  	output execinfra.RowReceiver,
   351  ) (*orderedAggregator, error) {
   352  	ag := &orderedAggregator{}
   353  
   354  	if err := ag.init(
   355  		ag,
   356  		flowCtx,
   357  		processorID,
   358  		spec,
   359  		input,
   360  		post,
   361  		output,
   362  		func(context.Context) []execinfrapb.ProducerMetadata {
   363  			ag.close()
   364  			return nil
   365  		},
   366  	); err != nil {
   367  		return nil, err
   368  	}
   369  
   370  	// A new tree.EvalCtx was created during initializing aggregatorBase above
   371  	// and will be used only by this aggregator, so it is ok to update EvalCtx
   372  	// directly.
   373  	ag.EvalCtx.SingleDatumAggMemAccount = &ag.aggFuncsAcc
   374  	return ag, nil
   375  }
   376  
   377  // Start is part of the RowSource interface.
   378  func (ag *hashAggregator) Start(ctx context.Context) context.Context {
   379  	return ag.start(ctx, hashAggregatorProcName)
   380  }
   381  
   382  // Start is part of the RowSource interface.
   383  func (ag *orderedAggregator) Start(ctx context.Context) context.Context {
   384  	return ag.start(ctx, orderedAggregatorProcName)
   385  }
   386  
   387  func (ag *aggregatorBase) start(ctx context.Context, procName string) context.Context {
   388  	ag.input.Start(ctx)
   389  	ctx = ag.StartInternal(ctx, procName)
   390  	ag.cancelChecker = sqlbase.NewCancelChecker(ctx)
   391  	ag.runningState = aggAccumulating
   392  	return ctx
   393  }
   394  
   395  func (ag *hashAggregator) close() {
   396  	if ag.InternalClose() {
   397  		log.VEventf(ag.Ctx, 2, "exiting aggregator")
   398  		// If we have started emitting rows, bucketsIter will represent which
   399  		// buckets are still open, since buckets are closed once their results are
   400  		// emitted.
   401  		if ag.bucketsIter == nil {
   402  			for _, bucket := range ag.buckets {
   403  				bucket.close(ag.Ctx)
   404  			}
   405  		} else {
   406  			for _, bucket := range ag.bucketsIter {
   407  				ag.buckets[bucket].close(ag.Ctx)
   408  			}
   409  		}
   410  		// Make sure to release any remaining memory under 'buckets'.
   411  		ag.buckets = nil
   412  		// Note that we should be closing accounts only after closing all the
   413  		// buckets since the latter might be releasing some precisely tracked
   414  		// memory, and if we were to close the accounts first, there would be
   415  		// no memory to release for the buckets.
   416  		ag.bucketsAcc.Close(ag.Ctx)
   417  		ag.aggFuncsAcc.Close(ag.Ctx)
   418  		ag.MemMonitor.Stop(ag.Ctx)
   419  	}
   420  }
   421  
   422  func (ag *orderedAggregator) close() {
   423  	if ag.InternalClose() {
   424  		log.VEventf(ag.Ctx, 2, "exiting aggregator")
   425  		if ag.bucket != nil {
   426  			ag.bucket.close(ag.Ctx)
   427  		}
   428  		// Note that we should be closing accounts only after closing the
   429  		// bucket since the latter might be releasing some precisely tracked
   430  		// memory, and if we were to close the accounts first, there would be
   431  		// no memory to release for the bucket.
   432  		ag.bucketsAcc.Close(ag.Ctx)
   433  		ag.aggFuncsAcc.Close(ag.Ctx)
   434  		ag.MemMonitor.Stop(ag.Ctx)
   435  	}
   436  }
   437  
   438  // matchLastOrdGroupCols takes a row and matches it with the row stored by
   439  // lastOrdGroupCols. It returns true if the two rows are equal on the grouping
   440  // columns, and false otherwise.
   441  func (ag *aggregatorBase) matchLastOrdGroupCols(row sqlbase.EncDatumRow) (bool, error) {
   442  	for _, colIdx := range ag.orderedGroupCols {
   443  		res, err := ag.lastOrdGroupCols[colIdx].Compare(
   444  			ag.inputTypes[colIdx], &ag.datumAlloc, ag.EvalCtx, &row[colIdx],
   445  		)
   446  		if res != 0 || err != nil {
   447  			return false, err
   448  		}
   449  	}
   450  	return true, nil
   451  }
   452  
   453  // accumulateRows continually reads rows from the input and accumulates them
   454  // into intermediary aggregate results. If it encounters metadata, the metadata
   455  // is immediately returned. Subsequent calls of this function will resume row
   456  // accumulation.
   457  func (ag *hashAggregator) accumulateRows() (
   458  	aggregatorState,
   459  	sqlbase.EncDatumRow,
   460  	*execinfrapb.ProducerMetadata,
   461  ) {
   462  	for {
   463  		row, meta := ag.input.Next()
   464  		if meta != nil {
   465  			if meta.Err != nil {
   466  				ag.MoveToDraining(nil /* err */)
   467  				return aggStateUnknown, nil, meta
   468  			}
   469  			return aggAccumulating, nil, meta
   470  		}
   471  		if row == nil {
   472  			log.VEvent(ag.Ctx, 1, "accumulation complete")
   473  			ag.inputDone = true
   474  			break
   475  		}
   476  
   477  		if ag.lastOrdGroupCols == nil {
   478  			ag.lastOrdGroupCols = ag.rowAlloc.CopyRow(row)
   479  		} else {
   480  			matched, err := ag.matchLastOrdGroupCols(row)
   481  			if err != nil {
   482  				ag.MoveToDraining(err)
   483  				return aggStateUnknown, nil, nil
   484  			}
   485  			if !matched {
   486  				copy(ag.lastOrdGroupCols, row)
   487  				break
   488  			}
   489  		}
   490  		if err := ag.accumulateRow(row); err != nil {
   491  			ag.MoveToDraining(err)
   492  			return aggStateUnknown, nil, nil
   493  		}
   494  	}
   495  
   496  	// Queries like `SELECT MAX(n) FROM t` expect a row of NULLs if nothing was
   497  	// aggregated.
   498  	if len(ag.buckets) < 1 && len(ag.groupCols) == 0 {
   499  		bucket, err := ag.createAggregateFuncs()
   500  		if err != nil {
   501  			ag.MoveToDraining(err)
   502  			return aggStateUnknown, nil, nil
   503  		}
   504  		ag.buckets[""] = bucket
   505  	}
   506  
   507  	// Note that, for simplicity, we're ignoring the overhead of the slice of
   508  	// strings.
   509  	if err := ag.bucketsAcc.Grow(ag.Ctx, int64(len(ag.buckets))*sizeOfString); err != nil {
   510  		ag.MoveToDraining(err)
   511  		return aggStateUnknown, nil, nil
   512  	}
   513  	ag.bucketsIter = make([]string, 0, len(ag.buckets))
   514  	for bucket := range ag.buckets {
   515  		ag.bucketsIter = append(ag.bucketsIter, bucket)
   516  	}
   517  
   518  	// Transition to aggEmittingRows, and let it generate the next row/meta.
   519  	return aggEmittingRows, nil, nil
   520  }
   521  
   522  // accumulateRows continually reads rows from the input and accumulates them
   523  // into intermediary aggregate results. If it encounters metadata, the metadata
   524  // is immediately returned. Subsequent calls of this function will resume row
   525  // accumulation.
   526  func (ag *orderedAggregator) accumulateRows() (
   527  	aggregatorState,
   528  	sqlbase.EncDatumRow,
   529  	*execinfrapb.ProducerMetadata,
   530  ) {
   531  	for {
   532  		row, meta := ag.input.Next()
   533  		if meta != nil {
   534  			if meta.Err != nil {
   535  				ag.MoveToDraining(nil /* err */)
   536  				return aggStateUnknown, nil, meta
   537  			}
   538  			return aggAccumulating, nil, meta
   539  		}
   540  		if row == nil {
   541  			log.VEvent(ag.Ctx, 1, "accumulation complete")
   542  			ag.inputDone = true
   543  			break
   544  		}
   545  
   546  		if ag.lastOrdGroupCols == nil {
   547  			ag.lastOrdGroupCols = ag.rowAlloc.CopyRow(row)
   548  		} else {
   549  			matched, err := ag.matchLastOrdGroupCols(row)
   550  			if err != nil {
   551  				ag.MoveToDraining(err)
   552  				return aggStateUnknown, nil, nil
   553  			}
   554  			if !matched {
   555  				copy(ag.lastOrdGroupCols, row)
   556  				break
   557  			}
   558  		}
   559  		if err := ag.accumulateRow(row); err != nil {
   560  			ag.MoveToDraining(err)
   561  			return aggStateUnknown, nil, nil
   562  		}
   563  	}
   564  
   565  	// Queries like `SELECT MAX(n) FROM t` expect a row of NULLs if nothing was
   566  	// aggregated.
   567  	if ag.bucket == nil && ag.isScalar {
   568  		var err error
   569  		ag.bucket, err = ag.createAggregateFuncs()
   570  		if err != nil {
   571  			ag.MoveToDraining(err)
   572  			return aggStateUnknown, nil, nil
   573  		}
   574  	}
   575  
   576  	// Transition to aggEmittingRows, and let it generate the next row/meta.
   577  	return aggEmittingRows, nil, nil
   578  }
   579  
   580  func (ag *aggregatorBase) getAggResults(
   581  	bucket aggregateFuncs,
   582  ) (aggregatorState, sqlbase.EncDatumRow, *execinfrapb.ProducerMetadata) {
   583  	for i, b := range bucket {
   584  		result, err := b.Result()
   585  		if err != nil {
   586  			ag.MoveToDraining(err)
   587  			return aggStateUnknown, nil, nil
   588  		}
   589  		if result == nil {
   590  			// We can't encode nil into an EncDatum, so we represent it with DNull.
   591  			result = tree.DNull
   592  		}
   593  		ag.row[i] = sqlbase.DatumToEncDatum(ag.outputTypes[i], result)
   594  	}
   595  	bucket.close(ag.Ctx)
   596  
   597  	if outRow := ag.ProcessRowHelper(ag.row); outRow != nil {
   598  		return aggEmittingRows, outRow, nil
   599  	}
   600  	// We might have switched to draining, we might not have. In case we
   601  	// haven't, aggEmittingRows is accurate. If we have, it will be ignored by
   602  	// the caller.
   603  	return aggEmittingRows, nil, nil
   604  }
   605  
   606  // emitRow constructs an output row from an accumulated bucket and returns it.
   607  //
   608  // emitRow() might move to stateDraining. It might also not return a row if the
   609  // ProcOutputHelper filtered the current row out.
   610  func (ag *hashAggregator) emitRow() (
   611  	aggregatorState,
   612  	sqlbase.EncDatumRow,
   613  	*execinfrapb.ProducerMetadata,
   614  ) {
   615  	if len(ag.bucketsIter) == 0 {
   616  		// We've exhausted all of the aggregation buckets.
   617  		if ag.inputDone {
   618  			// The input has been fully consumed. Transition to draining so that we
   619  			// emit any metadata that we've produced.
   620  			ag.MoveToDraining(nil /* err */)
   621  			return aggStateUnknown, nil, nil
   622  		}
   623  
   624  		// We've only consumed part of the input where the rows are equal over
   625  		// the columns specified by ag.orderedGroupCols, so we need to continue
   626  		// accumulating the remaining rows.
   627  
   628  		if err := ag.arena.UnsafeReset(ag.Ctx); err != nil {
   629  			ag.MoveToDraining(err)
   630  			return aggStateUnknown, nil, nil
   631  		}
   632  		// Before we create a new 'buckets' map below, we need to "release" the
   633  		// already accounted for memory of the current map.
   634  		ag.bucketsAcc.Shrink(ag.Ctx, int64(ag.alreadyAccountedFor)*hashAggregatorSizeOfBucketsItem)
   635  		// Note that, for simplicity, we're ignoring the overhead of the slice of
   636  		// strings.
   637  		ag.bucketsAcc.Shrink(ag.Ctx, int64(len(ag.buckets))*sizeOfString)
   638  		ag.bucketsIter = nil
   639  		ag.buckets = make(map[string]aggregateFuncs)
   640  		ag.bucketsLenGrowThreshold = hashAggregatorBucketsInitialLen
   641  		ag.alreadyAccountedFor = 0
   642  		for _, f := range ag.funcs {
   643  			if f.seen != nil {
   644  				f.seen = make(map[string]struct{})
   645  			}
   646  		}
   647  
   648  		if err := ag.accumulateRow(ag.lastOrdGroupCols); err != nil {
   649  			ag.MoveToDraining(err)
   650  			return aggStateUnknown, nil, nil
   651  		}
   652  
   653  		return aggAccumulating, nil, nil
   654  	}
   655  
   656  	bucket := ag.bucketsIter[0]
   657  	ag.bucketsIter = ag.bucketsIter[1:]
   658  
   659  	// Once we get the results from the bucket, we can delete it from the map.
   660  	// This will allow us to return the memory to the system before the hash
   661  	// aggregator is fully done (which matters when we have many buckets).
   662  	// NOTE: accounting for the memory under aggregate builtins in the bucket
   663  	// is updated in getAggResults (the bucket will be closed), however, we
   664  	// choose to not reduce our estimate of the map's internal footprint
   665  	// because it is error-prone to estimate the new footprint (we don't
   666  	// whether and when Go runtime will release some of the underlying memory).
   667  	// This behavior is ok, though, since actual usage of buckets will be lower
   668  	// than what we accounted for - in the worst case, the query might hit a
   669  	// memory budget limit and error out when it might actually be within the
   670  	// limit. However, we might be under accounting memory usage in other
   671  	// places, so having some over accounting here might be actually beneficial
   672  	// as a defensive mechanism against OOM crashes.
   673  	state, row, meta := ag.getAggResults(ag.buckets[bucket])
   674  	delete(ag.buckets, bucket)
   675  	return state, row, meta
   676  }
   677  
   678  // emitRow constructs an output row from an accumulated bucket and returns it.
   679  //
   680  // emitRow() might move to stateDraining. It might also not return a row if the
   681  // ProcOutputHelper filtered a the current row out.
   682  func (ag *orderedAggregator) emitRow() (
   683  	aggregatorState,
   684  	sqlbase.EncDatumRow,
   685  	*execinfrapb.ProducerMetadata,
   686  ) {
   687  	if ag.bucket == nil {
   688  		// We've exhausted all of the aggregation buckets.
   689  		if ag.inputDone {
   690  			// The input has been fully consumed. Transition to draining so that we
   691  			// emit any metadata that we've produced.
   692  			ag.MoveToDraining(nil /* err */)
   693  			return aggStateUnknown, nil, nil
   694  		}
   695  
   696  		// We've only consumed part of the input where the rows are equal over
   697  		// the columns specified by ag.orderedGroupCols, so we need to continue
   698  		// accumulating the remaining rows.
   699  
   700  		if err := ag.arena.UnsafeReset(ag.Ctx); err != nil {
   701  			ag.MoveToDraining(err)
   702  			return aggStateUnknown, nil, nil
   703  		}
   704  		for _, f := range ag.funcs {
   705  			if f.seen != nil {
   706  				f.seen = make(map[string]struct{})
   707  			}
   708  		}
   709  
   710  		if err := ag.accumulateRow(ag.lastOrdGroupCols); err != nil {
   711  			ag.MoveToDraining(err)
   712  			return aggStateUnknown, nil, nil
   713  		}
   714  
   715  		return aggAccumulating, nil, nil
   716  	}
   717  
   718  	bucket := ag.bucket
   719  	ag.bucket = nil
   720  	return ag.getAggResults(bucket)
   721  }
   722  
   723  // Next is part of the RowSource interface.
   724  func (ag *hashAggregator) Next() (sqlbase.EncDatumRow, *execinfrapb.ProducerMetadata) {
   725  	for ag.State == execinfra.StateRunning {
   726  		var row sqlbase.EncDatumRow
   727  		var meta *execinfrapb.ProducerMetadata
   728  		switch ag.runningState {
   729  		case aggAccumulating:
   730  			ag.runningState, row, meta = ag.accumulateRows()
   731  		case aggEmittingRows:
   732  			ag.runningState, row, meta = ag.emitRow()
   733  		default:
   734  			log.Fatalf(ag.Ctx, "unsupported state: %d", ag.runningState)
   735  		}
   736  
   737  		if row == nil && meta == nil {
   738  			continue
   739  		}
   740  		return row, meta
   741  	}
   742  	return nil, ag.DrainHelper()
   743  }
   744  
   745  // Next is part of the RowSource interface.
   746  func (ag *orderedAggregator) Next() (sqlbase.EncDatumRow, *execinfrapb.ProducerMetadata) {
   747  	for ag.State == execinfra.StateRunning {
   748  		var row sqlbase.EncDatumRow
   749  		var meta *execinfrapb.ProducerMetadata
   750  		switch ag.runningState {
   751  		case aggAccumulating:
   752  			ag.runningState, row, meta = ag.accumulateRows()
   753  		case aggEmittingRows:
   754  			ag.runningState, row, meta = ag.emitRow()
   755  		default:
   756  			log.Fatalf(ag.Ctx, "unsupported state: %d", ag.runningState)
   757  		}
   758  
   759  		if row == nil && meta == nil {
   760  			continue
   761  		}
   762  		return row, meta
   763  	}
   764  	return nil, ag.DrainHelper()
   765  }
   766  
   767  // ConsumerClosed is part of the RowSource interface.
   768  func (ag *hashAggregator) ConsumerClosed() {
   769  	// The consumer is done, Next() will not be called again.
   770  	ag.close()
   771  }
   772  
   773  // ConsumerClosed is part of the RowSource interface.
   774  func (ag *orderedAggregator) ConsumerClosed() {
   775  	// The consumer is done, Next() will not be called again.
   776  	ag.close()
   777  }
   778  
   779  func (ag *aggregatorBase) accumulateRowIntoBucket(
   780  	row sqlbase.EncDatumRow, groupKey []byte, bucket aggregateFuncs,
   781  ) error {
   782  	var err error
   783  	// Feed the func holders for this bucket the non-grouping datums.
   784  	for i, a := range ag.aggregations {
   785  		if a.FilterColIdx != nil {
   786  			col := *a.FilterColIdx
   787  			if err := row[col].EnsureDecoded(ag.inputTypes[col], &ag.datumAlloc); err != nil {
   788  				return err
   789  			}
   790  			if row[*a.FilterColIdx].Datum != tree.DBoolTrue {
   791  				// This row doesn't contribute to this aggregation.
   792  				continue
   793  			}
   794  		}
   795  		// Extract the corresponding arguments from the row to feed into the
   796  		// aggregate function.
   797  		// Most functions require at most one argument thus we separate
   798  		// the first argument and allocation of (if applicable) a variadic
   799  		// collection of arguments thereafter.
   800  		var firstArg tree.Datum
   801  		var otherArgs tree.Datums
   802  		if len(a.ColIdx) > 1 {
   803  			otherArgs = make(tree.Datums, len(a.ColIdx)-1)
   804  		}
   805  		isFirstArg := true
   806  		for j, c := range a.ColIdx {
   807  			if err := row[c].EnsureDecoded(ag.inputTypes[c], &ag.datumAlloc); err != nil {
   808  				return err
   809  			}
   810  			if isFirstArg {
   811  				firstArg = row[c].Datum
   812  				isFirstArg = false
   813  				continue
   814  			}
   815  			otherArgs[j-1] = row[c].Datum
   816  		}
   817  
   818  		canAdd := true
   819  		if a.Distinct {
   820  			canAdd, err = ag.funcs[i].isDistinct(
   821  				ag.Ctx,
   822  				&ag.datumAlloc,
   823  				groupKey,
   824  				firstArg,
   825  				otherArgs,
   826  			)
   827  			if err != nil {
   828  				return err
   829  			}
   830  		}
   831  		if !canAdd {
   832  			continue
   833  		}
   834  		if err := bucket[i].Add(ag.Ctx, firstArg, otherArgs...); err != nil {
   835  			return err
   836  		}
   837  	}
   838  	return nil
   839  }
   840  
   841  // accumulateRow accumulates a single row, returning an error if accumulation
   842  // failed for any reason.
   843  func (ag *hashAggregator) accumulateRow(row sqlbase.EncDatumRow) error {
   844  	if err := ag.cancelChecker.Check(); err != nil {
   845  		return err
   846  	}
   847  
   848  	// The encoding computed here determines which bucket the non-grouping
   849  	// datums are accumulated to.
   850  	encoded, err := ag.encode(ag.scratch, row)
   851  	if err != nil {
   852  		return err
   853  	}
   854  	ag.scratch = encoded[:0]
   855  
   856  	bucket, ok := ag.buckets[string(encoded)]
   857  	if !ok {
   858  		s, err := ag.arena.AllocBytes(ag.Ctx, encoded)
   859  		if err != nil {
   860  			return err
   861  		}
   862  		bucket, err = ag.createAggregateFuncs()
   863  		if err != nil {
   864  			return err
   865  		}
   866  		ag.buckets[s] = bucket
   867  		if len(ag.buckets) == ag.bucketsLenGrowThreshold {
   868  			toAccountFor := ag.bucketsLenGrowThreshold - ag.alreadyAccountedFor
   869  			if err := ag.bucketsAcc.Grow(ag.Ctx, int64(toAccountFor)*hashAggregatorSizeOfBucketsItem); err != nil {
   870  				return err
   871  			}
   872  			ag.alreadyAccountedFor = ag.bucketsLenGrowThreshold
   873  			ag.bucketsLenGrowThreshold *= 2
   874  		}
   875  	}
   876  
   877  	return ag.accumulateRowIntoBucket(row, encoded, bucket)
   878  }
   879  
   880  // accumulateRow accumulates a single row, returning an error if accumulation
   881  // failed for any reason.
   882  func (ag *orderedAggregator) accumulateRow(row sqlbase.EncDatumRow) error {
   883  	if err := ag.cancelChecker.Check(); err != nil {
   884  		return err
   885  	}
   886  
   887  	if ag.bucket == nil {
   888  		var err error
   889  		ag.bucket, err = ag.createAggregateFuncs()
   890  		if err != nil {
   891  			return err
   892  		}
   893  	}
   894  
   895  	return ag.accumulateRowIntoBucket(row, nil /* groupKey */, ag.bucket)
   896  }
   897  
   898  type aggregateFuncHolder struct {
   899  	create func(*tree.EvalContext, tree.Datums) tree.AggregateFunc
   900  
   901  	// arguments is the list of constant (non-aggregated) arguments to the
   902  	// aggregate, for instance, the separator in string_agg.
   903  	arguments tree.Datums
   904  
   905  	group *aggregatorBase
   906  	seen  map[string]struct{}
   907  	arena *stringarena.Arena
   908  }
   909  
   910  const (
   911  	sizeOfString         = int64(unsafe.Sizeof(""))
   912  	sizeOfAggregateFuncs = int64(unsafe.Sizeof(aggregateFuncs{}))
   913  	sizeOfAggregateFunc  = int64(unsafe.Sizeof(tree.AggregateFunc(nil)))
   914  )
   915  
   916  func (ag *aggregatorBase) newAggregateFuncHolder(
   917  	create func(*tree.EvalContext, tree.Datums) tree.AggregateFunc, arguments tree.Datums,
   918  ) *aggregateFuncHolder {
   919  	return &aggregateFuncHolder{
   920  		create:    create,
   921  		group:     ag,
   922  		arena:     &ag.arena,
   923  		arguments: arguments,
   924  	}
   925  }
   926  
   927  // isDistinct returns whether this aggregateFuncHolder has not already seen the
   928  // encoding of grouping columns and argument columns. It should be used *only*
   929  // when we have DISTINCT aggregation so that we can aggregate only the "first"
   930  // row in the group.
   931  func (a *aggregateFuncHolder) isDistinct(
   932  	ctx context.Context,
   933  	alloc *sqlbase.DatumAlloc,
   934  	prefix []byte,
   935  	firstArg tree.Datum,
   936  	otherArgs tree.Datums,
   937  ) (bool, error) {
   938  	// Allocate one EncDatum that will be reused when encoding every argument.
   939  	ed := sqlbase.EncDatum{Datum: firstArg}
   940  	encoded, err := ed.Fingerprint(firstArg.ResolvedType(), alloc, prefix)
   941  	if err != nil {
   942  		return false, err
   943  	}
   944  	if otherArgs != nil {
   945  		for _, arg := range otherArgs {
   946  			ed.Datum = arg
   947  			encoded, err = ed.Fingerprint(arg.ResolvedType(), alloc, encoded)
   948  			if err != nil {
   949  				return false, err
   950  			}
   951  		}
   952  	}
   953  
   954  	if _, ok := a.seen[string(encoded)]; ok {
   955  		// We have already seen a row with such combination of grouping and
   956  		// argument columns.
   957  		return false, nil
   958  	}
   959  	s, err := a.arena.AllocBytes(ctx, encoded)
   960  	if err != nil {
   961  		return false, err
   962  	}
   963  	a.seen[s] = struct{}{}
   964  	return true, nil
   965  }
   966  
   967  // encode returns the encoding for the grouping columns, this is then used as
   968  // our group key to determine which bucket to add to.
   969  func (ag *aggregatorBase) encode(
   970  	appendTo []byte, row sqlbase.EncDatumRow,
   971  ) (encoding []byte, err error) {
   972  	for _, colIdx := range ag.groupCols {
   973  		appendTo, err = row[colIdx].Fingerprint(
   974  			ag.inputTypes[colIdx], &ag.datumAlloc, appendTo)
   975  		if err != nil {
   976  			return appendTo, err
   977  		}
   978  	}
   979  	return appendTo, nil
   980  }
   981  
   982  func (ag *aggregatorBase) createAggregateFuncs() (aggregateFuncs, error) {
   983  	if err := ag.bucketsAcc.Grow(ag.Ctx, sizeOfAggregateFuncs+sizeOfAggregateFunc*int64(len(ag.funcs))); err != nil {
   984  		return nil, err
   985  	}
   986  	bucket := make(aggregateFuncs, len(ag.funcs))
   987  	for i, f := range ag.funcs {
   988  		agg := f.create(ag.EvalCtx, f.arguments)
   989  		if err := ag.bucketsAcc.Grow(ag.Ctx, agg.Size()); err != nil {
   990  			return nil, err
   991  		}
   992  		bucket[i] = agg
   993  	}
   994  	return bucket, nil
   995  }