github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/sql/rowexec/windower.go (about)

     1  // Copyright 2018 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package rowexec
    12  
    13  import (
    14  	"context"
    15  	"fmt"
    16  	"unsafe"
    17  
    18  	"github.com/cockroachdb/cockroach/pkg/sql/execinfra"
    19  	"github.com/cockroachdb/cockroach/pkg/sql/execinfrapb"
    20  	"github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgcode"
    21  	"github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgerror"
    22  	"github.com/cockroachdb/cockroach/pkg/sql/rowcontainer"
    23  	"github.com/cockroachdb/cockroach/pkg/sql/sem/builtins"
    24  	"github.com/cockroachdb/cockroach/pkg/sql/sem/tree"
    25  	"github.com/cockroachdb/cockroach/pkg/sql/sqlbase"
    26  	"github.com/cockroachdb/cockroach/pkg/sql/types"
    27  	"github.com/cockroachdb/cockroach/pkg/util/encoding"
    28  	"github.com/cockroachdb/cockroach/pkg/util/humanizeutil"
    29  	"github.com/cockroachdb/cockroach/pkg/util/log"
    30  	"github.com/cockroachdb/cockroach/pkg/util/mon"
    31  	"github.com/cockroachdb/cockroach/pkg/util/tracing"
    32  	"github.com/cockroachdb/errors"
    33  	"github.com/opentracing/opentracing-go"
    34  )
    35  
    36  // windowerState represents the state of the processor.
    37  type windowerState int
    38  
    39  const (
    40  	windowerStateUnknown windowerState = iota
    41  	// windowerAccumulating means that rows are being read from the input
    42  	// and accumulated in allRowsPartitioned.
    43  	windowerAccumulating
    44  	// windowerEmittingRows means that all rows have been read and
    45  	// output rows are being emitted.
    46  	windowerEmittingRows
    47  )
    48  
    49  // memRequiredByWindower indicates the minimum amount of RAM (in bytes) that
    50  // the windower needs.
    51  const memRequiredByWindower = 100 * 1024
    52  
    53  // windower is the processor that performs computation of window functions
    54  // that have the same PARTITION BY clause. It passes through all of its input
    55  // columns and puts the output of a window function windowFn at
    56  // windowFn.outputColIdx.
    57  type windower struct {
    58  	execinfra.ProcessorBase
    59  
    60  	// runningState represents the state of the windower. This is in addition to
    61  	// ProcessorBase.State - the runningState is only relevant when
    62  	// ProcessorBase.State == StateRunning.
    63  	runningState windowerState
    64  	input        execinfra.RowSource
    65  	inputDone    bool
    66  	inputTypes   []*types.T
    67  	outputTypes  []*types.T
    68  	datumAlloc   sqlbase.DatumAlloc
    69  	acc          mon.BoundAccount
    70  	diskMonitor  *mon.BytesMonitor
    71  
    72  	scratch       []byte
    73  	cancelChecker *sqlbase.CancelChecker
    74  
    75  	partitionBy                []uint32
    76  	allRowsPartitioned         *rowcontainer.HashDiskBackedRowContainer
    77  	partition                  *rowcontainer.DiskBackedIndexedRowContainer
    78  	orderOfWindowFnsProcessing []int
    79  	windowFns                  []*windowFunc
    80  	builtins                   []tree.WindowFunc
    81  
    82  	populated           bool
    83  	partitionIdx        int
    84  	rowsInBucketEmitted int
    85  	partitionSizes      []int
    86  	windowValues        [][][]tree.Datum
    87  	allRowsIterator     rowcontainer.RowIterator
    88  	outputRow           sqlbase.EncDatumRow
    89  }
    90  
    91  var _ execinfra.Processor = &windower{}
    92  var _ execinfra.RowSource = &windower{}
    93  var _ execinfra.OpNode = &windower{}
    94  
    95  const windowerProcName = "windower"
    96  
    97  func newWindower(
    98  	flowCtx *execinfra.FlowCtx,
    99  	processorID int32,
   100  	spec *execinfrapb.WindowerSpec,
   101  	input execinfra.RowSource,
   102  	post *execinfrapb.PostProcessSpec,
   103  	output execinfra.RowReceiver,
   104  ) (*windower, error) {
   105  	w := &windower{
   106  		input: input,
   107  	}
   108  	evalCtx := flowCtx.NewEvalCtx()
   109  	w.inputTypes = input.OutputTypes()
   110  	ctx := evalCtx.Ctx()
   111  
   112  	w.partitionBy = spec.PartitionBy
   113  	windowFns := spec.WindowFns
   114  	w.windowFns = make([]*windowFunc, 0, len(windowFns))
   115  	w.builtins = make([]tree.WindowFunc, 0, len(windowFns))
   116  	// windower passes through all of its input columns and appends an output
   117  	// column for each of window functions it is computing.
   118  	w.outputTypes = make([]*types.T, len(w.inputTypes)+len(windowFns))
   119  	copy(w.outputTypes, w.inputTypes)
   120  	for _, windowFn := range windowFns {
   121  		// Check for out of bounds arguments has been done during planning step.
   122  		argTypes := make([]*types.T, len(windowFn.ArgsIdxs))
   123  		for i, argIdx := range windowFn.ArgsIdxs {
   124  			argTypes[i] = w.inputTypes[argIdx]
   125  		}
   126  		windowConstructor, outputType, err := execinfrapb.GetWindowFunctionInfo(windowFn.Func, argTypes...)
   127  		if err != nil {
   128  			return nil, err
   129  		}
   130  		w.outputTypes[windowFn.OutputColIdx] = outputType
   131  
   132  		w.builtins = append(w.builtins, windowConstructor(evalCtx))
   133  		wf := &windowFunc{
   134  			ordering:     windowFn.Ordering,
   135  			argsIdxs:     windowFn.ArgsIdxs,
   136  			frame:        windowFn.Frame,
   137  			filterColIdx: int(windowFn.FilterColIdx),
   138  			outputColIdx: int(windowFn.OutputColIdx),
   139  		}
   140  
   141  		w.windowFns = append(w.windowFns, wf)
   142  	}
   143  	w.outputRow = make(sqlbase.EncDatumRow, len(w.outputTypes))
   144  
   145  	st := flowCtx.Cfg.Settings
   146  	// Limit the memory use by creating a child monitor with a hard limit.
   147  	// windower will overflow to disk if this limit is not enough.
   148  	limit := flowCtx.Cfg.TestingKnobs.MemoryLimitBytes
   149  	if limit <= 0 {
   150  		limit = execinfra.SettingWorkMemBytes.Get(&st.SV)
   151  		if limit < memRequiredByWindower {
   152  			return nil, errors.Errorf(
   153  				"window functions require %d bytes of RAM but only %d are in the budget. "+
   154  					"Consider increasing sql.distsql.temp_storage.workmem setting",
   155  				memRequiredByWindower, limit)
   156  		}
   157  	} else {
   158  		if flowCtx.Cfg.TestingKnobs.ForceDiskSpill || limit < memRequiredByWindower {
   159  			// The limit is set very low by the tests, but the windower requires
   160  			// some amount of RAM, so we override the limit.
   161  			limit = memRequiredByWindower
   162  		}
   163  	}
   164  	limitedMon := mon.MakeMonitorInheritWithLimit("windower-limited", limit, evalCtx.Mon)
   165  	limitedMon.Start(ctx, evalCtx.Mon, mon.BoundAccount{})
   166  
   167  	if err := w.InitWithEvalCtx(
   168  		w,
   169  		post,
   170  		w.outputTypes,
   171  		flowCtx,
   172  		evalCtx,
   173  		processorID,
   174  		output,
   175  		&limitedMon,
   176  		execinfra.ProcStateOpts{InputsToDrain: []execinfra.RowSource{w.input},
   177  			TrailingMetaCallback: func(context.Context) []execinfrapb.ProducerMetadata {
   178  				w.close()
   179  				return nil
   180  			}},
   181  	); err != nil {
   182  		return nil, err
   183  	}
   184  
   185  	w.diskMonitor = execinfra.NewMonitor(ctx, flowCtx.Cfg.DiskMonitor, "windower-disk")
   186  	w.allRowsPartitioned = rowcontainer.NewHashDiskBackedRowContainer(
   187  		nil, /* memRowContainer */
   188  		evalCtx,
   189  		w.MemMonitor,
   190  		w.diskMonitor,
   191  		flowCtx.Cfg.TempStorage,
   192  	)
   193  	if err := w.allRowsPartitioned.Init(
   194  		ctx,
   195  		false, /* shouldMark */
   196  		w.inputTypes,
   197  		w.partitionBy,
   198  		true, /* encodeNull */
   199  	); err != nil {
   200  		return nil, err
   201  	}
   202  
   203  	w.acc = w.MemMonitor.MakeBoundAccount()
   204  	// If we have aggregate builtins that aggregate a single datum, we want
   205  	// them to reuse the same shared memory account with the windower.
   206  	evalCtx.SingleDatumAggMemAccount = &w.acc
   207  
   208  	if sp := opentracing.SpanFromContext(ctx); sp != nil && tracing.IsRecording(sp) {
   209  		w.input = newInputStatCollector(w.input)
   210  		w.FinishTrace = w.outputStatsToTrace
   211  	}
   212  
   213  	return w, nil
   214  }
   215  
   216  // Start is part of the RowSource interface.
   217  func (w *windower) Start(ctx context.Context) context.Context {
   218  	w.input.Start(ctx)
   219  	ctx = w.StartInternal(ctx, windowerProcName)
   220  	w.cancelChecker = sqlbase.NewCancelChecker(ctx)
   221  	w.runningState = windowerAccumulating
   222  	return ctx
   223  }
   224  
   225  // Next is part of the RowSource interface.
   226  func (w *windower) Next() (sqlbase.EncDatumRow, *execinfrapb.ProducerMetadata) {
   227  	for w.State == execinfra.StateRunning {
   228  		var row sqlbase.EncDatumRow
   229  		var meta *execinfrapb.ProducerMetadata
   230  		switch w.runningState {
   231  		case windowerAccumulating:
   232  			w.runningState, row, meta = w.accumulateRows()
   233  		case windowerEmittingRows:
   234  			w.runningState, row, meta = w.emitRow()
   235  		default:
   236  			log.Fatalf(w.Ctx, "unsupported state: %d", w.runningState)
   237  		}
   238  
   239  		if row == nil && meta == nil {
   240  			continue
   241  		}
   242  		return row, meta
   243  	}
   244  	return nil, w.DrainHelper()
   245  }
   246  
   247  // ConsumerClosed is part of the RowSource interface.
   248  func (w *windower) ConsumerClosed() {
   249  	// The consumer is done, Next() will not be called again.
   250  	w.close()
   251  }
   252  
   253  func (w *windower) close() {
   254  	if w.InternalClose() {
   255  		if w.allRowsIterator != nil {
   256  			w.allRowsIterator.Close()
   257  		}
   258  		w.allRowsPartitioned.Close(w.Ctx)
   259  		if w.partition != nil {
   260  			w.partition.Close(w.Ctx)
   261  		}
   262  		for _, builtin := range w.builtins {
   263  			builtin.Close(w.Ctx, w.EvalCtx)
   264  		}
   265  		w.acc.Close(w.Ctx)
   266  		w.MemMonitor.Stop(w.Ctx)
   267  		w.diskMonitor.Stop(w.Ctx)
   268  	}
   269  }
   270  
   271  // accumulateRows continually reads rows from the input and accumulates them
   272  // in allRowsPartitioned. If it encounters metadata, the metadata is returned
   273  // immediately. Subsequent calls of this function will resume row accumulation.
   274  func (w *windower) accumulateRows() (
   275  	windowerState,
   276  	sqlbase.EncDatumRow,
   277  	*execinfrapb.ProducerMetadata,
   278  ) {
   279  	for {
   280  		row, meta := w.input.Next()
   281  		if meta != nil {
   282  			if meta.Err != nil {
   283  				// We want to send the whole meta (below) rather than just the err,
   284  				// so we pass nil as an argument.
   285  				w.MoveToDraining(nil /* err */)
   286  				return windowerStateUnknown, nil, meta
   287  			}
   288  			return windowerAccumulating, nil, meta
   289  		}
   290  		if row == nil {
   291  			log.VEvent(w.Ctx, 1, "accumulation complete")
   292  			w.inputDone = true
   293  			// We need to sort all the rows based on partitionBy columns so that all
   294  			// rows belonging to the same hash bucket are contiguous.
   295  			w.allRowsPartitioned.Sort(w.Ctx)
   296  			break
   297  		}
   298  
   299  		// The underlying row container will decode all datums as necessary, so we
   300  		// don't need to worry about that.
   301  		if err := w.allRowsPartitioned.AddRow(w.Ctx, row); err != nil {
   302  			w.MoveToDraining(err)
   303  			return windowerStateUnknown, nil, w.DrainHelper()
   304  		}
   305  	}
   306  
   307  	return windowerEmittingRows, nil, nil
   308  }
   309  
   310  // emitRow emits the next row if output rows have already been populated;
   311  // if they haven't, it first computes all window functions over all partitions
   312  // (i.e. populates w.windowValues), and then emits the first row.
   313  //
   314  // emitRow() might move to stateDraining. It might also not return a row if the
   315  // ProcOutputHelper filtered the current row out.
   316  func (w *windower) emitRow() (windowerState, sqlbase.EncDatumRow, *execinfrapb.ProducerMetadata) {
   317  	if w.inputDone {
   318  		for !w.populated {
   319  			if err := w.cancelChecker.Check(); err != nil {
   320  				w.MoveToDraining(err)
   321  				return windowerStateUnknown, nil, w.DrainHelper()
   322  			}
   323  
   324  			if err := w.computeWindowFunctions(w.Ctx, w.EvalCtx); err != nil {
   325  				w.MoveToDraining(err)
   326  				return windowerStateUnknown, nil, w.DrainHelper()
   327  			}
   328  			w.populated = true
   329  		}
   330  
   331  		if rowOutputted, err := w.populateNextOutputRow(); err != nil {
   332  			w.MoveToDraining(err)
   333  			return windowerStateUnknown, nil, nil
   334  		} else if rowOutputted {
   335  			return windowerEmittingRows, w.ProcessRowHelper(w.outputRow), nil
   336  		}
   337  
   338  		w.MoveToDraining(nil /* err */)
   339  		return windowerStateUnknown, nil, nil
   340  	}
   341  
   342  	w.MoveToDraining(errors.Errorf("unexpected: emitRow() is called on a windower before all input rows are accumulated"))
   343  	return windowerStateUnknown, nil, w.DrainHelper()
   344  }
   345  
   346  // spillAllRowsToDisk attempts to first spill w.allRowsPartitioned to disk if
   347  // it's using memory. We choose to not to force w.partition to spill right away
   348  // since it might be resorted multiple times with different orderings, so it's
   349  // better to keep it in memory (if it hasn't spilled on its own). If
   350  // w.allRowsPartitioned is already using disk, we attempt to spill w.partition.
   351  func (w *windower) spillAllRowsToDisk() error {
   352  	if w.allRowsPartitioned != nil {
   353  		if !w.allRowsPartitioned.UsingDisk() {
   354  			if err := w.allRowsPartitioned.SpillToDisk(w.Ctx); err != nil {
   355  				return err
   356  			}
   357  		} else {
   358  			// w.allRowsPartitioned has already been spilled, so we have to spill
   359  			// w.partition if possible.
   360  			if w.partition != nil {
   361  				if !w.partition.UsingDisk() {
   362  					if err := w.partition.SpillToDisk(w.Ctx); err != nil {
   363  						return err
   364  					}
   365  				}
   366  			}
   367  		}
   368  	}
   369  	return nil
   370  }
   371  
   372  // growMemAccount attempts to grow acc by usage, and if it encounters OOM
   373  // error, it forces all rows to spill and attempts to grow acc by usage
   374  // one more time.
   375  func (w *windower) growMemAccount(acc *mon.BoundAccount, usage int64) error {
   376  	if err := acc.Grow(w.Ctx, usage); err != nil {
   377  		if sqlbase.IsOutOfMemoryError(err) {
   378  			if err := w.spillAllRowsToDisk(); err != nil {
   379  				return err
   380  			}
   381  			if err := acc.Grow(w.Ctx, usage); err != nil {
   382  				return err
   383  			}
   384  		} else {
   385  			return err
   386  		}
   387  	}
   388  	return nil
   389  }
   390  
   391  // findOrderOfWindowFnsToProcessIn finds an ordering of window functions such
   392  // that all window functions that have the same ORDER BY clause are computed
   393  // one after another. The order is stored in w.orderOfWindowFnsProcessing.
   394  // This allows for using the same row container without having to resort it
   395  // multiple times.
   396  func (w *windower) findOrderOfWindowFnsToProcessIn() {
   397  	w.orderOfWindowFnsProcessing = make([]int, 0, len(w.windowFns))
   398  	windowFnAdded := make([]bool, len(w.windowFns))
   399  	for i, windowFn := range w.windowFns {
   400  		if !windowFnAdded[i] {
   401  			w.orderOfWindowFnsProcessing = append(w.orderOfWindowFnsProcessing, i)
   402  			windowFnAdded[i] = true
   403  		}
   404  		for j := i + 1; j < len(w.windowFns); j++ {
   405  			if windowFnAdded[j] {
   406  				// j'th windowFn has been already added to orderOfWindowFnsProcessing.
   407  				continue
   408  			}
   409  			if windowFn.ordering.Equal(w.windowFns[j].ordering) {
   410  				w.orderOfWindowFnsProcessing = append(w.orderOfWindowFnsProcessing, j)
   411  				windowFnAdded[j] = true
   412  			}
   413  		}
   414  	}
   415  }
   416  
   417  // processPartition computes all window functions over the given partition and
   418  // puts the result of computations in w.windowValues[partitionIdx]. It computes
   419  // window functions in the order specified in w.orderOfWindowFnsProcessing.
   420  // The same ReorderableRowContainer for partition is reused with changing the
   421  // ordering and being resorted as necessary.
   422  //
   423  // Note: partition must have the ordering as needed by the first window
   424  // function to be processed.
   425  func (w *windower) processPartition(
   426  	ctx context.Context,
   427  	evalCtx *tree.EvalContext,
   428  	partition *rowcontainer.DiskBackedIndexedRowContainer,
   429  	partitionIdx int,
   430  ) error {
   431  	peerGrouper := &partitionPeerGrouper{
   432  		ctx:     ctx,
   433  		evalCtx: evalCtx,
   434  		rowCopy: make(sqlbase.EncDatumRow, len(w.inputTypes)),
   435  	}
   436  	usage := sizeOfSliceOfRows + rowSliceOverhead + sizeOfRow*int64(len(w.windowFns))
   437  	if err := w.growMemAccount(&w.acc, usage); err != nil {
   438  		return err
   439  	}
   440  	w.windowValues = append(w.windowValues, make([][]tree.Datum, len(w.windowFns)))
   441  
   442  	// Partition has ordering as first window function to be processed needs, but
   443  	// we need to sort the partition for the ordering to take effect.
   444  	partition.Sort(ctx)
   445  
   446  	var prevWindowFn *windowFunc
   447  	for _, windowFnIdx := range w.orderOfWindowFnsProcessing {
   448  		windowFn := w.windowFns[windowFnIdx]
   449  
   450  		frameRun := &tree.WindowFrameRun{
   451  			ArgsIdxs:     windowFn.argsIdxs,
   452  			FilterColIdx: windowFn.filterColIdx,
   453  		}
   454  
   455  		if windowFn.frame != nil {
   456  			var err error
   457  			if frameRun.Frame, err = windowFn.frame.ConvertToAST(); err != nil {
   458  				return err
   459  			}
   460  			startBound, endBound := windowFn.frame.Bounds.Start, windowFn.frame.Bounds.End
   461  			if startBound.BoundType == execinfrapb.WindowerSpec_Frame_OFFSET_PRECEDING ||
   462  				startBound.BoundType == execinfrapb.WindowerSpec_Frame_OFFSET_FOLLOWING {
   463  				switch windowFn.frame.Mode {
   464  				case execinfrapb.WindowerSpec_Frame_ROWS:
   465  					frameRun.StartBoundOffset = tree.NewDInt(tree.DInt(int(startBound.IntOffset)))
   466  				case execinfrapb.WindowerSpec_Frame_RANGE:
   467  					datum, rem, err := sqlbase.DecodeTableValue(&w.datumAlloc, startBound.OffsetType.Type, startBound.TypedOffset)
   468  					if err != nil {
   469  						return errors.NewAssertionErrorWithWrappedErrf(err,
   470  							"error decoding %d bytes", errors.Safe(len(startBound.TypedOffset)))
   471  					}
   472  					if len(rem) != 0 {
   473  						return errors.AssertionFailedf(
   474  							"%d trailing bytes in encoded value", errors.Safe(len(rem)))
   475  					}
   476  					frameRun.StartBoundOffset = datum
   477  				case execinfrapb.WindowerSpec_Frame_GROUPS:
   478  					frameRun.StartBoundOffset = tree.NewDInt(tree.DInt(int(startBound.IntOffset)))
   479  				default:
   480  					return errors.AssertionFailedf(
   481  						"unexpected WindowFrameMode: %d", errors.Safe(windowFn.frame.Mode))
   482  				}
   483  			}
   484  			if endBound != nil {
   485  				if endBound.BoundType == execinfrapb.WindowerSpec_Frame_OFFSET_PRECEDING ||
   486  					endBound.BoundType == execinfrapb.WindowerSpec_Frame_OFFSET_FOLLOWING {
   487  					switch windowFn.frame.Mode {
   488  					case execinfrapb.WindowerSpec_Frame_ROWS:
   489  						frameRun.EndBoundOffset = tree.NewDInt(tree.DInt(int(endBound.IntOffset)))
   490  					case execinfrapb.WindowerSpec_Frame_RANGE:
   491  						datum, rem, err := sqlbase.DecodeTableValue(&w.datumAlloc, endBound.OffsetType.Type, endBound.TypedOffset)
   492  						if err != nil {
   493  							return errors.NewAssertionErrorWithWrappedErrf(err,
   494  								"error decoding %d bytes", errors.Safe(len(endBound.TypedOffset)))
   495  						}
   496  						if len(rem) != 0 {
   497  							return errors.AssertionFailedf(
   498  								"%d trailing bytes in encoded value", errors.Safe(len(rem)))
   499  						}
   500  						frameRun.EndBoundOffset = datum
   501  					case execinfrapb.WindowerSpec_Frame_GROUPS:
   502  						frameRun.EndBoundOffset = tree.NewDInt(tree.DInt(int(endBound.IntOffset)))
   503  					default:
   504  						return errors.AssertionFailedf("unexpected WindowFrameMode: %d",
   505  							errors.Safe(windowFn.frame.Mode))
   506  					}
   507  				}
   508  			}
   509  			if frameRun.RangeModeWithOffsets() {
   510  				ordCol := windowFn.ordering.Columns[0]
   511  				frameRun.OrdColIdx = int(ordCol.ColIdx)
   512  				// We need this +1 because encoding.Direction has extra value "_"
   513  				// as zeroth "entry" which its proto equivalent doesn't have.
   514  				frameRun.OrdDirection = encoding.Direction(ordCol.Direction + 1)
   515  
   516  				colTyp := w.inputTypes[ordCol.ColIdx]
   517  				// Type of offset depends on the ordering column's type.
   518  				offsetTyp := colTyp
   519  				if types.IsDateTimeType(colTyp) {
   520  					// For datetime related ordering columns, offset must be an Interval.
   521  					offsetTyp = types.Interval
   522  				}
   523  				plusOp, minusOp, found := tree.WindowFrameRangeOps{}.LookupImpl(colTyp, offsetTyp)
   524  				if !found {
   525  					return pgerror.Newf(pgcode.Windowing,
   526  						"given logical offset cannot be combined with ordering column")
   527  				}
   528  				frameRun.PlusOp, frameRun.MinusOp = plusOp, minusOp
   529  			}
   530  		}
   531  
   532  		builtin := w.builtins[windowFnIdx]
   533  		builtin.Reset(ctx)
   534  
   535  		usage = datumSliceOverhead + sizeOfDatum*int64(partition.Len())
   536  		if err := w.growMemAccount(&w.acc, usage); err != nil {
   537  			return err
   538  		}
   539  		w.windowValues[partitionIdx][windowFnIdx] = make([]tree.Datum, partition.Len())
   540  
   541  		if len(windowFn.ordering.Columns) > 0 {
   542  			// If an ORDER BY clause is provided, we check whether the partition is
   543  			// already sorted as we need (i.e. prevWindowFn has the same ordering),
   544  			// and if it is not, we change the ordering to the needed and resort the
   545  			// container.
   546  			if prevWindowFn != nil && !windowFn.ordering.Equal(prevWindowFn.ordering) {
   547  				if err := partition.Reorder(ctx, execinfrapb.ConvertToColumnOrdering(windowFn.ordering)); err != nil {
   548  					return err
   549  				}
   550  				partition.Sort(ctx)
   551  			}
   552  		}
   553  		peerGrouper.ordering = windowFn.ordering
   554  		peerGrouper.partition = partition
   555  
   556  		frameRun.Rows = partition
   557  		frameRun.RowIdx = 0
   558  
   559  		if !frameRun.Frame.IsDefaultFrame() {
   560  			// We have a custom frame not equivalent to default one, so if we have
   561  			// an aggregate function, we want to reset it for each row. Not resetting
   562  			// is an optimization since we're not computing the result over the whole
   563  			// frame but only as a result of the current row and previous results of
   564  			// aggregation.
   565  			builtins.ShouldReset(builtin)
   566  		}
   567  
   568  		if err := frameRun.PeerHelper.Init(frameRun, peerGrouper); err != nil {
   569  			return err
   570  		}
   571  		frameRun.CurRowPeerGroupNum = 0
   572  
   573  		var prevRes tree.Datum
   574  		for frameRun.RowIdx < partition.Len() {
   575  			// Perform calculations on each row in the current peer group.
   576  			peerGroupEndIdx := frameRun.PeerHelper.GetFirstPeerIdx(frameRun.CurRowPeerGroupNum) + frameRun.PeerHelper.GetRowCount(frameRun.CurRowPeerGroupNum)
   577  			for ; frameRun.RowIdx < peerGroupEndIdx; frameRun.RowIdx++ {
   578  				if err := w.cancelChecker.Check(); err != nil {
   579  					return err
   580  				}
   581  				res, err := builtin.Compute(ctx, evalCtx, frameRun)
   582  				if err != nil {
   583  					return err
   584  				}
   585  				row, err := frameRun.Rows.GetRow(ctx, frameRun.RowIdx)
   586  				if err != nil {
   587  					return err
   588  				}
   589  				if prevRes == nil || prevRes != res {
   590  					// We don't want to double count the same memory, and since the same
   591  					// memory can only be reused contiguously as res, comparing against
   592  					// result of the previous row is sufficient.
   593  					// We have already accounted for the size of a nil datum prior to
   594  					// allocating the slice for window values, so we need to keep that in
   595  					// mind.
   596  					if err := w.growMemAccount(&w.acc, int64(res.Size())-sizeOfDatum); err != nil {
   597  						return err
   598  					}
   599  				}
   600  				w.windowValues[partitionIdx][windowFnIdx][row.GetIdx()] = res
   601  				prevRes = res
   602  			}
   603  			if err := frameRun.PeerHelper.Update(frameRun); err != nil {
   604  				return err
   605  			}
   606  			frameRun.CurRowPeerGroupNum++
   607  		}
   608  
   609  		prevWindowFn = windowFn
   610  	}
   611  
   612  	if err := w.growMemAccount(&w.acc, sizeOfInt); err != nil {
   613  		return err
   614  	}
   615  	w.partitionSizes = append(w.partitionSizes, w.partition.Len())
   616  	return nil
   617  }
   618  
   619  // computeWindowFunctions computes all window functions over all partitions.
   620  // Partitions are processed one at a time with the underlying row container
   621  // reused (and reordered if needed).
   622  func (w *windower) computeWindowFunctions(ctx context.Context, evalCtx *tree.EvalContext) error {
   623  	w.findOrderOfWindowFnsToProcessIn()
   624  
   625  	// We don't know how many partitions there are, so we'll be accounting for
   626  	// this memory right before every append to these slices.
   627  	usage := sliceOfIntsOverhead + sliceOfRowsSliceOverhead
   628  	if err := w.growMemAccount(&w.acc, usage); err != nil {
   629  		return err
   630  	}
   631  	w.partitionSizes = make([]int, 0, 8)
   632  	w.windowValues = make([][][]tree.Datum, 0, 8)
   633  	bucket := ""
   634  
   635  	// w.partition will have ordering as needed by the first window function to
   636  	// be processed.
   637  	ordering := execinfrapb.ConvertToColumnOrdering(w.windowFns[w.orderOfWindowFnsProcessing[0]].ordering)
   638  	w.partition = rowcontainer.NewDiskBackedIndexedRowContainer(
   639  		ordering,
   640  		w.inputTypes,
   641  		w.EvalCtx,
   642  		w.FlowCtx.Cfg.TempStorage,
   643  		w.MemMonitor,
   644  		w.diskMonitor,
   645  		0, /* rowCapacity */
   646  	)
   647  	i, err := w.allRowsPartitioned.NewAllRowsIterator(ctx)
   648  	if err != nil {
   649  		return err
   650  	}
   651  	defer i.Close()
   652  
   653  	// We iterate over all the rows and add them to w.partition one by one. When
   654  	// a row from a different partition is encountered, we process the partition
   655  	// and reset w.partition for reusing.
   656  	for i.Rewind(); ; i.Next() {
   657  		if ok, err := i.Valid(); err != nil {
   658  			return err
   659  		} else if !ok {
   660  			break
   661  		}
   662  		row, err := i.Row()
   663  		if err != nil {
   664  			return err
   665  		}
   666  		if err := w.cancelChecker.Check(); err != nil {
   667  			return err
   668  		}
   669  		if len(w.partitionBy) > 0 {
   670  			// We need to hash the row according to partitionBy
   671  			// to figure out which partition the row belongs to.
   672  			w.scratch = w.scratch[:0]
   673  			for _, col := range w.partitionBy {
   674  				if int(col) >= len(row) {
   675  					return errors.AssertionFailedf(
   676  						"hash column %d, row with only %d columns", errors.Safe(col), errors.Safe(len(row)))
   677  				}
   678  				var err error
   679  				w.scratch, err = row[int(col)].Fingerprint(w.inputTypes[int(col)], &w.datumAlloc, w.scratch)
   680  				if err != nil {
   681  					return err
   682  				}
   683  			}
   684  			if string(w.scratch) != bucket {
   685  				// Current row is from the new bucket, so we "finalize" the previous
   686  				// bucket (if current row is not the first row among all rows in
   687  				// allRowsPartitioned). We then process this partition, reset the
   688  				// container for reuse by the next partition.
   689  				if bucket != "" {
   690  					if err := w.processPartition(ctx, evalCtx, w.partition, len(w.partitionSizes)); err != nil {
   691  						return err
   692  					}
   693  				}
   694  				bucket = string(w.scratch)
   695  				if err := w.partition.UnsafeReset(ctx); err != nil {
   696  					return err
   697  				}
   698  				if !w.windowFns[w.orderOfWindowFnsProcessing[0]].ordering.Equal(w.windowFns[w.orderOfWindowFnsProcessing[len(w.windowFns)-1]].ordering) {
   699  					// The container no longer has the ordering as needed by the first
   700  					// window function to be processed, so we need to change it.
   701  					if err = w.partition.Reorder(ctx, ordering); err != nil {
   702  						return err
   703  					}
   704  				}
   705  			}
   706  		}
   707  		if err := w.partition.AddRow(w.Ctx, row); err != nil {
   708  			return err
   709  		}
   710  	}
   711  	return w.processPartition(ctx, evalCtx, w.partition, len(w.partitionSizes))
   712  }
   713  
   714  // populateNextOutputRow populates next output row to be returned. All input
   715  // columns are passed through, and the results of window functions'
   716  // computations are put in the desired columns (i.e. in outputColIdx of each
   717  // window function).
   718  func (w *windower) populateNextOutputRow() (bool, error) {
   719  	if w.partitionIdx < len(w.partitionSizes) {
   720  		if w.allRowsIterator == nil {
   721  			w.allRowsIterator = w.allRowsPartitioned.NewUnmarkedIterator(w.Ctx)
   722  			w.allRowsIterator.Rewind()
   723  		}
   724  		// rowIdx is the index of the next row to be emitted from the
   725  		// partitionIdx'th partition.
   726  		rowIdx := w.rowsInBucketEmitted
   727  		if ok, err := w.allRowsIterator.Valid(); err != nil {
   728  			return false, err
   729  		} else if !ok {
   730  			return false, nil
   731  		}
   732  		inputRow, err := w.allRowsIterator.Row()
   733  		w.allRowsIterator.Next()
   734  		if err != nil {
   735  			return false, err
   736  		}
   737  		copy(w.outputRow, inputRow[:len(w.inputTypes)])
   738  		for windowFnIdx, windowFn := range w.windowFns {
   739  			windowFnRes := w.windowValues[w.partitionIdx][windowFnIdx][rowIdx]
   740  			encWindowFnRes := sqlbase.DatumToEncDatum(w.outputTypes[windowFn.outputColIdx], windowFnRes)
   741  			w.outputRow[windowFn.outputColIdx] = encWindowFnRes
   742  		}
   743  		w.rowsInBucketEmitted++
   744  		if w.rowsInBucketEmitted == w.partitionSizes[w.partitionIdx] {
   745  			// We have emitted all rows from the current bucket, so we advance the
   746  			// iterator.
   747  			w.partitionIdx++
   748  			w.rowsInBucketEmitted = 0
   749  		}
   750  		return true, nil
   751  
   752  	}
   753  	return false, nil
   754  }
   755  
   756  type windowFunc struct {
   757  	ordering     execinfrapb.Ordering
   758  	argsIdxs     []uint32
   759  	frame        *execinfrapb.WindowerSpec_Frame
   760  	filterColIdx int
   761  	outputColIdx int
   762  }
   763  
   764  type partitionPeerGrouper struct {
   765  	ctx       context.Context
   766  	evalCtx   *tree.EvalContext
   767  	partition *rowcontainer.DiskBackedIndexedRowContainer
   768  	ordering  execinfrapb.Ordering
   769  	rowCopy   sqlbase.EncDatumRow
   770  	err       error
   771  }
   772  
   773  func (n *partitionPeerGrouper) InSameGroup(i, j int) (bool, error) {
   774  	if len(n.ordering.Columns) == 0 {
   775  		// ORDER BY clause is omitted, so all rows are peers.
   776  		return true, nil
   777  	}
   778  	if n.err != nil {
   779  		return false, n.err
   780  	}
   781  	indexedRow, err := n.partition.GetRow(n.ctx, i)
   782  	if err != nil {
   783  		n.err = err
   784  		return false, err
   785  	}
   786  	row := indexedRow.(rowcontainer.IndexedRow)
   787  	// We need to copy the row explicitly since n.partition might be reusing
   788  	// the underlying memory when GetRow() is called.
   789  	copy(n.rowCopy, row.Row)
   790  	rb, err := n.partition.GetRow(n.ctx, j)
   791  	if err != nil {
   792  		n.err = err
   793  		return false, n.err
   794  	}
   795  	for _, o := range n.ordering.Columns {
   796  		da := n.rowCopy[o.ColIdx].Datum
   797  		db, err := rb.GetDatum(int(o.ColIdx))
   798  		if err != nil {
   799  			n.err = err
   800  			return false, n.err
   801  		}
   802  		if c := da.Compare(n.evalCtx, db); c != 0 {
   803  			if o.Direction != execinfrapb.Ordering_Column_ASC {
   804  				return false, nil
   805  			}
   806  			return false, nil
   807  		}
   808  	}
   809  	return true, nil
   810  }
   811  
   812  const sizeOfInt = int64(unsafe.Sizeof(int(0)))
   813  const sliceOfIntsOverhead = int64(unsafe.Sizeof([]int{}))
   814  const sizeOfSliceOfRows = int64(unsafe.Sizeof([][]tree.Datum{}))
   815  const sliceOfRowsSliceOverhead = int64(unsafe.Sizeof([][][]tree.Datum{}))
   816  const sizeOfRow = int64(unsafe.Sizeof([]tree.Datum{}))
   817  const rowSliceOverhead = int64(unsafe.Sizeof([][]tree.Datum{}))
   818  const sizeOfDatum = int64(unsafe.Sizeof(tree.Datum(nil)))
   819  const datumSliceOverhead = int64(unsafe.Sizeof([]tree.Datum(nil)))
   820  
   821  // CreateWindowerSpecFunc creates a WindowerSpec_Func based on the function
   822  // name or returns an error if unknown function name is provided.
   823  func CreateWindowerSpecFunc(funcStr string) (execinfrapb.WindowerSpec_Func, error) {
   824  	if aggBuiltin, ok := execinfrapb.AggregatorSpec_Func_value[funcStr]; ok {
   825  		aggSpec := execinfrapb.AggregatorSpec_Func(aggBuiltin)
   826  		return execinfrapb.WindowerSpec_Func{AggregateFunc: &aggSpec}, nil
   827  	} else if winBuiltin, ok := execinfrapb.WindowerSpec_WindowFunc_value[funcStr]; ok {
   828  		winSpec := execinfrapb.WindowerSpec_WindowFunc(winBuiltin)
   829  		return execinfrapb.WindowerSpec_Func{WindowFunc: &winSpec}, nil
   830  	} else {
   831  		return execinfrapb.WindowerSpec_Func{}, errors.Errorf("unknown aggregate/window function %s", funcStr)
   832  	}
   833  }
   834  
   835  var _ execinfrapb.DistSQLSpanStats = &WindowerStats{}
   836  
   837  const windowerTagPrefix = "windower."
   838  
   839  // Stats implements the SpanStats interface.
   840  func (ws *WindowerStats) Stats() map[string]string {
   841  	inputStatsMap := ws.InputStats.Stats(windowerTagPrefix)
   842  	inputStatsMap[windowerTagPrefix+MaxMemoryTagSuffix] = humanizeutil.IBytes(ws.MaxAllocatedMem)
   843  	inputStatsMap[windowerTagPrefix+MaxDiskTagSuffix] = humanizeutil.IBytes(ws.MaxAllocatedDisk)
   844  	return inputStatsMap
   845  }
   846  
   847  // StatsForQueryPlan implements the DistSQLSpanStats interface.
   848  func (ws *WindowerStats) StatsForQueryPlan() []string {
   849  	stats := ws.InputStats.StatsForQueryPlan("" /* prefix */)
   850  
   851  	if ws.MaxAllocatedMem != 0 {
   852  		stats = append(stats,
   853  			fmt.Sprintf("%s: %s", MaxMemoryQueryPlanSuffix, humanizeutil.IBytes(ws.MaxAllocatedMem)))
   854  	}
   855  
   856  	if ws.MaxAllocatedDisk != 0 {
   857  		stats = append(stats,
   858  			fmt.Sprintf("%s: %s", MaxDiskQueryPlanSuffix, humanizeutil.IBytes(ws.MaxAllocatedDisk)))
   859  	}
   860  
   861  	return stats
   862  }
   863  
   864  func (w *windower) outputStatsToTrace() {
   865  	is, ok := getInputStats(w.FlowCtx, w.input)
   866  	if !ok {
   867  		return
   868  	}
   869  	if sp := opentracing.SpanFromContext(w.Ctx); sp != nil {
   870  		tracing.SetSpanStats(
   871  			sp,
   872  			&WindowerStats{
   873  				InputStats:       is,
   874  				MaxAllocatedMem:  w.MemMonitor.MaximumBytes(),
   875  				MaxAllocatedDisk: w.diskMonitor.MaximumBytes(),
   876  			},
   877  		)
   878  	}
   879  }
   880  
   881  // ChildCount is part of the execinfra.OpNode interface.
   882  func (w *windower) ChildCount(verbose bool) int {
   883  	if _, ok := w.input.(execinfra.OpNode); ok {
   884  		return 1
   885  	}
   886  	return 0
   887  }
   888  
   889  // Child is part of the execinfra.OpNode interface.
   890  func (w *windower) Child(nth int, verbose bool) execinfra.OpNode {
   891  	if nth == 0 {
   892  		if n, ok := w.input.(execinfra.OpNode); ok {
   893  			return n
   894  		}
   895  		panic("input to windower is not an execinfra.OpNode")
   896  	}
   897  	panic(fmt.Sprintf("invalid index %d", nth))
   898  }