github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/sql/rowexec/windower.go (about) 1 // Copyright 2018 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package rowexec 12 13 import ( 14 "context" 15 "fmt" 16 "unsafe" 17 18 "github.com/cockroachdb/cockroach/pkg/sql/execinfra" 19 "github.com/cockroachdb/cockroach/pkg/sql/execinfrapb" 20 "github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgcode" 21 "github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgerror" 22 "github.com/cockroachdb/cockroach/pkg/sql/rowcontainer" 23 "github.com/cockroachdb/cockroach/pkg/sql/sem/builtins" 24 "github.com/cockroachdb/cockroach/pkg/sql/sem/tree" 25 "github.com/cockroachdb/cockroach/pkg/sql/sqlbase" 26 "github.com/cockroachdb/cockroach/pkg/sql/types" 27 "github.com/cockroachdb/cockroach/pkg/util/encoding" 28 "github.com/cockroachdb/cockroach/pkg/util/humanizeutil" 29 "github.com/cockroachdb/cockroach/pkg/util/log" 30 "github.com/cockroachdb/cockroach/pkg/util/mon" 31 "github.com/cockroachdb/cockroach/pkg/util/tracing" 32 "github.com/cockroachdb/errors" 33 "github.com/opentracing/opentracing-go" 34 ) 35 36 // windowerState represents the state of the processor. 37 type windowerState int 38 39 const ( 40 windowerStateUnknown windowerState = iota 41 // windowerAccumulating means that rows are being read from the input 42 // and accumulated in allRowsPartitioned. 43 windowerAccumulating 44 // windowerEmittingRows means that all rows have been read and 45 // output rows are being emitted. 46 windowerEmittingRows 47 ) 48 49 // memRequiredByWindower indicates the minimum amount of RAM (in bytes) that 50 // the windower needs. 51 const memRequiredByWindower = 100 * 1024 52 53 // windower is the processor that performs computation of window functions 54 // that have the same PARTITION BY clause. It passes through all of its input 55 // columns and puts the output of a window function windowFn at 56 // windowFn.outputColIdx. 57 type windower struct { 58 execinfra.ProcessorBase 59 60 // runningState represents the state of the windower. This is in addition to 61 // ProcessorBase.State - the runningState is only relevant when 62 // ProcessorBase.State == StateRunning. 63 runningState windowerState 64 input execinfra.RowSource 65 inputDone bool 66 inputTypes []*types.T 67 outputTypes []*types.T 68 datumAlloc sqlbase.DatumAlloc 69 acc mon.BoundAccount 70 diskMonitor *mon.BytesMonitor 71 72 scratch []byte 73 cancelChecker *sqlbase.CancelChecker 74 75 partitionBy []uint32 76 allRowsPartitioned *rowcontainer.HashDiskBackedRowContainer 77 partition *rowcontainer.DiskBackedIndexedRowContainer 78 orderOfWindowFnsProcessing []int 79 windowFns []*windowFunc 80 builtins []tree.WindowFunc 81 82 populated bool 83 partitionIdx int 84 rowsInBucketEmitted int 85 partitionSizes []int 86 windowValues [][][]tree.Datum 87 allRowsIterator rowcontainer.RowIterator 88 outputRow sqlbase.EncDatumRow 89 } 90 91 var _ execinfra.Processor = &windower{} 92 var _ execinfra.RowSource = &windower{} 93 var _ execinfra.OpNode = &windower{} 94 95 const windowerProcName = "windower" 96 97 func newWindower( 98 flowCtx *execinfra.FlowCtx, 99 processorID int32, 100 spec *execinfrapb.WindowerSpec, 101 input execinfra.RowSource, 102 post *execinfrapb.PostProcessSpec, 103 output execinfra.RowReceiver, 104 ) (*windower, error) { 105 w := &windower{ 106 input: input, 107 } 108 evalCtx := flowCtx.NewEvalCtx() 109 w.inputTypes = input.OutputTypes() 110 ctx := evalCtx.Ctx() 111 112 w.partitionBy = spec.PartitionBy 113 windowFns := spec.WindowFns 114 w.windowFns = make([]*windowFunc, 0, len(windowFns)) 115 w.builtins = make([]tree.WindowFunc, 0, len(windowFns)) 116 // windower passes through all of its input columns and appends an output 117 // column for each of window functions it is computing. 118 w.outputTypes = make([]*types.T, len(w.inputTypes)+len(windowFns)) 119 copy(w.outputTypes, w.inputTypes) 120 for _, windowFn := range windowFns { 121 // Check for out of bounds arguments has been done during planning step. 122 argTypes := make([]*types.T, len(windowFn.ArgsIdxs)) 123 for i, argIdx := range windowFn.ArgsIdxs { 124 argTypes[i] = w.inputTypes[argIdx] 125 } 126 windowConstructor, outputType, err := execinfrapb.GetWindowFunctionInfo(windowFn.Func, argTypes...) 127 if err != nil { 128 return nil, err 129 } 130 w.outputTypes[windowFn.OutputColIdx] = outputType 131 132 w.builtins = append(w.builtins, windowConstructor(evalCtx)) 133 wf := &windowFunc{ 134 ordering: windowFn.Ordering, 135 argsIdxs: windowFn.ArgsIdxs, 136 frame: windowFn.Frame, 137 filterColIdx: int(windowFn.FilterColIdx), 138 outputColIdx: int(windowFn.OutputColIdx), 139 } 140 141 w.windowFns = append(w.windowFns, wf) 142 } 143 w.outputRow = make(sqlbase.EncDatumRow, len(w.outputTypes)) 144 145 st := flowCtx.Cfg.Settings 146 // Limit the memory use by creating a child monitor with a hard limit. 147 // windower will overflow to disk if this limit is not enough. 148 limit := flowCtx.Cfg.TestingKnobs.MemoryLimitBytes 149 if limit <= 0 { 150 limit = execinfra.SettingWorkMemBytes.Get(&st.SV) 151 if limit < memRequiredByWindower { 152 return nil, errors.Errorf( 153 "window functions require %d bytes of RAM but only %d are in the budget. "+ 154 "Consider increasing sql.distsql.temp_storage.workmem setting", 155 memRequiredByWindower, limit) 156 } 157 } else { 158 if flowCtx.Cfg.TestingKnobs.ForceDiskSpill || limit < memRequiredByWindower { 159 // The limit is set very low by the tests, but the windower requires 160 // some amount of RAM, so we override the limit. 161 limit = memRequiredByWindower 162 } 163 } 164 limitedMon := mon.MakeMonitorInheritWithLimit("windower-limited", limit, evalCtx.Mon) 165 limitedMon.Start(ctx, evalCtx.Mon, mon.BoundAccount{}) 166 167 if err := w.InitWithEvalCtx( 168 w, 169 post, 170 w.outputTypes, 171 flowCtx, 172 evalCtx, 173 processorID, 174 output, 175 &limitedMon, 176 execinfra.ProcStateOpts{InputsToDrain: []execinfra.RowSource{w.input}, 177 TrailingMetaCallback: func(context.Context) []execinfrapb.ProducerMetadata { 178 w.close() 179 return nil 180 }}, 181 ); err != nil { 182 return nil, err 183 } 184 185 w.diskMonitor = execinfra.NewMonitor(ctx, flowCtx.Cfg.DiskMonitor, "windower-disk") 186 w.allRowsPartitioned = rowcontainer.NewHashDiskBackedRowContainer( 187 nil, /* memRowContainer */ 188 evalCtx, 189 w.MemMonitor, 190 w.diskMonitor, 191 flowCtx.Cfg.TempStorage, 192 ) 193 if err := w.allRowsPartitioned.Init( 194 ctx, 195 false, /* shouldMark */ 196 w.inputTypes, 197 w.partitionBy, 198 true, /* encodeNull */ 199 ); err != nil { 200 return nil, err 201 } 202 203 w.acc = w.MemMonitor.MakeBoundAccount() 204 // If we have aggregate builtins that aggregate a single datum, we want 205 // them to reuse the same shared memory account with the windower. 206 evalCtx.SingleDatumAggMemAccount = &w.acc 207 208 if sp := opentracing.SpanFromContext(ctx); sp != nil && tracing.IsRecording(sp) { 209 w.input = newInputStatCollector(w.input) 210 w.FinishTrace = w.outputStatsToTrace 211 } 212 213 return w, nil 214 } 215 216 // Start is part of the RowSource interface. 217 func (w *windower) Start(ctx context.Context) context.Context { 218 w.input.Start(ctx) 219 ctx = w.StartInternal(ctx, windowerProcName) 220 w.cancelChecker = sqlbase.NewCancelChecker(ctx) 221 w.runningState = windowerAccumulating 222 return ctx 223 } 224 225 // Next is part of the RowSource interface. 226 func (w *windower) Next() (sqlbase.EncDatumRow, *execinfrapb.ProducerMetadata) { 227 for w.State == execinfra.StateRunning { 228 var row sqlbase.EncDatumRow 229 var meta *execinfrapb.ProducerMetadata 230 switch w.runningState { 231 case windowerAccumulating: 232 w.runningState, row, meta = w.accumulateRows() 233 case windowerEmittingRows: 234 w.runningState, row, meta = w.emitRow() 235 default: 236 log.Fatalf(w.Ctx, "unsupported state: %d", w.runningState) 237 } 238 239 if row == nil && meta == nil { 240 continue 241 } 242 return row, meta 243 } 244 return nil, w.DrainHelper() 245 } 246 247 // ConsumerClosed is part of the RowSource interface. 248 func (w *windower) ConsumerClosed() { 249 // The consumer is done, Next() will not be called again. 250 w.close() 251 } 252 253 func (w *windower) close() { 254 if w.InternalClose() { 255 if w.allRowsIterator != nil { 256 w.allRowsIterator.Close() 257 } 258 w.allRowsPartitioned.Close(w.Ctx) 259 if w.partition != nil { 260 w.partition.Close(w.Ctx) 261 } 262 for _, builtin := range w.builtins { 263 builtin.Close(w.Ctx, w.EvalCtx) 264 } 265 w.acc.Close(w.Ctx) 266 w.MemMonitor.Stop(w.Ctx) 267 w.diskMonitor.Stop(w.Ctx) 268 } 269 } 270 271 // accumulateRows continually reads rows from the input and accumulates them 272 // in allRowsPartitioned. If it encounters metadata, the metadata is returned 273 // immediately. Subsequent calls of this function will resume row accumulation. 274 func (w *windower) accumulateRows() ( 275 windowerState, 276 sqlbase.EncDatumRow, 277 *execinfrapb.ProducerMetadata, 278 ) { 279 for { 280 row, meta := w.input.Next() 281 if meta != nil { 282 if meta.Err != nil { 283 // We want to send the whole meta (below) rather than just the err, 284 // so we pass nil as an argument. 285 w.MoveToDraining(nil /* err */) 286 return windowerStateUnknown, nil, meta 287 } 288 return windowerAccumulating, nil, meta 289 } 290 if row == nil { 291 log.VEvent(w.Ctx, 1, "accumulation complete") 292 w.inputDone = true 293 // We need to sort all the rows based on partitionBy columns so that all 294 // rows belonging to the same hash bucket are contiguous. 295 w.allRowsPartitioned.Sort(w.Ctx) 296 break 297 } 298 299 // The underlying row container will decode all datums as necessary, so we 300 // don't need to worry about that. 301 if err := w.allRowsPartitioned.AddRow(w.Ctx, row); err != nil { 302 w.MoveToDraining(err) 303 return windowerStateUnknown, nil, w.DrainHelper() 304 } 305 } 306 307 return windowerEmittingRows, nil, nil 308 } 309 310 // emitRow emits the next row if output rows have already been populated; 311 // if they haven't, it first computes all window functions over all partitions 312 // (i.e. populates w.windowValues), and then emits the first row. 313 // 314 // emitRow() might move to stateDraining. It might also not return a row if the 315 // ProcOutputHelper filtered the current row out. 316 func (w *windower) emitRow() (windowerState, sqlbase.EncDatumRow, *execinfrapb.ProducerMetadata) { 317 if w.inputDone { 318 for !w.populated { 319 if err := w.cancelChecker.Check(); err != nil { 320 w.MoveToDraining(err) 321 return windowerStateUnknown, nil, w.DrainHelper() 322 } 323 324 if err := w.computeWindowFunctions(w.Ctx, w.EvalCtx); err != nil { 325 w.MoveToDraining(err) 326 return windowerStateUnknown, nil, w.DrainHelper() 327 } 328 w.populated = true 329 } 330 331 if rowOutputted, err := w.populateNextOutputRow(); err != nil { 332 w.MoveToDraining(err) 333 return windowerStateUnknown, nil, nil 334 } else if rowOutputted { 335 return windowerEmittingRows, w.ProcessRowHelper(w.outputRow), nil 336 } 337 338 w.MoveToDraining(nil /* err */) 339 return windowerStateUnknown, nil, nil 340 } 341 342 w.MoveToDraining(errors.Errorf("unexpected: emitRow() is called on a windower before all input rows are accumulated")) 343 return windowerStateUnknown, nil, w.DrainHelper() 344 } 345 346 // spillAllRowsToDisk attempts to first spill w.allRowsPartitioned to disk if 347 // it's using memory. We choose to not to force w.partition to spill right away 348 // since it might be resorted multiple times with different orderings, so it's 349 // better to keep it in memory (if it hasn't spilled on its own). If 350 // w.allRowsPartitioned is already using disk, we attempt to spill w.partition. 351 func (w *windower) spillAllRowsToDisk() error { 352 if w.allRowsPartitioned != nil { 353 if !w.allRowsPartitioned.UsingDisk() { 354 if err := w.allRowsPartitioned.SpillToDisk(w.Ctx); err != nil { 355 return err 356 } 357 } else { 358 // w.allRowsPartitioned has already been spilled, so we have to spill 359 // w.partition if possible. 360 if w.partition != nil { 361 if !w.partition.UsingDisk() { 362 if err := w.partition.SpillToDisk(w.Ctx); err != nil { 363 return err 364 } 365 } 366 } 367 } 368 } 369 return nil 370 } 371 372 // growMemAccount attempts to grow acc by usage, and if it encounters OOM 373 // error, it forces all rows to spill and attempts to grow acc by usage 374 // one more time. 375 func (w *windower) growMemAccount(acc *mon.BoundAccount, usage int64) error { 376 if err := acc.Grow(w.Ctx, usage); err != nil { 377 if sqlbase.IsOutOfMemoryError(err) { 378 if err := w.spillAllRowsToDisk(); err != nil { 379 return err 380 } 381 if err := acc.Grow(w.Ctx, usage); err != nil { 382 return err 383 } 384 } else { 385 return err 386 } 387 } 388 return nil 389 } 390 391 // findOrderOfWindowFnsToProcessIn finds an ordering of window functions such 392 // that all window functions that have the same ORDER BY clause are computed 393 // one after another. The order is stored in w.orderOfWindowFnsProcessing. 394 // This allows for using the same row container without having to resort it 395 // multiple times. 396 func (w *windower) findOrderOfWindowFnsToProcessIn() { 397 w.orderOfWindowFnsProcessing = make([]int, 0, len(w.windowFns)) 398 windowFnAdded := make([]bool, len(w.windowFns)) 399 for i, windowFn := range w.windowFns { 400 if !windowFnAdded[i] { 401 w.orderOfWindowFnsProcessing = append(w.orderOfWindowFnsProcessing, i) 402 windowFnAdded[i] = true 403 } 404 for j := i + 1; j < len(w.windowFns); j++ { 405 if windowFnAdded[j] { 406 // j'th windowFn has been already added to orderOfWindowFnsProcessing. 407 continue 408 } 409 if windowFn.ordering.Equal(w.windowFns[j].ordering) { 410 w.orderOfWindowFnsProcessing = append(w.orderOfWindowFnsProcessing, j) 411 windowFnAdded[j] = true 412 } 413 } 414 } 415 } 416 417 // processPartition computes all window functions over the given partition and 418 // puts the result of computations in w.windowValues[partitionIdx]. It computes 419 // window functions in the order specified in w.orderOfWindowFnsProcessing. 420 // The same ReorderableRowContainer for partition is reused with changing the 421 // ordering and being resorted as necessary. 422 // 423 // Note: partition must have the ordering as needed by the first window 424 // function to be processed. 425 func (w *windower) processPartition( 426 ctx context.Context, 427 evalCtx *tree.EvalContext, 428 partition *rowcontainer.DiskBackedIndexedRowContainer, 429 partitionIdx int, 430 ) error { 431 peerGrouper := &partitionPeerGrouper{ 432 ctx: ctx, 433 evalCtx: evalCtx, 434 rowCopy: make(sqlbase.EncDatumRow, len(w.inputTypes)), 435 } 436 usage := sizeOfSliceOfRows + rowSliceOverhead + sizeOfRow*int64(len(w.windowFns)) 437 if err := w.growMemAccount(&w.acc, usage); err != nil { 438 return err 439 } 440 w.windowValues = append(w.windowValues, make([][]tree.Datum, len(w.windowFns))) 441 442 // Partition has ordering as first window function to be processed needs, but 443 // we need to sort the partition for the ordering to take effect. 444 partition.Sort(ctx) 445 446 var prevWindowFn *windowFunc 447 for _, windowFnIdx := range w.orderOfWindowFnsProcessing { 448 windowFn := w.windowFns[windowFnIdx] 449 450 frameRun := &tree.WindowFrameRun{ 451 ArgsIdxs: windowFn.argsIdxs, 452 FilterColIdx: windowFn.filterColIdx, 453 } 454 455 if windowFn.frame != nil { 456 var err error 457 if frameRun.Frame, err = windowFn.frame.ConvertToAST(); err != nil { 458 return err 459 } 460 startBound, endBound := windowFn.frame.Bounds.Start, windowFn.frame.Bounds.End 461 if startBound.BoundType == execinfrapb.WindowerSpec_Frame_OFFSET_PRECEDING || 462 startBound.BoundType == execinfrapb.WindowerSpec_Frame_OFFSET_FOLLOWING { 463 switch windowFn.frame.Mode { 464 case execinfrapb.WindowerSpec_Frame_ROWS: 465 frameRun.StartBoundOffset = tree.NewDInt(tree.DInt(int(startBound.IntOffset))) 466 case execinfrapb.WindowerSpec_Frame_RANGE: 467 datum, rem, err := sqlbase.DecodeTableValue(&w.datumAlloc, startBound.OffsetType.Type, startBound.TypedOffset) 468 if err != nil { 469 return errors.NewAssertionErrorWithWrappedErrf(err, 470 "error decoding %d bytes", errors.Safe(len(startBound.TypedOffset))) 471 } 472 if len(rem) != 0 { 473 return errors.AssertionFailedf( 474 "%d trailing bytes in encoded value", errors.Safe(len(rem))) 475 } 476 frameRun.StartBoundOffset = datum 477 case execinfrapb.WindowerSpec_Frame_GROUPS: 478 frameRun.StartBoundOffset = tree.NewDInt(tree.DInt(int(startBound.IntOffset))) 479 default: 480 return errors.AssertionFailedf( 481 "unexpected WindowFrameMode: %d", errors.Safe(windowFn.frame.Mode)) 482 } 483 } 484 if endBound != nil { 485 if endBound.BoundType == execinfrapb.WindowerSpec_Frame_OFFSET_PRECEDING || 486 endBound.BoundType == execinfrapb.WindowerSpec_Frame_OFFSET_FOLLOWING { 487 switch windowFn.frame.Mode { 488 case execinfrapb.WindowerSpec_Frame_ROWS: 489 frameRun.EndBoundOffset = tree.NewDInt(tree.DInt(int(endBound.IntOffset))) 490 case execinfrapb.WindowerSpec_Frame_RANGE: 491 datum, rem, err := sqlbase.DecodeTableValue(&w.datumAlloc, endBound.OffsetType.Type, endBound.TypedOffset) 492 if err != nil { 493 return errors.NewAssertionErrorWithWrappedErrf(err, 494 "error decoding %d bytes", errors.Safe(len(endBound.TypedOffset))) 495 } 496 if len(rem) != 0 { 497 return errors.AssertionFailedf( 498 "%d trailing bytes in encoded value", errors.Safe(len(rem))) 499 } 500 frameRun.EndBoundOffset = datum 501 case execinfrapb.WindowerSpec_Frame_GROUPS: 502 frameRun.EndBoundOffset = tree.NewDInt(tree.DInt(int(endBound.IntOffset))) 503 default: 504 return errors.AssertionFailedf("unexpected WindowFrameMode: %d", 505 errors.Safe(windowFn.frame.Mode)) 506 } 507 } 508 } 509 if frameRun.RangeModeWithOffsets() { 510 ordCol := windowFn.ordering.Columns[0] 511 frameRun.OrdColIdx = int(ordCol.ColIdx) 512 // We need this +1 because encoding.Direction has extra value "_" 513 // as zeroth "entry" which its proto equivalent doesn't have. 514 frameRun.OrdDirection = encoding.Direction(ordCol.Direction + 1) 515 516 colTyp := w.inputTypes[ordCol.ColIdx] 517 // Type of offset depends on the ordering column's type. 518 offsetTyp := colTyp 519 if types.IsDateTimeType(colTyp) { 520 // For datetime related ordering columns, offset must be an Interval. 521 offsetTyp = types.Interval 522 } 523 plusOp, minusOp, found := tree.WindowFrameRangeOps{}.LookupImpl(colTyp, offsetTyp) 524 if !found { 525 return pgerror.Newf(pgcode.Windowing, 526 "given logical offset cannot be combined with ordering column") 527 } 528 frameRun.PlusOp, frameRun.MinusOp = plusOp, minusOp 529 } 530 } 531 532 builtin := w.builtins[windowFnIdx] 533 builtin.Reset(ctx) 534 535 usage = datumSliceOverhead + sizeOfDatum*int64(partition.Len()) 536 if err := w.growMemAccount(&w.acc, usage); err != nil { 537 return err 538 } 539 w.windowValues[partitionIdx][windowFnIdx] = make([]tree.Datum, partition.Len()) 540 541 if len(windowFn.ordering.Columns) > 0 { 542 // If an ORDER BY clause is provided, we check whether the partition is 543 // already sorted as we need (i.e. prevWindowFn has the same ordering), 544 // and if it is not, we change the ordering to the needed and resort the 545 // container. 546 if prevWindowFn != nil && !windowFn.ordering.Equal(prevWindowFn.ordering) { 547 if err := partition.Reorder(ctx, execinfrapb.ConvertToColumnOrdering(windowFn.ordering)); err != nil { 548 return err 549 } 550 partition.Sort(ctx) 551 } 552 } 553 peerGrouper.ordering = windowFn.ordering 554 peerGrouper.partition = partition 555 556 frameRun.Rows = partition 557 frameRun.RowIdx = 0 558 559 if !frameRun.Frame.IsDefaultFrame() { 560 // We have a custom frame not equivalent to default one, so if we have 561 // an aggregate function, we want to reset it for each row. Not resetting 562 // is an optimization since we're not computing the result over the whole 563 // frame but only as a result of the current row and previous results of 564 // aggregation. 565 builtins.ShouldReset(builtin) 566 } 567 568 if err := frameRun.PeerHelper.Init(frameRun, peerGrouper); err != nil { 569 return err 570 } 571 frameRun.CurRowPeerGroupNum = 0 572 573 var prevRes tree.Datum 574 for frameRun.RowIdx < partition.Len() { 575 // Perform calculations on each row in the current peer group. 576 peerGroupEndIdx := frameRun.PeerHelper.GetFirstPeerIdx(frameRun.CurRowPeerGroupNum) + frameRun.PeerHelper.GetRowCount(frameRun.CurRowPeerGroupNum) 577 for ; frameRun.RowIdx < peerGroupEndIdx; frameRun.RowIdx++ { 578 if err := w.cancelChecker.Check(); err != nil { 579 return err 580 } 581 res, err := builtin.Compute(ctx, evalCtx, frameRun) 582 if err != nil { 583 return err 584 } 585 row, err := frameRun.Rows.GetRow(ctx, frameRun.RowIdx) 586 if err != nil { 587 return err 588 } 589 if prevRes == nil || prevRes != res { 590 // We don't want to double count the same memory, and since the same 591 // memory can only be reused contiguously as res, comparing against 592 // result of the previous row is sufficient. 593 // We have already accounted for the size of a nil datum prior to 594 // allocating the slice for window values, so we need to keep that in 595 // mind. 596 if err := w.growMemAccount(&w.acc, int64(res.Size())-sizeOfDatum); err != nil { 597 return err 598 } 599 } 600 w.windowValues[partitionIdx][windowFnIdx][row.GetIdx()] = res 601 prevRes = res 602 } 603 if err := frameRun.PeerHelper.Update(frameRun); err != nil { 604 return err 605 } 606 frameRun.CurRowPeerGroupNum++ 607 } 608 609 prevWindowFn = windowFn 610 } 611 612 if err := w.growMemAccount(&w.acc, sizeOfInt); err != nil { 613 return err 614 } 615 w.partitionSizes = append(w.partitionSizes, w.partition.Len()) 616 return nil 617 } 618 619 // computeWindowFunctions computes all window functions over all partitions. 620 // Partitions are processed one at a time with the underlying row container 621 // reused (and reordered if needed). 622 func (w *windower) computeWindowFunctions(ctx context.Context, evalCtx *tree.EvalContext) error { 623 w.findOrderOfWindowFnsToProcessIn() 624 625 // We don't know how many partitions there are, so we'll be accounting for 626 // this memory right before every append to these slices. 627 usage := sliceOfIntsOverhead + sliceOfRowsSliceOverhead 628 if err := w.growMemAccount(&w.acc, usage); err != nil { 629 return err 630 } 631 w.partitionSizes = make([]int, 0, 8) 632 w.windowValues = make([][][]tree.Datum, 0, 8) 633 bucket := "" 634 635 // w.partition will have ordering as needed by the first window function to 636 // be processed. 637 ordering := execinfrapb.ConvertToColumnOrdering(w.windowFns[w.orderOfWindowFnsProcessing[0]].ordering) 638 w.partition = rowcontainer.NewDiskBackedIndexedRowContainer( 639 ordering, 640 w.inputTypes, 641 w.EvalCtx, 642 w.FlowCtx.Cfg.TempStorage, 643 w.MemMonitor, 644 w.diskMonitor, 645 0, /* rowCapacity */ 646 ) 647 i, err := w.allRowsPartitioned.NewAllRowsIterator(ctx) 648 if err != nil { 649 return err 650 } 651 defer i.Close() 652 653 // We iterate over all the rows and add them to w.partition one by one. When 654 // a row from a different partition is encountered, we process the partition 655 // and reset w.partition for reusing. 656 for i.Rewind(); ; i.Next() { 657 if ok, err := i.Valid(); err != nil { 658 return err 659 } else if !ok { 660 break 661 } 662 row, err := i.Row() 663 if err != nil { 664 return err 665 } 666 if err := w.cancelChecker.Check(); err != nil { 667 return err 668 } 669 if len(w.partitionBy) > 0 { 670 // We need to hash the row according to partitionBy 671 // to figure out which partition the row belongs to. 672 w.scratch = w.scratch[:0] 673 for _, col := range w.partitionBy { 674 if int(col) >= len(row) { 675 return errors.AssertionFailedf( 676 "hash column %d, row with only %d columns", errors.Safe(col), errors.Safe(len(row))) 677 } 678 var err error 679 w.scratch, err = row[int(col)].Fingerprint(w.inputTypes[int(col)], &w.datumAlloc, w.scratch) 680 if err != nil { 681 return err 682 } 683 } 684 if string(w.scratch) != bucket { 685 // Current row is from the new bucket, so we "finalize" the previous 686 // bucket (if current row is not the first row among all rows in 687 // allRowsPartitioned). We then process this partition, reset the 688 // container for reuse by the next partition. 689 if bucket != "" { 690 if err := w.processPartition(ctx, evalCtx, w.partition, len(w.partitionSizes)); err != nil { 691 return err 692 } 693 } 694 bucket = string(w.scratch) 695 if err := w.partition.UnsafeReset(ctx); err != nil { 696 return err 697 } 698 if !w.windowFns[w.orderOfWindowFnsProcessing[0]].ordering.Equal(w.windowFns[w.orderOfWindowFnsProcessing[len(w.windowFns)-1]].ordering) { 699 // The container no longer has the ordering as needed by the first 700 // window function to be processed, so we need to change it. 701 if err = w.partition.Reorder(ctx, ordering); err != nil { 702 return err 703 } 704 } 705 } 706 } 707 if err := w.partition.AddRow(w.Ctx, row); err != nil { 708 return err 709 } 710 } 711 return w.processPartition(ctx, evalCtx, w.partition, len(w.partitionSizes)) 712 } 713 714 // populateNextOutputRow populates next output row to be returned. All input 715 // columns are passed through, and the results of window functions' 716 // computations are put in the desired columns (i.e. in outputColIdx of each 717 // window function). 718 func (w *windower) populateNextOutputRow() (bool, error) { 719 if w.partitionIdx < len(w.partitionSizes) { 720 if w.allRowsIterator == nil { 721 w.allRowsIterator = w.allRowsPartitioned.NewUnmarkedIterator(w.Ctx) 722 w.allRowsIterator.Rewind() 723 } 724 // rowIdx is the index of the next row to be emitted from the 725 // partitionIdx'th partition. 726 rowIdx := w.rowsInBucketEmitted 727 if ok, err := w.allRowsIterator.Valid(); err != nil { 728 return false, err 729 } else if !ok { 730 return false, nil 731 } 732 inputRow, err := w.allRowsIterator.Row() 733 w.allRowsIterator.Next() 734 if err != nil { 735 return false, err 736 } 737 copy(w.outputRow, inputRow[:len(w.inputTypes)]) 738 for windowFnIdx, windowFn := range w.windowFns { 739 windowFnRes := w.windowValues[w.partitionIdx][windowFnIdx][rowIdx] 740 encWindowFnRes := sqlbase.DatumToEncDatum(w.outputTypes[windowFn.outputColIdx], windowFnRes) 741 w.outputRow[windowFn.outputColIdx] = encWindowFnRes 742 } 743 w.rowsInBucketEmitted++ 744 if w.rowsInBucketEmitted == w.partitionSizes[w.partitionIdx] { 745 // We have emitted all rows from the current bucket, so we advance the 746 // iterator. 747 w.partitionIdx++ 748 w.rowsInBucketEmitted = 0 749 } 750 return true, nil 751 752 } 753 return false, nil 754 } 755 756 type windowFunc struct { 757 ordering execinfrapb.Ordering 758 argsIdxs []uint32 759 frame *execinfrapb.WindowerSpec_Frame 760 filterColIdx int 761 outputColIdx int 762 } 763 764 type partitionPeerGrouper struct { 765 ctx context.Context 766 evalCtx *tree.EvalContext 767 partition *rowcontainer.DiskBackedIndexedRowContainer 768 ordering execinfrapb.Ordering 769 rowCopy sqlbase.EncDatumRow 770 err error 771 } 772 773 func (n *partitionPeerGrouper) InSameGroup(i, j int) (bool, error) { 774 if len(n.ordering.Columns) == 0 { 775 // ORDER BY clause is omitted, so all rows are peers. 776 return true, nil 777 } 778 if n.err != nil { 779 return false, n.err 780 } 781 indexedRow, err := n.partition.GetRow(n.ctx, i) 782 if err != nil { 783 n.err = err 784 return false, err 785 } 786 row := indexedRow.(rowcontainer.IndexedRow) 787 // We need to copy the row explicitly since n.partition might be reusing 788 // the underlying memory when GetRow() is called. 789 copy(n.rowCopy, row.Row) 790 rb, err := n.partition.GetRow(n.ctx, j) 791 if err != nil { 792 n.err = err 793 return false, n.err 794 } 795 for _, o := range n.ordering.Columns { 796 da := n.rowCopy[o.ColIdx].Datum 797 db, err := rb.GetDatum(int(o.ColIdx)) 798 if err != nil { 799 n.err = err 800 return false, n.err 801 } 802 if c := da.Compare(n.evalCtx, db); c != 0 { 803 if o.Direction != execinfrapb.Ordering_Column_ASC { 804 return false, nil 805 } 806 return false, nil 807 } 808 } 809 return true, nil 810 } 811 812 const sizeOfInt = int64(unsafe.Sizeof(int(0))) 813 const sliceOfIntsOverhead = int64(unsafe.Sizeof([]int{})) 814 const sizeOfSliceOfRows = int64(unsafe.Sizeof([][]tree.Datum{})) 815 const sliceOfRowsSliceOverhead = int64(unsafe.Sizeof([][][]tree.Datum{})) 816 const sizeOfRow = int64(unsafe.Sizeof([]tree.Datum{})) 817 const rowSliceOverhead = int64(unsafe.Sizeof([][]tree.Datum{})) 818 const sizeOfDatum = int64(unsafe.Sizeof(tree.Datum(nil))) 819 const datumSliceOverhead = int64(unsafe.Sizeof([]tree.Datum(nil))) 820 821 // CreateWindowerSpecFunc creates a WindowerSpec_Func based on the function 822 // name or returns an error if unknown function name is provided. 823 func CreateWindowerSpecFunc(funcStr string) (execinfrapb.WindowerSpec_Func, error) { 824 if aggBuiltin, ok := execinfrapb.AggregatorSpec_Func_value[funcStr]; ok { 825 aggSpec := execinfrapb.AggregatorSpec_Func(aggBuiltin) 826 return execinfrapb.WindowerSpec_Func{AggregateFunc: &aggSpec}, nil 827 } else if winBuiltin, ok := execinfrapb.WindowerSpec_WindowFunc_value[funcStr]; ok { 828 winSpec := execinfrapb.WindowerSpec_WindowFunc(winBuiltin) 829 return execinfrapb.WindowerSpec_Func{WindowFunc: &winSpec}, nil 830 } else { 831 return execinfrapb.WindowerSpec_Func{}, errors.Errorf("unknown aggregate/window function %s", funcStr) 832 } 833 } 834 835 var _ execinfrapb.DistSQLSpanStats = &WindowerStats{} 836 837 const windowerTagPrefix = "windower." 838 839 // Stats implements the SpanStats interface. 840 func (ws *WindowerStats) Stats() map[string]string { 841 inputStatsMap := ws.InputStats.Stats(windowerTagPrefix) 842 inputStatsMap[windowerTagPrefix+MaxMemoryTagSuffix] = humanizeutil.IBytes(ws.MaxAllocatedMem) 843 inputStatsMap[windowerTagPrefix+MaxDiskTagSuffix] = humanizeutil.IBytes(ws.MaxAllocatedDisk) 844 return inputStatsMap 845 } 846 847 // StatsForQueryPlan implements the DistSQLSpanStats interface. 848 func (ws *WindowerStats) StatsForQueryPlan() []string { 849 stats := ws.InputStats.StatsForQueryPlan("" /* prefix */) 850 851 if ws.MaxAllocatedMem != 0 { 852 stats = append(stats, 853 fmt.Sprintf("%s: %s", MaxMemoryQueryPlanSuffix, humanizeutil.IBytes(ws.MaxAllocatedMem))) 854 } 855 856 if ws.MaxAllocatedDisk != 0 { 857 stats = append(stats, 858 fmt.Sprintf("%s: %s", MaxDiskQueryPlanSuffix, humanizeutil.IBytes(ws.MaxAllocatedDisk))) 859 } 860 861 return stats 862 } 863 864 func (w *windower) outputStatsToTrace() { 865 is, ok := getInputStats(w.FlowCtx, w.input) 866 if !ok { 867 return 868 } 869 if sp := opentracing.SpanFromContext(w.Ctx); sp != nil { 870 tracing.SetSpanStats( 871 sp, 872 &WindowerStats{ 873 InputStats: is, 874 MaxAllocatedMem: w.MemMonitor.MaximumBytes(), 875 MaxAllocatedDisk: w.diskMonitor.MaximumBytes(), 876 }, 877 ) 878 } 879 } 880 881 // ChildCount is part of the execinfra.OpNode interface. 882 func (w *windower) ChildCount(verbose bool) int { 883 if _, ok := w.input.(execinfra.OpNode); ok { 884 return 1 885 } 886 return 0 887 } 888 889 // Child is part of the execinfra.OpNode interface. 890 func (w *windower) Child(nth int, verbose bool) execinfra.OpNode { 891 if nth == 0 { 892 if n, ok := w.input.(execinfra.OpNode); ok { 893 return n 894 } 895 panic("input to windower is not an execinfra.OpNode") 896 } 897 panic(fmt.Sprintf("invalid index %d", nth)) 898 }