github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/sql/colexec/sort_chunks.go

github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/sql/colexec/sort_chunks.go (about)

     1  // Copyright 2019 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package colexec
    12  
    13  import (
    14  	"context"
    15  	"fmt"
    16  
    17  	"github.com/cockroachdb/cockroach/pkg/col/coldata"
    18  	"github.com/cockroachdb/cockroach/pkg/sql/colexecbase"
    19  	"github.com/cockroachdb/cockroach/pkg/sql/colexecbase/colexecerror"
    20  	"github.com/cockroachdb/cockroach/pkg/sql/colmem"
    21  	"github.com/cockroachdb/cockroach/pkg/sql/execinfra"
    22  	"github.com/cockroachdb/cockroach/pkg/sql/execinfrapb"
    23  	"github.com/cockroachdb/cockroach/pkg/sql/types"
    24  )
    25  
    26  // NewSortChunks returns a new sort chunks operator, which sorts its input on
    27  // the columns given in orderingCols. The inputTypes must correspond 1-1 with
    28  // the columns in the input operator. The input tuples must be sorted on first
    29  // matchLen columns.
    30  func NewSortChunks(
    31  	allocator *colmem.Allocator,
    32  	input colexecbase.Operator,
    33  	inputTypes []*types.T,
    34  	orderingCols []execinfrapb.Ordering_Column,
    35  	matchLen int,
    36  ) (colexecbase.Operator, error) {
    37  	if matchLen < 1 || matchLen == len(orderingCols) {
    38  		colexecerror.InternalError(fmt.Sprintf(
    39  			"sort chunks should only be used when the input is "+
    40  				"already ordered on at least one column but not fully ordered; "+
    41  				"num ordering cols = %d, matchLen = %d", len(orderingCols), matchLen))
    42  	}
    43  	alreadySortedCols := make([]uint32, matchLen)
    44  	for i := range alreadySortedCols {
    45  		alreadySortedCols[i] = orderingCols[i].ColIdx
    46  	}
    47  	chunker, err := newChunker(allocator, input, inputTypes, alreadySortedCols)
    48  	if err != nil {
    49  		return nil, err
    50  	}
    51  	sorter, err := newSorter(allocator, chunker, inputTypes, orderingCols[matchLen:])
    52  	if err != nil {
    53  		return nil, err
    54  	}
    55  	return &sortChunksOp{allocator: allocator, input: chunker, sorter: sorter}, nil
    56  }
    57  
    58  type sortChunksOp struct {
    59  	allocator *colmem.Allocator
    60  	input     *chunker
    61  	sorter    resettableOperator
    62  
    63  	exportedFromBuffer int
    64  	exportedFromBatch  int
    65  	windowedBatch      coldata.Batch
    66  }
    67  
    68  var _ colexecbase.Operator = &sortChunksOp{}
    69  var _ bufferingInMemoryOperator = &sortChunksOp{}
    70  
    71  func (c *sortChunksOp) ChildCount(verbose bool) int {
    72  	return 1
    73  }
    74  
    75  func (c *sortChunksOp) Child(nth int, verbose bool) execinfra.OpNode {
    76  	if nth == 0 {
    77  		return c.input
    78  	}
    79  	colexecerror.InternalError(fmt.Sprintf("invalid index %d", nth))
    80  	// This code is unreachable, but the compiler cannot infer that.
    81  	return nil
    82  }
    83  
    84  func (c *sortChunksOp) Init() {
    85  	c.input.init()
    86  	c.sorter.Init()
    87  	// TODO(yuzefovich): switch to calling this method on allocator. This will
    88  	// require plumbing unlimited allocator to work correctly in tests with
    89  	// memory limit of 1.
    90  	c.windowedBatch = coldata.NewMemBatchNoCols(c.input.inputTypes, coldata.BatchSize())
    91  }
    92  
    93  func (c *sortChunksOp) Next(ctx context.Context) coldata.Batch {
    94  	for {
    95  		batch := c.sorter.Next(ctx)
    96  		if batch.Length() == 0 {
    97  			if c.input.done() {
    98  				// We're done, so return a zero-length batch.
    99  				return batch
   100  			}
   101  			// We're not yet done - need to process another chunk, so we empty the
   102  			// chunker's buffer and reset the sorter. Note that we do not want to do
   103  			// the full reset of the chunker because we're in the middle of
   104  			// processing of the input to sortChunksOp.
   105  			c.input.emptyBuffer()
   106  			c.sorter.reset(ctx)
   107  		} else {
   108  			return batch
   109  		}
   110  	}
   111  }
   112  
   113  func (c *sortChunksOp) ExportBuffered(colexecbase.Operator) coldata.Batch {
   114  	// First, we check whether chunker has buffered up any tuples, and if so,
   115  	// whether we have exported them all.
   116  	if c.input.bufferedTuples.Length() > 0 {
   117  		if c.exportedFromBuffer < c.input.bufferedTuples.Length() {
   118  			newExportedFromBuffer := c.exportedFromBuffer + coldata.BatchSize()
   119  			if newExportedFromBuffer > c.input.bufferedTuples.Length() {
   120  				newExportedFromBuffer = c.input.bufferedTuples.Length()
   121  			}
   122  			for i := range c.input.inputTypes {
   123  				window := c.input.bufferedTuples.ColVec(i).Window(c.exportedFromBuffer, newExportedFromBuffer)
   124  				c.windowedBatch.ReplaceCol(window, i)
   125  			}
   126  			c.windowedBatch.SetLength(newExportedFromBuffer - c.exportedFromBuffer)
   127  			c.exportedFromBuffer = newExportedFromBuffer
   128  			return c.windowedBatch
   129  		}
   130  	}
   131  	// Next, we check whether there are any unexported tuples in the last read
   132  	// batch.
   133  	// firstTupleIdx indicates the index of the first tuple in the last read
   134  	// batch that hasn't been "processed" and should be the first to be exported.
   135  	firstTupleIdx := c.input.exportState.numProcessedTuplesFromBatch
   136  	if c.input.batch != nil && firstTupleIdx+c.exportedFromBatch < c.input.batch.Length() {
   137  		makeWindowIntoBatch(c.windowedBatch, c.input.batch, firstTupleIdx, c.input.inputTypes)
   138  		c.exportedFromBatch = c.windowedBatch.Length()
   139  		return c.windowedBatch
   140  	}
   141  	return coldata.ZeroBatch
   142  }
   143  
   144  // chunkerState represents the state of the chunker spooler.
   145  type chunkerState int
   146  
   147  const (
   148  	// chunkerReading is the state of the chunker spooler in which it reads a
   149  	// batch from its input and partitions the batch into chunks. Depending on
   150  	// current state of the chunker's buffer and number of chunks in the batch,
   151  	// chunker might stay in chunkerReading state or switch to either of the
   152  	// emitting states.
   153  	chunkerReading chunkerState = iota
   154  	// chunkerEmittingFromBuffer is the state of the chunker spooler in which it
   155  	// prepares to "emit" tuples that have been buffered. All the tuples belong
   156  	// to the same chunk ("emit" is in quotes because the chunker does not emit
   157  	// batches as usual - it, instead, implements spooler interface, and the
   158  	// batches should be accessed through those methods). The chunker transitions
   159  	// to chunkerEmittingFromBuffer state and indicates that the tuples need to
   160  	// be read from the buffer.
   161  	chunkerEmittingFromBuffer
   162  	// chunkerEmittingFromBatch is the state of the chunker spooler in which it
   163  	// prepares to "emit" all chunks that are fully contained within the last
   164  	// read batch (i.e. all chunks except for the last chunk which might include
   165  	// tuples from the next batch). The last chunk within the batch is buffered,
   166  	// the chunker transitions to chunkerReading state and indicates that the
   167  	// tuples need to be read from s.batch.
   168  	chunkerEmittingFromBatch
   169  )
   170  
   171  // chunkerReadingState indicates where the spooler needs to read tuples from
   172  // for emitting.
   173  type chunkerReadingState int
   174  
   175  const (
   176  	// chunkerReadFromBuffer indicates that the tuples need to be read from the
   177  	// buffer.
   178  	chunkerReadFromBuffer = iota
   179  	// chunkerReadFromBatch indicates that the tuples need to be read from the
   180  	// last read batch directly. Only tuples that are fully contained within the
   181  	// last read batch are "emitted".
   182  	chunkerReadFromBatch
   183  	// chunkerDone indicates that the input has been fully consumed and all
   184  	// tuples have already been emitted.
   185  	chunkerDone
   186  )
   187  
   188  // chunker is a spooler that produces chunks from its input when the tuples
   189  // are already ordered on the first matchLen columns. The chunks are not
   190  // emitted in batches as usual when Next()'ed, but, instead, they should be
   191  // accessed via getValues().
   192  //
   193  // Note 1: the chunker assumes that its input produces batches with no
   194  // selection vector, so it always puts a deselector on top of its input. It
   195  // does the coalescing itself, so it does not use an extra coalescer.
   196  // Note 2: the chunker intentionally does not implement resetter interface (if
   197  // it did, the sorter would reset it, but we don't want that since we're likely
   198  // in the middle of processing the input). Instead, sortChunksOp will empty the
   199  // buffer when appropriate.
   200  type chunker struct {
   201  	OneInputNode
   202  	NonExplainable
   203  
   204  	allocator *colmem.Allocator
   205  	// inputTypes contains the types of all of the columns from input.
   206  	inputTypes []*types.T
   207  	// inputDone indicates whether input has been fully consumed.
   208  	inputDone bool
   209  	// alreadySortedCols indicates the columns on which the input is already
   210  	// ordered.
   211  	alreadySortedCols []uint32
   212  
   213  	// batch is the last read batch from input.
   214  	batch coldata.Batch
   215  	// partitioners contains one partitioner for each of matchLen first already
   216  	// ordered columns.
   217  	partitioners []partitioner
   218  	// partitionCol is a bool slice for partitioners' output to be ORed.
   219  	partitionCol []bool
   220  
   221  	// chunks contains the indices of the first tuples within different chunks
   222  	// found in the last read batch. Note: the first chunk might be a part of
   223  	// the chunk that is currently being buffered, and similarly the last chunk
   224  	// might include tuples from the batches to be read.
   225  	chunks []int
   226  	// chunksProcessedIdx indicates which chunk within s.chunks should be
   227  	// processed next.
   228  	chunksProcessedIdx int
   229  	// chunksStartIdx indicates the index of the chunk within s.chunks that is
   230  	// the first one to be emitted from s.batch directly in when reading from
   231  	// batch.
   232  	chunksStartIdx int
   233  
   234  	// bufferedTuples is a buffer to store tuples when a chunk is bigger than
   235  	// coldata.BatchSize() or when the chunk is the last in the last read batch
   236  	// (we don't know yet where the end of such chunk is).
   237  	bufferedTuples *appendOnlyBufferedBatch
   238  
   239  	readFrom chunkerReadingState
   240  	state    chunkerState
   241  
   242  	exportState struct {
   243  		// numProcessedTuplesFromBatch indicates how many tuples from the current
   244  		// batch have been "processed" for ExportBuffered purposes (here,
   245  		// "processed" means either have been sorted and emitted or have been
   246  		// buffered up into bufferedTuples. This information is needed by
   247  		// sortChunksOp to be able to spill to disk in case of OOM.
   248  		numProcessedTuplesFromBatch int
   249  	}
   250  }
   251  
   252  var _ spooler = &chunker{}
   253  
   254  func newChunker(
   255  	allocator *colmem.Allocator,
   256  	input colexecbase.Operator,
   257  	inputTypes []*types.T,
   258  	alreadySortedCols []uint32,
   259  ) (*chunker, error) {
   260  	var err error
   261  	partitioners := make([]partitioner, len(alreadySortedCols))
   262  	for i, col := range alreadySortedCols {
   263  		partitioners[i], err = newPartitioner(inputTypes[col])
   264  		if err != nil {
   265  			return nil, err
   266  		}
   267  	}
   268  	deselector := NewDeselectorOp(allocator, input, inputTypes)
   269  	return &chunker{
   270  		OneInputNode:      NewOneInputNode(deselector),
   271  		allocator:         allocator,
   272  		inputTypes:        inputTypes,
   273  		alreadySortedCols: alreadySortedCols,
   274  		partitioners:      partitioners,
   275  		state:             chunkerReading,
   276  	}, nil
   277  }
   278  
   279  func (s *chunker) init() {
   280  	s.input.Init()
   281  	s.bufferedTuples = newAppendOnlyBufferedBatch(
   282  		s.allocator, s.inputTypes, 0, /* initialSize */
   283  	)
   284  	s.partitionCol = make([]bool, coldata.BatchSize())
   285  	s.chunks = make([]int, 0, 16)
   286  }
   287  
   288  // done indicates whether the chunker has fully consumed its input.
   289  func (s *chunker) done() bool {
   290  	return s.readFrom == chunkerDone
   291  }
   292  
   293  // prepareNextChunks prepares the chunks for the chunker spooler.
   294  //
   295  // Note: it does not return the batches directly; instead, the chunker
   296  // remembers where the next chunks to be emitted are actually stored. In order
   297  // to access the chunks, getValues() must be used.
   298  func (s *chunker) prepareNextChunks(ctx context.Context) chunkerReadingState {
   299  	for {
   300  		switch s.state {
   301  		case chunkerReading:
   302  			s.batch = s.input.Next(ctx)
   303  			s.exportState.numProcessedTuplesFromBatch = 0
   304  			if s.batch.Length() == 0 {
   305  				s.inputDone = true
   306  				if s.bufferedTuples.Length() > 0 {
   307  					s.state = chunkerEmittingFromBuffer
   308  				} else {
   309  					s.state = chunkerEmittingFromBatch
   310  				}
   311  				continue
   312  			}
   313  			if s.batch.Selection() != nil {
   314  				// We assume that the input has been deselected, so the batch should
   315  				// never have a selection vector set.
   316  				colexecerror.InternalError(fmt.Sprintf("unexpected: batch with non-nil selection vector"))
   317  			}
   318  
   319  			// First, run the partitioners on our pre-sorted columns to determine the
   320  			// boundaries of the chunks (stored in s.chunks) to sort further.
   321  			copy(s.partitionCol, zeroBoolColumn)
   322  			for i, orderedCol := range s.alreadySortedCols {
   323  				s.partitioners[i].partition(s.batch.ColVec(int(orderedCol)), s.partitionCol,
   324  					s.batch.Length())
   325  			}
   326  			s.chunks = boolVecToSel64(s.partitionCol, s.chunks[:0])
   327  
   328  			if s.bufferedTuples.Length() == 0 {
   329  				// There are no buffered tuples, so a new chunk starts in the current
   330  				// batch.
   331  				if len(s.chunks) > 1 {
   332  					// There is at least one chunk that is fully contained within
   333  					// s.batch, so we proceed to emitting it.
   334  					s.state = chunkerEmittingFromBatch
   335  					continue
   336  				}
   337  				// All tuples in s.batch belong to the same chunk. Possibly tuples from
   338  				// the next batch will also belong to this chunk, so we buffer the full
   339  				// s.batch.
   340  				s.buffer(0 /* start */, s.batch.Length())
   341  				s.state = chunkerReading
   342  				continue
   343  			} else {
   344  				// There are some buffered tuples, so we need to check whether the
   345  				// first tuple of s.batch belongs to the chunk that is being buffered.
   346  				differ := false
   347  				i := 0
   348  				for !differ && i < len(s.alreadySortedCols) {
   349  					differ = valuesDiffer(
   350  						s.bufferedTuples.ColVec(int(s.alreadySortedCols[i])),
   351  						0, /*aValueIdx */
   352  						s.batch.ColVec(int(s.alreadySortedCols[i])),
   353  						0, /* bValueIdx */
   354  					)
   355  					i++
   356  				}
   357  				if differ {
   358  					// Buffered tuples comprise a full chunk, so we proceed to emitting
   359  					// it.
   360  					s.state = chunkerEmittingFromBuffer
   361  					continue
   362  				}
   363  
   364  				// The first tuple of s.batch belongs to the chunk that is being
   365  				// buffered.
   366  				if len(s.chunks) == 1 {
   367  					// All tuples in s.batch belong to the same chunk that is being
   368  					// buffered. Possibly tuples from the next batch will also belong to
   369  					// this chunk, so we buffer the full s.batch.
   370  					s.buffer(0 /* start */, s.batch.Length())
   371  					s.state = chunkerReading
   372  					continue
   373  				}
   374  				// First s.chunks[1] tuples belong to the same chunk that is being
   375  				// buffered, so we buffer them and proceed to emitting all buffered
   376  				// tuples.
   377  				s.buffer(0 /* start */, s.chunks[1])
   378  				s.chunksProcessedIdx = 1
   379  				s.state = chunkerEmittingFromBuffer
   380  				continue
   381  			}
   382  		case chunkerEmittingFromBuffer:
   383  			s.state = chunkerEmittingFromBatch
   384  			return chunkerReadFromBuffer
   385  		case chunkerEmittingFromBatch:
   386  			if s.chunksProcessedIdx < len(s.chunks)-1 {
   387  				// There is at least one chunk that is fully contained within s.batch.
   388  				// We don't know yet whether the tuples from the next batch belong to
   389  				// the last chunk of the current batch, so we will buffer those and can
   390  				// only emit "internal" to s.batch chunks. Additionally, if
   391  				// s.chunksProcessedIdx == 1, then the first chunk was already combined
   392  				// with the buffered tuples and emitted.
   393  				s.chunksStartIdx = s.chunksProcessedIdx
   394  				s.chunksProcessedIdx = len(s.chunks) - 1
   395  				return chunkerReadFromBatch
   396  			} else if s.chunksProcessedIdx == len(s.chunks)-1 {
   397  				// Other tuples might belong to this chunk, so we buffer it.
   398  				s.buffer(s.chunks[s.chunksProcessedIdx], s.batch.Length())
   399  				// All tuples in s.batch have been processed, so we reset s.chunks and
   400  				// the corresponding variables.
   401  				s.chunks = s.chunks[:0]
   402  				s.chunksProcessedIdx = 0
   403  				s.state = chunkerReading
   404  			} else {
   405  				// All tuples in s.batch have been emitted.
   406  				if s.inputDone {
   407  					return chunkerDone
   408  				}
   409  				colexecerror.InternalError(fmt.Sprintf("unexpected: chunkerEmittingFromBatch state" +
   410  					"when s.chunks is fully processed and input is not done"))
   411  			}
   412  		default:
   413  			colexecerror.InternalError(fmt.Sprintf("invalid chunker spooler state %v", s.state))
   414  		}
   415  	}
   416  }
   417  
   418  // buffer appends all tuples in range [start,end) from s.batch to already
   419  // buffered tuples.
   420  func (s *chunker) buffer(start int, end int) {
   421  	if start == end {
   422  		return
   423  	}
   424  	s.allocator.PerformOperation(s.bufferedTuples.ColVecs(), func() {
   425  		s.exportState.numProcessedTuplesFromBatch = end
   426  		s.bufferedTuples.append(s.batch, start, end)
   427  	})
   428  }
   429  
   430  func (s *chunker) spool(ctx context.Context) {
   431  	s.readFrom = s.prepareNextChunks(ctx)
   432  }
   433  
   434  func (s *chunker) getValues(i int) coldata.Vec {
   435  	switch s.readFrom {
   436  	case chunkerReadFromBuffer:
   437  		return s.bufferedTuples.ColVec(i).Window(0 /* start */, s.bufferedTuples.Length())
   438  	case chunkerReadFromBatch:
   439  		return s.batch.ColVec(i).Window(s.chunks[s.chunksStartIdx], s.chunks[len(s.chunks)-1])
   440  	default:
   441  		colexecerror.InternalError(fmt.Sprintf("unexpected chunkerReadingState in getValues: %v", s.state))
   442  		// This code is unreachable, but the compiler cannot infer that.
   443  		return nil
   444  	}
   445  }
   446  
   447  func (s *chunker) getNumTuples() int {
   448  	switch s.readFrom {
   449  	case chunkerReadFromBuffer:
   450  		return s.bufferedTuples.Length()
   451  	case chunkerReadFromBatch:
   452  		return s.chunks[len(s.chunks)-1] - s.chunks[s.chunksStartIdx]
   453  	case chunkerDone:
   454  		return 0
   455  	default:
   456  		colexecerror.InternalError(fmt.Sprintf("unexpected chunkerReadingState in getNumTuples: %v", s.state))
   457  		// This code is unreachable, but the compiler cannot infer that.
   458  		return 0
   459  	}
   460  }
   461  
   462  func (s *chunker) getPartitionsCol() []bool {
   463  	switch s.readFrom {
   464  	case chunkerReadFromBuffer:
   465  		// There is a single chunk in the buffer, so, per spooler's contract, we
   466  		// return nil.
   467  		return nil
   468  	case chunkerReadFromBatch:
   469  		if s.chunksStartIdx+1 == len(s.chunks)-1 {
   470  			// There is a single chunk that is fully contained within s.batch, so,
   471  			// per spooler's contract, we return nil.
   472  			return nil
   473  		}
   474  		copy(s.partitionCol, zeroBoolColumn)
   475  		for i := s.chunksStartIdx; i < len(s.chunks)-1; i++ {
   476  			// getValues returns a slice starting from s.chunks[s.chunksStartIdx], so
   477  			// we need to account for that by shifting as well.
   478  			s.partitionCol[s.chunks[i]-s.chunks[s.chunksStartIdx]] = true
   479  		}
   480  		return s.partitionCol
   481  	case chunkerDone:
   482  		return nil
   483  	default:
   484  		colexecerror.InternalError(fmt.Sprintf("unexpected chunkerReadingState in getPartitionsCol: %v", s.state))
   485  		// This code is unreachable, but the compiler cannot infer that.
   486  		return nil
   487  	}
   488  }
   489  
   490  func (s *chunker) getWindowedBatch(startIdx, endIdx int) coldata.Batch {
   491  	colexecerror.InternalError("getWindowedBatch is not implemented on chunker spooler")
   492  	// This code is unreachable, but the compiler cannot infer that.
   493  	return nil
   494  }
   495  
   496  func (s *chunker) emptyBuffer() {
   497  	s.bufferedTuples.SetLength(0)
   498  	s.bufferedTuples.ResetInternalBatch()
   499  }