github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/sql/colexec/partially_ordered_distinct.go

github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/sql/colexec/partially_ordered_distinct.go (about)

     1  // Copyright 2020 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package colexec
    12  
    13  import (
    14  	"context"
    15  	"fmt"
    16  
    17  	"github.com/cockroachdb/cockroach/pkg/col/coldata"
    18  	"github.com/cockroachdb/cockroach/pkg/sql/colexecbase"
    19  	"github.com/cockroachdb/cockroach/pkg/sql/colexecbase/colexecerror"
    20  	"github.com/cockroachdb/cockroach/pkg/sql/colmem"
    21  	"github.com/cockroachdb/cockroach/pkg/sql/execinfra"
    22  	"github.com/cockroachdb/cockroach/pkg/sql/types"
    23  	"github.com/cockroachdb/errors"
    24  )
    25  
    26  // TODO(yuzefovich): tune.
    27  const partiallyOrderedDistinctNumHashBuckets = 1024
    28  
    29  // newPartiallyOrderedDistinct creates a distinct operator on the given
    30  // distinct columns when we have partial ordering on some of the distinct
    31  // columns.
    32  func newPartiallyOrderedDistinct(
    33  	allocator *colmem.Allocator,
    34  	input colexecbase.Operator,
    35  	distinctCols []uint32,
    36  	orderedCols []uint32,
    37  	typs []*types.T,
    38  ) (colexecbase.Operator, error) {
    39  	if len(orderedCols) == 0 || len(orderedCols) == len(distinctCols) {
    40  		return nil, errors.AssertionFailedf(
    41  			"partially ordered distinct wrongfully planned: numDistinctCols=%d "+
    42  				"numOrderedCols=%d", len(distinctCols), len(orderedCols))
    43  	}
    44  	chunker, err := newChunker(allocator, input, typs, orderedCols)
    45  	if err != nil {
    46  		return nil, err
    47  	}
    48  	chunkerOperator := newChunkerOperator(allocator, chunker, typs)
    49  	// distinctUnorderedCols will contain distinct columns that are not present
    50  	// among orderedCols. The unordered distinct operator will use these columns
    51  	// to find distinct tuples within "chunks" of tuples that are the same on the
    52  	// ordered columns.
    53  	distinctUnorderedCols := make([]uint32, 0, len(distinctCols)-len(orderedCols))
    54  	for _, distinctCol := range distinctCols {
    55  		isOrdered := false
    56  		for _, orderedCol := range orderedCols {
    57  			if orderedCol == distinctCol {
    58  				isOrdered = true
    59  				break
    60  			}
    61  		}
    62  		if !isOrdered {
    63  			distinctUnorderedCols = append(distinctUnorderedCols, distinctCol)
    64  		}
    65  	}
    66  	distinct := NewUnorderedDistinct(
    67  		allocator, chunkerOperator, distinctUnorderedCols, typs,
    68  		partiallyOrderedDistinctNumHashBuckets,
    69  	)
    70  	return &partiallyOrderedDistinct{
    71  		input:    chunkerOperator,
    72  		distinct: distinct.(resettableOperator),
    73  	}, nil
    74  }
    75  
    76  // partiallyOrderedDistinct implements DISTINCT operation using a combination
    77  // of chunkerOperator and unorderedDistinct. It's only job is to check whether
    78  // the input has been fully processed and, if not, to move to the next chunk
    79  // (where "chunk" is all tuples that are equal on the ordered columns).
    80  type partiallyOrderedDistinct struct {
    81  	input    *chunkerOperator
    82  	distinct resettableOperator
    83  }
    84  
    85  var _ colexecbase.Operator = &partiallyOrderedDistinct{}
    86  
    87  func (p *partiallyOrderedDistinct) ChildCount(bool) int {
    88  	return 1
    89  }
    90  
    91  func (p *partiallyOrderedDistinct) Child(nth int, _ bool) execinfra.OpNode {
    92  	if nth == 0 {
    93  		return p.input
    94  	}
    95  	colexecerror.InternalError(fmt.Sprintf("invalid index %d", nth))
    96  	// This code is unreachable, but the compiler cannot infer that.
    97  	return nil
    98  }
    99  
   100  func (p *partiallyOrderedDistinct) Init() {
   101  	p.distinct.Init()
   102  }
   103  
   104  func (p *partiallyOrderedDistinct) Next(ctx context.Context) coldata.Batch {
   105  	for {
   106  		batch := p.distinct.Next(ctx)
   107  		if batch.Length() == 0 {
   108  			if p.input.done() {
   109  				// We're done, so return a zero-length batch.
   110  				return coldata.ZeroBatch
   111  			}
   112  			// p.distinct will reset p.input.
   113  			p.distinct.reset(ctx)
   114  		} else {
   115  			return batch
   116  		}
   117  	}
   118  }
   119  
   120  func newChunkerOperator(
   121  	allocator *colmem.Allocator, input *chunker, inputTypes []*types.T,
   122  ) *chunkerOperator {
   123  	return &chunkerOperator{
   124  		input:         input,
   125  		inputTypes:    inputTypes,
   126  		windowedBatch: allocator.NewMemBatchNoCols(inputTypes, coldata.BatchSize()),
   127  	}
   128  }
   129  
   130  // chunkerOperator is an adapter from chunker to Operator interface. It outputs
   131  // all tuples from a single chunk followed by zero-length batches until it is
   132  // reset.
   133  // It will have returned all tuples from all of the chunks only when it returns
   134  // a zero-length *and* done() method returns true (i.e. a zero-length batch
   135  // indicates the end of a chunk, but when done() returns true, it indicates
   136  // that the input has been fully processed).
   137  type chunkerOperator struct {
   138  	input      *chunker
   139  	inputTypes []*types.T
   140  	// haveChunksToEmit indicates whether we have spooled input and still there
   141  	// are more chunks to emit.
   142  	haveChunksToEmit bool
   143  	// numTuplesInChunks stores the number of tuples that are currently spooled
   144  	// by input.
   145  	numTuplesInChunks int
   146  	// currentChunkFinished indicates whether we have emitted all tuples from the
   147  	// current chunk and should be returning a zero-length batch.
   148  	currentChunkFinished bool
   149  	// newChunksCol, when non-nil, stores the boundaries of chunks. Every true
   150  	// value indicates that a new chunk begins at the corresponding index. If
   151  	// newChunksCol is nil, all spooled tuples belong to the same chunk.
   152  	newChunksCol []bool
   153  	// outputTupleStartIdx indicates the index of the first tuple to be included
   154  	// in the output batch.
   155  	outputTupleStartIdx int
   156  	// windowedBatch is the output batch of chunkerOperator. For performance
   157  	// reasons, the spooled tuples are not copied into it, instead we use a
   158  	// "window" approach.
   159  	windowedBatch coldata.Batch
   160  }
   161  
   162  var _ resettableOperator = &chunkerOperator{}
   163  
   164  func (c *chunkerOperator) ChildCount(bool) int {
   165  	return 1
   166  }
   167  
   168  func (c *chunkerOperator) Child(nth int, _ bool) execinfra.OpNode {
   169  	if nth == 0 {
   170  		return c.input
   171  	}
   172  	colexecerror.InternalError(fmt.Sprintf("invalid index %d", nth))
   173  	// This code is unreachable, but the compiler cannot infer that.
   174  	return nil
   175  }
   176  
   177  func (c *chunkerOperator) Init() {
   178  	c.input.init()
   179  }
   180  
   181  func (c *chunkerOperator) Next(ctx context.Context) coldata.Batch {
   182  	if c.currentChunkFinished {
   183  		return coldata.ZeroBatch
   184  	}
   185  	if !c.haveChunksToEmit {
   186  		// We don't have any chunks to emit, so we need to spool the input.
   187  		c.input.spool(ctx)
   188  		c.haveChunksToEmit = true
   189  		c.numTuplesInChunks = c.input.getNumTuples()
   190  		c.newChunksCol = c.input.getPartitionsCol()
   191  	}
   192  	outputTupleEndIdx := c.numTuplesInChunks
   193  	if c.outputTupleStartIdx == outputTupleEndIdx {
   194  		// Current chunk has been fully output.
   195  		c.currentChunkFinished = true
   196  		return coldata.ZeroBatch
   197  	}
   198  	if c.newChunksCol == nil {
   199  		// When newChunksCol is nil, then all tuples that are returned via
   200  		// getValues are equal on the ordered columns, so we simply emit the next
   201  		// "window" of those tuples.
   202  		if outputTupleEndIdx-c.outputTupleStartIdx > coldata.BatchSize() {
   203  			outputTupleEndIdx = c.outputTupleStartIdx + coldata.BatchSize()
   204  		}
   205  	} else {
   206  		// newChunksCol is non-nil, so there are multiple chunks within the
   207  		// current tuples. We will emit a single chunk as a separate batch and
   208  		// then will proceed to emitting zero-length batches until we're reset.
   209  		outputTupleEndIdx = c.outputTupleStartIdx + 1
   210  		for outputTupleEndIdx < c.numTuplesInChunks && !c.newChunksCol[outputTupleEndIdx] {
   211  			outputTupleEndIdx++
   212  		}
   213  		c.currentChunkFinished = true
   214  	}
   215  	for i := range c.inputTypes {
   216  		window := c.input.getValues(i).Window(c.outputTupleStartIdx, outputTupleEndIdx)
   217  		c.windowedBatch.ReplaceCol(window, i)
   218  	}
   219  	c.windowedBatch.SetSelection(false)
   220  	c.windowedBatch.SetLength(outputTupleEndIdx - c.outputTupleStartIdx)
   221  	c.outputTupleStartIdx = outputTupleEndIdx
   222  	return c.windowedBatch
   223  }
   224  
   225  func (c *chunkerOperator) done() bool {
   226  	return c.input.done()
   227  }
   228  
   229  func (c *chunkerOperator) reset(_ context.Context) {
   230  	c.currentChunkFinished = false
   231  	if c.newChunksCol != nil {
   232  		if c.outputTupleStartIdx == c.numTuplesInChunks {
   233  			// We have processed all chunks among the current tuples, so we will need
   234  			// to get new chunks.
   235  			c.haveChunksToEmit = false
   236  		}
   237  	} else {
   238  		// We have processed all current tuples (that comprised a single chunk), so
   239  		// we will need to get new chunks.
   240  		c.haveChunksToEmit = false
   241  	}
   242  	if !c.haveChunksToEmit {
   243  		c.input.emptyBuffer()
   244  		c.outputTupleStartIdx = 0
   245  	}
   246  }