github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/sql/colexec/partially_ordered_distinct.go (about) 1 // Copyright 2020 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package colexec 12 13 import ( 14 "context" 15 "fmt" 16 17 "github.com/cockroachdb/cockroach/pkg/col/coldata" 18 "github.com/cockroachdb/cockroach/pkg/sql/colexecbase" 19 "github.com/cockroachdb/cockroach/pkg/sql/colexecbase/colexecerror" 20 "github.com/cockroachdb/cockroach/pkg/sql/colmem" 21 "github.com/cockroachdb/cockroach/pkg/sql/execinfra" 22 "github.com/cockroachdb/cockroach/pkg/sql/types" 23 "github.com/cockroachdb/errors" 24 ) 25 26 // TODO(yuzefovich): tune. 27 const partiallyOrderedDistinctNumHashBuckets = 1024 28 29 // newPartiallyOrderedDistinct creates a distinct operator on the given 30 // distinct columns when we have partial ordering on some of the distinct 31 // columns. 32 func newPartiallyOrderedDistinct( 33 allocator *colmem.Allocator, 34 input colexecbase.Operator, 35 distinctCols []uint32, 36 orderedCols []uint32, 37 typs []*types.T, 38 ) (colexecbase.Operator, error) { 39 if len(orderedCols) == 0 || len(orderedCols) == len(distinctCols) { 40 return nil, errors.AssertionFailedf( 41 "partially ordered distinct wrongfully planned: numDistinctCols=%d "+ 42 "numOrderedCols=%d", len(distinctCols), len(orderedCols)) 43 } 44 chunker, err := newChunker(allocator, input, typs, orderedCols) 45 if err != nil { 46 return nil, err 47 } 48 chunkerOperator := newChunkerOperator(allocator, chunker, typs) 49 // distinctUnorderedCols will contain distinct columns that are not present 50 // among orderedCols. The unordered distinct operator will use these columns 51 // to find distinct tuples within "chunks" of tuples that are the same on the 52 // ordered columns. 53 distinctUnorderedCols := make([]uint32, 0, len(distinctCols)-len(orderedCols)) 54 for _, distinctCol := range distinctCols { 55 isOrdered := false 56 for _, orderedCol := range orderedCols { 57 if orderedCol == distinctCol { 58 isOrdered = true 59 break 60 } 61 } 62 if !isOrdered { 63 distinctUnorderedCols = append(distinctUnorderedCols, distinctCol) 64 } 65 } 66 distinct := NewUnorderedDistinct( 67 allocator, chunkerOperator, distinctUnorderedCols, typs, 68 partiallyOrderedDistinctNumHashBuckets, 69 ) 70 return &partiallyOrderedDistinct{ 71 input: chunkerOperator, 72 distinct: distinct.(resettableOperator), 73 }, nil 74 } 75 76 // partiallyOrderedDistinct implements DISTINCT operation using a combination 77 // of chunkerOperator and unorderedDistinct. It's only job is to check whether 78 // the input has been fully processed and, if not, to move to the next chunk 79 // (where "chunk" is all tuples that are equal on the ordered columns). 80 type partiallyOrderedDistinct struct { 81 input *chunkerOperator 82 distinct resettableOperator 83 } 84 85 var _ colexecbase.Operator = &partiallyOrderedDistinct{} 86 87 func (p *partiallyOrderedDistinct) ChildCount(bool) int { 88 return 1 89 } 90 91 func (p *partiallyOrderedDistinct) Child(nth int, _ bool) execinfra.OpNode { 92 if nth == 0 { 93 return p.input 94 } 95 colexecerror.InternalError(fmt.Sprintf("invalid index %d", nth)) 96 // This code is unreachable, but the compiler cannot infer that. 97 return nil 98 } 99 100 func (p *partiallyOrderedDistinct) Init() { 101 p.distinct.Init() 102 } 103 104 func (p *partiallyOrderedDistinct) Next(ctx context.Context) coldata.Batch { 105 for { 106 batch := p.distinct.Next(ctx) 107 if batch.Length() == 0 { 108 if p.input.done() { 109 // We're done, so return a zero-length batch. 110 return coldata.ZeroBatch 111 } 112 // p.distinct will reset p.input. 113 p.distinct.reset(ctx) 114 } else { 115 return batch 116 } 117 } 118 } 119 120 func newChunkerOperator( 121 allocator *colmem.Allocator, input *chunker, inputTypes []*types.T, 122 ) *chunkerOperator { 123 return &chunkerOperator{ 124 input: input, 125 inputTypes: inputTypes, 126 windowedBatch: allocator.NewMemBatchNoCols(inputTypes, coldata.BatchSize()), 127 } 128 } 129 130 // chunkerOperator is an adapter from chunker to Operator interface. It outputs 131 // all tuples from a single chunk followed by zero-length batches until it is 132 // reset. 133 // It will have returned all tuples from all of the chunks only when it returns 134 // a zero-length *and* done() method returns true (i.e. a zero-length batch 135 // indicates the end of a chunk, but when done() returns true, it indicates 136 // that the input has been fully processed). 137 type chunkerOperator struct { 138 input *chunker 139 inputTypes []*types.T 140 // haveChunksToEmit indicates whether we have spooled input and still there 141 // are more chunks to emit. 142 haveChunksToEmit bool 143 // numTuplesInChunks stores the number of tuples that are currently spooled 144 // by input. 145 numTuplesInChunks int 146 // currentChunkFinished indicates whether we have emitted all tuples from the 147 // current chunk and should be returning a zero-length batch. 148 currentChunkFinished bool 149 // newChunksCol, when non-nil, stores the boundaries of chunks. Every true 150 // value indicates that a new chunk begins at the corresponding index. If 151 // newChunksCol is nil, all spooled tuples belong to the same chunk. 152 newChunksCol []bool 153 // outputTupleStartIdx indicates the index of the first tuple to be included 154 // in the output batch. 155 outputTupleStartIdx int 156 // windowedBatch is the output batch of chunkerOperator. For performance 157 // reasons, the spooled tuples are not copied into it, instead we use a 158 // "window" approach. 159 windowedBatch coldata.Batch 160 } 161 162 var _ resettableOperator = &chunkerOperator{} 163 164 func (c *chunkerOperator) ChildCount(bool) int { 165 return 1 166 } 167 168 func (c *chunkerOperator) Child(nth int, _ bool) execinfra.OpNode { 169 if nth == 0 { 170 return c.input 171 } 172 colexecerror.InternalError(fmt.Sprintf("invalid index %d", nth)) 173 // This code is unreachable, but the compiler cannot infer that. 174 return nil 175 } 176 177 func (c *chunkerOperator) Init() { 178 c.input.init() 179 } 180 181 func (c *chunkerOperator) Next(ctx context.Context) coldata.Batch { 182 if c.currentChunkFinished { 183 return coldata.ZeroBatch 184 } 185 if !c.haveChunksToEmit { 186 // We don't have any chunks to emit, so we need to spool the input. 187 c.input.spool(ctx) 188 c.haveChunksToEmit = true 189 c.numTuplesInChunks = c.input.getNumTuples() 190 c.newChunksCol = c.input.getPartitionsCol() 191 } 192 outputTupleEndIdx := c.numTuplesInChunks 193 if c.outputTupleStartIdx == outputTupleEndIdx { 194 // Current chunk has been fully output. 195 c.currentChunkFinished = true 196 return coldata.ZeroBatch 197 } 198 if c.newChunksCol == nil { 199 // When newChunksCol is nil, then all tuples that are returned via 200 // getValues are equal on the ordered columns, so we simply emit the next 201 // "window" of those tuples. 202 if outputTupleEndIdx-c.outputTupleStartIdx > coldata.BatchSize() { 203 outputTupleEndIdx = c.outputTupleStartIdx + coldata.BatchSize() 204 } 205 } else { 206 // newChunksCol is non-nil, so there are multiple chunks within the 207 // current tuples. We will emit a single chunk as a separate batch and 208 // then will proceed to emitting zero-length batches until we're reset. 209 outputTupleEndIdx = c.outputTupleStartIdx + 1 210 for outputTupleEndIdx < c.numTuplesInChunks && !c.newChunksCol[outputTupleEndIdx] { 211 outputTupleEndIdx++ 212 } 213 c.currentChunkFinished = true 214 } 215 for i := range c.inputTypes { 216 window := c.input.getValues(i).Window(c.outputTupleStartIdx, outputTupleEndIdx) 217 c.windowedBatch.ReplaceCol(window, i) 218 } 219 c.windowedBatch.SetSelection(false) 220 c.windowedBatch.SetLength(outputTupleEndIdx - c.outputTupleStartIdx) 221 c.outputTupleStartIdx = outputTupleEndIdx 222 return c.windowedBatch 223 } 224 225 func (c *chunkerOperator) done() bool { 226 return c.input.done() 227 } 228 229 func (c *chunkerOperator) reset(_ context.Context) { 230 c.currentChunkFinished = false 231 if c.newChunksCol != nil { 232 if c.outputTupleStartIdx == c.numTuplesInChunks { 233 // We have processed all chunks among the current tuples, so we will need 234 // to get new chunks. 235 c.haveChunksToEmit = false 236 } 237 } else { 238 // We have processed all current tuples (that comprised a single chunk), so 239 // we will need to get new chunks. 240 c.haveChunksToEmit = false 241 } 242 if !c.haveChunksToEmit { 243 c.input.emptyBuffer() 244 c.outputTupleStartIdx = 0 245 } 246 }