github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/sql/colexec/sort.go (about) 1 // Copyright 2018 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package colexec 12 13 import ( 14 "context" 15 "fmt" 16 17 "github.com/cockroachdb/cockroach/pkg/col/coldata" 18 "github.com/cockroachdb/cockroach/pkg/sql/colexecbase" 19 "github.com/cockroachdb/cockroach/pkg/sql/colexecbase/colexecerror" 20 "github.com/cockroachdb/cockroach/pkg/sql/colmem" 21 "github.com/cockroachdb/cockroach/pkg/sql/execinfra" 22 "github.com/cockroachdb/cockroach/pkg/sql/execinfrapb" 23 "github.com/cockroachdb/cockroach/pkg/sql/types" 24 "github.com/cockroachdb/errors" 25 ) 26 27 // NewSorter returns a new sort operator, which sorts its input on the columns 28 // given in orderingCols. The inputTypes must correspond 1-1 with the columns 29 // in the input operator. 30 func NewSorter( 31 allocator *colmem.Allocator, 32 input colexecbase.Operator, 33 inputTypes []*types.T, 34 orderingCols []execinfrapb.Ordering_Column, 35 ) (colexecbase.Operator, error) { 36 return newSorter(allocator, newAllSpooler(allocator, input, inputTypes), inputTypes, orderingCols) 37 } 38 39 func newSorter( 40 allocator *colmem.Allocator, 41 input spooler, 42 inputTypes []*types.T, 43 orderingCols []execinfrapb.Ordering_Column, 44 ) (resettableOperator, error) { 45 partitioners := make([]partitioner, len(orderingCols)-1) 46 47 var err error 48 for i, ord := range orderingCols { 49 if !isSorterSupported(inputTypes[ord.ColIdx], ord.Direction) { 50 return nil, errors.Errorf("sorter for type: %s and direction: %s not supported", inputTypes[ord.ColIdx], ord.Direction) 51 } 52 if i < len(orderingCols)-1 { 53 partitioners[i], err = newPartitioner(inputTypes[ord.ColIdx]) 54 if err != nil { 55 return nil, err 56 } 57 } 58 } 59 60 return &sortOp{ 61 allocator: allocator, 62 input: input, 63 inputTypes: inputTypes, 64 sorters: make([]colSorter, len(orderingCols)), 65 partitioners: partitioners, 66 orderingCols: orderingCols, 67 state: sortSpooling, 68 }, nil 69 } 70 71 // spooler is a column vector operator that spools the data from its input. 72 type spooler interface { 73 execinfra.OpNode 74 75 // init initializes this spooler and will be called once at the setup time. 76 init() 77 // spool performs the actual spooling. 78 spool(context.Context) 79 // getValues returns ith Vec of the already spooled data. 80 getValues(i int) coldata.Vec 81 // getNumTuples returns the number of spooled tuples. 82 getNumTuples() int 83 // getPartitionsCol returns a partitions column vector in which every true 84 // value indicates a start of a different partition (i.e. "chunk") within 85 // spooled tuples. It should return nil if all the tuples belong to the same 86 // partition. 87 getPartitionsCol() []bool 88 // getWindowedBatch returns a batch that is a "window" into all Vecs of the 89 // already spooled data, with tuples in range [startIdx, endIdx). This batch 90 // is not allowed to be modified and is only safe to use until the next call 91 // to this method. 92 // TODO(yuzefovich): one idea we might want to implement at some point is 93 // adding a wrapper on top of a coldata.Batch that is coldata.ImmutableBatch 94 // that returns coldata.ImmutableVecs to enforce immutability. 95 getWindowedBatch(startIdx, endIdx int) coldata.Batch 96 } 97 98 // allSpooler is the spooler that spools all tuples from the input. It is used 99 // by the general sorter over the whole input. 100 type allSpooler struct { 101 OneInputNode 102 NonExplainable 103 104 allocator *colmem.Allocator 105 // inputTypes contains the types of all of the columns from the input. 106 inputTypes []*types.T 107 // bufferedTuples stores all the values from the input after spooling. Each 108 // Vec in this batch is the entire column from the input. 109 bufferedTuples *appendOnlyBufferedBatch 110 // spooled indicates whether spool() has already been called. 111 spooled bool 112 windowedBatch coldata.Batch 113 } 114 115 var _ spooler = &allSpooler{} 116 var _ resetter = &allSpooler{} 117 118 func newAllSpooler( 119 allocator *colmem.Allocator, input colexecbase.Operator, inputTypes []*types.T, 120 ) spooler { 121 return &allSpooler{ 122 OneInputNode: NewOneInputNode(input), 123 allocator: allocator, 124 inputTypes: inputTypes, 125 } 126 } 127 128 func (p *allSpooler) init() { 129 p.input.Init() 130 p.bufferedTuples = newAppendOnlyBufferedBatch( 131 p.allocator, p.inputTypes, 0, /* initialSize */ 132 ) 133 p.windowedBatch = p.allocator.NewMemBatchWithSize(p.inputTypes, 0 /* size */) 134 } 135 136 func (p *allSpooler) spool(ctx context.Context) { 137 if p.spooled { 138 colexecerror.InternalError("spool() is called for the second time") 139 } 140 p.spooled = true 141 for batch := p.input.Next(ctx); batch.Length() != 0; batch = p.input.Next(ctx) { 142 p.allocator.PerformOperation(p.bufferedTuples.ColVecs(), func() { 143 p.bufferedTuples.append(batch, 0 /* startIdx */, batch.Length()) 144 }) 145 } 146 } 147 148 func (p *allSpooler) getValues(i int) coldata.Vec { 149 if !p.spooled { 150 colexecerror.InternalError("getValues() is called before spool()") 151 } 152 return p.bufferedTuples.ColVec(i) 153 } 154 155 func (p *allSpooler) getNumTuples() int { 156 return p.bufferedTuples.Length() 157 } 158 159 func (p *allSpooler) getPartitionsCol() []bool { 160 if !p.spooled { 161 colexecerror.InternalError("getPartitionsCol() is called before spool()") 162 } 163 return nil 164 } 165 166 func (p *allSpooler) getWindowedBatch(startIdx, endIdx int) coldata.Batch { 167 // We don't need to worry about selection vectors here because if these were 168 // present on the original input batches, they have been removed when we were 169 // buffering up tuples. 170 for i := range p.inputTypes { 171 window := p.bufferedTuples.ColVec(i).Window(startIdx, endIdx) 172 p.windowedBatch.ReplaceCol(window, i) 173 } 174 p.windowedBatch.SetSelection(false) 175 p.windowedBatch.SetLength(endIdx - startIdx) 176 return p.windowedBatch 177 } 178 179 func (p *allSpooler) reset(ctx context.Context) { 180 if r, ok := p.input.(resetter); ok { 181 r.reset(ctx) 182 } 183 p.spooled = false 184 p.bufferedTuples.SetLength(0) 185 p.bufferedTuples.ResetInternalBatch() 186 } 187 188 type sortOp struct { 189 allocator *colmem.Allocator 190 input spooler 191 192 // inputTypes contains the types of all of the columns from input. 193 inputTypes []*types.T 194 // orderingCols is the ordered list of column orderings that the sorter should 195 // sort on. 196 orderingCols []execinfrapb.Ordering_Column 197 // sorters contains one colSorter per sort column. The instantiation of 198 // sorters occurs within the sort method rather than during construction 199 // of the sortOp so that we can correctly choose a sorter based on 200 // whether the input has nulls or not. 201 sorters []colSorter 202 // partitioners contains one partitioner per sort column except for the last, 203 // which doesn't need to be partitioned. 204 partitioners []partitioner 205 206 // order maintains the order of tuples in the batch, after sorting. The value 207 // at index i in order is the ordinal value of the tuple in the input that 208 // belongs at index i. For example, if the input column to sort was 209 // [c,b,a,d], the order vector after sorting would be [2,1,0,3]. 210 order []int 211 // emitted is the number of tuples emitted so far. 212 emitted int 213 // state is the current state of the sort. 214 state sortState 215 216 output coldata.Batch 217 218 exported int 219 } 220 221 var _ bufferingInMemoryOperator = &sortOp{} 222 var _ resetter = &sortOp{} 223 224 // colSorter is a single-column sorter, specialized on a particular type. 225 type colSorter interface { 226 // init prepares this sorter, given a particular Vec and an order vector, 227 // which must be the same size as the input Vec and will be permuted with 228 // the same swaps as the column. 229 init(col coldata.Vec, order []int) 230 // sort globally sorts this sorter's column. 231 sort(ctx context.Context) 232 // sortPartitions sorts this sorter's column once for every partition in the 233 // partition slice. 234 sortPartitions(ctx context.Context, partitions []int) 235 } 236 237 func (p *sortOp) Init() { 238 p.input.init() 239 } 240 241 // sortState represents the state of the sort operator. 242 type sortState int 243 244 const ( 245 // sortSpooling is the initial state of the operator, where it spools its 246 // input. 247 sortSpooling sortState = iota 248 // sortSorting is the second state of the operator, where it actually sorts 249 // all the spooled data. 250 sortSorting 251 // sortEmitting is the third state of the operator, indicating that each call 252 // to Next will return another batch of the sorted data. 253 sortEmitting 254 ) 255 256 func (p *sortOp) Next(ctx context.Context) coldata.Batch { 257 switch p.state { 258 case sortSpooling: 259 p.input.spool(ctx) 260 p.state = sortSorting 261 fallthrough 262 case sortSorting: 263 p.sort(ctx) 264 p.state = sortEmitting 265 fallthrough 266 case sortEmitting: 267 newEmitted := p.emitted + coldata.BatchSize() 268 if newEmitted > p.input.getNumTuples() { 269 newEmitted = p.input.getNumTuples() 270 } 271 if newEmitted == p.emitted { 272 return coldata.ZeroBatch 273 } 274 275 p.resetOutput() 276 for j := 0; j < len(p.inputTypes); j++ { 277 // At this point, we have already fully sorted the input. It is ok to do 278 // this Copy outside of the allocator - the work has been done, but 279 // theoretically it is possible to hit the limit here (mainly with 280 // variable-sized types like Bytes). Nonetheless, for performance reasons 281 // it would be sad to fallback to disk at this point. 282 p.output.ColVec(j).Copy( 283 coldata.CopySliceArgs{ 284 SliceArgs: coldata.SliceArgs{ 285 Sel: p.order, 286 Src: p.input.getValues(j), 287 SrcStartIdx: p.emitted, 288 SrcEndIdx: newEmitted, 289 }, 290 }, 291 ) 292 } 293 p.output.SetLength(newEmitted - p.emitted) 294 p.emitted = newEmitted 295 return p.output 296 } 297 colexecerror.InternalError(fmt.Sprintf("invalid sort state %v", p.state)) 298 // This code is unreachable, but the compiler cannot infer that. 299 return nil 300 } 301 302 // sort sorts the spooled tuples, so it must be called after spool() has been 303 // performed. 304 func (p *sortOp) sort(ctx context.Context) { 305 spooledTuples := p.input.getNumTuples() 306 if spooledTuples == 0 { 307 // There is nothing to sort. 308 return 309 } 310 // Allocate p.order and p.workingSpace if it hasn't been allocated yet or the 311 // underlying memory is insufficient. 312 if p.order == nil || cap(p.order) < spooledTuples { 313 p.order = make([]int, spooledTuples) 314 } 315 p.order = p.order[:spooledTuples] 316 317 // Initialize the order vector to the ordinal positions within the input set. 318 for i := 0; i < len(p.order); i++ { 319 p.order[i] = i 320 } 321 322 for i := range p.orderingCols { 323 inputVec := p.input.getValues(int(p.orderingCols[i].ColIdx)) 324 p.sorters[i] = newSingleSorter(p.inputTypes[p.orderingCols[i].ColIdx], p.orderingCols[i].Direction, inputVec.MaybeHasNulls()) 325 p.sorters[i].init(inputVec, p.order) 326 } 327 328 // Now, sort each column in turn. 329 sorters := p.sorters 330 partitionsCol := p.input.getPartitionsCol() 331 omitNextPartitioning := false 332 offset := 0 333 if partitionsCol == nil { 334 // All spooled tuples belong to the same partition, so the first column 335 // doesn't need special treatment - we just globally sort it. 336 p.sorters[0].sort(ctx) 337 if len(p.sorters) == 1 { 338 // We're done sorting. Transition to emitting. 339 return 340 } 341 sorters = sorters[1:] 342 partitionsCol = make([]bool, spooledTuples) 343 } else { 344 // There are at least two partitions already, so the first column needs the 345 // same special treatment as all others. The general sequence is as 346 // follows: global sort -> partition -> sort partitions -> partition -> 347 // -> sort partitions -> partition -> sort partitions -> ..., but in this 348 // case, global sort doesn't make sense and partitioning has already been 349 // done, so we want to skip the first partitioning step and sort partitions 350 // right away. Also, in order to account for not performed global sort, we 351 // introduce an offset of 1 for partitioners. 352 omitNextPartitioning = true 353 offset = 1 354 } 355 356 // The rest of the columns need p sorts, one per partition in the previous 357 // column. For example, in a two column sort: 358 // 359 // 1 b 360 // 2 b 361 // 1 a 362 // 2 a 363 // 364 // We'll first sort the first column: 365 // 366 // 1 b 367 // 1 a 368 // 2 b 369 // 2 a 370 // 371 // Then, for each group in the sorted, first column, we sort the second column: 372 // 373 // 1 a 374 // 1 b 375 // 2 a 376 // 2 b 377 378 partitions := make([]int, 0, 16) 379 for i, sorter := range sorters { 380 if !omitNextPartitioning { 381 // We partition the previous column by running an ordered distinct operation 382 // on it, ORing the results together with each subsequent column. This 383 // produces a distinct vector (a boolean vector that has true in each 384 // position that is different from the last position). 385 p.partitioners[i-offset].partitionWithOrder(p.input.getValues(int(p.orderingCols[i-offset].ColIdx)), p.order, 386 partitionsCol, spooledTuples) 387 } else { 388 omitNextPartitioning = false 389 } 390 // Convert the distinct vector into a selection vector - a vector of indices 391 // that were true in the distinct vector. 392 partitions = boolVecToSel64(partitionsCol, partitions[:0]) 393 // For each partition (set of tuples that are identical in all of the sort 394 // columns we've seen so far), sort based on the new column. 395 sorter.sortPartitions(ctx, partitions) 396 } 397 } 398 399 func (p *sortOp) resetOutput() { 400 if p.output == nil { 401 p.output = p.allocator.NewMemBatch(p.inputTypes) 402 } else { 403 p.output.ResetInternalBatch() 404 } 405 } 406 407 func (p *sortOp) reset(ctx context.Context) { 408 if r, ok := p.input.(resetter); ok { 409 r.reset(ctx) 410 } 411 p.emitted = 0 412 p.exported = 0 413 p.state = sortSpooling 414 } 415 416 func (p *sortOp) ChildCount(verbose bool) int { 417 return 1 418 } 419 420 func (p *sortOp) Child(nth int, verbose bool) execinfra.OpNode { 421 if nth == 0 { 422 return p.input 423 } 424 colexecerror.InternalError(fmt.Sprintf("invalid index %d", nth)) 425 // This code is unreachable, but the compiler cannot infer that. 426 return nil 427 } 428 429 func (p *sortOp) ExportBuffered(colexecbase.Operator) coldata.Batch { 430 if p.exported == p.input.getNumTuples() { 431 return coldata.ZeroBatch 432 } 433 newExported := p.exported + coldata.BatchSize() 434 if newExported > p.input.getNumTuples() { 435 newExported = p.input.getNumTuples() 436 } 437 b := p.input.getWindowedBatch(p.exported, newExported) 438 p.exported = newExported 439 return b 440 }