github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/sql/colexec/sort_chunks.go (about) 1 // Copyright 2019 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package colexec 12 13 import ( 14 "context" 15 "fmt" 16 17 "github.com/cockroachdb/cockroach/pkg/col/coldata" 18 "github.com/cockroachdb/cockroach/pkg/sql/colexecbase" 19 "github.com/cockroachdb/cockroach/pkg/sql/colexecbase/colexecerror" 20 "github.com/cockroachdb/cockroach/pkg/sql/colmem" 21 "github.com/cockroachdb/cockroach/pkg/sql/execinfra" 22 "github.com/cockroachdb/cockroach/pkg/sql/execinfrapb" 23 "github.com/cockroachdb/cockroach/pkg/sql/types" 24 ) 25 26 // NewSortChunks returns a new sort chunks operator, which sorts its input on 27 // the columns given in orderingCols. The inputTypes must correspond 1-1 with 28 // the columns in the input operator. The input tuples must be sorted on first 29 // matchLen columns. 30 func NewSortChunks( 31 allocator *colmem.Allocator, 32 input colexecbase.Operator, 33 inputTypes []*types.T, 34 orderingCols []execinfrapb.Ordering_Column, 35 matchLen int, 36 ) (colexecbase.Operator, error) { 37 if matchLen < 1 || matchLen == len(orderingCols) { 38 colexecerror.InternalError(fmt.Sprintf( 39 "sort chunks should only be used when the input is "+ 40 "already ordered on at least one column but not fully ordered; "+ 41 "num ordering cols = %d, matchLen = %d", len(orderingCols), matchLen)) 42 } 43 alreadySortedCols := make([]uint32, matchLen) 44 for i := range alreadySortedCols { 45 alreadySortedCols[i] = orderingCols[i].ColIdx 46 } 47 chunker, err := newChunker(allocator, input, inputTypes, alreadySortedCols) 48 if err != nil { 49 return nil, err 50 } 51 sorter, err := newSorter(allocator, chunker, inputTypes, orderingCols[matchLen:]) 52 if err != nil { 53 return nil, err 54 } 55 return &sortChunksOp{allocator: allocator, input: chunker, sorter: sorter}, nil 56 } 57 58 type sortChunksOp struct { 59 allocator *colmem.Allocator 60 input *chunker 61 sorter resettableOperator 62 63 exportedFromBuffer int 64 exportedFromBatch int 65 windowedBatch coldata.Batch 66 } 67 68 var _ colexecbase.Operator = &sortChunksOp{} 69 var _ bufferingInMemoryOperator = &sortChunksOp{} 70 71 func (c *sortChunksOp) ChildCount(verbose bool) int { 72 return 1 73 } 74 75 func (c *sortChunksOp) Child(nth int, verbose bool) execinfra.OpNode { 76 if nth == 0 { 77 return c.input 78 } 79 colexecerror.InternalError(fmt.Sprintf("invalid index %d", nth)) 80 // This code is unreachable, but the compiler cannot infer that. 81 return nil 82 } 83 84 func (c *sortChunksOp) Init() { 85 c.input.init() 86 c.sorter.Init() 87 // TODO(yuzefovich): switch to calling this method on allocator. This will 88 // require plumbing unlimited allocator to work correctly in tests with 89 // memory limit of 1. 90 c.windowedBatch = coldata.NewMemBatchNoCols(c.input.inputTypes, coldata.BatchSize()) 91 } 92 93 func (c *sortChunksOp) Next(ctx context.Context) coldata.Batch { 94 for { 95 batch := c.sorter.Next(ctx) 96 if batch.Length() == 0 { 97 if c.input.done() { 98 // We're done, so return a zero-length batch. 99 return batch 100 } 101 // We're not yet done - need to process another chunk, so we empty the 102 // chunker's buffer and reset the sorter. Note that we do not want to do 103 // the full reset of the chunker because we're in the middle of 104 // processing of the input to sortChunksOp. 105 c.input.emptyBuffer() 106 c.sorter.reset(ctx) 107 } else { 108 return batch 109 } 110 } 111 } 112 113 func (c *sortChunksOp) ExportBuffered(colexecbase.Operator) coldata.Batch { 114 // First, we check whether chunker has buffered up any tuples, and if so, 115 // whether we have exported them all. 116 if c.input.bufferedTuples.Length() > 0 { 117 if c.exportedFromBuffer < c.input.bufferedTuples.Length() { 118 newExportedFromBuffer := c.exportedFromBuffer + coldata.BatchSize() 119 if newExportedFromBuffer > c.input.bufferedTuples.Length() { 120 newExportedFromBuffer = c.input.bufferedTuples.Length() 121 } 122 for i := range c.input.inputTypes { 123 window := c.input.bufferedTuples.ColVec(i).Window(c.exportedFromBuffer, newExportedFromBuffer) 124 c.windowedBatch.ReplaceCol(window, i) 125 } 126 c.windowedBatch.SetLength(newExportedFromBuffer - c.exportedFromBuffer) 127 c.exportedFromBuffer = newExportedFromBuffer 128 return c.windowedBatch 129 } 130 } 131 // Next, we check whether there are any unexported tuples in the last read 132 // batch. 133 // firstTupleIdx indicates the index of the first tuple in the last read 134 // batch that hasn't been "processed" and should be the first to be exported. 135 firstTupleIdx := c.input.exportState.numProcessedTuplesFromBatch 136 if c.input.batch != nil && firstTupleIdx+c.exportedFromBatch < c.input.batch.Length() { 137 makeWindowIntoBatch(c.windowedBatch, c.input.batch, firstTupleIdx, c.input.inputTypes) 138 c.exportedFromBatch = c.windowedBatch.Length() 139 return c.windowedBatch 140 } 141 return coldata.ZeroBatch 142 } 143 144 // chunkerState represents the state of the chunker spooler. 145 type chunkerState int 146 147 const ( 148 // chunkerReading is the state of the chunker spooler in which it reads a 149 // batch from its input and partitions the batch into chunks. Depending on 150 // current state of the chunker's buffer and number of chunks in the batch, 151 // chunker might stay in chunkerReading state or switch to either of the 152 // emitting states. 153 chunkerReading chunkerState = iota 154 // chunkerEmittingFromBuffer is the state of the chunker spooler in which it 155 // prepares to "emit" tuples that have been buffered. All the tuples belong 156 // to the same chunk ("emit" is in quotes because the chunker does not emit 157 // batches as usual - it, instead, implements spooler interface, and the 158 // batches should be accessed through those methods). The chunker transitions 159 // to chunkerEmittingFromBuffer state and indicates that the tuples need to 160 // be read from the buffer. 161 chunkerEmittingFromBuffer 162 // chunkerEmittingFromBatch is the state of the chunker spooler in which it 163 // prepares to "emit" all chunks that are fully contained within the last 164 // read batch (i.e. all chunks except for the last chunk which might include 165 // tuples from the next batch). The last chunk within the batch is buffered, 166 // the chunker transitions to chunkerReading state and indicates that the 167 // tuples need to be read from s.batch. 168 chunkerEmittingFromBatch 169 ) 170 171 // chunkerReadingState indicates where the spooler needs to read tuples from 172 // for emitting. 173 type chunkerReadingState int 174 175 const ( 176 // chunkerReadFromBuffer indicates that the tuples need to be read from the 177 // buffer. 178 chunkerReadFromBuffer = iota 179 // chunkerReadFromBatch indicates that the tuples need to be read from the 180 // last read batch directly. Only tuples that are fully contained within the 181 // last read batch are "emitted". 182 chunkerReadFromBatch 183 // chunkerDone indicates that the input has been fully consumed and all 184 // tuples have already been emitted. 185 chunkerDone 186 ) 187 188 // chunker is a spooler that produces chunks from its input when the tuples 189 // are already ordered on the first matchLen columns. The chunks are not 190 // emitted in batches as usual when Next()'ed, but, instead, they should be 191 // accessed via getValues(). 192 // 193 // Note 1: the chunker assumes that its input produces batches with no 194 // selection vector, so it always puts a deselector on top of its input. It 195 // does the coalescing itself, so it does not use an extra coalescer. 196 // Note 2: the chunker intentionally does not implement resetter interface (if 197 // it did, the sorter would reset it, but we don't want that since we're likely 198 // in the middle of processing the input). Instead, sortChunksOp will empty the 199 // buffer when appropriate. 200 type chunker struct { 201 OneInputNode 202 NonExplainable 203 204 allocator *colmem.Allocator 205 // inputTypes contains the types of all of the columns from input. 206 inputTypes []*types.T 207 // inputDone indicates whether input has been fully consumed. 208 inputDone bool 209 // alreadySortedCols indicates the columns on which the input is already 210 // ordered. 211 alreadySortedCols []uint32 212 213 // batch is the last read batch from input. 214 batch coldata.Batch 215 // partitioners contains one partitioner for each of matchLen first already 216 // ordered columns. 217 partitioners []partitioner 218 // partitionCol is a bool slice for partitioners' output to be ORed. 219 partitionCol []bool 220 221 // chunks contains the indices of the first tuples within different chunks 222 // found in the last read batch. Note: the first chunk might be a part of 223 // the chunk that is currently being buffered, and similarly the last chunk 224 // might include tuples from the batches to be read. 225 chunks []int 226 // chunksProcessedIdx indicates which chunk within s.chunks should be 227 // processed next. 228 chunksProcessedIdx int 229 // chunksStartIdx indicates the index of the chunk within s.chunks that is 230 // the first one to be emitted from s.batch directly in when reading from 231 // batch. 232 chunksStartIdx int 233 234 // bufferedTuples is a buffer to store tuples when a chunk is bigger than 235 // coldata.BatchSize() or when the chunk is the last in the last read batch 236 // (we don't know yet where the end of such chunk is). 237 bufferedTuples *appendOnlyBufferedBatch 238 239 readFrom chunkerReadingState 240 state chunkerState 241 242 exportState struct { 243 // numProcessedTuplesFromBatch indicates how many tuples from the current 244 // batch have been "processed" for ExportBuffered purposes (here, 245 // "processed" means either have been sorted and emitted or have been 246 // buffered up into bufferedTuples. This information is needed by 247 // sortChunksOp to be able to spill to disk in case of OOM. 248 numProcessedTuplesFromBatch int 249 } 250 } 251 252 var _ spooler = &chunker{} 253 254 func newChunker( 255 allocator *colmem.Allocator, 256 input colexecbase.Operator, 257 inputTypes []*types.T, 258 alreadySortedCols []uint32, 259 ) (*chunker, error) { 260 var err error 261 partitioners := make([]partitioner, len(alreadySortedCols)) 262 for i, col := range alreadySortedCols { 263 partitioners[i], err = newPartitioner(inputTypes[col]) 264 if err != nil { 265 return nil, err 266 } 267 } 268 deselector := NewDeselectorOp(allocator, input, inputTypes) 269 return &chunker{ 270 OneInputNode: NewOneInputNode(deselector), 271 allocator: allocator, 272 inputTypes: inputTypes, 273 alreadySortedCols: alreadySortedCols, 274 partitioners: partitioners, 275 state: chunkerReading, 276 }, nil 277 } 278 279 func (s *chunker) init() { 280 s.input.Init() 281 s.bufferedTuples = newAppendOnlyBufferedBatch( 282 s.allocator, s.inputTypes, 0, /* initialSize */ 283 ) 284 s.partitionCol = make([]bool, coldata.BatchSize()) 285 s.chunks = make([]int, 0, 16) 286 } 287 288 // done indicates whether the chunker has fully consumed its input. 289 func (s *chunker) done() bool { 290 return s.readFrom == chunkerDone 291 } 292 293 // prepareNextChunks prepares the chunks for the chunker spooler. 294 // 295 // Note: it does not return the batches directly; instead, the chunker 296 // remembers where the next chunks to be emitted are actually stored. In order 297 // to access the chunks, getValues() must be used. 298 func (s *chunker) prepareNextChunks(ctx context.Context) chunkerReadingState { 299 for { 300 switch s.state { 301 case chunkerReading: 302 s.batch = s.input.Next(ctx) 303 s.exportState.numProcessedTuplesFromBatch = 0 304 if s.batch.Length() == 0 { 305 s.inputDone = true 306 if s.bufferedTuples.Length() > 0 { 307 s.state = chunkerEmittingFromBuffer 308 } else { 309 s.state = chunkerEmittingFromBatch 310 } 311 continue 312 } 313 if s.batch.Selection() != nil { 314 // We assume that the input has been deselected, so the batch should 315 // never have a selection vector set. 316 colexecerror.InternalError(fmt.Sprintf("unexpected: batch with non-nil selection vector")) 317 } 318 319 // First, run the partitioners on our pre-sorted columns to determine the 320 // boundaries of the chunks (stored in s.chunks) to sort further. 321 copy(s.partitionCol, zeroBoolColumn) 322 for i, orderedCol := range s.alreadySortedCols { 323 s.partitioners[i].partition(s.batch.ColVec(int(orderedCol)), s.partitionCol, 324 s.batch.Length()) 325 } 326 s.chunks = boolVecToSel64(s.partitionCol, s.chunks[:0]) 327 328 if s.bufferedTuples.Length() == 0 { 329 // There are no buffered tuples, so a new chunk starts in the current 330 // batch. 331 if len(s.chunks) > 1 { 332 // There is at least one chunk that is fully contained within 333 // s.batch, so we proceed to emitting it. 334 s.state = chunkerEmittingFromBatch 335 continue 336 } 337 // All tuples in s.batch belong to the same chunk. Possibly tuples from 338 // the next batch will also belong to this chunk, so we buffer the full 339 // s.batch. 340 s.buffer(0 /* start */, s.batch.Length()) 341 s.state = chunkerReading 342 continue 343 } else { 344 // There are some buffered tuples, so we need to check whether the 345 // first tuple of s.batch belongs to the chunk that is being buffered. 346 differ := false 347 i := 0 348 for !differ && i < len(s.alreadySortedCols) { 349 differ = valuesDiffer( 350 s.bufferedTuples.ColVec(int(s.alreadySortedCols[i])), 351 0, /*aValueIdx */ 352 s.batch.ColVec(int(s.alreadySortedCols[i])), 353 0, /* bValueIdx */ 354 ) 355 i++ 356 } 357 if differ { 358 // Buffered tuples comprise a full chunk, so we proceed to emitting 359 // it. 360 s.state = chunkerEmittingFromBuffer 361 continue 362 } 363 364 // The first tuple of s.batch belongs to the chunk that is being 365 // buffered. 366 if len(s.chunks) == 1 { 367 // All tuples in s.batch belong to the same chunk that is being 368 // buffered. Possibly tuples from the next batch will also belong to 369 // this chunk, so we buffer the full s.batch. 370 s.buffer(0 /* start */, s.batch.Length()) 371 s.state = chunkerReading 372 continue 373 } 374 // First s.chunks[1] tuples belong to the same chunk that is being 375 // buffered, so we buffer them and proceed to emitting all buffered 376 // tuples. 377 s.buffer(0 /* start */, s.chunks[1]) 378 s.chunksProcessedIdx = 1 379 s.state = chunkerEmittingFromBuffer 380 continue 381 } 382 case chunkerEmittingFromBuffer: 383 s.state = chunkerEmittingFromBatch 384 return chunkerReadFromBuffer 385 case chunkerEmittingFromBatch: 386 if s.chunksProcessedIdx < len(s.chunks)-1 { 387 // There is at least one chunk that is fully contained within s.batch. 388 // We don't know yet whether the tuples from the next batch belong to 389 // the last chunk of the current batch, so we will buffer those and can 390 // only emit "internal" to s.batch chunks. Additionally, if 391 // s.chunksProcessedIdx == 1, then the first chunk was already combined 392 // with the buffered tuples and emitted. 393 s.chunksStartIdx = s.chunksProcessedIdx 394 s.chunksProcessedIdx = len(s.chunks) - 1 395 return chunkerReadFromBatch 396 } else if s.chunksProcessedIdx == len(s.chunks)-1 { 397 // Other tuples might belong to this chunk, so we buffer it. 398 s.buffer(s.chunks[s.chunksProcessedIdx], s.batch.Length()) 399 // All tuples in s.batch have been processed, so we reset s.chunks and 400 // the corresponding variables. 401 s.chunks = s.chunks[:0] 402 s.chunksProcessedIdx = 0 403 s.state = chunkerReading 404 } else { 405 // All tuples in s.batch have been emitted. 406 if s.inputDone { 407 return chunkerDone 408 } 409 colexecerror.InternalError(fmt.Sprintf("unexpected: chunkerEmittingFromBatch state" + 410 "when s.chunks is fully processed and input is not done")) 411 } 412 default: 413 colexecerror.InternalError(fmt.Sprintf("invalid chunker spooler state %v", s.state)) 414 } 415 } 416 } 417 418 // buffer appends all tuples in range [start,end) from s.batch to already 419 // buffered tuples. 420 func (s *chunker) buffer(start int, end int) { 421 if start == end { 422 return 423 } 424 s.allocator.PerformOperation(s.bufferedTuples.ColVecs(), func() { 425 s.exportState.numProcessedTuplesFromBatch = end 426 s.bufferedTuples.append(s.batch, start, end) 427 }) 428 } 429 430 func (s *chunker) spool(ctx context.Context) { 431 s.readFrom = s.prepareNextChunks(ctx) 432 } 433 434 func (s *chunker) getValues(i int) coldata.Vec { 435 switch s.readFrom { 436 case chunkerReadFromBuffer: 437 return s.bufferedTuples.ColVec(i).Window(0 /* start */, s.bufferedTuples.Length()) 438 case chunkerReadFromBatch: 439 return s.batch.ColVec(i).Window(s.chunks[s.chunksStartIdx], s.chunks[len(s.chunks)-1]) 440 default: 441 colexecerror.InternalError(fmt.Sprintf("unexpected chunkerReadingState in getValues: %v", s.state)) 442 // This code is unreachable, but the compiler cannot infer that. 443 return nil 444 } 445 } 446 447 func (s *chunker) getNumTuples() int { 448 switch s.readFrom { 449 case chunkerReadFromBuffer: 450 return s.bufferedTuples.Length() 451 case chunkerReadFromBatch: 452 return s.chunks[len(s.chunks)-1] - s.chunks[s.chunksStartIdx] 453 case chunkerDone: 454 return 0 455 default: 456 colexecerror.InternalError(fmt.Sprintf("unexpected chunkerReadingState in getNumTuples: %v", s.state)) 457 // This code is unreachable, but the compiler cannot infer that. 458 return 0 459 } 460 } 461 462 func (s *chunker) getPartitionsCol() []bool { 463 switch s.readFrom { 464 case chunkerReadFromBuffer: 465 // There is a single chunk in the buffer, so, per spooler's contract, we 466 // return nil. 467 return nil 468 case chunkerReadFromBatch: 469 if s.chunksStartIdx+1 == len(s.chunks)-1 { 470 // There is a single chunk that is fully contained within s.batch, so, 471 // per spooler's contract, we return nil. 472 return nil 473 } 474 copy(s.partitionCol, zeroBoolColumn) 475 for i := s.chunksStartIdx; i < len(s.chunks)-1; i++ { 476 // getValues returns a slice starting from s.chunks[s.chunksStartIdx], so 477 // we need to account for that by shifting as well. 478 s.partitionCol[s.chunks[i]-s.chunks[s.chunksStartIdx]] = true 479 } 480 return s.partitionCol 481 case chunkerDone: 482 return nil 483 default: 484 colexecerror.InternalError(fmt.Sprintf("unexpected chunkerReadingState in getPartitionsCol: %v", s.state)) 485 // This code is unreachable, but the compiler cannot infer that. 486 return nil 487 } 488 } 489 490 func (s *chunker) getWindowedBatch(startIdx, endIdx int) coldata.Batch { 491 colexecerror.InternalError("getWindowedBatch is not implemented on chunker spooler") 492 // This code is unreachable, but the compiler cannot infer that. 493 return nil 494 } 495 496 func (s *chunker) emptyBuffer() { 497 s.bufferedTuples.SetLength(0) 498 s.bufferedTuples.ResetInternalBatch() 499 }