github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/sql/colexec/external_sort.go (about) 1 // Copyright 2019 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package colexec 12 13 import ( 14 "context" 15 "fmt" 16 17 "github.com/cockroachdb/cockroach/pkg/col/coldata" 18 "github.com/cockroachdb/cockroach/pkg/sql/colcontainer" 19 "github.com/cockroachdb/cockroach/pkg/sql/colexecbase" 20 "github.com/cockroachdb/cockroach/pkg/sql/colexecbase/colexecerror" 21 "github.com/cockroachdb/cockroach/pkg/sql/colmem" 22 "github.com/cockroachdb/cockroach/pkg/sql/execinfrapb" 23 "github.com/cockroachdb/cockroach/pkg/sql/types" 24 "github.com/cockroachdb/cockroach/pkg/util/mon" 25 "github.com/cockroachdb/cockroach/pkg/util/syncutil" 26 "github.com/cockroachdb/errors" 27 "github.com/marusama/semaphore" 28 ) 29 30 // externalSorterState indicates the current state of the external sorter. 31 type externalSorterState int 32 33 const ( 34 // externalSorterNewPartition indicates that the next batch we read should 35 // start a new partition. A zero-length batch in this state indicates that 36 // the input to the external sorter has been fully consumed and we should 37 // proceed to merging the partitions. 38 externalSorterNewPartition externalSorterState = iota 39 // externalSorterSpillPartition indicates that the next batch we read should 40 // be added to the last partition so far. A zero-length batch in this state 41 // indicates that the end of the partition has been reached and we should 42 // transition to starting a new partition. If maxNumberPartitions is reached 43 // in this state, the sorter will transition to externalSorterRepeatedMerging 44 // to reduce the number of partitions. 45 externalSorterSpillPartition 46 // externalSorterRepeatedMerging indicates that we need to merge 47 // maxNumberPartitions into one and spill that new partition to disk. When 48 // finished, the sorter will transition to externalSorterNewPartition. 49 externalSorterRepeatedMerging 50 // externalSorterFinalMerging indicates that we have fully consumed the input 51 // and can merge all of the partitions in one step. We then transition to 52 // externalSorterEmitting state. 53 externalSorterFinalMerging 54 // externalSorterEmitting indicates that we are ready to emit output. A zero- 55 // length batch in this state indicates that we have emitted all tuples and 56 // should transition to externalSorterFinished state. 57 externalSorterEmitting 58 // externalSorterFinished indicates that all tuples from all partitions have 59 // been emitted and from now on only a zero-length batch will be emitted by 60 // the external sorter. This state is also responsible for closing the 61 // partitions. 62 externalSorterFinished 63 ) 64 65 // In order to make progress when merging we have to merge at least two 66 // partitions into a new third one. 67 const externalSorterMinPartitions = 3 68 69 // externalSorter is an Operator that performs external merge sort. It works in 70 // two stages: 71 // 1. it will use a combination of an input partitioner and in-memory sorter to 72 // divide up all batches from the input into partitions, sort each partition in 73 // memory, and write sorted partitions to disk 74 // 2. it will use OrderedSynchronizer to merge the partitions. 75 // 76 // The (simplified) diagram of the components involved is as follows: 77 // 78 // input 79 // | 80 // ↓ 81 // input partitioner 82 // | 83 // ↓ 84 // in-memory sorter 85 // | 86 // ↓ 87 // ------------------------------------------ 88 // | external sorter | 89 // | --------------- | 90 // | | 91 // | partition1 partition2 ... partitionN | 92 // | | | | | 93 // | ↓ ↓ ↓ | 94 // | merger (ordered synchronizer) | 95 // ------------------------------------------ 96 // | 97 // ↓ 98 // output 99 // 100 // There are a couple of implicit upstream links in the setup: 101 // - input partitioner checks the allocator used by the in-memory sorter to see 102 // whether a new partition must be started 103 // - external sorter resets in-memory sorter (which, in turn, resets input 104 // partitioner) once the full partition has been spilled to disk. 105 // 106 // What is hidden in the diagram is the fact that at some point we might need 107 // to merge several partitions into a new one that we spill to disk in order to 108 // reduce the number of "active" partitions. This requirement comes from the 109 // need to limit the number of "active" partitions because each partition uses 110 // some amount of RAM for its buffer. This is determined by 111 // maxNumberPartitions variable. 112 type externalSorter struct { 113 OneInputNode 114 NonExplainable 115 closerHelper 116 117 // mu is used to protect against concurrent IdempotentClose and Next calls, 118 // which are currently allowed. 119 // TODO(asubiotto): Explore calling IdempotentClose from the same goroutine as 120 // Next, which will simplify this model. 121 mu syncutil.Mutex 122 123 unlimitedAllocator *colmem.Allocator 124 state externalSorterState 125 inputTypes []*types.T 126 ordering execinfrapb.Ordering 127 inMemSorter resettableOperator 128 inMemSorterInput *inputPartitioningOperator 129 partitioner colcontainer.PartitionedQueue 130 partitionerCreator func() colcontainer.PartitionedQueue 131 // numPartitions is the current number of partitions. 132 numPartitions int 133 // firstPartitionIdx is the index of the first partition to merge next. 134 firstPartitionIdx int 135 maxNumberPartitions int 136 137 // fdState is used to acquire file descriptors up front. 138 fdState struct { 139 fdSemaphore semaphore.Semaphore 140 acquiredFDs int 141 } 142 143 emitter colexecbase.Operator 144 145 testingKnobs struct { 146 // delegateFDAcquisitions if true, means that a test wants to force the 147 // PartitionedDiskQueues to track the number of file descriptors the hash 148 // joiner will open/close. This disables the default behavior of acquiring 149 // all file descriptors up front in Next. 150 delegateFDAcquisitions bool 151 } 152 } 153 154 var _ resettableOperator = &externalSorter{} 155 var _ closableOperator = &externalSorter{} 156 157 // newExternalSorter returns a disk-backed general sort operator. 158 // - ctx is the same context that standaloneMemAccount was created with. 159 // - unlimitedAllocator must have been created with a memory account derived 160 // from an unlimited memory monitor. It will be used by several internal 161 // components of the external sort which is responsible for making sure that 162 // the components stay within the memory limit. 163 // - standaloneMemAccount must be a memory account derived from an unlimited 164 // memory monitor with a standalone budget. It will be used by 165 // inputPartitioningOperator to "partition" the input according to memory 166 // limit. The budget *must* be standalone because we don't want to double 167 // count the memory (the memory under the batches will be accounted for with 168 // the unlimitedAllocator). 169 // - maxNumberPartitions (when non-zero) overrides the semi-dynamically 170 // computed maximum number of partitions to have at once. 171 // - delegateFDAcquisitions specifies whether the external sorter should let 172 // the partitioned disk queue acquire file descriptors instead of acquiring 173 // them up front in Next. This should only be true in tests. 174 func newExternalSorter( 175 ctx context.Context, 176 unlimitedAllocator *colmem.Allocator, 177 standaloneMemAccount *mon.BoundAccount, 178 input colexecbase.Operator, 179 inputTypes []*types.T, 180 ordering execinfrapb.Ordering, 181 memoryLimit int64, 182 maxNumberPartitions int, 183 delegateFDAcquisitions bool, 184 diskQueueCfg colcontainer.DiskQueueCfg, 185 fdSemaphore semaphore.Semaphore, 186 diskAcc *mon.BoundAccount, 187 ) colexecbase.Operator { 188 if diskQueueCfg.CacheMode != colcontainer.DiskQueueCacheModeReuseCache { 189 colexecerror.InternalError(errors.Errorf("external sorter instantiated with suboptimal disk queue cache mode: %d", diskQueueCfg.CacheMode)) 190 } 191 if diskQueueCfg.BufferSizeBytes > 0 && maxNumberPartitions == 0 { 192 // With the default limit of 256 file descriptors, this results in 16 193 // partitions. This is a hard maximum of partitions that will be used by the 194 // external sorter 195 // TODO(asubiotto): this number should be tuned. 196 maxNumberPartitions = fdSemaphore.GetLimit() / 16 197 } 198 if maxNumberPartitions < externalSorterMinPartitions { 199 maxNumberPartitions = externalSorterMinPartitions 200 } 201 // Each disk queue will use up to BufferSizeBytes of RAM, so we reduce the 202 // memoryLimit of the partitions to sort in memory by those cache sizes. To be 203 // safe, we also estimate the size of the output batch and subtract that as 204 // well. 205 batchMemSize := colmem.EstimateBatchSizeBytes(inputTypes, coldata.BatchSize()) 206 // Reserve a certain amount of memory for the partition caches. 207 memoryLimit -= int64((maxNumberPartitions * diskQueueCfg.BufferSizeBytes) + batchMemSize) 208 if memoryLimit < 1 { 209 // If the memory limit is 0, the input partitioning operator will return a 210 // zero-length batch, so make it at least 1. 211 memoryLimit = 1 212 } 213 inputPartitioner := newInputPartitioningOperator(input, standaloneMemAccount, memoryLimit) 214 inMemSorter, err := newSorter( 215 unlimitedAllocator, newAllSpooler(unlimitedAllocator, inputPartitioner, inputTypes), 216 inputTypes, ordering.Columns, 217 ) 218 if err != nil { 219 colexecerror.InternalError(err) 220 } 221 partitionedDiskQueueSemaphore := fdSemaphore 222 if !delegateFDAcquisitions { 223 // To avoid deadlocks with other disk queues, we manually attempt to acquire 224 // the maximum number of descriptors all at once in Next. Passing in a nil 225 // semaphore indicates that the caller will do the acquiring. 226 partitionedDiskQueueSemaphore = nil 227 } 228 es := &externalSorter{ 229 OneInputNode: NewOneInputNode(inMemSorter), 230 unlimitedAllocator: unlimitedAllocator, 231 inMemSorter: inMemSorter, 232 inMemSorterInput: inputPartitioner.(*inputPartitioningOperator), 233 partitionerCreator: func() colcontainer.PartitionedQueue { 234 return colcontainer.NewPartitionedDiskQueue(inputTypes, diskQueueCfg, partitionedDiskQueueSemaphore, colcontainer.PartitionerStrategyCloseOnNewPartition, diskAcc) 235 }, 236 inputTypes: inputTypes, 237 ordering: ordering, 238 maxNumberPartitions: maxNumberPartitions, 239 } 240 es.fdState.fdSemaphore = fdSemaphore 241 es.testingKnobs.delegateFDAcquisitions = delegateFDAcquisitions 242 return es 243 } 244 245 func (s *externalSorter) Init() { 246 s.input.Init() 247 s.state = externalSorterNewPartition 248 } 249 250 func (s *externalSorter) Next(ctx context.Context) coldata.Batch { 251 s.mu.Lock() 252 defer s.mu.Unlock() 253 for { 254 switch s.state { 255 case externalSorterNewPartition: 256 b := s.input.Next(ctx) 257 if b.Length() == 0 { 258 // The input has been fully exhausted, and it is always the case that 259 // the number of partitions is less than the maximum number since 260 // externalSorterSpillPartition will check and re-merge if not. 261 // Proceed to the final merging state. 262 s.state = externalSorterFinalMerging 263 continue 264 } 265 newPartitionIdx := s.firstPartitionIdx + s.numPartitions 266 if s.partitioner == nil { 267 s.partitioner = s.partitionerCreator() 268 } 269 if err := s.partitioner.Enqueue(ctx, newPartitionIdx, b); err != nil { 270 colexecerror.InternalError(err) 271 } 272 s.state = externalSorterSpillPartition 273 continue 274 case externalSorterSpillPartition: 275 curPartitionIdx := s.firstPartitionIdx + s.numPartitions 276 b := s.input.Next(ctx) 277 if b.Length() == 0 { 278 // The partition has been fully spilled, so we reset the in-memory 279 // sorter (which will do the "shallow" reset of 280 // inputPartitioningOperator). 281 s.inMemSorterInput.interceptReset = true 282 s.inMemSorter.reset(ctx) 283 s.numPartitions++ 284 if s.numPartitions == s.maxNumberPartitions-1 { 285 // We have reached the maximum number of active partitions that we 286 // know that we'll be able to merge without exceeding the limit, so 287 // we need to merge all of them and spill the new partition to disk 288 // before we can proceed on consuming the input. 289 s.state = externalSorterRepeatedMerging 290 continue 291 } 292 s.state = externalSorterNewPartition 293 continue 294 } 295 if !s.testingKnobs.delegateFDAcquisitions && s.fdState.fdSemaphore != nil && s.fdState.acquiredFDs == 0 { 296 toAcquire := s.maxNumberPartitions 297 if err := s.fdState.fdSemaphore.Acquire(ctx, toAcquire); err != nil { 298 colexecerror.InternalError(err) 299 } 300 s.fdState.acquiredFDs = toAcquire 301 } 302 if err := s.partitioner.Enqueue(ctx, curPartitionIdx, b); err != nil { 303 colexecerror.InternalError(err) 304 } 305 continue 306 case externalSorterRepeatedMerging: 307 // We will merge all partitions in range [s.firstPartitionIdx, 308 // s.firstPartitionIdx+s.numPartitions) and will spill all the 309 // resulting batches into a new partition with the next available 310 // index. 311 // 312 // The merger will be using some amount of RAM, will register it 313 // with the unlimited allocator and will *not* release that memory 314 // from the allocator, so we have to do it ourselves. 315 before := s.unlimitedAllocator.Used() 316 merger, err := s.createMergerForPartitions(s.firstPartitionIdx, s.numPartitions) 317 if err != nil { 318 colexecerror.InternalError(err) 319 } 320 merger.Init() 321 newPartitionIdx := s.firstPartitionIdx + s.numPartitions 322 for b := merger.Next(ctx); b.Length() > 0; b = merger.Next(ctx) { 323 if err := s.partitioner.Enqueue(ctx, newPartitionIdx, b); err != nil { 324 colexecerror.InternalError(err) 325 } 326 } 327 after := s.unlimitedAllocator.Used() 328 s.unlimitedAllocator.ReleaseMemory(after - before) 329 // Reclaim disk space by closing the inactive read partitions. Since the 330 // merger must have exhausted all inputs, this is all the partitions just 331 // read from. 332 if err := s.partitioner.CloseInactiveReadPartitions(ctx); err != nil { 333 colexecerror.InternalError(err) 334 } 335 s.firstPartitionIdx += s.numPartitions 336 s.numPartitions = 1 337 s.state = externalSorterNewPartition 338 continue 339 case externalSorterFinalMerging: 340 if s.numPartitions == 0 { 341 s.state = externalSorterFinished 342 continue 343 } else if s.numPartitions == 1 { 344 s.emitter = newPartitionerToOperator( 345 s.unlimitedAllocator, s.inputTypes, s.partitioner, s.firstPartitionIdx, 346 ) 347 } else { 348 var err error 349 s.emitter, err = s.createMergerForPartitions(s.firstPartitionIdx, s.numPartitions) 350 if err != nil { 351 colexecerror.InternalError(err) 352 } 353 } 354 s.emitter.Init() 355 s.state = externalSorterEmitting 356 continue 357 case externalSorterEmitting: 358 b := s.emitter.Next(ctx) 359 if b.Length() == 0 { 360 s.state = externalSorterFinished 361 continue 362 } 363 return b 364 case externalSorterFinished: 365 if err := s.internalCloseLocked(ctx); err != nil { 366 colexecerror.InternalError(err) 367 } 368 return coldata.ZeroBatch 369 default: 370 colexecerror.InternalError(fmt.Sprintf("unexpected externalSorterState %d", s.state)) 371 } 372 } 373 } 374 375 func (s *externalSorter) reset(ctx context.Context) { 376 if r, ok := s.input.(resetter); ok { 377 r.reset(ctx) 378 } 379 s.state = externalSorterNewPartition 380 s.mu.Lock() 381 defer s.mu.Unlock() 382 if err := s.internalCloseLocked(ctx); err != nil { 383 colexecerror.InternalError(err) 384 } 385 s.firstPartitionIdx = 0 386 s.numPartitions = 0 387 } 388 389 func (s *externalSorter) internalCloseLocked(ctx context.Context) error { 390 var lastErr error 391 if s.partitioner != nil { 392 lastErr = s.partitioner.Close(ctx) 393 s.partitioner = nil 394 } 395 if err := s.inMemSorterInput.Close(ctx); err != nil { 396 lastErr = err 397 } 398 if !s.testingKnobs.delegateFDAcquisitions && s.fdState.fdSemaphore != nil && s.fdState.acquiredFDs > 0 { 399 s.fdState.fdSemaphore.Release(s.fdState.acquiredFDs) 400 s.fdState.acquiredFDs = 0 401 } 402 return lastErr 403 } 404 405 func (s *externalSorter) IdempotentClose(ctx context.Context) error { 406 s.mu.Lock() 407 defer s.mu.Unlock() 408 if !s.close() { 409 return nil 410 } 411 return s.internalCloseLocked(ctx) 412 } 413 414 // createMergerForPartitions creates an ordered synchronizer that will merge 415 // partitions in [firstIdx, firstIdx+numPartitions) range. 416 func (s *externalSorter) createMergerForPartitions( 417 firstIdx, numPartitions int, 418 ) (colexecbase.Operator, error) { 419 syncInputs := make([]colexecbase.Operator, numPartitions) 420 for i := range syncInputs { 421 syncInputs[i] = newPartitionerToOperator( 422 s.unlimitedAllocator, s.inputTypes, s.partitioner, firstIdx+i, 423 ) 424 } 425 return NewOrderedSynchronizer( 426 s.unlimitedAllocator, 427 syncInputs, 428 s.inputTypes, 429 execinfrapb.ConvertToColumnOrdering(s.ordering), 430 ) 431 } 432 433 func newInputPartitioningOperator( 434 input colexecbase.Operator, standaloneMemAccount *mon.BoundAccount, memoryLimit int64, 435 ) resettableOperator { 436 return &inputPartitioningOperator{ 437 OneInputNode: NewOneInputNode(input), 438 standaloneMemAccount: standaloneMemAccount, 439 memoryLimit: memoryLimit, 440 } 441 } 442 443 // inputPartitioningOperator is an operator that returns the batches from its 444 // input until the standalone allocator reaches the memory limit. From that 445 // point, the operator returns a zero-length batch (until it is reset). 446 type inputPartitioningOperator struct { 447 OneInputNode 448 NonExplainable 449 450 standaloneMemAccount *mon.BoundAccount 451 memoryLimit int64 452 // interceptReset determines whether the reset method will be called on 453 // the input to this operator when the latter is being reset. This field is 454 // managed by externalSorter. 455 // NOTE: this field itself is set to 'false' when inputPartitioningOperator 456 // is being reset, regardless of the original value. 457 // 458 // The reason for having this knob is that we need two kinds of behaviors 459 // when resetting the inputPartitioningOperator: 460 // 1. ("shallow" reset) we need to clear the memory account because the 461 // external sorter is moving on spilling the data into a new partition. 462 // However, we *cannot* propagate the reset further up because it might 463 // delete the data that the external sorter has not yet spilled. This 464 // behavior is needed in externalSorter when resetting the in-memory sorter 465 // when spilling the next "chunk" of data into the new partition. 466 // 2. ("deep" reset) we need to do the full reset of the whole chain of 467 // operators. This behavior is needed when the whole external sorter is 468 // being reset. 469 interceptReset bool 470 } 471 472 var _ resettableOperator = &inputPartitioningOperator{} 473 474 func (o *inputPartitioningOperator) Init() { 475 o.input.Init() 476 } 477 478 func (o *inputPartitioningOperator) Next(ctx context.Context) coldata.Batch { 479 if o.standaloneMemAccount.Used() >= o.memoryLimit { 480 return coldata.ZeroBatch 481 } 482 b := o.input.Next(ctx) 483 if b.Length() == 0 { 484 return b 485 } 486 // We cannot use Allocator.RetainBatch here because that method looks at the 487 // capacities of the vectors. However, this operator is an input to sortOp 488 // which will spool all the tuples and buffer them (by appending into the 489 // buffered batch), so we need to account for memory proportionally to the 490 // length of the batch. (Note: this is not exactly true for Bytes type, but 491 // it's ok if we have some deviation. This numbers matter only to understand 492 // when to start a new partition, and the memory will be actually accounted 493 // for correctly.) 494 batchMemSize := colmem.GetProportionalBatchMemSize(b, int64(b.Length())) 495 if err := o.standaloneMemAccount.Grow(ctx, batchMemSize); err != nil { 496 colexecerror.InternalError(err) 497 } 498 return b 499 } 500 501 func (o *inputPartitioningOperator) reset(ctx context.Context) { 502 if !o.interceptReset { 503 if r, ok := o.input.(resetter); ok { 504 r.reset(ctx) 505 } 506 } 507 o.interceptReset = false 508 o.standaloneMemAccount.Shrink(ctx, o.standaloneMemAccount.Used()) 509 } 510 511 func (o *inputPartitioningOperator) Close(ctx context.Context) error { 512 o.standaloneMemAccount.Clear(ctx) 513 return nil 514 }