github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/sql/colexec/external_sort.go (about)

     1  // Copyright 2019 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package colexec
    12  
    13  import (
    14  	"context"
    15  	"fmt"
    16  
    17  	"github.com/cockroachdb/cockroach/pkg/col/coldata"
    18  	"github.com/cockroachdb/cockroach/pkg/sql/colcontainer"
    19  	"github.com/cockroachdb/cockroach/pkg/sql/colexecbase"
    20  	"github.com/cockroachdb/cockroach/pkg/sql/colexecbase/colexecerror"
    21  	"github.com/cockroachdb/cockroach/pkg/sql/colmem"
    22  	"github.com/cockroachdb/cockroach/pkg/sql/execinfrapb"
    23  	"github.com/cockroachdb/cockroach/pkg/sql/types"
    24  	"github.com/cockroachdb/cockroach/pkg/util/mon"
    25  	"github.com/cockroachdb/cockroach/pkg/util/syncutil"
    26  	"github.com/cockroachdb/errors"
    27  	"github.com/marusama/semaphore"
    28  )
    29  
    30  // externalSorterState indicates the current state of the external sorter.
    31  type externalSorterState int
    32  
    33  const (
    34  	// externalSorterNewPartition indicates that the next batch we read should
    35  	// start a new partition. A zero-length batch in this state indicates that
    36  	// the input to the external sorter has been fully consumed and we should
    37  	// proceed to merging the partitions.
    38  	externalSorterNewPartition externalSorterState = iota
    39  	// externalSorterSpillPartition indicates that the next batch we read should
    40  	// be added to the last partition so far. A zero-length batch in this state
    41  	// indicates that the end of the partition has been reached and we should
    42  	// transition to starting a new partition. If maxNumberPartitions is reached
    43  	// in this state, the sorter will transition to externalSorterRepeatedMerging
    44  	// to reduce the number of partitions.
    45  	externalSorterSpillPartition
    46  	// externalSorterRepeatedMerging indicates that we need to merge
    47  	// maxNumberPartitions into one and spill that new partition to disk. When
    48  	// finished, the sorter will transition to externalSorterNewPartition.
    49  	externalSorterRepeatedMerging
    50  	// externalSorterFinalMerging indicates that we have fully consumed the input
    51  	// and can merge all of the partitions in one step. We then transition to
    52  	// externalSorterEmitting state.
    53  	externalSorterFinalMerging
    54  	// externalSorterEmitting indicates that we are ready to emit output. A zero-
    55  	// length batch in this state indicates that we have emitted all tuples and
    56  	// should transition to externalSorterFinished state.
    57  	externalSorterEmitting
    58  	// externalSorterFinished indicates that all tuples from all partitions have
    59  	// been emitted and from now on only a zero-length batch will be emitted by
    60  	// the external sorter. This state is also responsible for closing the
    61  	// partitions.
    62  	externalSorterFinished
    63  )
    64  
    65  // In order to make progress when merging we have to merge at least two
    66  // partitions into a new third one.
    67  const externalSorterMinPartitions = 3
    68  
    69  // externalSorter is an Operator that performs external merge sort. It works in
    70  // two stages:
    71  // 1. it will use a combination of an input partitioner and in-memory sorter to
    72  // divide up all batches from the input into partitions, sort each partition in
    73  // memory, and write sorted partitions to disk
    74  // 2. it will use OrderedSynchronizer to merge the partitions.
    75  //
    76  // The (simplified) diagram of the components involved is as follows:
    77  //
    78  //                      input
    79  //                        |
    80  //                        ↓
    81  //                 input partitioner
    82  //                        |
    83  //                        ↓
    84  //                 in-memory sorter
    85  //                        |
    86  //                        ↓
    87  //    ------------------------------------------
    88  //   |             external sorter              |
    89  //   |             ---------------              |
    90  //   |                                          |
    91  //   | partition1     partition2 ... partitionN |
    92  //   |     |              |              |      |
    93  //   |     ↓              ↓              ↓      |
    94  //   |      merger (ordered synchronizer)       |
    95  //    ------------------------------------------
    96  //                        |
    97  //                        ↓
    98  //                      output
    99  //
   100  // There are a couple of implicit upstream links in the setup:
   101  // - input partitioner checks the allocator used by the in-memory sorter to see
   102  // whether a new partition must be started
   103  // - external sorter resets in-memory sorter (which, in turn, resets input
   104  // partitioner) once the full partition has been spilled to disk.
   105  //
   106  // What is hidden in the diagram is the fact that at some point we might need
   107  // to merge several partitions into a new one that we spill to disk in order to
   108  // reduce the number of "active" partitions. This requirement comes from the
   109  // need to limit the number of "active" partitions because each partition uses
   110  // some amount of RAM for its buffer. This is determined by
   111  // maxNumberPartitions variable.
   112  type externalSorter struct {
   113  	OneInputNode
   114  	NonExplainable
   115  	closerHelper
   116  
   117  	// mu is used to protect against concurrent IdempotentClose and Next calls,
   118  	// which are currently allowed.
   119  	// TODO(asubiotto): Explore calling IdempotentClose from the same goroutine as
   120  	//  Next, which will simplify this model.
   121  	mu syncutil.Mutex
   122  
   123  	unlimitedAllocator *colmem.Allocator
   124  	state              externalSorterState
   125  	inputTypes         []*types.T
   126  	ordering           execinfrapb.Ordering
   127  	inMemSorter        resettableOperator
   128  	inMemSorterInput   *inputPartitioningOperator
   129  	partitioner        colcontainer.PartitionedQueue
   130  	partitionerCreator func() colcontainer.PartitionedQueue
   131  	// numPartitions is the current number of partitions.
   132  	numPartitions int
   133  	// firstPartitionIdx is the index of the first partition to merge next.
   134  	firstPartitionIdx   int
   135  	maxNumberPartitions int
   136  
   137  	// fdState is used to acquire file descriptors up front.
   138  	fdState struct {
   139  		fdSemaphore semaphore.Semaphore
   140  		acquiredFDs int
   141  	}
   142  
   143  	emitter colexecbase.Operator
   144  
   145  	testingKnobs struct {
   146  		// delegateFDAcquisitions if true, means that a test wants to force the
   147  		// PartitionedDiskQueues to track the number of file descriptors the hash
   148  		// joiner will open/close. This disables the default behavior of acquiring
   149  		// all file descriptors up front in Next.
   150  		delegateFDAcquisitions bool
   151  	}
   152  }
   153  
   154  var _ resettableOperator = &externalSorter{}
   155  var _ closableOperator = &externalSorter{}
   156  
   157  // newExternalSorter returns a disk-backed general sort operator.
   158  // - ctx is the same context that standaloneMemAccount was created with.
   159  // - unlimitedAllocator must have been created with a memory account derived
   160  // from an unlimited memory monitor. It will be used by several internal
   161  // components of the external sort which is responsible for making sure that
   162  // the components stay within the memory limit.
   163  // - standaloneMemAccount must be a memory account derived from an unlimited
   164  // memory monitor with a standalone budget. It will be used by
   165  // inputPartitioningOperator to "partition" the input according to memory
   166  // limit. The budget *must* be standalone because we don't want to double
   167  // count the memory (the memory under the batches will be accounted for with
   168  // the unlimitedAllocator).
   169  // - maxNumberPartitions (when non-zero) overrides the semi-dynamically
   170  // computed maximum number of partitions to have at once.
   171  // - delegateFDAcquisitions specifies whether the external sorter should let
   172  // the partitioned disk queue acquire file descriptors instead of acquiring
   173  // them up front in Next. This should only be true in tests.
   174  func newExternalSorter(
   175  	ctx context.Context,
   176  	unlimitedAllocator *colmem.Allocator,
   177  	standaloneMemAccount *mon.BoundAccount,
   178  	input colexecbase.Operator,
   179  	inputTypes []*types.T,
   180  	ordering execinfrapb.Ordering,
   181  	memoryLimit int64,
   182  	maxNumberPartitions int,
   183  	delegateFDAcquisitions bool,
   184  	diskQueueCfg colcontainer.DiskQueueCfg,
   185  	fdSemaphore semaphore.Semaphore,
   186  	diskAcc *mon.BoundAccount,
   187  ) colexecbase.Operator {
   188  	if diskQueueCfg.CacheMode != colcontainer.DiskQueueCacheModeReuseCache {
   189  		colexecerror.InternalError(errors.Errorf("external sorter instantiated with suboptimal disk queue cache mode: %d", diskQueueCfg.CacheMode))
   190  	}
   191  	if diskQueueCfg.BufferSizeBytes > 0 && maxNumberPartitions == 0 {
   192  		// With the default limit of 256 file descriptors, this results in 16
   193  		// partitions. This is a hard maximum of partitions that will be used by the
   194  		// external sorter
   195  		// TODO(asubiotto): this number should be tuned.
   196  		maxNumberPartitions = fdSemaphore.GetLimit() / 16
   197  	}
   198  	if maxNumberPartitions < externalSorterMinPartitions {
   199  		maxNumberPartitions = externalSorterMinPartitions
   200  	}
   201  	// Each disk queue will use up to BufferSizeBytes of RAM, so we reduce the
   202  	// memoryLimit of the partitions to sort in memory by those cache sizes. To be
   203  	// safe, we also estimate the size of the output batch and subtract that as
   204  	// well.
   205  	batchMemSize := colmem.EstimateBatchSizeBytes(inputTypes, coldata.BatchSize())
   206  	// Reserve a certain amount of memory for the partition caches.
   207  	memoryLimit -= int64((maxNumberPartitions * diskQueueCfg.BufferSizeBytes) + batchMemSize)
   208  	if memoryLimit < 1 {
   209  		// If the memory limit is 0, the input partitioning operator will return a
   210  		// zero-length batch, so make it at least 1.
   211  		memoryLimit = 1
   212  	}
   213  	inputPartitioner := newInputPartitioningOperator(input, standaloneMemAccount, memoryLimit)
   214  	inMemSorter, err := newSorter(
   215  		unlimitedAllocator, newAllSpooler(unlimitedAllocator, inputPartitioner, inputTypes),
   216  		inputTypes, ordering.Columns,
   217  	)
   218  	if err != nil {
   219  		colexecerror.InternalError(err)
   220  	}
   221  	partitionedDiskQueueSemaphore := fdSemaphore
   222  	if !delegateFDAcquisitions {
   223  		// To avoid deadlocks with other disk queues, we manually attempt to acquire
   224  		// the maximum number of descriptors all at once in Next. Passing in a nil
   225  		// semaphore indicates that the caller will do the acquiring.
   226  		partitionedDiskQueueSemaphore = nil
   227  	}
   228  	es := &externalSorter{
   229  		OneInputNode:       NewOneInputNode(inMemSorter),
   230  		unlimitedAllocator: unlimitedAllocator,
   231  		inMemSorter:        inMemSorter,
   232  		inMemSorterInput:   inputPartitioner.(*inputPartitioningOperator),
   233  		partitionerCreator: func() colcontainer.PartitionedQueue {
   234  			return colcontainer.NewPartitionedDiskQueue(inputTypes, diskQueueCfg, partitionedDiskQueueSemaphore, colcontainer.PartitionerStrategyCloseOnNewPartition, diskAcc)
   235  		},
   236  		inputTypes:          inputTypes,
   237  		ordering:            ordering,
   238  		maxNumberPartitions: maxNumberPartitions,
   239  	}
   240  	es.fdState.fdSemaphore = fdSemaphore
   241  	es.testingKnobs.delegateFDAcquisitions = delegateFDAcquisitions
   242  	return es
   243  }
   244  
   245  func (s *externalSorter) Init() {
   246  	s.input.Init()
   247  	s.state = externalSorterNewPartition
   248  }
   249  
   250  func (s *externalSorter) Next(ctx context.Context) coldata.Batch {
   251  	s.mu.Lock()
   252  	defer s.mu.Unlock()
   253  	for {
   254  		switch s.state {
   255  		case externalSorterNewPartition:
   256  			b := s.input.Next(ctx)
   257  			if b.Length() == 0 {
   258  				// The input has been fully exhausted, and it is always the case that
   259  				// the number of partitions is less than the maximum number since
   260  				// externalSorterSpillPartition will check and re-merge if not.
   261  				// Proceed to the final merging state.
   262  				s.state = externalSorterFinalMerging
   263  				continue
   264  			}
   265  			newPartitionIdx := s.firstPartitionIdx + s.numPartitions
   266  			if s.partitioner == nil {
   267  				s.partitioner = s.partitionerCreator()
   268  			}
   269  			if err := s.partitioner.Enqueue(ctx, newPartitionIdx, b); err != nil {
   270  				colexecerror.InternalError(err)
   271  			}
   272  			s.state = externalSorterSpillPartition
   273  			continue
   274  		case externalSorterSpillPartition:
   275  			curPartitionIdx := s.firstPartitionIdx + s.numPartitions
   276  			b := s.input.Next(ctx)
   277  			if b.Length() == 0 {
   278  				// The partition has been fully spilled, so we reset the in-memory
   279  				// sorter (which will do the "shallow" reset of
   280  				// inputPartitioningOperator).
   281  				s.inMemSorterInput.interceptReset = true
   282  				s.inMemSorter.reset(ctx)
   283  				s.numPartitions++
   284  				if s.numPartitions == s.maxNumberPartitions-1 {
   285  					// We have reached the maximum number of active partitions that we
   286  					// know that we'll be able to merge without exceeding the limit, so
   287  					// we need to merge all of them and spill the new partition to disk
   288  					// before we can proceed on consuming the input.
   289  					s.state = externalSorterRepeatedMerging
   290  					continue
   291  				}
   292  				s.state = externalSorterNewPartition
   293  				continue
   294  			}
   295  			if !s.testingKnobs.delegateFDAcquisitions && s.fdState.fdSemaphore != nil && s.fdState.acquiredFDs == 0 {
   296  				toAcquire := s.maxNumberPartitions
   297  				if err := s.fdState.fdSemaphore.Acquire(ctx, toAcquire); err != nil {
   298  					colexecerror.InternalError(err)
   299  				}
   300  				s.fdState.acquiredFDs = toAcquire
   301  			}
   302  			if err := s.partitioner.Enqueue(ctx, curPartitionIdx, b); err != nil {
   303  				colexecerror.InternalError(err)
   304  			}
   305  			continue
   306  		case externalSorterRepeatedMerging:
   307  			// We will merge all partitions in range [s.firstPartitionIdx,
   308  			// s.firstPartitionIdx+s.numPartitions) and will spill all the
   309  			// resulting batches into a new partition with the next available
   310  			// index.
   311  			//
   312  			// The merger will be using some amount of RAM, will register it
   313  			// with the unlimited allocator and will *not* release that memory
   314  			// from the allocator, so we have to do it ourselves.
   315  			before := s.unlimitedAllocator.Used()
   316  			merger, err := s.createMergerForPartitions(s.firstPartitionIdx, s.numPartitions)
   317  			if err != nil {
   318  				colexecerror.InternalError(err)
   319  			}
   320  			merger.Init()
   321  			newPartitionIdx := s.firstPartitionIdx + s.numPartitions
   322  			for b := merger.Next(ctx); b.Length() > 0; b = merger.Next(ctx) {
   323  				if err := s.partitioner.Enqueue(ctx, newPartitionIdx, b); err != nil {
   324  					colexecerror.InternalError(err)
   325  				}
   326  			}
   327  			after := s.unlimitedAllocator.Used()
   328  			s.unlimitedAllocator.ReleaseMemory(after - before)
   329  			// Reclaim disk space by closing the inactive read partitions. Since the
   330  			// merger must have exhausted all inputs, this is all the partitions just
   331  			// read from.
   332  			if err := s.partitioner.CloseInactiveReadPartitions(ctx); err != nil {
   333  				colexecerror.InternalError(err)
   334  			}
   335  			s.firstPartitionIdx += s.numPartitions
   336  			s.numPartitions = 1
   337  			s.state = externalSorterNewPartition
   338  			continue
   339  		case externalSorterFinalMerging:
   340  			if s.numPartitions == 0 {
   341  				s.state = externalSorterFinished
   342  				continue
   343  			} else if s.numPartitions == 1 {
   344  				s.emitter = newPartitionerToOperator(
   345  					s.unlimitedAllocator, s.inputTypes, s.partitioner, s.firstPartitionIdx,
   346  				)
   347  			} else {
   348  				var err error
   349  				s.emitter, err = s.createMergerForPartitions(s.firstPartitionIdx, s.numPartitions)
   350  				if err != nil {
   351  					colexecerror.InternalError(err)
   352  				}
   353  			}
   354  			s.emitter.Init()
   355  			s.state = externalSorterEmitting
   356  			continue
   357  		case externalSorterEmitting:
   358  			b := s.emitter.Next(ctx)
   359  			if b.Length() == 0 {
   360  				s.state = externalSorterFinished
   361  				continue
   362  			}
   363  			return b
   364  		case externalSorterFinished:
   365  			if err := s.internalCloseLocked(ctx); err != nil {
   366  				colexecerror.InternalError(err)
   367  			}
   368  			return coldata.ZeroBatch
   369  		default:
   370  			colexecerror.InternalError(fmt.Sprintf("unexpected externalSorterState %d", s.state))
   371  		}
   372  	}
   373  }
   374  
   375  func (s *externalSorter) reset(ctx context.Context) {
   376  	if r, ok := s.input.(resetter); ok {
   377  		r.reset(ctx)
   378  	}
   379  	s.state = externalSorterNewPartition
   380  	s.mu.Lock()
   381  	defer s.mu.Unlock()
   382  	if err := s.internalCloseLocked(ctx); err != nil {
   383  		colexecerror.InternalError(err)
   384  	}
   385  	s.firstPartitionIdx = 0
   386  	s.numPartitions = 0
   387  }
   388  
   389  func (s *externalSorter) internalCloseLocked(ctx context.Context) error {
   390  	var lastErr error
   391  	if s.partitioner != nil {
   392  		lastErr = s.partitioner.Close(ctx)
   393  		s.partitioner = nil
   394  	}
   395  	if err := s.inMemSorterInput.Close(ctx); err != nil {
   396  		lastErr = err
   397  	}
   398  	if !s.testingKnobs.delegateFDAcquisitions && s.fdState.fdSemaphore != nil && s.fdState.acquiredFDs > 0 {
   399  		s.fdState.fdSemaphore.Release(s.fdState.acquiredFDs)
   400  		s.fdState.acquiredFDs = 0
   401  	}
   402  	return lastErr
   403  }
   404  
   405  func (s *externalSorter) IdempotentClose(ctx context.Context) error {
   406  	s.mu.Lock()
   407  	defer s.mu.Unlock()
   408  	if !s.close() {
   409  		return nil
   410  	}
   411  	return s.internalCloseLocked(ctx)
   412  }
   413  
   414  // createMergerForPartitions creates an ordered synchronizer that will merge
   415  // partitions in [firstIdx, firstIdx+numPartitions) range.
   416  func (s *externalSorter) createMergerForPartitions(
   417  	firstIdx, numPartitions int,
   418  ) (colexecbase.Operator, error) {
   419  	syncInputs := make([]colexecbase.Operator, numPartitions)
   420  	for i := range syncInputs {
   421  		syncInputs[i] = newPartitionerToOperator(
   422  			s.unlimitedAllocator, s.inputTypes, s.partitioner, firstIdx+i,
   423  		)
   424  	}
   425  	return NewOrderedSynchronizer(
   426  		s.unlimitedAllocator,
   427  		syncInputs,
   428  		s.inputTypes,
   429  		execinfrapb.ConvertToColumnOrdering(s.ordering),
   430  	)
   431  }
   432  
   433  func newInputPartitioningOperator(
   434  	input colexecbase.Operator, standaloneMemAccount *mon.BoundAccount, memoryLimit int64,
   435  ) resettableOperator {
   436  	return &inputPartitioningOperator{
   437  		OneInputNode:         NewOneInputNode(input),
   438  		standaloneMemAccount: standaloneMemAccount,
   439  		memoryLimit:          memoryLimit,
   440  	}
   441  }
   442  
   443  // inputPartitioningOperator is an operator that returns the batches from its
   444  // input until the standalone allocator reaches the memory limit. From that
   445  // point, the operator returns a zero-length batch (until it is reset).
   446  type inputPartitioningOperator struct {
   447  	OneInputNode
   448  	NonExplainable
   449  
   450  	standaloneMemAccount *mon.BoundAccount
   451  	memoryLimit          int64
   452  	// interceptReset determines whether the reset method will be called on
   453  	// the input to this operator when the latter is being reset. This field is
   454  	// managed by externalSorter.
   455  	// NOTE: this field itself is set to 'false' when inputPartitioningOperator
   456  	// is being reset, regardless of the original value.
   457  	//
   458  	// The reason for having this knob is that we need two kinds of behaviors
   459  	// when resetting the inputPartitioningOperator:
   460  	// 1. ("shallow" reset) we need to clear the memory account because the
   461  	// external sorter is moving on spilling the data into a new partition.
   462  	// However, we *cannot* propagate the reset further up because it might
   463  	// delete the data that the external sorter has not yet spilled. This
   464  	// behavior is needed in externalSorter when resetting the in-memory sorter
   465  	// when spilling the next "chunk" of data into the new partition.
   466  	// 2. ("deep" reset) we need to do the full reset of the whole chain of
   467  	// operators. This behavior is needed when the whole external sorter is
   468  	// being reset.
   469  	interceptReset bool
   470  }
   471  
   472  var _ resettableOperator = &inputPartitioningOperator{}
   473  
   474  func (o *inputPartitioningOperator) Init() {
   475  	o.input.Init()
   476  }
   477  
   478  func (o *inputPartitioningOperator) Next(ctx context.Context) coldata.Batch {
   479  	if o.standaloneMemAccount.Used() >= o.memoryLimit {
   480  		return coldata.ZeroBatch
   481  	}
   482  	b := o.input.Next(ctx)
   483  	if b.Length() == 0 {
   484  		return b
   485  	}
   486  	// We cannot use Allocator.RetainBatch here because that method looks at the
   487  	// capacities of the vectors. However, this operator is an input to sortOp
   488  	// which will spool all the tuples and buffer them (by appending into the
   489  	// buffered batch), so we need to account for memory proportionally to the
   490  	// length of the batch. (Note: this is not exactly true for Bytes type, but
   491  	// it's ok if we have some deviation. This numbers matter only to understand
   492  	// when to start a new partition, and the memory will be actually accounted
   493  	// for correctly.)
   494  	batchMemSize := colmem.GetProportionalBatchMemSize(b, int64(b.Length()))
   495  	if err := o.standaloneMemAccount.Grow(ctx, batchMemSize); err != nil {
   496  		colexecerror.InternalError(err)
   497  	}
   498  	return b
   499  }
   500  
   501  func (o *inputPartitioningOperator) reset(ctx context.Context) {
   502  	if !o.interceptReset {
   503  		if r, ok := o.input.(resetter); ok {
   504  			r.reset(ctx)
   505  		}
   506  	}
   507  	o.interceptReset = false
   508  	o.standaloneMemAccount.Shrink(ctx, o.standaloneMemAccount.Used())
   509  }
   510  
   511  func (o *inputPartitioningOperator) Close(ctx context.Context) error {
   512  	o.standaloneMemAccount.Clear(ctx)
   513  	return nil
   514  }