github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/sql/colexec/spilling_queue.go (about)

     1  // Copyright 2020 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package colexec
    12  
    13  import (
    14  	"context"
    15  	"fmt"
    16  
    17  	"github.com/cockroachdb/cockroach/pkg/col/coldata"
    18  	"github.com/cockroachdb/cockroach/pkg/sql/colcontainer"
    19  	"github.com/cockroachdb/cockroach/pkg/sql/colexecbase/colexecerror"
    20  	"github.com/cockroachdb/cockroach/pkg/sql/colmem"
    21  	"github.com/cockroachdb/cockroach/pkg/sql/types"
    22  	"github.com/cockroachdb/cockroach/pkg/util/log"
    23  	"github.com/cockroachdb/cockroach/pkg/util/mon"
    24  	"github.com/cockroachdb/errors"
    25  	"github.com/marusama/semaphore"
    26  )
    27  
    28  // spillingQueue is a Queue that uses a fixed-size in-memory circular buffer
    29  // and spills to disk if spillingQueue.items has no more slots available to hold
    30  // a reference to an enqueued batch or the allocator reports that more memory
    31  // than the caller-provided maxMemoryLimit is in use.
    32  // When spilling to disk, a DiskQueue will be created. When spilling batches to
    33  // disk, their memory will first be released using the allocator. When batches
    34  // are read from disk back into memory, that memory will be reclaimed.
    35  // NOTE: When a batch is returned, that batch's memory will still be tracked
    36  // using the allocator. Since the memory in use is fixed, a previously returned
    37  // batch may be overwritten by a batch read from disk. This new batch's memory
    38  // footprint will replace the footprint of the previously returned batch. Since
    39  // batches are unsafe for reuse, it is assumed that the previously returned
    40  // batch is not kept around and thus its referenced memory will be GCed as soon
    41  // as the batch is updated.
    42  type spillingQueue struct {
    43  	unlimitedAllocator *colmem.Allocator
    44  	maxMemoryLimit     int64
    45  
    46  	typs             []*types.T
    47  	items            []coldata.Batch
    48  	curHeadIdx       int
    49  	curTailIdx       int
    50  	numInMemoryItems int
    51  	numOnDiskItems   int
    52  	closed           bool
    53  
    54  	diskQueueCfg   colcontainer.DiskQueueCfg
    55  	diskQueue      colcontainer.Queue
    56  	fdSemaphore    semaphore.Semaphore
    57  	dequeueScratch coldata.Batch
    58  
    59  	rewindable      bool
    60  	rewindableState struct {
    61  		numItemsDequeued int
    62  	}
    63  
    64  	diskAcc *mon.BoundAccount
    65  }
    66  
    67  // newSpillingQueue creates a new spillingQueue. An unlimited allocator must be
    68  // passed in. The spillingQueue will use this allocator to check whether memory
    69  // usage exceeds the given memory limit and use disk if so.
    70  // If fdSemaphore is nil, no Acquire or Release calls will happen. The caller
    71  // may want to do this if requesting FDs up front.
    72  func newSpillingQueue(
    73  	unlimitedAllocator *colmem.Allocator,
    74  	typs []*types.T,
    75  	memoryLimit int64,
    76  	cfg colcontainer.DiskQueueCfg,
    77  	fdSemaphore semaphore.Semaphore,
    78  	batchSize int,
    79  	diskAcc *mon.BoundAccount,
    80  ) *spillingQueue {
    81  	// Reduce the memory limit by what the DiskQueue may need to buffer
    82  	// writes/reads.
    83  	memoryLimit -= int64(cfg.BufferSizeBytes)
    84  	if memoryLimit < 0 {
    85  		memoryLimit = 0
    86  	}
    87  	itemsLen := memoryLimit / int64(colmem.EstimateBatchSizeBytes(typs, batchSize))
    88  	if itemsLen == 0 {
    89  		// Make items at least of length 1. Even though batches will spill to disk
    90  		// directly (this can only happen with a very low memory limit), it's nice
    91  		// to have at least one item in order to be able to deserialize from disk
    92  		// into this slice.
    93  		itemsLen = 1
    94  	}
    95  	return &spillingQueue{
    96  		unlimitedAllocator: unlimitedAllocator,
    97  		maxMemoryLimit:     memoryLimit,
    98  		typs:               typs,
    99  		items:              make([]coldata.Batch, itemsLen),
   100  		diskQueueCfg:       cfg,
   101  		fdSemaphore:        fdSemaphore,
   102  		dequeueScratch:     unlimitedAllocator.NewMemBatchWithSize(typs, coldata.BatchSize()),
   103  		diskAcc:            diskAcc,
   104  	}
   105  }
   106  
   107  // newRewindableSpillingQueue creates a new spillingQueue that can be rewinded
   108  // in order to dequeue all enqueued batches all over again. An unlimited
   109  // allocator must be passed in. The queue will use this allocator to check
   110  // whether memory usage exceeds the given memory limit and use disk if so.
   111  func newRewindableSpillingQueue(
   112  	unlimitedAllocator *colmem.Allocator,
   113  	typs []*types.T,
   114  	memoryLimit int64,
   115  	cfg colcontainer.DiskQueueCfg,
   116  	fdSemaphore semaphore.Semaphore,
   117  	batchSize int,
   118  	diskAcc *mon.BoundAccount,
   119  ) *spillingQueue {
   120  	q := newSpillingQueue(unlimitedAllocator, typs, memoryLimit, cfg, fdSemaphore, batchSize, diskAcc)
   121  	q.rewindable = true
   122  	return q
   123  }
   124  
   125  func (q *spillingQueue) enqueue(ctx context.Context, batch coldata.Batch) error {
   126  	if batch.Length() == 0 {
   127  		if q.diskQueue != nil {
   128  			if err := q.diskQueue.Enqueue(ctx, batch); err != nil {
   129  				return err
   130  			}
   131  		}
   132  		return nil
   133  	}
   134  
   135  	if q.numOnDiskItems > 0 || q.unlimitedAllocator.Used() > q.maxMemoryLimit || q.numInMemoryItems == len(q.items) {
   136  		// In this case, there is not enough memory available to keep this batch in
   137  		// memory, or the in-memory circular buffer has no slots available (we do
   138  		// an initial estimate of how many batches would fit into the buffer, which
   139  		// might be wrong). The tail of the queue might also already be on disk, in
   140  		// which case that is where the batch must be enqueued to maintain order.
   141  		if err := q.maybeSpillToDisk(ctx); err != nil {
   142  			return err
   143  		}
   144  		q.unlimitedAllocator.ReleaseBatch(batch)
   145  		if err := q.diskQueue.Enqueue(ctx, batch); err != nil {
   146  			return err
   147  		}
   148  		q.numOnDiskItems++
   149  		return nil
   150  	}
   151  
   152  	q.items[q.curTailIdx] = batch
   153  	q.curTailIdx++
   154  	if q.curTailIdx == len(q.items) {
   155  		q.curTailIdx = 0
   156  	}
   157  	q.numInMemoryItems++
   158  	return nil
   159  }
   160  
   161  func (q *spillingQueue) dequeue(ctx context.Context) (coldata.Batch, error) {
   162  	if q.empty() {
   163  		return coldata.ZeroBatch, nil
   164  	}
   165  
   166  	if (q.rewindable && q.numInMemoryItems <= q.rewindableState.numItemsDequeued) ||
   167  		(!q.rewindable && q.numInMemoryItems == 0) {
   168  		// No more in-memory items. Fill the circular buffer as much as possible.
   169  		// Note that there must be at least one element on disk.
   170  		if !q.rewindable && q.curHeadIdx != q.curTailIdx {
   171  			colexecerror.InternalError(fmt.Sprintf("assertion failed in spillingQueue: curHeadIdx != curTailIdx, %d != %d", q.curHeadIdx, q.curTailIdx))
   172  		}
   173  		// NOTE: Only one item is dequeued from disk since a deserialized batch is
   174  		// only valid until the next call to Dequeue. In practice we could Dequeue
   175  		// up until a new file region is loaded (which will overwrite the memory of
   176  		// the previous batches), but Dequeue calls are already amortized, so this
   177  		// is acceptable.
   178  		// Release a batch to make space for a new batch from disk.
   179  		q.unlimitedAllocator.ReleaseBatch(q.dequeueScratch)
   180  		ok, err := q.diskQueue.Dequeue(ctx, q.dequeueScratch)
   181  		if err != nil {
   182  			return nil, err
   183  		}
   184  		if !ok {
   185  			// There was no batch to dequeue from disk. This should not really
   186  			// happen, as it should have been caught by the q.empty() check above.
   187  			colexecerror.InternalError("disk queue was not empty but failed to dequeue element in spillingQueue")
   188  		}
   189  		// Account for this batch's memory.
   190  		q.unlimitedAllocator.RetainBatch(q.dequeueScratch)
   191  		if q.rewindable {
   192  			q.rewindableState.numItemsDequeued++
   193  			return q.dequeueScratch, nil
   194  		}
   195  		q.numOnDiskItems--
   196  		q.numInMemoryItems++
   197  		q.items[q.curTailIdx] = q.dequeueScratch
   198  		q.curTailIdx++
   199  		if q.curTailIdx == len(q.items) {
   200  			q.curTailIdx = 0
   201  		}
   202  	}
   203  
   204  	res := q.items[q.curHeadIdx]
   205  	q.curHeadIdx++
   206  	if q.curHeadIdx == len(q.items) {
   207  		q.curHeadIdx = 0
   208  	}
   209  	if q.rewindable {
   210  		q.rewindableState.numItemsDequeued++
   211  	} else {
   212  		q.numInMemoryItems--
   213  	}
   214  	return res, nil
   215  }
   216  
   217  func (q *spillingQueue) numFDsOpenAtAnyGivenTime() int {
   218  	if q.diskQueueCfg.CacheMode != colcontainer.DiskQueueCacheModeDefault {
   219  		// The access pattern must be write-everything then read-everything so
   220  		// either a read FD or a write FD are open at any one point.
   221  		return 1
   222  	}
   223  	// Otherwise, both will be open.
   224  	return 2
   225  }
   226  
   227  func (q *spillingQueue) maybeSpillToDisk(ctx context.Context) error {
   228  	if q.diskQueue != nil {
   229  		return nil
   230  	}
   231  	var err error
   232  	// Acquire two file descriptors for the DiskQueue: one for the write file and
   233  	// one for the read file.
   234  	if q.fdSemaphore != nil {
   235  		if err = q.fdSemaphore.Acquire(ctx, q.numFDsOpenAtAnyGivenTime()); err != nil {
   236  			return err
   237  		}
   238  	}
   239  	log.VEvent(ctx, 1, "spilled to disk")
   240  	var diskQueue colcontainer.Queue
   241  	if q.rewindable {
   242  		diskQueue, err = colcontainer.NewRewindableDiskQueue(ctx, q.typs, q.diskQueueCfg, q.diskAcc)
   243  	} else {
   244  		diskQueue, err = colcontainer.NewDiskQueue(ctx, q.typs, q.diskQueueCfg, q.diskAcc)
   245  	}
   246  	if err != nil {
   247  		return err
   248  	}
   249  	// Only assign q.diskQueue if there was no error, otherwise the returned value
   250  	// may be non-nil but invalid.
   251  	q.diskQueue = diskQueue
   252  	return nil
   253  }
   254  
   255  // empty returns whether there are currently no items to be dequeued.
   256  func (q *spillingQueue) empty() bool {
   257  	if q.rewindable {
   258  		return q.numInMemoryItems+q.numOnDiskItems == q.rewindableState.numItemsDequeued
   259  	}
   260  	return q.numInMemoryItems == 0 && q.numOnDiskItems == 0
   261  }
   262  
   263  func (q *spillingQueue) spilled() bool {
   264  	return q.diskQueue != nil
   265  }
   266  
   267  func (q *spillingQueue) close(ctx context.Context) error {
   268  	if q.closed {
   269  		return nil
   270  	}
   271  	if q.diskQueue != nil {
   272  		if err := q.diskQueue.Close(ctx); err != nil {
   273  			return err
   274  		}
   275  		if q.fdSemaphore != nil {
   276  			q.fdSemaphore.Release(q.numFDsOpenAtAnyGivenTime())
   277  		}
   278  		q.closed = true
   279  		return nil
   280  	}
   281  	return nil
   282  }
   283  
   284  func (q *spillingQueue) rewind() error {
   285  	if !q.rewindable {
   286  		return errors.Newf("unexpectedly rewind() called when spilling queue is not rewindable")
   287  	}
   288  	if q.diskQueue != nil {
   289  		if err := q.diskQueue.(colcontainer.RewindableQueue).Rewind(); err != nil {
   290  			return err
   291  		}
   292  	}
   293  	q.curHeadIdx = 0
   294  	q.rewindableState.numItemsDequeued = 0
   295  	return nil
   296  }
   297  
   298  func (q *spillingQueue) reset(ctx context.Context) {
   299  	if err := q.close(ctx); err != nil {
   300  		colexecerror.InternalError(err)
   301  	}
   302  	q.diskQueue = nil
   303  	q.closed = false
   304  	q.numInMemoryItems = 0
   305  	q.numOnDiskItems = 0
   306  	q.curHeadIdx = 0
   307  	q.curTailIdx = 0
   308  	q.rewindableState.numItemsDequeued = 0
   309  }