github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/sql/colcontainer/diskqueue.go (about)

     1  // Copyright 2019 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package colcontainer
    12  
    13  import (
    14  	"bytes"
    15  	"context"
    16  	"io"
    17  	"path/filepath"
    18  	"strconv"
    19  
    20  	"github.com/cockroachdb/cockroach/pkg/col/coldata"
    21  	"github.com/cockroachdb/cockroach/pkg/col/colserde"
    22  	"github.com/cockroachdb/cockroach/pkg/sql/types"
    23  	"github.com/cockroachdb/cockroach/pkg/storage/fs"
    24  	"github.com/cockroachdb/cockroach/pkg/util/mon"
    25  	"github.com/cockroachdb/cockroach/pkg/util/uuid"
    26  	"github.com/cockroachdb/errors"
    27  	"github.com/golang/snappy"
    28  )
    29  
    30  const (
    31  	// compressionSizeReductionThreshold is the factor used to determine whether
    32  	// to write compressed bytes or not. If the compressed bytes are larger than
    33  	// 1-1/compressionSizeReductionThreshold of the original size, compression is
    34  	// not used. This is to avoid paying the cost of decompression if the space
    35  	// savings are not sufficient.
    36  	compressionSizeReductionThreshold = 8
    37  	// bytesPerSync is the amount of bytes written to a file before Sync is
    38  	// called (implemented by using a vfs.SyncingFile).
    39  	bytesPerSync = 512 << 10 /* 512 KiB */
    40  )
    41  
    42  // file represents in-memory state used by a diskQueue to keep track of the
    43  // state of a file.
    44  type file struct {
    45  	name string
    46  	// offsets represent the start and ends of logical regions of a file to be
    47  	// read at once. This allows a region of coldata.Batches to be deserialized
    48  	// without reading a whole file into memory.
    49  	offsets []int
    50  	// curOffsetIdx is an index into offsets.
    51  	curOffsetIdx int
    52  	totalSize    int
    53  	// finishedWriting specifies whether this file will be written to in the
    54  	// future or not. If finishedWriting is true and the reader reaches the end
    55  	// of the file, the file represented by this struct should be closed and
    56  	// (if the disk queue is not rewindable) removed.
    57  	finishedWriting bool
    58  }
    59  
    60  // diskQueueWriter is an object that encapsulates the writing logic of a
    61  // diskQueue. As bytes are written to it, they are buffered until
    62  // compressAndFlush is called, which compresses all bytes and writes them to the
    63  // wrapped io.Writer.
    64  type diskQueueWriter struct {
    65  	// testingKnobAlwaysCompress specifies whether the writer should always
    66  	// compress writes (i.e. don't bother measuring whether compression passes
    67  	// a certain threshold of size improvement before writing compressed bytes).
    68  	testingKnobAlwaysCompress bool
    69  	buffer                    bytes.Buffer
    70  	wrapped                   io.Writer
    71  	scratch                   struct {
    72  		// blockType is a single byte that specifies whether the following block on
    73  		// disk (i.e. compressedBuf in memory) is compressed or not. It is an array
    74  		// due to having to pass this byte in as a slice to Write.
    75  		blockType     [1]byte
    76  		compressedBuf []byte
    77  	}
    78  }
    79  
    80  const (
    81  	snappyUncompressedBlock byte = 0
    82  	snappyCompressedBlock   byte = 1
    83  )
    84  
    85  func (w *diskQueueWriter) Write(p []byte) (int, error) {
    86  	return w.buffer.Write(p)
    87  }
    88  
    89  // reset resets the diskQueueWriter's wrapped writer and discards any buffered
    90  // bytes.
    91  func (w *diskQueueWriter) reset(wrapped io.Writer) {
    92  	w.wrapped = wrapped
    93  	w.buffer.Reset()
    94  }
    95  
    96  // compressAndFlush compresses all buffered bytes and writes them to the wrapped
    97  // io.Writer. The number of total bytes written to the wrapped writer is
    98  // returned if no error occurred, otherwise 0, err is returned.
    99  func (w *diskQueueWriter) compressAndFlush() (int, error) {
   100  	b := w.buffer.Bytes()
   101  	compressed := snappy.Encode(w.scratch.compressedBuf, b)
   102  	w.scratch.compressedBuf = compressed[:cap(compressed)]
   103  
   104  	blockType := snappyUncompressedBlock
   105  	// Discard result if < 12.5% size reduction. All code that uses snappy
   106  	// compression (including pebble and the higher-level snappy implementation)
   107  	// has this threshold in place.
   108  	if w.testingKnobAlwaysCompress || len(compressed) < len(b)-len(b)/compressionSizeReductionThreshold {
   109  		blockType = snappyCompressedBlock
   110  		b = compressed
   111  	}
   112  
   113  	// Write whether this data is compressed or not.
   114  	w.scratch.blockType[0] = blockType
   115  	nType, err := w.wrapped.Write(w.scratch.blockType[:])
   116  	if err != nil {
   117  		return 0, err
   118  	}
   119  
   120  	nBody, err := w.wrapped.Write(b)
   121  	if err != nil {
   122  		return 0, err
   123  	}
   124  	w.buffer.Reset()
   125  	return nType + nBody, err
   126  }
   127  
   128  func (w *diskQueueWriter) numBytesBuffered() int {
   129  	return w.buffer.Len()
   130  }
   131  
   132  // diskQueueState describes the current state of the disk queue. Used to assert
   133  // that an invalid state transition doesn't happen when a DiskQueue is in
   134  // DiskQueueCacheMode{ClearAnd}ReuseCache.
   135  type diskQueueState int
   136  
   137  const (
   138  	diskQueueStateEnqueueing diskQueueState = iota
   139  	diskQueueStateDequeueing
   140  )
   141  
   142  // diskQueue is an on-disk queue of coldata.Batches that implements the Queue
   143  // interface. coldata.Batches are serialized and buffered up, after which they
   144  // are compressed and flushed to a file. A directory with a random UUID name
   145  // will be created in cfg.Path, and files will be created in that directory
   146  // using sequence numbers.
   147  // When a file reaches DiskQueueCfg.MaxFileSizeBytes, a new file is created with
   148  // the next sequential file number to store the next batches in the queue.
   149  // Note that, if diskQueue is not rewindable, files will be cleaned up as
   150  // coldata.Batches are dequeued from the diskQueue. DiskQueueCfg.Dir will also
   151  // be removed on Close, deleting all files.
   152  // A diskQueue will never use more memory than cfg.BufferSizeBytes, but not all
   153  // the available memory will be used to buffer only writes. Refer to the
   154  // DiskQueueCacheMode comment as to how cfg.BufferSizeBytes is divided in each
   155  // mode.
   156  type diskQueue struct {
   157  	// dirName is the directory in cfg.Path that holds this queue's files.
   158  	dirName string
   159  
   160  	typs  []*types.T
   161  	cfg   DiskQueueCfg
   162  	files []file
   163  	seqNo int
   164  
   165  	state      diskQueueState
   166  	rewindable bool
   167  
   168  	// done is set when a coldata.ZeroBatch has been Enqueued.
   169  	done bool
   170  
   171  	serializer *colserde.FileSerializer
   172  	// numBufferedBatches is the number of batches buffered that haven't been
   173  	// flushed to disk. This is useful for a reader to determine whether to flush
   174  	// or not, since the number of buffered bytes will always be > 0 even though
   175  	// no batches have been enqueued (due to metadata).
   176  	numBufferedBatches int
   177  	writer             *diskQueueWriter
   178  	// writeBufferLimit is the limit on the number of uncompressed write bytes
   179  	// written before a compress and flush.
   180  	writeBufferLimit  int
   181  	writeFileIdx      int
   182  	writeFile         fs.File
   183  	deserializerState struct {
   184  		*colserde.FileDeserializer
   185  		curBatch int
   186  	}
   187  	// readFileIdx is an index into the current file in files the deserializer is
   188  	// reading from.
   189  	readFileIdx                  int
   190  	readFile                     fs.File
   191  	scratchDecompressedReadBytes []byte
   192  
   193  	diskAcc *mon.BoundAccount
   194  }
   195  
   196  var _ RewindableQueue = &diskQueue{}
   197  
   198  // Queue describes a simple queue interface to which coldata.Batches can be
   199  // Enqueued and Dequeued.
   200  type Queue interface {
   201  	// Enqueue enqueues a coldata.Batch to this queue. A zero-length batch should
   202  	// be enqueued when no more elements will be enqueued.
   203  	// WARNING: Selection vectors are ignored.
   204  	Enqueue(context.Context, coldata.Batch) error
   205  	// Dequeue dequeues a coldata.Batch from the queue into the batch that is
   206  	// passed in. The boolean returned specifies whether the queue was not empty
   207  	// (i.e. whether there was a batch to Dequeue). If true is returned and the
   208  	// batch has a length of zero, the Queue is finished and will not be Enqueued
   209  	// to. If an error is returned, the batch and boolean returned are
   210  	// meaningless.
   211  	Dequeue(context.Context, coldata.Batch) (bool, error)
   212  	// CloseRead closes the read file descriptor. If Dequeued, the file may be
   213  	// reopened.
   214  	CloseRead() error
   215  	// Close closes any resources associated with the Queue.
   216  	Close(context.Context) error
   217  }
   218  
   219  // RewindableQueue is a Queue that can be read from multiple times. Note that
   220  // in order for this Queue to return the same data after rewinding, all
   221  // Enqueueing *must* occur before any Dequeueing.
   222  type RewindableQueue interface {
   223  	Queue
   224  	// Rewind resets the Queue so that it Dequeues all Enqueued batches from the
   225  	// start.
   226  	Rewind() error
   227  }
   228  
   229  const (
   230  	// defaultBufferSizeBytesDefaultCacheMode is the default buffer size used when
   231  	// the DiskQueue is in DiskQueueCacheModeDefault.
   232  	// This value was chosen by running BenchmarkQueue.
   233  	defaultBufferSizeBytesDefaultCacheMode = 128 << 10 /* 128 KiB */
   234  	// defaultBufferSizeBytesReuseCacheMode is the default buffer size used when
   235  	// the DiskQueue is in DiskQueueCacheMode{ClearAnd}ReuseCache.
   236  	defaultBufferSizeBytesReuseCacheMode = 64 << 10 /* 64 KiB */
   237  	// defaultMaxFileSizeBytes is the default maximum file size after which the
   238  	// DiskQueue rolls over to a new file. This value was chosen by running
   239  	// BenchmarkQueue.
   240  	defaultMaxFileSizeBytes = 32 << 20 /* 32 MiB */
   241  )
   242  
   243  // DiskQueueCacheMode specifies a pattern that a DiskQueue should use regarding
   244  // its cache.
   245  type DiskQueueCacheMode int
   246  
   247  const (
   248  	// DiskQueueCacheModeDefault is the default mode for DiskQueue cache behavior.
   249  	// The cache (DiskQueueCfg.BufferSizeBytes) will be divided as follows:
   250  	// - 1/3 for buffered writes (before compression)
   251  	// - 1/3 for compressed writes, this is distinct from the previous 1/3 because
   252  	//   it is a requirement of the snappy library that the compressed memory may
   253  	//   not overlap with the uncompressed memory. This memory is reused to read
   254  	//   compressed bytes from disk.
   255  	// - 1/3 for buffered reads after decompression. Kept separate from the write
   256  	//   memory to allow for Enqueues to come in while unread batches are held in
   257  	//   memory.
   258  	// In this mode, Enqueues and Dequeues may happen in any order.
   259  	DiskQueueCacheModeDefault DiskQueueCacheMode = iota
   260  	// DiskQueueCacheModeReuseCache imposes a limitation that all Enqueues happen
   261  	// before all Dequeues to be able to reuse more memory. In this mode the cache
   262  	// will be divided as follows:
   263  	// - 1/2 for buffered writes and buffered reads.
   264  	// - 1/2 for compressed write and reads (given the limitation that this memory
   265  	//   has to be non-overlapping.
   266  	DiskQueueCacheModeReuseCache
   267  	// DiskQueueCacheModeClearAndReuseCache is the same as
   268  	// DiskQueueCacheModeReuseCache with the additional behavior that when a
   269  	// coldata.ZeroBatch is Enqueued, the cache will be released to the GC.
   270  	DiskQueueCacheModeClearAndReuseCache
   271  )
   272  
   273  // DiskQueueCfg is a struct holding the configuration options for a DiskQueue.
   274  type DiskQueueCfg struct {
   275  	// FS is the filesystem interface to use.
   276  	FS fs.FS
   277  	// Path is where the temporary directory that will contain this DiskQueue's
   278  	// files should be created. The directory name will be a UUID.
   279  	Path string
   280  	// CacheMode defines the way a DiskQueue should use its cache. Refer to the
   281  	// comment of DiskQueueCacheModes for more information.
   282  	CacheMode DiskQueueCacheMode
   283  	// BufferSizeBytes is the number of bytes to buffer before compressing and
   284  	// writing to disk.
   285  	BufferSizeBytes int
   286  	// MaxFileSizeBytes is the maximum size an on-disk file should reach before
   287  	// rolling over to a new one.
   288  	MaxFileSizeBytes int
   289  
   290  	// OnNewDiskQueueCb is an optional callback function that will be called when
   291  	// NewDiskQueue is called.
   292  	OnNewDiskQueueCb func()
   293  
   294  	// TestingKnobs are used to test the queue implementation.
   295  	TestingKnobs struct {
   296  		// AlwaysCompress, if true, will skip a check that determines whether
   297  		// compression is used for a given write or not given the percentage size
   298  		// improvement. This allows us to test compression.
   299  		AlwaysCompress bool
   300  	}
   301  }
   302  
   303  // EnsureDefaults ensures that optional fields are set to reasonable defaults.
   304  // If any necessary options have been elided, an error is returned.
   305  func (cfg *DiskQueueCfg) EnsureDefaults() error {
   306  	if cfg.FS == nil {
   307  		return errors.New("FS unset on DiskQueueCfg")
   308  	}
   309  	if cfg.BufferSizeBytes == 0 {
   310  		cfg.SetDefaultBufferSizeBytesForCacheMode()
   311  	}
   312  	if cfg.MaxFileSizeBytes == 0 {
   313  		cfg.MaxFileSizeBytes = defaultMaxFileSizeBytes
   314  	}
   315  	return nil
   316  }
   317  
   318  // SetDefaultBufferSizeBytesForCacheMode sets the default BufferSizeBytes
   319  // according to the set CacheMode.
   320  func (cfg *DiskQueueCfg) SetDefaultBufferSizeBytesForCacheMode() {
   321  	if cfg.CacheMode == DiskQueueCacheModeDefault {
   322  		cfg.BufferSizeBytes = defaultBufferSizeBytesDefaultCacheMode
   323  	} else {
   324  		cfg.BufferSizeBytes = defaultBufferSizeBytesReuseCacheMode
   325  	}
   326  }
   327  
   328  // NewDiskQueue creates a Queue that spills to disk.
   329  func NewDiskQueue(
   330  	ctx context.Context, typs []*types.T, cfg DiskQueueCfg, diskAcc *mon.BoundAccount,
   331  ) (Queue, error) {
   332  	return newDiskQueue(ctx, typs, cfg, diskAcc)
   333  }
   334  
   335  // NewRewindableDiskQueue creates a RewindableQueue that spills to disk.
   336  func NewRewindableDiskQueue(
   337  	ctx context.Context, typs []*types.T, cfg DiskQueueCfg, diskAcc *mon.BoundAccount,
   338  ) (RewindableQueue, error) {
   339  	d, err := newDiskQueue(ctx, typs, cfg, diskAcc)
   340  	if err != nil {
   341  		return nil, err
   342  	}
   343  	d.rewindable = true
   344  	return d, nil
   345  }
   346  
   347  func newDiskQueue(
   348  	ctx context.Context, typs []*types.T, cfg DiskQueueCfg, diskAcc *mon.BoundAccount,
   349  ) (*diskQueue, error) {
   350  	if err := cfg.EnsureDefaults(); err != nil {
   351  		return nil, err
   352  	}
   353  	if cfg.OnNewDiskQueueCb != nil {
   354  		cfg.OnNewDiskQueueCb()
   355  	}
   356  	d := &diskQueue{
   357  		dirName:          uuid.FastMakeV4().String(),
   358  		typs:             typs,
   359  		cfg:              cfg,
   360  		files:            make([]file, 0, 4),
   361  		writeBufferLimit: cfg.BufferSizeBytes / 3,
   362  		diskAcc:          diskAcc,
   363  	}
   364  	// Refer to the DiskQueueCacheMode comment for why this division of
   365  	// BufferSizeBytes.
   366  	if d.cfg.CacheMode != DiskQueueCacheModeDefault {
   367  		d.writeBufferLimit = d.cfg.BufferSizeBytes / 2
   368  	}
   369  	if err := cfg.FS.MkdirAll(filepath.Join(cfg.Path, d.dirName)); err != nil {
   370  		return nil, err
   371  	}
   372  	// rotateFile will create a new file to write to.
   373  	return d, d.rotateFile(ctx)
   374  }
   375  
   376  func (d *diskQueue) CloseRead() error {
   377  	if d.readFile != nil {
   378  		if err := d.readFile.Close(); err != nil {
   379  			return err
   380  		}
   381  		d.readFile = nil
   382  	}
   383  	return nil
   384  }
   385  
   386  func (d *diskQueue) closeFileDeserializer() error {
   387  	if d.deserializerState.FileDeserializer != nil {
   388  		if err := d.deserializerState.Close(); err != nil {
   389  			return err
   390  		}
   391  	}
   392  	d.deserializerState.FileDeserializer = nil
   393  	return nil
   394  }
   395  
   396  func (d *diskQueue) Close(ctx context.Context) error {
   397  	if d.serializer != nil {
   398  		if err := d.writeFooterAndFlush(ctx); err != nil {
   399  			return err
   400  		}
   401  		d.serializer = nil
   402  	}
   403  	if err := d.closeFileDeserializer(); err != nil {
   404  		return err
   405  	}
   406  	if d.writeFile != nil {
   407  		if err := d.writeFile.Close(); err != nil {
   408  			return err
   409  		}
   410  		d.writeFile = nil
   411  	}
   412  	// The readFile will be removed below in DeleteDirAndFiles.
   413  	if err := d.CloseRead(); err != nil {
   414  		return err
   415  	}
   416  	if err := d.cfg.FS.RemoveAll(filepath.Join(d.cfg.Path, d.dirName)); err != nil {
   417  		return err
   418  	}
   419  	totalSize := int64(0)
   420  	leftOverFileIdx := 0
   421  	if !d.rewindable {
   422  		leftOverFileIdx = d.readFileIdx
   423  	}
   424  	for _, file := range d.files[leftOverFileIdx : d.writeFileIdx+1] {
   425  		totalSize += int64(file.totalSize)
   426  	}
   427  	if totalSize > d.diskAcc.Used() {
   428  		totalSize = d.diskAcc.Used()
   429  	}
   430  	d.diskAcc.Shrink(ctx, totalSize)
   431  	return nil
   432  }
   433  
   434  // rotateFile performs file rotation for the diskQueue. i.e. it creates a new
   435  // file to write to and sets the diskQueue state up to write to that file when
   436  // Enqueue is called.
   437  // It is valid to call rotateFile when the diskQueue is not currently writing to
   438  // any file (i.e. during initialization). This will simply create the first file
   439  // to write to.
   440  func (d *diskQueue) rotateFile(ctx context.Context) error {
   441  	fName := filepath.Join(d.cfg.Path, d.dirName, strconv.Itoa(d.seqNo))
   442  	f, err := d.cfg.FS.CreateWithSync(fName, bytesPerSync)
   443  	if err != nil {
   444  		return err
   445  	}
   446  	d.seqNo++
   447  
   448  	if d.serializer == nil {
   449  		writer := &diskQueueWriter{testingKnobAlwaysCompress: d.cfg.TestingKnobs.AlwaysCompress, wrapped: f}
   450  		d.serializer, err = colserde.NewFileSerializer(writer, d.typs)
   451  		if err != nil {
   452  			return err
   453  		}
   454  		d.writer = writer
   455  	} else {
   456  		if err := d.writeFooterAndFlush(ctx); err != nil {
   457  			return err
   458  		}
   459  		if err := d.resetWriters(f); err != nil {
   460  			return err
   461  		}
   462  	}
   463  
   464  	if d.writeFile != nil {
   465  		d.files[d.writeFileIdx].finishedWriting = true
   466  		if err := d.writeFile.Close(); err != nil {
   467  			return err
   468  		}
   469  	}
   470  
   471  	d.writeFileIdx = len(d.files)
   472  	d.files = append(d.files, file{name: fName, offsets: make([]int, 1, 16)})
   473  	d.writeFile = f
   474  	return nil
   475  }
   476  
   477  func (d *diskQueue) resetWriters(f fs.File) error {
   478  	d.writer.reset(f)
   479  	return d.serializer.Reset(d.writer)
   480  }
   481  
   482  func (d *diskQueue) writeFooterAndFlush(ctx context.Context) error {
   483  	err := d.serializer.Finish()
   484  	if err != nil {
   485  		return err
   486  	}
   487  	written, err := d.writer.compressAndFlush()
   488  	if err != nil {
   489  		return err
   490  	}
   491  	d.numBufferedBatches = 0
   492  	d.files[d.writeFileIdx].totalSize += written
   493  	if err := d.diskAcc.Grow(ctx, int64(written)); err != nil {
   494  		return err
   495  	}
   496  	// Append offset for the readers.
   497  	d.files[d.writeFileIdx].offsets = append(d.files[d.writeFileIdx].offsets, d.files[d.writeFileIdx].totalSize)
   498  	return nil
   499  }
   500  
   501  func (d *diskQueue) Enqueue(ctx context.Context, b coldata.Batch) error {
   502  	if d.state == diskQueueStateDequeueing {
   503  		if d.cfg.CacheMode != DiskQueueCacheModeDefault {
   504  			return errors.Errorf("attempted to Enqueue to DiskQueue in mode that disallows it: %d", d.cfg.CacheMode)
   505  		}
   506  		if d.rewindable {
   507  			return errors.Errorf("attempted to Enqueue to RewindableDiskQueue after Dequeue has been called")
   508  		}
   509  	}
   510  	d.state = diskQueueStateEnqueueing
   511  	if b.Length() == 0 {
   512  		if d.done {
   513  			// Already done.
   514  			return nil
   515  		}
   516  		if err := d.writeFooterAndFlush(ctx); err != nil {
   517  			return err
   518  		}
   519  		if err := d.writeFile.Close(); err != nil {
   520  			return err
   521  		}
   522  		d.files[d.writeFileIdx].finishedWriting = true
   523  		d.writeFile = nil
   524  		// Done with the serializer. Not setting this will cause us to attempt to
   525  		// flush the serializer on Close.
   526  		d.serializer = nil
   527  		// The write file will be closed in Close.
   528  		d.done = true
   529  		if d.cfg.CacheMode == DiskQueueCacheModeClearAndReuseCache {
   530  			// Clear the cache. d.scratchDecompressedReadBytes should already be nil
   531  			// since we don't allow writes once reads happen in this mode.
   532  			d.scratchDecompressedReadBytes = nil
   533  			// Clear the write side of the cache.
   534  			d.writer.buffer = bytes.Buffer{}
   535  			d.writer.scratch.compressedBuf = nil
   536  		}
   537  		return nil
   538  	}
   539  	if err := d.serializer.AppendBatch(b); err != nil {
   540  		return err
   541  	}
   542  	d.numBufferedBatches++
   543  
   544  	bufferSizeLimitReached := d.writer.numBytesBuffered() > d.writeBufferLimit
   545  	fileSizeLimitReached := d.files[d.writeFileIdx].totalSize+d.writer.numBytesBuffered() > d.cfg.MaxFileSizeBytes
   546  	if bufferSizeLimitReached || fileSizeLimitReached {
   547  		if fileSizeLimitReached {
   548  			// rotateFile will flush and reset writers.
   549  			return d.rotateFile(ctx)
   550  		}
   551  		if err := d.writeFooterAndFlush(ctx); err != nil {
   552  			return err
   553  		}
   554  		return d.resetWriters(d.writeFile)
   555  	}
   556  	return nil
   557  }
   558  
   559  func (d *diskQueue) maybeInitDeserializer(ctx context.Context) (bool, error) {
   560  	if d.deserializerState.FileDeserializer != nil {
   561  		return true, nil
   562  	}
   563  	if d.readFileIdx >= len(d.files) {
   564  		// There is no valid file to read from. Either more data will be enqueued or
   565  		// not, but the behavior there depends on the caller.
   566  		return false, nil
   567  	}
   568  	fileToRead := d.files[d.readFileIdx]
   569  	if fileToRead.curOffsetIdx == len(fileToRead.offsets)-1 {
   570  		// The current offset index is the last element in offsets. This means that
   571  		// either the region to read from next is currently being written to or the
   572  		// writer has rotated to a new file.
   573  		if fileToRead.finishedWriting {
   574  			// Close current file.
   575  			if err := d.CloseRead(); err != nil {
   576  				return false, err
   577  			}
   578  			if !d.rewindable {
   579  				// Remove current file.
   580  				if err := d.cfg.FS.Remove(d.files[d.readFileIdx].name); err != nil {
   581  					return false, err
   582  				}
   583  				fileSize := int64(d.files[d.readFileIdx].totalSize)
   584  				if fileSize > d.diskAcc.Used() {
   585  					fileSize = d.diskAcc.Used()
   586  				}
   587  				d.diskAcc.Shrink(ctx, fileSize)
   588  			}
   589  			d.readFile = nil
   590  			// Read next file.
   591  			d.readFileIdx++
   592  			return d.maybeInitDeserializer(ctx)
   593  		}
   594  		// Not finished writing. there is currently no data to read.
   595  		return false, nil
   596  	}
   597  	if d.readFile == nil {
   598  		// File is not open.
   599  		f, err := d.cfg.FS.Open(fileToRead.name)
   600  		if err != nil {
   601  			return false, err
   602  		}
   603  		d.readFile = f
   604  	}
   605  	readRegionStart := fileToRead.offsets[fileToRead.curOffsetIdx]
   606  	readRegionLength := fileToRead.offsets[fileToRead.curOffsetIdx+1] - readRegionStart
   607  	if cap(d.writer.scratch.compressedBuf) < readRegionLength {
   608  		// Not enough capacity, we have to allocate a new compressedBuf.
   609  		d.writer.scratch.compressedBuf = make([]byte, readRegionLength)
   610  	}
   611  	// Slice the compressedBuf to be of the desired length, encoded in
   612  	// readRegionLength.
   613  	d.writer.scratch.compressedBuf = d.writer.scratch.compressedBuf[0:readRegionLength]
   614  	// Read the desired length starting at readRegionStart.
   615  	n, err := d.readFile.ReadAt(d.writer.scratch.compressedBuf, int64(readRegionStart))
   616  	if err != nil && err != io.EOF {
   617  		return false, err
   618  	}
   619  	if n != len(d.writer.scratch.compressedBuf) {
   620  		return false, errors.Errorf("expected to read %d bytes but read %d", len(d.writer.scratch.compressedBuf), n)
   621  	}
   622  
   623  	blockType := d.writer.scratch.compressedBuf[0]
   624  	compressedBytes := d.writer.scratch.compressedBuf[1:]
   625  	var decompressedBytes []byte
   626  	if blockType == snappyCompressedBlock {
   627  		decompressedBytes, err = snappy.Decode(d.scratchDecompressedReadBytes, compressedBytes)
   628  		if err != nil {
   629  			return false, err
   630  		}
   631  		d.scratchDecompressedReadBytes = decompressedBytes[:cap(decompressedBytes)]
   632  	} else {
   633  		// Copy the result for safety since we're reusing the diskQueueWriter's
   634  		// compressed write buffer. If an Enqueue were to arrive between Dequeue
   635  		// calls of the same buffered coldata.Batches to return, the memory would
   636  		// be corrupted. The following code ensures that
   637  		// scratchDecompressedReadBytes is of the required capacity.
   638  		if cap(d.scratchDecompressedReadBytes) < len(compressedBytes) {
   639  			d.scratchDecompressedReadBytes = make([]byte, len(compressedBytes))
   640  		}
   641  		// Slice up to the length of compressedBytes so that the copy below will
   642  		// copy all desired bytes.
   643  		d.scratchDecompressedReadBytes = d.scratchDecompressedReadBytes[:len(compressedBytes)]
   644  		copy(d.scratchDecompressedReadBytes, compressedBytes)
   645  		decompressedBytes = d.scratchDecompressedReadBytes
   646  	}
   647  
   648  	deserializer, err := colserde.NewFileDeserializerFromBytes(d.typs, decompressedBytes)
   649  	if err != nil {
   650  		return false, err
   651  	}
   652  	d.deserializerState.FileDeserializer = deserializer
   653  	d.deserializerState.curBatch = 0
   654  	if d.deserializerState.NumBatches() == 0 {
   655  		// Zero batches to deserialize in this region. This shouldn't happen but we
   656  		// might as well handle it.
   657  		if err := d.closeFileDeserializer(); err != nil {
   658  			return false, err
   659  		}
   660  		d.files[d.readFileIdx].curOffsetIdx++
   661  		return d.maybeInitDeserializer(ctx)
   662  	}
   663  	return true, nil
   664  }
   665  
   666  // Dequeue dequeues a batch from disk and deserializes it into b. Note that the
   667  // deserialized batch is only valid until the next call to Dequeue.
   668  func (d *diskQueue) Dequeue(ctx context.Context, b coldata.Batch) (bool, error) {
   669  	if d.serializer != nil && d.numBufferedBatches > 0 {
   670  		if err := d.writeFooterAndFlush(ctx); err != nil {
   671  			return false, err
   672  		}
   673  		if err := d.resetWriters(d.writeFile); err != nil {
   674  			return false, err
   675  		}
   676  	}
   677  	if d.state == diskQueueStateEnqueueing && d.cfg.CacheMode != DiskQueueCacheModeDefault {
   678  		// This is the first Dequeue after Enqueues, so reuse the write cache for
   679  		// reads. Note that the buffer for compressed reads is reused in
   680  		// maybeInitDeserializer in either case, so there is nothing to do here for
   681  		// that.
   682  		d.writer.buffer.Reset()
   683  		d.scratchDecompressedReadBytes = d.writer.buffer.Bytes()
   684  	}
   685  	d.state = diskQueueStateDequeueing
   686  
   687  	if d.deserializerState.FileDeserializer != nil && d.deserializerState.curBatch >= d.deserializerState.NumBatches() {
   688  		// Finished all the batches, set the deserializer to nil to initialize a new
   689  		// one to read the next region.
   690  		if err := d.closeFileDeserializer(); err != nil {
   691  			return false, err
   692  		}
   693  		d.files[d.readFileIdx].curOffsetIdx++
   694  	}
   695  
   696  	if dataToRead, err := d.maybeInitDeserializer(ctx); err != nil {
   697  		return false, err
   698  	} else if !dataToRead {
   699  		// No data to read.
   700  		if !d.done {
   701  			// Data might still be added.
   702  			return false, nil
   703  		}
   704  		// No data will be added.
   705  		b.SetLength(0)
   706  	} else {
   707  		if d.deserializerState.curBatch == 0 {
   708  			// It is possible that the caller has appended more columns to the
   709  			// batch than it provided types during diskQueue's creation. We
   710  			// will only be touching the prefix of the batch that we have been
   711  			// told about.
   712  			vecs := b.ColVecs()[:len(d.typs)]
   713  			for i := range vecs {
   714  				// When we deserialize a new memory region, we allocate a new null
   715  				// bitmap for the batch which deserializer will write to. If we naively
   716  				// allow the arrow batch converter to directly overwrite null bitmap of
   717  				// each column, it could lead to memory corruption. Doing this avoids
   718  				// reallocating a new scratchDecompressedReadBytes every time we perform
   719  				// a read from the file and constrains the downside to allocating a new
   720  				// null bitmap every couple of batches.
   721  				nulls := coldata.NewNulls(coldata.BatchSize())
   722  				vecs[i].SetNulls(&nulls)
   723  			}
   724  		}
   725  		if err := d.deserializerState.GetBatch(d.deserializerState.curBatch, b); err != nil {
   726  			return false, err
   727  		}
   728  		d.deserializerState.curBatch++
   729  	}
   730  
   731  	return true, nil
   732  }
   733  
   734  // Rewind is part of the RewindableQueue interface.
   735  func (d *diskQueue) Rewind() error {
   736  	if err := d.closeFileDeserializer(); err != nil {
   737  		return err
   738  	}
   739  	if err := d.CloseRead(); err != nil {
   740  		return err
   741  	}
   742  	d.deserializerState.curBatch = 0
   743  	d.readFile = nil
   744  	d.readFileIdx = 0
   745  	for i := range d.files {
   746  		d.files[i].curOffsetIdx = 0
   747  	}
   748  	return nil
   749  }