github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/sql/colcontainer/partitionedqueue.go (about)

     1  // Copyright 2020 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package colcontainer
    12  
    13  import (
    14  	"context"
    15  	"fmt"
    16  
    17  	"github.com/cockroachdb/cockroach/pkg/col/coldata"
    18  	"github.com/cockroachdb/cockroach/pkg/sql/colexecbase/colexecerror"
    19  	"github.com/cockroachdb/cockroach/pkg/sql/types"
    20  	"github.com/cockroachdb/cockroach/pkg/util/mon"
    21  	"github.com/cockroachdb/errors"
    22  	"github.com/marusama/semaphore"
    23  )
    24  
    25  // PartitionedQueue is the abstraction for on-disk storage.
    26  type PartitionedQueue interface {
    27  	// Enqueue adds the batch to the end of the partitionIdx'th partition. If a
    28  	// partition at that index does not exist, a new one is created. Existing
    29  	// partitions may not be Enqueued to after calling
    30  	// CloseAllOpenWriteFileDescriptors.
    31  	Enqueue(ctx context.Context, partitionIdx int, batch coldata.Batch) error
    32  	// Dequeue removes and returns the batch from the front of the
    33  	// partitionIdx'th partition. If the partition is empty, or no partition at
    34  	// that index was Enqueued to, a zero-length batch is returned. Note that
    35  	// it is illegal to call Enqueue on a partition after Dequeue.
    36  	Dequeue(ctx context.Context, partitionIdx int, batch coldata.Batch) error
    37  	// CloseAllOpenWriteFileDescriptors notifies the PartitionedQueue that it can
    38  	// close all open write file descriptors. After this point, only new
    39  	// partitions may be Enqueued to.
    40  	CloseAllOpenWriteFileDescriptors(ctx context.Context) error
    41  	// CloseAllOpenReadFileDescriptors closes the open read file descriptors
    42  	// belonging to partitions. These partitions may still be Dequeued from,
    43  	// although this will trigger files to be reopened.
    44  	CloseAllOpenReadFileDescriptors() error
    45  	// CloseInactiveReadPartitions closes all partitions that have been Dequeued
    46  	// from and have either been temporarily closed through
    47  	// CloseAllOpenReadFileDescriptors or have returned a coldata.ZeroBatch from
    48  	// Dequeue. This close removes the underlying files.
    49  	CloseInactiveReadPartitions(ctx context.Context) error
    50  	// Close closes all partitions created.
    51  	Close(ctx context.Context) error
    52  }
    53  
    54  // partitionState is the state a partition is in.
    55  type partitionState int
    56  
    57  const (
    58  	// partitionStateWriting is the initial state of a partition. A partition will
    59  	// always transition to partitionStateClosedForWriting next.
    60  	partitionStateWriting partitionState = iota
    61  	// partitionStateClosedForWriting is the state a partition is in when it is
    62  	// closed for writing. The next possible state is for reads to happen, and
    63  	// thus a transition to partitionStateReading. Note that if a partition is
    64  	// in this state when entering a method of PartitionedDiskQueue, it is always
    65  	// true that the file descriptor for this partition has been released.
    66  	partitionStateClosedForWriting
    67  	// partitionStateReading is the state a partition is in when Dequeue has been
    68  	// called. It is always the case that this partition has been closed for
    69  	// writing and may not transition back to partitionStateWriting or
    70  	// partitionStateClosedForWriting. The read file descriptor for this partition
    71  	// may be closed in this state, although it will be reacquired on the next
    72  	// read.
    73  	partitionStateReading
    74  	// partitionStateClosedForReading is the state a partition is in when a
    75  	// partition was in partitionStateReading and CloseAllOpenReadFileDescriptors
    76  	// was called. If Dequeued, this partition will reacquire a file descriptor
    77  	// and transition back to partitionStateReading.
    78  	partitionStateClosedForReading
    79  	// partitionStatePermanentlyClosed is the state a partition is in when its
    80  	// underlying DiskQueue has been Closed.
    81  	partitionStatePermanentlyClosed
    82  )
    83  
    84  // partition is a simple wrapper over a Queue used by the PartitionedDiskQueue.
    85  type partition struct {
    86  	Queue
    87  	state partitionState
    88  }
    89  
    90  // PartitionerStrategy describes a strategy used by the PartitionedQueue during
    91  // its operation.
    92  type PartitionerStrategy int
    93  
    94  const (
    95  	// PartitionerStrategyDefault is a partitioner strategy in which the
    96  	// PartitionedQueue will keep all partitions open for writing.
    97  	// Note that this uses up as many file descriptors as partitions.
    98  	PartitionerStrategyDefault PartitionerStrategy = iota
    99  	// PartitionerStrategyCloseOnNewPartition is a partitioner strategy that
   100  	// closes an open partition for writing if a new partition is created. This
   101  	// ensures that the total number of file descriptors remains at 1. However,
   102  	// note that closed partitions may never be written to again, only read.
   103  	PartitionerStrategyCloseOnNewPartition
   104  )
   105  
   106  // PartitionedDiskQueue is a PartitionedQueue whose partitions are on-disk.
   107  type PartitionedDiskQueue struct {
   108  	typs     []*types.T
   109  	strategy PartitionerStrategy
   110  	cfg      DiskQueueCfg
   111  
   112  	partitionIdxToIndex map[int]int
   113  	partitions          []partition
   114  
   115  	// lastEnqueuedPartitionIdx is the index of the last enqueued partition. Set
   116  	// to -1 during initialization.
   117  	lastEnqueuedPartitionIdx int
   118  
   119  	numOpenFDs  int
   120  	fdSemaphore semaphore.Semaphore
   121  	diskAcc     *mon.BoundAccount
   122  }
   123  
   124  var _ PartitionedQueue = &PartitionedDiskQueue{}
   125  
   126  // NewPartitionedDiskQueue creates a PartitionedDiskQueue whose partitions are
   127  // all on-disk queues. Note that diskQueues will be lazily created when
   128  // enqueueing to a new partition. Each new partition will use
   129  // cfg.BufferSizeBytes, so memory usage may increase in an unbounded fashion if
   130  // used unmethodically. The file descriptors are acquired through fdSemaphore.
   131  // If fdSemaphore is nil, the partitioned disk queue will not Acquire or Release
   132  // file descriptors. Do this if the caller knows that it will use a constant
   133  // maximum number of file descriptors and wishes to acquire these up front.
   134  // Note that actual file descriptors open may be less than, but never more than
   135  // the number acquired through the semaphore.
   136  func NewPartitionedDiskQueue(
   137  	typs []*types.T,
   138  	cfg DiskQueueCfg,
   139  	fdSemaphore semaphore.Semaphore,
   140  	partitionerStrategy PartitionerStrategy,
   141  	diskAcc *mon.BoundAccount,
   142  ) *PartitionedDiskQueue {
   143  	if len(typs) == 0 {
   144  		// DiskQueues cannot serialize zero length schemas, so catch this error
   145  		// early.
   146  		// TODO(asubiotto): We could support this, but not sure we need to.
   147  		colexecerror.InternalError("zero length schema unsupported")
   148  	}
   149  	return &PartitionedDiskQueue{
   150  		typs:                     typs,
   151  		strategy:                 partitionerStrategy,
   152  		cfg:                      cfg,
   153  		partitionIdxToIndex:      make(map[int]int),
   154  		partitions:               make([]partition, 0),
   155  		lastEnqueuedPartitionIdx: -1,
   156  		fdSemaphore:              fdSemaphore,
   157  		diskAcc:                  diskAcc,
   158  	}
   159  }
   160  
   161  type closeWritePartitionArgument int
   162  
   163  const (
   164  	retainFD closeWritePartitionArgument = iota
   165  	releaseFD
   166  )
   167  
   168  // closeWritePartition enqueues a coldata.ZeroBatch to a partition at index
   169  // idx, resulting in it closing its write file descriptor. This partition may
   170  // still be read from, but never written to again. Note that releaseFDOption
   171  // should always be retainFD if a new file is opened in the same scope, to avoid
   172  // having to re-enter the semaphore. The argument should only be releaseFD if
   173  // reopening a different file in a different scope.
   174  func (p *PartitionedDiskQueue) closeWritePartition(
   175  	ctx context.Context, idx int, releaseFDOption closeWritePartitionArgument,
   176  ) error {
   177  	if p.partitions[idx].state != partitionStateWriting {
   178  		colexecerror.InternalError(fmt.Sprintf("illegal state change from %d to partitionStateClosedForWriting, only partitionStateWriting allowed", p.partitions[idx].state))
   179  	}
   180  	if err := p.partitions[idx].Enqueue(ctx, coldata.ZeroBatch); err != nil {
   181  		return err
   182  	}
   183  	if releaseFDOption == releaseFD && p.fdSemaphore != nil {
   184  		p.fdSemaphore.Release(1)
   185  		p.numOpenFDs--
   186  	}
   187  	p.partitions[idx].state = partitionStateClosedForWriting
   188  	return nil
   189  }
   190  
   191  func (p *PartitionedDiskQueue) closeReadPartition(idx int) error {
   192  	if p.partitions[idx].state != partitionStateReading {
   193  		colexecerror.InternalError(fmt.Sprintf("illegal state change from %d to partitionStateClosedForReading, only partitionStateReading allowed", p.partitions[idx].state))
   194  	}
   195  	if err := p.partitions[idx].CloseRead(); err != nil {
   196  		return err
   197  	}
   198  	if p.fdSemaphore != nil {
   199  		p.fdSemaphore.Release(1)
   200  		p.numOpenFDs--
   201  	}
   202  	p.partitions[idx].state = partitionStateClosedForReading
   203  	return nil
   204  }
   205  
   206  func (p *PartitionedDiskQueue) acquireNewFD(ctx context.Context) error {
   207  	if p.fdSemaphore == nil {
   208  		return nil
   209  	}
   210  	if err := p.fdSemaphore.Acquire(ctx, 1); err != nil {
   211  		return err
   212  	}
   213  	p.numOpenFDs++
   214  	return nil
   215  }
   216  
   217  // Enqueue enqueues a batch at partition partitionIdx.
   218  func (p *PartitionedDiskQueue) Enqueue(
   219  	ctx context.Context, partitionIdx int, batch coldata.Batch,
   220  ) error {
   221  	idx, ok := p.partitionIdxToIndex[partitionIdx]
   222  	if !ok {
   223  		needToAcquireFD := true
   224  		if p.strategy == PartitionerStrategyCloseOnNewPartition && p.lastEnqueuedPartitionIdx != -1 {
   225  			idxToClose, found := p.partitionIdxToIndex[p.lastEnqueuedPartitionIdx]
   226  			if !found {
   227  				// This would be unexpected.
   228  				return errors.New("PartitionerStrategyCloseOnNewPartition unable to find last Enqueued partition")
   229  			}
   230  			if p.partitions[idxToClose].state == partitionStateWriting {
   231  				// Close the last enqueued partition. No need to release or acquire a new
   232  				// file descriptor, since the acquired FD will represent the new
   233  				// partition's FD opened in Enqueue below.
   234  				if err := p.closeWritePartition(ctx, idxToClose, retainFD); err != nil {
   235  					return err
   236  				}
   237  				needToAcquireFD = false
   238  			} else {
   239  				// The partition that was last enqueued to is not open for writing, so
   240  				// we need to acquire a new FD for this new partition.
   241  				needToAcquireFD = true
   242  			}
   243  		}
   244  
   245  		if needToAcquireFD {
   246  			// Acquire only one file descriptor. This will represent the write file
   247  			// descriptor. When we start Dequeueing from this partition, this will
   248  			// represent the read file descriptor.
   249  			if err := p.acquireNewFD(ctx); err != nil {
   250  				return err
   251  			}
   252  		}
   253  		// Partition has not been created yet.
   254  		q, err := NewDiskQueue(ctx, p.typs, p.cfg, p.diskAcc)
   255  		if err != nil {
   256  			return err
   257  		}
   258  		idx = len(p.partitions)
   259  		p.partitions = append(p.partitions, partition{Queue: q})
   260  		p.partitionIdxToIndex[partitionIdx] = idx
   261  	}
   262  	if state := p.partitions[idx].state; state != partitionStateWriting {
   263  		if state == partitionStatePermanentlyClosed {
   264  			return errors.Errorf("partition at index %d permanently closed, cannot Enqueue", partitionIdx)
   265  		}
   266  		return errors.New("Enqueue illegally called after Dequeue or CloseAllOpenWriteFileDescriptors")
   267  	}
   268  	p.lastEnqueuedPartitionIdx = partitionIdx
   269  	return p.partitions[idx].Enqueue(ctx, batch)
   270  }
   271  
   272  // Dequeue dequeues a batch from partition partitionIdx, returns a
   273  // coldata.ZeroBatch if that partition does not exist. If the partition exists
   274  // and a coldata.ZeroBatch is returned, that partition is closed.
   275  func (p *PartitionedDiskQueue) Dequeue(
   276  	ctx context.Context, partitionIdx int, batch coldata.Batch,
   277  ) error {
   278  	idx, ok := p.partitionIdxToIndex[partitionIdx]
   279  	if !ok {
   280  		batch.SetLength(0)
   281  		return nil
   282  	}
   283  	switch state := p.partitions[idx].state; state {
   284  	case partitionStateWriting:
   285  		// Close this partition for writing. However, we keep a file descriptor
   286  		// acquired for the read file descriptor opened for Dequeue.
   287  		if err := p.closeWritePartition(ctx, idx, retainFD); err != nil {
   288  			return err
   289  		}
   290  		p.partitions[idx].state = partitionStateReading
   291  	case partitionStateClosedForWriting, partitionStateClosedForReading:
   292  		// There will never be a file descriptor acquired for a partition in this
   293  		// state, so acquire it.
   294  		if err := p.acquireNewFD(ctx); err != nil {
   295  			return err
   296  		}
   297  		p.partitions[idx].state = partitionStateReading
   298  	case partitionStateReading:
   299  	// Do nothing.
   300  	case partitionStatePermanentlyClosed:
   301  		return errors.Errorf("partition at index %d permanently closed, cannot Dequeue", partitionIdx)
   302  	default:
   303  		colexecerror.InternalError(fmt.Sprintf("unhandled state %d", state))
   304  	}
   305  	notEmpty, err := p.partitions[idx].Dequeue(ctx, batch)
   306  	if err != nil {
   307  		return err
   308  	}
   309  	if batch.Length() == 0 {
   310  		// Queue is finished, release the acquired file descriptor.
   311  		if err := p.closeReadPartition(idx); err != nil {
   312  			return err
   313  		}
   314  	}
   315  	if !notEmpty {
   316  		// This should never happen. It simply means that there was no batch to
   317  		// Dequeue but more batches will be added in the future (i.e. a zero batch
   318  		// was never enqueued). Since we require partitions to be closed for writing
   319  		// before reading, this state is unexpected.
   320  		colexecerror.InternalError("DiskQueue unexpectedly returned that more data will be added")
   321  	}
   322  	return nil
   323  }
   324  
   325  // CloseAllOpenWriteFileDescriptors closes all open write file descriptors
   326  // belonging to partitions that are being Enqueued to. Once this method is
   327  // called, existing partitions may not be enqueued to again.
   328  func (p *PartitionedDiskQueue) CloseAllOpenWriteFileDescriptors(ctx context.Context) error {
   329  	for i, q := range p.partitions {
   330  		if q.state != partitionStateWriting {
   331  			continue
   332  		}
   333  		// closeWritePartition will Release the file descriptor.
   334  		if err := p.closeWritePartition(ctx, i, releaseFD); err != nil {
   335  			return err
   336  		}
   337  	}
   338  	return nil
   339  }
   340  
   341  // CloseAllOpenReadFileDescriptors closes all open read file descriptors
   342  // belonging to partitions that are being Dequeued from. If Dequeue is called
   343  // on a closed partition, it will be reopened.
   344  func (p *PartitionedDiskQueue) CloseAllOpenReadFileDescriptors() error {
   345  	for i, q := range p.partitions {
   346  		if q.state != partitionStateReading {
   347  			continue
   348  		}
   349  		// closeReadPartition will Release the file descriptor.
   350  		if err := p.closeReadPartition(i); err != nil {
   351  			return err
   352  		}
   353  	}
   354  	return nil
   355  }
   356  
   357  // CloseInactiveReadPartitions closes all partitions that were Dequeued from
   358  // and either Dequeued a coldata.ZeroBatch or were closed through
   359  // CloseAllOpenReadFileDescriptors. This method call Closes the underlying
   360  // DiskQueue to remove its files, so a partition may never be used again.
   361  func (p *PartitionedDiskQueue) CloseInactiveReadPartitions(ctx context.Context) error {
   362  	var lastErr error
   363  	for i, q := range p.partitions {
   364  		if q.state != partitionStateClosedForReading {
   365  			continue
   366  		}
   367  		lastErr = q.Close(ctx)
   368  		p.partitions[i].state = partitionStatePermanentlyClosed
   369  	}
   370  	return lastErr
   371  }
   372  
   373  // Close closes all the PartitionedDiskQueue's partitions. If an error is
   374  // encountered, the PartitionedDiskQueue will attempt to close all partitions
   375  // anyway and return the last error encountered.
   376  func (p *PartitionedDiskQueue) Close(ctx context.Context) error {
   377  	var lastErr error
   378  	for i, q := range p.partitions {
   379  		if q.state == partitionStatePermanentlyClosed {
   380  			// Already closed.
   381  			continue
   382  		}
   383  		lastErr = q.Close(ctx)
   384  		p.partitions[i].state = partitionStatePermanentlyClosed
   385  	}
   386  	if p.numOpenFDs != 0 {
   387  		// Note that if p.numOpenFDs is non-zero, it must be the case that
   388  		// fdSemaphore is non-nil.
   389  		p.fdSemaphore.Release(p.numOpenFDs)
   390  		p.numOpenFDs = 0
   391  	}
   392  	return lastErr
   393  }