github.com/cockroachdb/cockroachdb-parser@v0.23.3-0.20240213214944-911057d40c9a/pkg/col/coldata/batch.go

github.com/cockroachdb/cockroachdb-parser@v0.23.3-0.20240213214944-911057d40c9a/pkg/col/coldata/batch.go (about)

     1  // Copyright 2018 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package coldata
    12  
    13  import (
    14  	"fmt"
    15  	"math"
    16  	"strings"
    17  	"sync/atomic"
    18  
    19  	"github.com/cockroachdb/cockroachdb-parser/pkg/sql/types"
    20  	"github.com/cockroachdb/cockroachdb-parser/pkg/util"
    21  	"github.com/cockroachdb/errors"
    22  )
    23  
    24  // Batch is the type that columnar operators receive and produce. It
    25  // represents a set of column vectors (partial data columns) as well as
    26  // metadata about a batch, like the selection vector (which rows in the column
    27  // batch are selected).
    28  type Batch interface {
    29  	// Length returns the number of values in the columns in the batch.
    30  	Length() int
    31  	// SetLength sets the number of values in the columns in the batch. Note
    32  	// that if the selection vector will be set or updated on the batch, it must
    33  	// be set **before** setting the length.
    34  	SetLength(int)
    35  	// Capacity returns the maximum number of values that can be stored in the
    36  	// columns in the batch. Note that it could be a lower bound meaning some
    37  	// of the Vecs could actually have larger underlying capacity (for example,
    38  	// if they have been appended to).
    39  	Capacity() int
    40  	// Width returns the number of columns in the batch.
    41  	Width() int
    42  	// ColVec returns the ith Vec in this batch.
    43  	ColVec(i int) Vec
    44  	// ColVecs returns all of the underlying Vecs in this batch.
    45  	ColVecs() []Vec
    46  	// Selection, if not nil, returns the selection vector on this batch: a
    47  	// densely-packed list of the *increasing* indices in each column that have
    48  	// not been filtered out by a previous step.
    49  	// TODO(yuzefovich): consider ensuring that the length of the returned slice
    50  	// equals the length of the batch.
    51  	Selection() []int
    52  	// SetSelection sets whether this batch is using its selection vector or not.
    53  	SetSelection(bool)
    54  	// AppendCol appends the given Vec to this batch.
    55  	AppendCol(Vec)
    56  	// ReplaceCol replaces the current Vec at the provided index with the
    57  	// provided Vec. The original and the replacement vectors *must* be of the
    58  	// same type.
    59  	ReplaceCol(Vec, int)
    60  	// Reset modifies the caller in-place to have the given length and columns
    61  	// with the given types. If it's possible, Reset will reuse the existing
    62  	// columns and allocations, invalidating existing references to the Batch or
    63  	// its Vecs. However, Reset does _not_ zero out the column data.
    64  	//
    65  	// NOTE: Reset can allocate a new Batch, so when calling from the vectorized
    66  	// engine consider either allocating a new Batch explicitly via
    67  	// colmem.Allocator or calling ResetInternalBatch.
    68  	Reset(typs []*types.T, length int, factory ColumnFactory)
    69  	// ResetInternalBatch resets a batch and its underlying Vecs for reuse. It's
    70  	// important for callers to call ResetInternalBatch if they own internal
    71  	// batches that they reuse as not doing this could result in correctness
    72  	// or memory blowup issues. It unsets the selection and sets the length to
    73  	// 0.
    74  	ResetInternalBatch()
    75  	// String returns a pretty representation of this batch.
    76  	String() string
    77  }
    78  
    79  var _ Batch = &MemBatch{}
    80  
    81  // DefaultColdataBatchSize is the default value of coldata-batch-size.
    82  const DefaultColdataBatchSize = 1024
    83  
    84  // defaultBatchSize is the size of batches that is used in the non-test setting.
    85  // Initially, 1024 was picked based on MonetDB/X100 paper and was later
    86  // confirmed to be very good using tpchvec/bench benchmark on TPC-H queries
    87  // (the best number according to that benchmark was 1280, but it was negligibly
    88  // better, so we decided to keep 1024 as it is a power of 2).
    89  var defaultBatchSize = int64(util.ConstantWithMetamorphicTestRange(
    90  	"coldata-batch-size",
    91  	DefaultColdataBatchSize, /* defaultValue */
    92  	// min is set to 3 to match colexec's minBatchSize setting.
    93  	3, /* min */
    94  	MaxBatchSize,
    95  ))
    96  
    97  var batchSize = defaultBatchSize
    98  
    99  // BatchSize is the maximum number of tuples that fit in a column batch.
   100  func BatchSize() int {
   101  	return int(atomic.LoadInt64(&batchSize))
   102  }
   103  
   104  // MaxBatchSize is the maximum acceptable size of batches.
   105  const MaxBatchSize = 4096
   106  
   107  // SetBatchSizeForTests modifies batchSize variable. It should only be used in
   108  // tests. batch sizes greater than MaxBatchSize will return an error.
   109  func SetBatchSizeForTests(newBatchSize int) error {
   110  	if newBatchSize > MaxBatchSize {
   111  		return errors.Errorf("batch size %d greater than maximum allowed batch size %d", newBatchSize, MaxBatchSize)
   112  	}
   113  	atomic.SwapInt64(&batchSize, int64(newBatchSize))
   114  	return nil
   115  }
   116  
   117  // NewMemBatch allocates a new in-memory Batch.
   118  // TODO(jordan): pool these allocations.
   119  func NewMemBatch(typs []*types.T, factory ColumnFactory) Batch {
   120  	return NewMemBatchWithCapacity(typs, BatchSize(), factory)
   121  }
   122  
   123  // NewMemBatchWithCapacity allocates a new in-memory Batch with the given
   124  // column size. Use for operators that have a precisely-sized output batch.
   125  func NewMemBatchWithCapacity(typs []*types.T, capacity int, factory ColumnFactory) Batch {
   126  	b := NewMemBatchNoCols(typs, capacity).(*MemBatch)
   127  	cols := make([]memColumn, len(typs))
   128  	for i, t := range typs {
   129  		col := &cols[i]
   130  		col.init(t, capacity, factory)
   131  		b.b[i] = col
   132  	}
   133  	return b
   134  }
   135  
   136  // NewMemBatchNoCols creates a "skeleton" of new in-memory Batch. It allocates
   137  // memory for the selection vector but does *not* allocate any memory for the
   138  // column vectors - those will have to be added separately.
   139  func NewMemBatchNoCols(typs []*types.T, capacity int) Batch {
   140  	if max := math.MaxUint16; capacity > max {
   141  		panic(fmt.Sprintf(`batches cannot have capacity larger than %d; requested %d`, max, capacity))
   142  	}
   143  	b := &MemBatch{}
   144  	b.capacity = capacity
   145  	b.b = make([]Vec, len(typs))
   146  	b.sel = make([]int, capacity)
   147  	return b
   148  }
   149  
   150  // ZeroBatch is a schema-less Batch of length 0.
   151  var ZeroBatch = &zeroBatch{
   152  	MemBatch: NewMemBatchWithCapacity(
   153  		nil /* typs */, 0 /* capacity */, StandardColumnFactory,
   154  	).(*MemBatch),
   155  }
   156  
   157  // zeroBatch is a wrapper around MemBatch that prohibits modifications of the
   158  // batch.
   159  type zeroBatch struct {
   160  	*MemBatch
   161  }
   162  
   163  var _ Batch = &zeroBatch{}
   164  
   165  func (b *zeroBatch) Length() int {
   166  	return 0
   167  }
   168  
   169  func (b *zeroBatch) Capacity() int {
   170  	return 0
   171  }
   172  
   173  func (b *zeroBatch) SetLength(int) {
   174  	panic("length should not be changed on zero batch")
   175  }
   176  
   177  func (b *zeroBatch) SetSelection(bool) {
   178  	panic("selection should not be changed on zero batch")
   179  }
   180  
   181  func (b *zeroBatch) AppendCol(Vec) {
   182  	panic("no columns should be appended to zero batch")
   183  }
   184  
   185  func (b *zeroBatch) ReplaceCol(Vec, int) {
   186  	panic("no columns should be replaced in zero batch")
   187  }
   188  
   189  func (b *zeroBatch) Reset([]*types.T, int, ColumnFactory) {
   190  	panic("zero batch should not be reset")
   191  }
   192  
   193  // MemBatch is an in-memory implementation of Batch.
   194  type MemBatch struct {
   195  	// length is the length of batch or sel in tuples.
   196  	length int
   197  	// capacity is the maximum number of tuples that can be stored in this
   198  	// MemBatch.
   199  	capacity int
   200  	// b is the slice of columns in this batch.
   201  	b      []Vec
   202  	useSel bool
   203  	// sel is - if useSel is true - a selection vector from upstream. A
   204  	// selection vector is a list of selected tuple indices in this memBatch's
   205  	// columns (tuples for which indices are not in sel are considered to be
   206  	// "not present").
   207  	sel []int
   208  }
   209  
   210  // Length implements the Batch interface.
   211  func (m *MemBatch) Length() int {
   212  	return m.length
   213  }
   214  
   215  // Capacity implements the Batch interface.
   216  func (m *MemBatch) Capacity() int {
   217  	return m.capacity
   218  }
   219  
   220  // Width implements the Batch interface.
   221  func (m *MemBatch) Width() int {
   222  	return len(m.b)
   223  }
   224  
   225  // ColVec implements the Batch interface.
   226  func (m *MemBatch) ColVec(i int) Vec {
   227  	return m.b[i]
   228  }
   229  
   230  // ColVecs implements the Batch interface.
   231  func (m *MemBatch) ColVecs() []Vec {
   232  	return m.b
   233  }
   234  
   235  // Selection implements the Batch interface.
   236  func (m *MemBatch) Selection() []int {
   237  	if !m.useSel {
   238  		return nil
   239  	}
   240  	return m.sel
   241  }
   242  
   243  // SetSelection implements the Batch interface.
   244  func (m *MemBatch) SetSelection(b bool) {
   245  	m.useSel = b
   246  }
   247  
   248  // SetLength implements the Batch interface.
   249  func (m *MemBatch) SetLength(length int) {
   250  	m.length = length
   251  }
   252  
   253  // AppendCol implements the Batch interface.
   254  func (m *MemBatch) AppendCol(col Vec) {
   255  	m.b = append(m.b, col)
   256  }
   257  
   258  // ReplaceCol implements the Batch interface.
   259  func (m *MemBatch) ReplaceCol(col Vec, colIdx int) {
   260  	if m.b[colIdx] != nil && !m.b[colIdx].Type().Identical(col.Type()) {
   261  		panic(fmt.Sprintf("unexpected replacement: original vector is %s "+
   262  			"whereas the replacement is %s", m.b[colIdx].Type(), col.Type()))
   263  	}
   264  	m.b[colIdx] = col
   265  }
   266  
   267  // Reset implements the Batch interface.
   268  func (m *MemBatch) Reset(typs []*types.T, length int, factory ColumnFactory) {
   269  	cannotReuse := m == nil || m.Capacity() < length || m.Width() < len(typs)
   270  	for i := 0; i < len(typs) && !cannotReuse; i++ {
   271  		// TODO(yuzefovich): change this when DatumVec is introduced.
   272  		// TODO(yuzefovich): requiring that types are "identical" might be an
   273  		// overkill - the vectors could have the same physical representation
   274  		// but non-identical types. Think through this more.
   275  		if !m.ColVec(i).Type().Identical(typs[i]) {
   276  			cannotReuse = true
   277  			break
   278  		}
   279  	}
   280  	if cannotReuse {
   281  		*m = *NewMemBatchWithCapacity(typs, length, factory).(*MemBatch)
   282  		m.SetLength(length)
   283  		return
   284  	}
   285  	// Yay! We can reuse m. NB It's not specified in the Reset contract, but
   286  	// probably a good idea to keep all modifications below this line.
   287  	//
   288  	// Note that we're intentionally not calling m.SetLength() here because
   289  	// that would update offsets in the bytes vectors which is not necessary
   290  	// since those will get reset in ResetInternalBatch anyway.
   291  	m.b = m.b[:len(typs)]
   292  	m.sel = m.sel[:length]
   293  	m.ResetInternalBatch()
   294  	m.SetLength(length)
   295  }
   296  
   297  // ResetInternalBatch implements the Batch interface.
   298  func (m *MemBatch) ResetInternalBatch() {
   299  	m.SetLength(0 /* length */)
   300  	m.SetSelection(false)
   301  	for _, v := range m.b {
   302  		if v.CanonicalTypeFamily() != types.UnknownFamily {
   303  			v.Nulls().UnsetNulls()
   304  			ResetIfBytesLike(v)
   305  		}
   306  	}
   307  }
   308  
   309  // String returns a pretty representation of this batch.
   310  func (m *MemBatch) String() string {
   311  	if m.Length() == 0 {
   312  		return "[zero-length batch]"
   313  	}
   314  	if VecsToStringWithRowPrefix == nil {
   315  		panic("need to inject the implementation from sql/colconv package")
   316  	}
   317  	return strings.Join(VecsToStringWithRowPrefix(m.ColVecs(), m.Length(), m.Selection(), "" /* prefix */), "\n")
   318  }
   319  
   320  // VecsToStringWithRowPrefix returns a pretty representation of the vectors.
   321  // This method will convert all vectors to datums in order to print everything
   322  // in the same manner as the tree.Datum representation does. Each row is printed
   323  // in a separate string.
   324  //
   325  // The implementation lives in colconv package and is injected during the
   326  // initialization.
   327  var VecsToStringWithRowPrefix func(vecs []Vec, length int, sel []int, prefix string) []string
   328  
   329  // GetBatchMemSize returns the total memory footprint of the batch.
   330  //
   331  // The implementation lives in the sql/colmem package since it depends on
   332  // sem/tree, and we don't want to make coldata depend on that.
   333  var GetBatchMemSize func(Batch) int64