github.com/cockroachdb/cockroachdb-parser@v0.23.3-0.20240213214944-911057d40c9a/pkg/col/coldata/batch.go (about) 1 // Copyright 2018 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package coldata 12 13 import ( 14 "fmt" 15 "math" 16 "strings" 17 "sync/atomic" 18 19 "github.com/cockroachdb/cockroachdb-parser/pkg/sql/types" 20 "github.com/cockroachdb/cockroachdb-parser/pkg/util" 21 "github.com/cockroachdb/errors" 22 ) 23 24 // Batch is the type that columnar operators receive and produce. It 25 // represents a set of column vectors (partial data columns) as well as 26 // metadata about a batch, like the selection vector (which rows in the column 27 // batch are selected). 28 type Batch interface { 29 // Length returns the number of values in the columns in the batch. 30 Length() int 31 // SetLength sets the number of values in the columns in the batch. Note 32 // that if the selection vector will be set or updated on the batch, it must 33 // be set **before** setting the length. 34 SetLength(int) 35 // Capacity returns the maximum number of values that can be stored in the 36 // columns in the batch. Note that it could be a lower bound meaning some 37 // of the Vecs could actually have larger underlying capacity (for example, 38 // if they have been appended to). 39 Capacity() int 40 // Width returns the number of columns in the batch. 41 Width() int 42 // ColVec returns the ith Vec in this batch. 43 ColVec(i int) Vec 44 // ColVecs returns all of the underlying Vecs in this batch. 45 ColVecs() []Vec 46 // Selection, if not nil, returns the selection vector on this batch: a 47 // densely-packed list of the *increasing* indices in each column that have 48 // not been filtered out by a previous step. 49 // TODO(yuzefovich): consider ensuring that the length of the returned slice 50 // equals the length of the batch. 51 Selection() []int 52 // SetSelection sets whether this batch is using its selection vector or not. 53 SetSelection(bool) 54 // AppendCol appends the given Vec to this batch. 55 AppendCol(Vec) 56 // ReplaceCol replaces the current Vec at the provided index with the 57 // provided Vec. The original and the replacement vectors *must* be of the 58 // same type. 59 ReplaceCol(Vec, int) 60 // Reset modifies the caller in-place to have the given length and columns 61 // with the given types. If it's possible, Reset will reuse the existing 62 // columns and allocations, invalidating existing references to the Batch or 63 // its Vecs. However, Reset does _not_ zero out the column data. 64 // 65 // NOTE: Reset can allocate a new Batch, so when calling from the vectorized 66 // engine consider either allocating a new Batch explicitly via 67 // colmem.Allocator or calling ResetInternalBatch. 68 Reset(typs []*types.T, length int, factory ColumnFactory) 69 // ResetInternalBatch resets a batch and its underlying Vecs for reuse. It's 70 // important for callers to call ResetInternalBatch if they own internal 71 // batches that they reuse as not doing this could result in correctness 72 // or memory blowup issues. It unsets the selection and sets the length to 73 // 0. 74 ResetInternalBatch() 75 // String returns a pretty representation of this batch. 76 String() string 77 } 78 79 var _ Batch = &MemBatch{} 80 81 // DefaultColdataBatchSize is the default value of coldata-batch-size. 82 const DefaultColdataBatchSize = 1024 83 84 // defaultBatchSize is the size of batches that is used in the non-test setting. 85 // Initially, 1024 was picked based on MonetDB/X100 paper and was later 86 // confirmed to be very good using tpchvec/bench benchmark on TPC-H queries 87 // (the best number according to that benchmark was 1280, but it was negligibly 88 // better, so we decided to keep 1024 as it is a power of 2). 89 var defaultBatchSize = int64(util.ConstantWithMetamorphicTestRange( 90 "coldata-batch-size", 91 DefaultColdataBatchSize, /* defaultValue */ 92 // min is set to 3 to match colexec's minBatchSize setting. 93 3, /* min */ 94 MaxBatchSize, 95 )) 96 97 var batchSize = defaultBatchSize 98 99 // BatchSize is the maximum number of tuples that fit in a column batch. 100 func BatchSize() int { 101 return int(atomic.LoadInt64(&batchSize)) 102 } 103 104 // MaxBatchSize is the maximum acceptable size of batches. 105 const MaxBatchSize = 4096 106 107 // SetBatchSizeForTests modifies batchSize variable. It should only be used in 108 // tests. batch sizes greater than MaxBatchSize will return an error. 109 func SetBatchSizeForTests(newBatchSize int) error { 110 if newBatchSize > MaxBatchSize { 111 return errors.Errorf("batch size %d greater than maximum allowed batch size %d", newBatchSize, MaxBatchSize) 112 } 113 atomic.SwapInt64(&batchSize, int64(newBatchSize)) 114 return nil 115 } 116 117 // NewMemBatch allocates a new in-memory Batch. 118 // TODO(jordan): pool these allocations. 119 func NewMemBatch(typs []*types.T, factory ColumnFactory) Batch { 120 return NewMemBatchWithCapacity(typs, BatchSize(), factory) 121 } 122 123 // NewMemBatchWithCapacity allocates a new in-memory Batch with the given 124 // column size. Use for operators that have a precisely-sized output batch. 125 func NewMemBatchWithCapacity(typs []*types.T, capacity int, factory ColumnFactory) Batch { 126 b := NewMemBatchNoCols(typs, capacity).(*MemBatch) 127 cols := make([]memColumn, len(typs)) 128 for i, t := range typs { 129 col := &cols[i] 130 col.init(t, capacity, factory) 131 b.b[i] = col 132 } 133 return b 134 } 135 136 // NewMemBatchNoCols creates a "skeleton" of new in-memory Batch. It allocates 137 // memory for the selection vector but does *not* allocate any memory for the 138 // column vectors - those will have to be added separately. 139 func NewMemBatchNoCols(typs []*types.T, capacity int) Batch { 140 if max := math.MaxUint16; capacity > max { 141 panic(fmt.Sprintf(`batches cannot have capacity larger than %d; requested %d`, max, capacity)) 142 } 143 b := &MemBatch{} 144 b.capacity = capacity 145 b.b = make([]Vec, len(typs)) 146 b.sel = make([]int, capacity) 147 return b 148 } 149 150 // ZeroBatch is a schema-less Batch of length 0. 151 var ZeroBatch = &zeroBatch{ 152 MemBatch: NewMemBatchWithCapacity( 153 nil /* typs */, 0 /* capacity */, StandardColumnFactory, 154 ).(*MemBatch), 155 } 156 157 // zeroBatch is a wrapper around MemBatch that prohibits modifications of the 158 // batch. 159 type zeroBatch struct { 160 *MemBatch 161 } 162 163 var _ Batch = &zeroBatch{} 164 165 func (b *zeroBatch) Length() int { 166 return 0 167 } 168 169 func (b *zeroBatch) Capacity() int { 170 return 0 171 } 172 173 func (b *zeroBatch) SetLength(int) { 174 panic("length should not be changed on zero batch") 175 } 176 177 func (b *zeroBatch) SetSelection(bool) { 178 panic("selection should not be changed on zero batch") 179 } 180 181 func (b *zeroBatch) AppendCol(Vec) { 182 panic("no columns should be appended to zero batch") 183 } 184 185 func (b *zeroBatch) ReplaceCol(Vec, int) { 186 panic("no columns should be replaced in zero batch") 187 } 188 189 func (b *zeroBatch) Reset([]*types.T, int, ColumnFactory) { 190 panic("zero batch should not be reset") 191 } 192 193 // MemBatch is an in-memory implementation of Batch. 194 type MemBatch struct { 195 // length is the length of batch or sel in tuples. 196 length int 197 // capacity is the maximum number of tuples that can be stored in this 198 // MemBatch. 199 capacity int 200 // b is the slice of columns in this batch. 201 b []Vec 202 useSel bool 203 // sel is - if useSel is true - a selection vector from upstream. A 204 // selection vector is a list of selected tuple indices in this memBatch's 205 // columns (tuples for which indices are not in sel are considered to be 206 // "not present"). 207 sel []int 208 } 209 210 // Length implements the Batch interface. 211 func (m *MemBatch) Length() int { 212 return m.length 213 } 214 215 // Capacity implements the Batch interface. 216 func (m *MemBatch) Capacity() int { 217 return m.capacity 218 } 219 220 // Width implements the Batch interface. 221 func (m *MemBatch) Width() int { 222 return len(m.b) 223 } 224 225 // ColVec implements the Batch interface. 226 func (m *MemBatch) ColVec(i int) Vec { 227 return m.b[i] 228 } 229 230 // ColVecs implements the Batch interface. 231 func (m *MemBatch) ColVecs() []Vec { 232 return m.b 233 } 234 235 // Selection implements the Batch interface. 236 func (m *MemBatch) Selection() []int { 237 if !m.useSel { 238 return nil 239 } 240 return m.sel 241 } 242 243 // SetSelection implements the Batch interface. 244 func (m *MemBatch) SetSelection(b bool) { 245 m.useSel = b 246 } 247 248 // SetLength implements the Batch interface. 249 func (m *MemBatch) SetLength(length int) { 250 m.length = length 251 } 252 253 // AppendCol implements the Batch interface. 254 func (m *MemBatch) AppendCol(col Vec) { 255 m.b = append(m.b, col) 256 } 257 258 // ReplaceCol implements the Batch interface. 259 func (m *MemBatch) ReplaceCol(col Vec, colIdx int) { 260 if m.b[colIdx] != nil && !m.b[colIdx].Type().Identical(col.Type()) { 261 panic(fmt.Sprintf("unexpected replacement: original vector is %s "+ 262 "whereas the replacement is %s", m.b[colIdx].Type(), col.Type())) 263 } 264 m.b[colIdx] = col 265 } 266 267 // Reset implements the Batch interface. 268 func (m *MemBatch) Reset(typs []*types.T, length int, factory ColumnFactory) { 269 cannotReuse := m == nil || m.Capacity() < length || m.Width() < len(typs) 270 for i := 0; i < len(typs) && !cannotReuse; i++ { 271 // TODO(yuzefovich): change this when DatumVec is introduced. 272 // TODO(yuzefovich): requiring that types are "identical" might be an 273 // overkill - the vectors could have the same physical representation 274 // but non-identical types. Think through this more. 275 if !m.ColVec(i).Type().Identical(typs[i]) { 276 cannotReuse = true 277 break 278 } 279 } 280 if cannotReuse { 281 *m = *NewMemBatchWithCapacity(typs, length, factory).(*MemBatch) 282 m.SetLength(length) 283 return 284 } 285 // Yay! We can reuse m. NB It's not specified in the Reset contract, but 286 // probably a good idea to keep all modifications below this line. 287 // 288 // Note that we're intentionally not calling m.SetLength() here because 289 // that would update offsets in the bytes vectors which is not necessary 290 // since those will get reset in ResetInternalBatch anyway. 291 m.b = m.b[:len(typs)] 292 m.sel = m.sel[:length] 293 m.ResetInternalBatch() 294 m.SetLength(length) 295 } 296 297 // ResetInternalBatch implements the Batch interface. 298 func (m *MemBatch) ResetInternalBatch() { 299 m.SetLength(0 /* length */) 300 m.SetSelection(false) 301 for _, v := range m.b { 302 if v.CanonicalTypeFamily() != types.UnknownFamily { 303 v.Nulls().UnsetNulls() 304 ResetIfBytesLike(v) 305 } 306 } 307 } 308 309 // String returns a pretty representation of this batch. 310 func (m *MemBatch) String() string { 311 if m.Length() == 0 { 312 return "[zero-length batch]" 313 } 314 if VecsToStringWithRowPrefix == nil { 315 panic("need to inject the implementation from sql/colconv package") 316 } 317 return strings.Join(VecsToStringWithRowPrefix(m.ColVecs(), m.Length(), m.Selection(), "" /* prefix */), "\n") 318 } 319 320 // VecsToStringWithRowPrefix returns a pretty representation of the vectors. 321 // This method will convert all vectors to datums in order to print everything 322 // in the same manner as the tree.Datum representation does. Each row is printed 323 // in a separate string. 324 // 325 // The implementation lives in colconv package and is injected during the 326 // initialization. 327 var VecsToStringWithRowPrefix func(vecs []Vec, length int, sel []int, prefix string) []string 328 329 // GetBatchMemSize returns the total memory footprint of the batch. 330 // 331 // The implementation lives in the sql/colmem package since it depends on 332 // sem/tree, and we don't want to make coldata depend on that. 333 var GetBatchMemSize func(Batch) int64