github.com/dolthub/dolt/go@v0.40.5-0.20240520175717-68db7794bea6/store/val/tuple.go (about)

     1  // Copyright 2021 Dolthub, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package val
    16  
    17  import (
    18  	"math"
    19  
    20  	"github.com/dolthub/dolt/go/store/hash"
    21  
    22  	"github.com/dolthub/dolt/go/store/pool"
    23  )
    24  
    25  const (
    26  	MaxTupleFields          = 4096
    27  	countSize      ByteSize = 2
    28  	nodeCountSize           = uint64Size
    29  	treeLevelSize           = uint8Size
    30  
    31  	// MaxTupleDataSize is the maximum KV length considering the extra
    32  	// flatbuffer metadata required to serialize the message. This number
    33  	// implicitly checks the "last row" size that will append chunk level
    34  	// metadata. Key and value offsets per field are excluded from this number.
    35  	// (uint16) - (field count) - (content hash) - (node count) - (tree level)
    36  	MaxTupleDataSize ByteSize = math.MaxUint16 - countSize - hash.ByteLen - nodeCountSize - treeLevelSize
    37  )
    38  
    39  // A Tuple is a vector of fields encoded as a byte slice. Key-Value Tuple pairs
    40  // are used to store row data within clustered and secondary indexes in Dolt.
    41  //
    42  // The encoding format for Tuples starts with field values packed contiguously from
    43  // the front of the Tuple, followed by field offsets, and finally a field count:
    44  //
    45  //	+---------+---------+-----+---------+----------+-----+----------+-------+
    46  //	| Value 0 | Value 1 | ... | Value K | Offset 1 | ... | Offset K | Count |
    47  //	+---------+---------+-----+---------+----------+-----+----------+-------+
    48  //
    49  // Field offsets encode the byte-offset from the front of the Tuple to the beginning
    50  // of the corresponding field in the Tuple. The offset for the first field is always
    51  // zero and is therefor omitted. Offsets and the field count are little-endian
    52  // encoded uint16 values.
    53  //
    54  // Tuples read and write field values as byte slices. Interpreting these encoded
    55  // values is left up to TupleDesc which knows about a Tuple's schema and associated
    56  // field encodings. Zero-length fields are interpreted as NULL values, all non-NULL
    57  // values must be encoded with non-zero length. For this reason, variable-length
    58  // strings are encoded with a NUL terminator (see codec.go).
    59  //
    60  // Accessing the ith field where i > count will return a NULL value. This allows us
    61  // to implicitly add nullable columns to the end of a schema without needing to
    62  // rewrite index storage. However, because Dolt storage in content-addressed, we
    63  // must have a single canonical encoding for any given Tuple. For this reason, the
    64  // NULL suffix of a Tuple is explicitly truncated and the field count reduced.
    65  type Tuple []byte
    66  
    67  var EmptyTuple = Tuple([]byte{0, 0})
    68  
    69  func NewTuple(pool pool.BuffPool, values ...[]byte) Tuple {
    70  	values = trimNullSuffix(values)
    71  
    72  	var count int
    73  	var pos ByteSize
    74  	for _, v := range values {
    75  		if isNull(v) {
    76  			continue
    77  		}
    78  		count++
    79  		pos += sizeOf(v)
    80  	}
    81  	if len(values) > MaxTupleFields {
    82  		panic("tuple field maxIdx exceeds maximum")
    83  	}
    84  	if pos > MaxTupleDataSize {
    85  		panic("tuple data size exceeds maximum")
    86  	}
    87  
    88  	tup, offs := allocateTuple(pool, pos, len(values))
    89  
    90  	count = 0
    91  	pos = ByteSize(0)
    92  	for _, v := range values {
    93  		writeOffset(count, pos, offs)
    94  		count++
    95  
    96  		if isNull(v) {
    97  			continue
    98  		}
    99  
   100  		copy(tup[pos:pos+sizeOf(v)], v)
   101  		pos += sizeOf(v)
   102  	}
   103  
   104  	return tup
   105  }
   106  
   107  func trimNullSuffix(values [][]byte) [][]byte {
   108  	n := len(values)
   109  	for i := len(values) - 1; i >= 0; i-- {
   110  		if values[i] != nil {
   111  			break
   112  		}
   113  		n--
   114  	}
   115  	return values[:n]
   116  }
   117  
   118  func cloneTuple(pool pool.BuffPool, tup Tuple) Tuple {
   119  	buf := pool.Get(uint64(len(tup)))
   120  	copy(buf, tup)
   121  	return buf
   122  }
   123  
   124  func allocateTuple(pool pool.BuffPool, bufSz ByteSize, fields int) (tup Tuple, offs offsets) {
   125  	offSz := offsetsSize(fields)
   126  	tup = pool.Get(uint64(bufSz + offSz + countSize))
   127  
   128  	writeFieldCount(tup, fields)
   129  	offs = offsets(tup[bufSz : bufSz+offSz])
   130  
   131  	return
   132  }
   133  
   134  func (tup Tuple) GetOffset(i int) (int, bool) {
   135  	cnt := tup.Count()
   136  	if i >= cnt {
   137  		return 0, false
   138  	}
   139  
   140  	sz := ByteSize(len(tup))
   141  	split := sz - uint16Size*ByteSize(cnt)
   142  	offs := tup[split : sz-countSize]
   143  
   144  	start, stop := uint16(0), uint16(split)
   145  	if i*2 < len(offs) {
   146  		pos := i * 2
   147  		stop = ReadUint16(offs[pos : pos+2])
   148  	}
   149  	if i > 0 {
   150  		pos := (i - 1) * 2
   151  		start = ReadUint16(offs[pos : pos+2])
   152  	}
   153  
   154  	return int(start), start != stop
   155  }
   156  
   157  // GetField returns the value for field |i|.
   158  func (tup Tuple) GetField(i int) []byte {
   159  	cnt := tup.Count()
   160  	if i >= cnt {
   161  		return nil
   162  	}
   163  
   164  	sz := ByteSize(len(tup))
   165  	split := sz - uint16Size*ByteSize(cnt)
   166  	offs := tup[split : sz-countSize]
   167  
   168  	start, stop := uint16(0), uint16(split)
   169  	if i*2 < len(offs) {
   170  		pos := i * 2
   171  		stop = ReadUint16(offs[pos : pos+2])
   172  	}
   173  	if i > 0 {
   174  		pos := (i - 1) * 2
   175  		start = ReadUint16(offs[pos : pos+2])
   176  	}
   177  
   178  	if start == stop {
   179  		return nil // NULL
   180  	}
   181  
   182  	return tup[start:stop]
   183  }
   184  
   185  func (tup Tuple) FieldIsNull(i int) bool {
   186  	return tup.GetField(i) == nil
   187  }
   188  
   189  func (tup Tuple) Count() int {
   190  	sl := tup[len(tup)-int(countSize):]
   191  	return int(ReadUint16(sl))
   192  }
   193  
   194  func isNull(val []byte) bool {
   195  	return val == nil
   196  }
   197  
   198  func sizeOf(val []byte) ByteSize {
   199  	return ByteSize(len(val))
   200  }
   201  
   202  func writeFieldCount(tup Tuple, count int) {
   203  	sl := tup[len(tup)-int(countSize):]
   204  	WriteUint16(sl, uint16(count))
   205  }
   206  
   207  type offsets []byte
   208  
   209  // offsetsSize returns the number of bytes needed to
   210  // store |fieldCount| offsets.
   211  func offsetsSize(count int) ByteSize {
   212  	if count == 0 {
   213  		return 0
   214  	}
   215  	return ByteSize((count - 1) * 2)
   216  }
   217  
   218  // writeOffset writes offset |pos| at index |i|.
   219  func writeOffset(i int, off ByteSize, arr offsets) {
   220  	if i == 0 {
   221  		return
   222  	}
   223  	start := (i - 1) * 2
   224  	WriteUint16(arr[start:start+2], uint16(off))
   225  }