github.com/dolthub/dolt/go@v0.40.5-0.20240520175717-68db7794bea6/store/val/tuple.go (about) 1 // Copyright 2021 Dolthub, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package val 16 17 import ( 18 "math" 19 20 "github.com/dolthub/dolt/go/store/hash" 21 22 "github.com/dolthub/dolt/go/store/pool" 23 ) 24 25 const ( 26 MaxTupleFields = 4096 27 countSize ByteSize = 2 28 nodeCountSize = uint64Size 29 treeLevelSize = uint8Size 30 31 // MaxTupleDataSize is the maximum KV length considering the extra 32 // flatbuffer metadata required to serialize the message. This number 33 // implicitly checks the "last row" size that will append chunk level 34 // metadata. Key and value offsets per field are excluded from this number. 35 // (uint16) - (field count) - (content hash) - (node count) - (tree level) 36 MaxTupleDataSize ByteSize = math.MaxUint16 - countSize - hash.ByteLen - nodeCountSize - treeLevelSize 37 ) 38 39 // A Tuple is a vector of fields encoded as a byte slice. Key-Value Tuple pairs 40 // are used to store row data within clustered and secondary indexes in Dolt. 41 // 42 // The encoding format for Tuples starts with field values packed contiguously from 43 // the front of the Tuple, followed by field offsets, and finally a field count: 44 // 45 // +---------+---------+-----+---------+----------+-----+----------+-------+ 46 // | Value 0 | Value 1 | ... | Value K | Offset 1 | ... | Offset K | Count | 47 // +---------+---------+-----+---------+----------+-----+----------+-------+ 48 // 49 // Field offsets encode the byte-offset from the front of the Tuple to the beginning 50 // of the corresponding field in the Tuple. The offset for the first field is always 51 // zero and is therefor omitted. Offsets and the field count are little-endian 52 // encoded uint16 values. 53 // 54 // Tuples read and write field values as byte slices. Interpreting these encoded 55 // values is left up to TupleDesc which knows about a Tuple's schema and associated 56 // field encodings. Zero-length fields are interpreted as NULL values, all non-NULL 57 // values must be encoded with non-zero length. For this reason, variable-length 58 // strings are encoded with a NUL terminator (see codec.go). 59 // 60 // Accessing the ith field where i > count will return a NULL value. This allows us 61 // to implicitly add nullable columns to the end of a schema without needing to 62 // rewrite index storage. However, because Dolt storage in content-addressed, we 63 // must have a single canonical encoding for any given Tuple. For this reason, the 64 // NULL suffix of a Tuple is explicitly truncated and the field count reduced. 65 type Tuple []byte 66 67 var EmptyTuple = Tuple([]byte{0, 0}) 68 69 func NewTuple(pool pool.BuffPool, values ...[]byte) Tuple { 70 values = trimNullSuffix(values) 71 72 var count int 73 var pos ByteSize 74 for _, v := range values { 75 if isNull(v) { 76 continue 77 } 78 count++ 79 pos += sizeOf(v) 80 } 81 if len(values) > MaxTupleFields { 82 panic("tuple field maxIdx exceeds maximum") 83 } 84 if pos > MaxTupleDataSize { 85 panic("tuple data size exceeds maximum") 86 } 87 88 tup, offs := allocateTuple(pool, pos, len(values)) 89 90 count = 0 91 pos = ByteSize(0) 92 for _, v := range values { 93 writeOffset(count, pos, offs) 94 count++ 95 96 if isNull(v) { 97 continue 98 } 99 100 copy(tup[pos:pos+sizeOf(v)], v) 101 pos += sizeOf(v) 102 } 103 104 return tup 105 } 106 107 func trimNullSuffix(values [][]byte) [][]byte { 108 n := len(values) 109 for i := len(values) - 1; i >= 0; i-- { 110 if values[i] != nil { 111 break 112 } 113 n-- 114 } 115 return values[:n] 116 } 117 118 func cloneTuple(pool pool.BuffPool, tup Tuple) Tuple { 119 buf := pool.Get(uint64(len(tup))) 120 copy(buf, tup) 121 return buf 122 } 123 124 func allocateTuple(pool pool.BuffPool, bufSz ByteSize, fields int) (tup Tuple, offs offsets) { 125 offSz := offsetsSize(fields) 126 tup = pool.Get(uint64(bufSz + offSz + countSize)) 127 128 writeFieldCount(tup, fields) 129 offs = offsets(tup[bufSz : bufSz+offSz]) 130 131 return 132 } 133 134 func (tup Tuple) GetOffset(i int) (int, bool) { 135 cnt := tup.Count() 136 if i >= cnt { 137 return 0, false 138 } 139 140 sz := ByteSize(len(tup)) 141 split := sz - uint16Size*ByteSize(cnt) 142 offs := tup[split : sz-countSize] 143 144 start, stop := uint16(0), uint16(split) 145 if i*2 < len(offs) { 146 pos := i * 2 147 stop = ReadUint16(offs[pos : pos+2]) 148 } 149 if i > 0 { 150 pos := (i - 1) * 2 151 start = ReadUint16(offs[pos : pos+2]) 152 } 153 154 return int(start), start != stop 155 } 156 157 // GetField returns the value for field |i|. 158 func (tup Tuple) GetField(i int) []byte { 159 cnt := tup.Count() 160 if i >= cnt { 161 return nil 162 } 163 164 sz := ByteSize(len(tup)) 165 split := sz - uint16Size*ByteSize(cnt) 166 offs := tup[split : sz-countSize] 167 168 start, stop := uint16(0), uint16(split) 169 if i*2 < len(offs) { 170 pos := i * 2 171 stop = ReadUint16(offs[pos : pos+2]) 172 } 173 if i > 0 { 174 pos := (i - 1) * 2 175 start = ReadUint16(offs[pos : pos+2]) 176 } 177 178 if start == stop { 179 return nil // NULL 180 } 181 182 return tup[start:stop] 183 } 184 185 func (tup Tuple) FieldIsNull(i int) bool { 186 return tup.GetField(i) == nil 187 } 188 189 func (tup Tuple) Count() int { 190 sl := tup[len(tup)-int(countSize):] 191 return int(ReadUint16(sl)) 192 } 193 194 func isNull(val []byte) bool { 195 return val == nil 196 } 197 198 func sizeOf(val []byte) ByteSize { 199 return ByteSize(len(val)) 200 } 201 202 func writeFieldCount(tup Tuple, count int) { 203 sl := tup[len(tup)-int(countSize):] 204 WriteUint16(sl, uint16(count)) 205 } 206 207 type offsets []byte 208 209 // offsetsSize returns the number of bytes needed to 210 // store |fieldCount| offsets. 211 func offsetsSize(count int) ByteSize { 212 if count == 0 { 213 return 0 214 } 215 return ByteSize((count - 1) * 2) 216 } 217 218 // writeOffset writes offset |pos| at index |i|. 219 func writeOffset(i int, off ByteSize, arr offsets) { 220 if i == 0 { 221 return 222 } 223 start := (i - 1) * 2 224 WriteUint16(arr[start:start+2], uint16(off)) 225 }