github.com/dolthub/dolt/go@v0.40.5-0.20240520175717-68db7794bea6/store/prolly/tree/blob_builder.go (about) 1 // Copyright 2022 Dolthub, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package tree 16 17 import ( 18 "bytes" 19 "context" 20 "errors" 21 "io" 22 23 "github.com/dolthub/go-mysql-server/sql" 24 sqltypes "github.com/dolthub/go-mysql-server/sql/types" 25 "github.com/goccy/go-json" 26 27 "github.com/dolthub/dolt/go/store/hash" 28 "github.com/dolthub/dolt/go/store/prolly/message" 29 "github.com/dolthub/dolt/go/store/types" 30 ) 31 32 const DefaultFixedChunkLength = 4000 33 34 var ErrInvalidChunkSize = errors.New("invalid chunkSize; value must be a multiple of 20") 35 36 func mustNewBlobBuilder(chunkSize int) *BlobBuilder { 37 b, _ := NewBlobBuilder(chunkSize) 38 return b 39 } 40 41 // NewBlobBuilder writes the contents of |reader| as an append-only 42 // tree, returning the root node or an error if applicable. |chunkSize| 43 // fixes the split size of leaf and intermediate node chunks. 44 func NewBlobBuilder(chunkSize int) (*BlobBuilder, error) { 45 if chunkSize%hash.ByteLen != 0 { 46 return nil, ErrInvalidChunkSize 47 } 48 49 keys := make([][]byte, chunkSize/hash.ByteLen) 50 for i := range keys { 51 keys[i] = zeroKey 52 } 53 return &BlobBuilder{ 54 chunkSize: chunkSize, 55 keys: keys, 56 }, nil 57 } 58 59 type blobNodeWriter interface { 60 Write(ctx context.Context, r io.Reader) (hash.Hash, uint64, error) 61 } 62 63 type BlobBuilder struct { 64 ns NodeStore 65 S message.Serializer 66 chunkSize int 67 keys [][]byte 68 wr blobNodeWriter 69 lastN Node 70 topLevel int 71 72 levelCap int 73 buf []byte 74 vals [][]byte 75 subtrees []uint64 76 } 77 78 func (b *BlobBuilder) SetNodeStore(ns NodeStore) { 79 b.ns = ns 80 b.S = message.NewBlobSerializer(ns.Pool()) 81 } 82 83 // Reset clears the BlobBuilder for re-use. 84 func (b *BlobBuilder) Reset() { 85 b.wr = nil 86 b.topLevel = 0 87 } 88 89 // Init calculates tree dimensions for a given blob. 90 func (b *BlobBuilder) Init(dataSize int) { 91 b.Reset() 92 93 if dataSize == 0 { 94 return 95 } 96 97 if dataSize <= b.chunkSize { 98 b.wr = &blobLeafWriter{ 99 bb: b, 100 buf: make([]byte, dataSize), 101 } 102 return 103 } 104 105 b.wr = &blobLeafWriter{ 106 bb: b, 107 buf: make([]byte, b.chunkSize), 108 } 109 110 numAddrs := b.chunkSize / hash.ByteLen 111 dataSize = dataSize / b.chunkSize 112 for dataSize > 0 { 113 dataSize = dataSize / numAddrs 114 b.topLevel += 1 115 } 116 117 // Allocate everything we need in batch, slice them up down below. 118 if b.levelCap < b.topLevel { 119 b.expand(numAddrs) 120 b.levelCap = b.topLevel 121 } 122 123 writers := make([]blobLevelWriter, b.topLevel) 124 for i, addrs := 0, 0; i < b.topLevel; i, addrs = i+1, addrs+numAddrs { 125 wr := &writers[i] 126 wr.bb = b 127 wr.child = b.wr 128 wr.buf = b.buf[addrs*hash.ByteLen : (addrs+numAddrs)*hash.ByteLen] 129 wr.vals = b.vals[addrs : addrs+numAddrs] 130 wr.subtrees = b.subtrees[addrs : addrs+numAddrs] 131 wr.level = i + 1 132 wr.sz = numAddrs 133 b.wr = wr 134 } 135 } 136 137 func (b *BlobBuilder) expand(numAddrs int) { 138 b.buf = make([]byte, b.topLevel*numAddrs*hash.ByteLen) 139 b.vals = make([][]byte, numAddrs*b.topLevel) 140 b.subtrees = make([]uint64, numAddrs*b.topLevel) 141 } 142 143 // Chunk builds the blob tree by passing the Reader to the chain of level 144 // writers, terminated in a leaf writer. The leaf writer reads chunks from the 145 // Reader and writes them, returning their hashes to its parent level writer. 146 // When the parent level writer fills up with addresses, it writes a chunk and 147 // returns that address to its parent. This continues until the Reader returns 148 // io.EOF, when every writer in the chain completes its chunk and we return the 149 // root node. 150 func (b *BlobBuilder) Chunk(ctx context.Context, r io.Reader) (Node, hash.Hash, error) { 151 if b.wr == nil { 152 return Node{}, hash.Hash{}, nil 153 } 154 h, _, err := b.wr.Write(ctx, r) 155 if err != nil && err != io.EOF { 156 return Node{}, hash.Hash{}, err 157 } 158 return b.lastN, h, nil 159 } 160 161 // blobLeafWriter writes leaf chunks of the blob, with max capacity len(buf), 162 // for every call to Write(). 163 type blobLeafWriter struct { 164 bb *BlobBuilder 165 buf []byte 166 } 167 168 var zeroKey = []byte{0} 169 var zeroKeys = [][]byte{zeroKey} 170 var leafSubtrees = []uint64{1} 171 172 func (lw *blobLeafWriter) Write(ctx context.Context, r io.Reader) (hash.Hash, uint64, error) { 173 n, err := r.Read(lw.buf) 174 if err != nil { 175 return hash.Hash{}, 0, err 176 } 177 h, err := lw.bb.write(ctx, zeroKeys, [][]byte{lw.buf[:n]}, leafSubtrees, 0) 178 return h, 1, err 179 } 180 181 // blobLevelWriters writes internal chunks of a blob, using its |child| to 182 // write the level below it. On a call to |Write|, it repeatedly calls 183 // |child.Write|, accumulating addresses to its children, until it fills up or 184 // the Reader is exhausted. In either case, it then writes its node and 185 // returns. 186 type blobLevelWriter struct { 187 bb *BlobBuilder 188 child blobNodeWriter 189 buf []byte 190 vals [][]byte 191 subtrees []uint64 192 sz int 193 level int 194 } 195 196 func (lw *blobLevelWriter) Write(ctx context.Context, r io.Reader) (hash.Hash, uint64, error) { 197 i, off, totalCount := 0, 0, uint64(0) 198 for { 199 // Sketchy hack to elide a copy here... 200 //h := (*hash.Hash)(unsafe.Pointer(&lw.buf[off])) 201 //var n uint64 202 //var err error 203 h, n, err := lw.child.Write(ctx, r) 204 if err != nil && err != io.EOF { 205 return hash.Hash{}, 0, err 206 } 207 if n != 0 { 208 totalCount += n 209 copy(lw.buf[off:], h[:]) 210 lw.subtrees[i] = n 211 lw.vals[i] = lw.buf[off : off+hash.ByteLen] 212 i += 1 213 off += hash.ByteLen 214 } 215 if i >= lw.sz || err == io.EOF { 216 h, nerr := lw.bb.write(ctx, lw.bb.keys[:i], lw.vals[:i], lw.subtrees[:i], lw.level) 217 if nerr != nil { 218 return hash.Hash{}, 0, nerr 219 } 220 return h, totalCount, err 221 } 222 } 223 } 224 225 // Write the blob node. Called by level and leaf writers. Will store lastN if 226 // the level corresponds to our root level. 227 func (b *BlobBuilder) write(ctx context.Context, keys, vals [][]byte, subtrees []uint64, level int) (hash.Hash, error) { 228 msg := b.S.Serialize(keys, vals, subtrees, level) 229 node, err := NodeFromBytes(msg) 230 if err != nil { 231 return hash.Hash{}, err 232 } 233 h, err := b.ns.Write(ctx, node) 234 if err != nil { 235 return hash.Hash{}, err 236 } 237 if level == b.topLevel { 238 b.lastN = node 239 } 240 return h, nil 241 } 242 243 const bytePeekLength = 128 244 245 type ByteArray struct { 246 ImmutableTree 247 } 248 249 func NewByteArray(addr hash.Hash, ns NodeStore) *ByteArray { 250 return &ByteArray{ImmutableTree{Addr: addr, ns: ns}} 251 } 252 253 func (b *ByteArray) ToBytes(ctx context.Context) ([]byte, error) { 254 return b.bytes(ctx) 255 } 256 257 func (b *ByteArray) ToString(ctx context.Context) (string, error) { 258 buf, err := b.bytes(ctx) 259 if err != nil { 260 return "", err 261 } 262 toShow := bytePeekLength 263 if len(buf) < toShow { 264 toShow = len(buf) 265 } 266 return string(buf[:toShow]), nil 267 } 268 269 type JSONDoc struct { 270 ImmutableTree 271 } 272 273 func NewJSONDoc(addr hash.Hash, ns NodeStore) *JSONDoc { 274 return &JSONDoc{ImmutableTree{Addr: addr, ns: ns}} 275 } 276 277 func (b *JSONDoc) ToJSONDocument(ctx context.Context) (sqltypes.JSONDocument, error) { 278 buf, err := b.bytes(ctx) 279 if err != nil { 280 return sqltypes.JSONDocument{}, err 281 } 282 var doc sqltypes.JSONDocument 283 err = json.Unmarshal(buf, &doc.Val) 284 if err != nil { 285 return sqltypes.JSONDocument{}, err 286 } 287 return doc, err 288 } 289 290 func (b *JSONDoc) ToLazyJSONDocument(ctx context.Context) (sql.JSONWrapper, error) { 291 buf, err := b.bytes(ctx) 292 if err != nil { 293 return sqltypes.JSONDocument{}, err 294 } 295 buf = types.UnescapeHTMLCodepoints(buf) 296 return sqltypes.NewLazyJSONDocument(buf), nil 297 } 298 299 func (b *JSONDoc) ToString(ctx context.Context) (string, error) { 300 buf, err := b.bytes(ctx) 301 if err != nil { 302 return "", err 303 } 304 toShow := bytePeekLength 305 if len(buf) < toShow { 306 toShow = len(buf) 307 } 308 return string(buf[:toShow]), nil 309 } 310 311 type TextStorage struct { 312 ImmutableTree 313 } 314 315 func NewTextStorage(addr hash.Hash, ns NodeStore) *TextStorage { 316 return &TextStorage{ImmutableTree{Addr: addr, ns: ns}} 317 } 318 319 func (b *TextStorage) ToBytes(ctx context.Context) ([]byte, error) { 320 return b.bytes(ctx) 321 } 322 323 func (b *TextStorage) ToString(ctx context.Context) (string, error) { 324 buf, err := b.bytes(ctx) 325 if err != nil { 326 return "", err 327 } 328 return string(buf), nil 329 } 330 331 type ImmutableTree struct { 332 Addr hash.Hash 333 buf []byte 334 ns NodeStore 335 } 336 337 func (t *ImmutableTree) load(ctx context.Context) error { 338 if t.Addr.IsEmpty() { 339 t.buf = []byte{} 340 return nil 341 } 342 n, err := t.ns.Read(ctx, t.Addr) 343 if err != nil { 344 return err 345 } 346 347 return WalkNodes(ctx, n, t.ns, func(ctx context.Context, n Node) error { 348 if n.IsLeaf() { 349 t.buf = append(t.buf, n.GetValue(0)...) 350 } 351 return nil 352 }) 353 } 354 355 func (t *ImmutableTree) bytes(ctx context.Context) ([]byte, error) { 356 if t.buf == nil { 357 err := t.load(ctx) 358 if err != nil { 359 return nil, err 360 } 361 } 362 return t.buf[:], nil 363 } 364 365 func (t *ImmutableTree) next() (Node, error) { 366 panic("not implemented") 367 } 368 369 func (t *ImmutableTree) close() error { 370 panic("not implemented") 371 } 372 373 func (t *ImmutableTree) Read(_ bytes.Buffer) (int, error) { 374 panic("not implemented") 375 }