github.com/dolthub/dolt/go@v0.40.5-0.20240520175717-68db7794bea6/store/prolly/tree/blob_builder.go (about)

     1  // Copyright 2022 Dolthub, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package tree
    16  
    17  import (
    18  	"bytes"
    19  	"context"
    20  	"errors"
    21  	"io"
    22  
    23  	"github.com/dolthub/go-mysql-server/sql"
    24  	sqltypes "github.com/dolthub/go-mysql-server/sql/types"
    25  	"github.com/goccy/go-json"
    26  
    27  	"github.com/dolthub/dolt/go/store/hash"
    28  	"github.com/dolthub/dolt/go/store/prolly/message"
    29  	"github.com/dolthub/dolt/go/store/types"
    30  )
    31  
    32  const DefaultFixedChunkLength = 4000
    33  
    34  var ErrInvalidChunkSize = errors.New("invalid chunkSize; value must be a multiple of 20")
    35  
    36  func mustNewBlobBuilder(chunkSize int) *BlobBuilder {
    37  	b, _ := NewBlobBuilder(chunkSize)
    38  	return b
    39  }
    40  
    41  // NewBlobBuilder writes the contents of |reader| as an append-only
    42  // tree, returning the root node or an error if applicable. |chunkSize|
    43  // fixes the split size of leaf and intermediate node chunks.
    44  func NewBlobBuilder(chunkSize int) (*BlobBuilder, error) {
    45  	if chunkSize%hash.ByteLen != 0 {
    46  		return nil, ErrInvalidChunkSize
    47  	}
    48  
    49  	keys := make([][]byte, chunkSize/hash.ByteLen)
    50  	for i := range keys {
    51  		keys[i] = zeroKey
    52  	}
    53  	return &BlobBuilder{
    54  		chunkSize: chunkSize,
    55  		keys:      keys,
    56  	}, nil
    57  }
    58  
    59  type blobNodeWriter interface {
    60  	Write(ctx context.Context, r io.Reader) (hash.Hash, uint64, error)
    61  }
    62  
    63  type BlobBuilder struct {
    64  	ns        NodeStore
    65  	S         message.Serializer
    66  	chunkSize int
    67  	keys      [][]byte
    68  	wr        blobNodeWriter
    69  	lastN     Node
    70  	topLevel  int
    71  
    72  	levelCap int
    73  	buf      []byte
    74  	vals     [][]byte
    75  	subtrees []uint64
    76  }
    77  
    78  func (b *BlobBuilder) SetNodeStore(ns NodeStore) {
    79  	b.ns = ns
    80  	b.S = message.NewBlobSerializer(ns.Pool())
    81  }
    82  
    83  // Reset clears the BlobBuilder for re-use.
    84  func (b *BlobBuilder) Reset() {
    85  	b.wr = nil
    86  	b.topLevel = 0
    87  }
    88  
    89  // Init calculates tree dimensions for a given blob.
    90  func (b *BlobBuilder) Init(dataSize int) {
    91  	b.Reset()
    92  
    93  	if dataSize == 0 {
    94  		return
    95  	}
    96  
    97  	if dataSize <= b.chunkSize {
    98  		b.wr = &blobLeafWriter{
    99  			bb:  b,
   100  			buf: make([]byte, dataSize),
   101  		}
   102  		return
   103  	}
   104  
   105  	b.wr = &blobLeafWriter{
   106  		bb:  b,
   107  		buf: make([]byte, b.chunkSize),
   108  	}
   109  
   110  	numAddrs := b.chunkSize / hash.ByteLen
   111  	dataSize = dataSize / b.chunkSize
   112  	for dataSize > 0 {
   113  		dataSize = dataSize / numAddrs
   114  		b.topLevel += 1
   115  	}
   116  
   117  	// Allocate everything we need in batch, slice them up down below.
   118  	if b.levelCap < b.topLevel {
   119  		b.expand(numAddrs)
   120  		b.levelCap = b.topLevel
   121  	}
   122  
   123  	writers := make([]blobLevelWriter, b.topLevel)
   124  	for i, addrs := 0, 0; i < b.topLevel; i, addrs = i+1, addrs+numAddrs {
   125  		wr := &writers[i]
   126  		wr.bb = b
   127  		wr.child = b.wr
   128  		wr.buf = b.buf[addrs*hash.ByteLen : (addrs+numAddrs)*hash.ByteLen]
   129  		wr.vals = b.vals[addrs : addrs+numAddrs]
   130  		wr.subtrees = b.subtrees[addrs : addrs+numAddrs]
   131  		wr.level = i + 1
   132  		wr.sz = numAddrs
   133  		b.wr = wr
   134  	}
   135  }
   136  
   137  func (b *BlobBuilder) expand(numAddrs int) {
   138  	b.buf = make([]byte, b.topLevel*numAddrs*hash.ByteLen)
   139  	b.vals = make([][]byte, numAddrs*b.topLevel)
   140  	b.subtrees = make([]uint64, numAddrs*b.topLevel)
   141  }
   142  
   143  // Chunk builds the blob tree by passing the Reader to the chain of level
   144  // writers, terminated in a leaf writer. The leaf writer reads chunks from the
   145  // Reader and writes them, returning their hashes to its parent level writer.
   146  // When the parent level writer fills up with addresses, it writes a chunk and
   147  // returns that address to its parent. This continues until the Reader returns
   148  // io.EOF, when every writer in the chain completes its chunk and we return the
   149  // root node.
   150  func (b *BlobBuilder) Chunk(ctx context.Context, r io.Reader) (Node, hash.Hash, error) {
   151  	if b.wr == nil {
   152  		return Node{}, hash.Hash{}, nil
   153  	}
   154  	h, _, err := b.wr.Write(ctx, r)
   155  	if err != nil && err != io.EOF {
   156  		return Node{}, hash.Hash{}, err
   157  	}
   158  	return b.lastN, h, nil
   159  }
   160  
   161  // blobLeafWriter writes leaf chunks of the blob, with max capacity len(buf),
   162  // for every call to Write().
   163  type blobLeafWriter struct {
   164  	bb  *BlobBuilder
   165  	buf []byte
   166  }
   167  
   168  var zeroKey = []byte{0}
   169  var zeroKeys = [][]byte{zeroKey}
   170  var leafSubtrees = []uint64{1}
   171  
   172  func (lw *blobLeafWriter) Write(ctx context.Context, r io.Reader) (hash.Hash, uint64, error) {
   173  	n, err := r.Read(lw.buf)
   174  	if err != nil {
   175  		return hash.Hash{}, 0, err
   176  	}
   177  	h, err := lw.bb.write(ctx, zeroKeys, [][]byte{lw.buf[:n]}, leafSubtrees, 0)
   178  	return h, 1, err
   179  }
   180  
   181  // blobLevelWriters writes internal chunks of a blob, using its |child| to
   182  // write the level below it. On a call to |Write|, it repeatedly calls
   183  // |child.Write|, accumulating addresses to its children, until it fills up or
   184  // the Reader is exhausted. In either case, it then writes its node and
   185  // returns.
   186  type blobLevelWriter struct {
   187  	bb       *BlobBuilder
   188  	child    blobNodeWriter
   189  	buf      []byte
   190  	vals     [][]byte
   191  	subtrees []uint64
   192  	sz       int
   193  	level    int
   194  }
   195  
   196  func (lw *blobLevelWriter) Write(ctx context.Context, r io.Reader) (hash.Hash, uint64, error) {
   197  	i, off, totalCount := 0, 0, uint64(0)
   198  	for {
   199  		// Sketchy hack to elide a copy here...
   200  		//h := (*hash.Hash)(unsafe.Pointer(&lw.buf[off]))
   201  		//var n uint64
   202  		//var err error
   203  		h, n, err := lw.child.Write(ctx, r)
   204  		if err != nil && err != io.EOF {
   205  			return hash.Hash{}, 0, err
   206  		}
   207  		if n != 0 {
   208  			totalCount += n
   209  			copy(lw.buf[off:], h[:])
   210  			lw.subtrees[i] = n
   211  			lw.vals[i] = lw.buf[off : off+hash.ByteLen]
   212  			i += 1
   213  			off += hash.ByteLen
   214  		}
   215  		if i >= lw.sz || err == io.EOF {
   216  			h, nerr := lw.bb.write(ctx, lw.bb.keys[:i], lw.vals[:i], lw.subtrees[:i], lw.level)
   217  			if nerr != nil {
   218  				return hash.Hash{}, 0, nerr
   219  			}
   220  			return h, totalCount, err
   221  		}
   222  	}
   223  }
   224  
   225  // Write the blob node. Called by level and leaf writers. Will store lastN if
   226  // the level corresponds to our root level.
   227  func (b *BlobBuilder) write(ctx context.Context, keys, vals [][]byte, subtrees []uint64, level int) (hash.Hash, error) {
   228  	msg := b.S.Serialize(keys, vals, subtrees, level)
   229  	node, err := NodeFromBytes(msg)
   230  	if err != nil {
   231  		return hash.Hash{}, err
   232  	}
   233  	h, err := b.ns.Write(ctx, node)
   234  	if err != nil {
   235  		return hash.Hash{}, err
   236  	}
   237  	if level == b.topLevel {
   238  		b.lastN = node
   239  	}
   240  	return h, nil
   241  }
   242  
   243  const bytePeekLength = 128
   244  
   245  type ByteArray struct {
   246  	ImmutableTree
   247  }
   248  
   249  func NewByteArray(addr hash.Hash, ns NodeStore) *ByteArray {
   250  	return &ByteArray{ImmutableTree{Addr: addr, ns: ns}}
   251  }
   252  
   253  func (b *ByteArray) ToBytes(ctx context.Context) ([]byte, error) {
   254  	return b.bytes(ctx)
   255  }
   256  
   257  func (b *ByteArray) ToString(ctx context.Context) (string, error) {
   258  	buf, err := b.bytes(ctx)
   259  	if err != nil {
   260  		return "", err
   261  	}
   262  	toShow := bytePeekLength
   263  	if len(buf) < toShow {
   264  		toShow = len(buf)
   265  	}
   266  	return string(buf[:toShow]), nil
   267  }
   268  
   269  type JSONDoc struct {
   270  	ImmutableTree
   271  }
   272  
   273  func NewJSONDoc(addr hash.Hash, ns NodeStore) *JSONDoc {
   274  	return &JSONDoc{ImmutableTree{Addr: addr, ns: ns}}
   275  }
   276  
   277  func (b *JSONDoc) ToJSONDocument(ctx context.Context) (sqltypes.JSONDocument, error) {
   278  	buf, err := b.bytes(ctx)
   279  	if err != nil {
   280  		return sqltypes.JSONDocument{}, err
   281  	}
   282  	var doc sqltypes.JSONDocument
   283  	err = json.Unmarshal(buf, &doc.Val)
   284  	if err != nil {
   285  		return sqltypes.JSONDocument{}, err
   286  	}
   287  	return doc, err
   288  }
   289  
   290  func (b *JSONDoc) ToLazyJSONDocument(ctx context.Context) (sql.JSONWrapper, error) {
   291  	buf, err := b.bytes(ctx)
   292  	if err != nil {
   293  		return sqltypes.JSONDocument{}, err
   294  	}
   295  	buf = types.UnescapeHTMLCodepoints(buf)
   296  	return sqltypes.NewLazyJSONDocument(buf), nil
   297  }
   298  
   299  func (b *JSONDoc) ToString(ctx context.Context) (string, error) {
   300  	buf, err := b.bytes(ctx)
   301  	if err != nil {
   302  		return "", err
   303  	}
   304  	toShow := bytePeekLength
   305  	if len(buf) < toShow {
   306  		toShow = len(buf)
   307  	}
   308  	return string(buf[:toShow]), nil
   309  }
   310  
   311  type TextStorage struct {
   312  	ImmutableTree
   313  }
   314  
   315  func NewTextStorage(addr hash.Hash, ns NodeStore) *TextStorage {
   316  	return &TextStorage{ImmutableTree{Addr: addr, ns: ns}}
   317  }
   318  
   319  func (b *TextStorage) ToBytes(ctx context.Context) ([]byte, error) {
   320  	return b.bytes(ctx)
   321  }
   322  
   323  func (b *TextStorage) ToString(ctx context.Context) (string, error) {
   324  	buf, err := b.bytes(ctx)
   325  	if err != nil {
   326  		return "", err
   327  	}
   328  	return string(buf), nil
   329  }
   330  
   331  type ImmutableTree struct {
   332  	Addr hash.Hash
   333  	buf  []byte
   334  	ns   NodeStore
   335  }
   336  
   337  func (t *ImmutableTree) load(ctx context.Context) error {
   338  	if t.Addr.IsEmpty() {
   339  		t.buf = []byte{}
   340  		return nil
   341  	}
   342  	n, err := t.ns.Read(ctx, t.Addr)
   343  	if err != nil {
   344  		return err
   345  	}
   346  
   347  	return WalkNodes(ctx, n, t.ns, func(ctx context.Context, n Node) error {
   348  		if n.IsLeaf() {
   349  			t.buf = append(t.buf, n.GetValue(0)...)
   350  		}
   351  		return nil
   352  	})
   353  }
   354  
   355  func (t *ImmutableTree) bytes(ctx context.Context) ([]byte, error) {
   356  	if t.buf == nil {
   357  		err := t.load(ctx)
   358  		if err != nil {
   359  			return nil, err
   360  		}
   361  	}
   362  	return t.buf[:], nil
   363  }
   364  
   365  func (t *ImmutableTree) next() (Node, error) {
   366  	panic("not implemented")
   367  }
   368  
   369  func (t *ImmutableTree) close() error {
   370  	panic("not implemented")
   371  }
   372  
   373  func (t *ImmutableTree) Read(_ bytes.Buffer) (int, error) {
   374  	panic("not implemented")
   375  }