github.com/hasnat/dolt/go@v0.0.0-20210628190320-9eb5d843fbb7/store/types/blob.go (about)

     1  // Copyright 2019 Dolthub, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  //
    15  // This file incorporates work covered by the following copyright and
    16  // permission notice:
    17  //
    18  // Copyright 2016 Attic Labs, Inc. All rights reserved.
    19  // Licensed under the Apache License, version 2.0:
    20  // http://www.apache.org/licenses/LICENSE-2.0
    21  
    22  package types
    23  
    24  import (
    25  	"context"
    26  	"errors"
    27  	"io"
    28  	"runtime"
    29  	"sync"
    30  
    31  	"github.com/dolthub/dolt/go/store/atomicerr"
    32  
    33  	"github.com/dolthub/dolt/go/store/d"
    34  )
    35  
    36  // Blob represents a list of Blobs.
    37  type Blob struct {
    38  	sequence
    39  }
    40  
    41  func newBlob(seq sequence) Blob {
    42  	return Blob{seq}
    43  }
    44  
    45  func NewEmptyBlob(vrw ValueReadWriter) (Blob, error) {
    46  	seq, err := newBlobLeafSequence(vrw, []byte{})
    47  
    48  	if err != nil {
    49  		return Blob{}, err
    50  	}
    51  
    52  	return Blob{seq}, nil
    53  }
    54  
    55  // ReadAt implements the ReaderAt interface. Eagerly loads requested byte-range from the blob p-tree.
    56  func (b Blob) ReadAt(ctx context.Context, p []byte, off int64) (n int, err error) {
    57  	// TODO: Support negative off?
    58  	d.PanicIfTrue(off < 0)
    59  
    60  	startIdx := uint64(off)
    61  	if startIdx >= b.Len() {
    62  		return 0, io.EOF
    63  	}
    64  
    65  	endIdx := startIdx + uint64(len(p))
    66  	if endIdx > b.Len() {
    67  		endIdx = b.Len()
    68  	}
    69  
    70  	var isEOF bool
    71  	if endIdx == b.Len() {
    72  		isEOF = true
    73  	}
    74  
    75  	if startIdx == endIdx {
    76  		return
    77  	}
    78  
    79  	leaves, localStart, err := LoadLeafNodes(ctx, []Collection{b}, startIdx, endIdx)
    80  
    81  	if err != nil {
    82  		return 0, err
    83  	}
    84  
    85  	endIdx = localStart + endIdx - startIdx
    86  	startIdx = localStart
    87  
    88  	for _, leaf := range leaves {
    89  		bl := leaf.asSequence().(blobLeafSequence)
    90  
    91  		localEnd := endIdx
    92  		data := bl.data()
    93  		leafLength := uint64(len(data))
    94  		if localEnd > leafLength {
    95  			localEnd = leafLength
    96  		}
    97  		src := data[startIdx:localEnd]
    98  
    99  		copy(p[n:], src)
   100  		n += len(src)
   101  		endIdx -= localEnd
   102  		startIdx = 0
   103  	}
   104  
   105  	if isEOF {
   106  		err = io.EOF
   107  	}
   108  
   109  	return n, err
   110  }
   111  
   112  func (b Blob) Reader(ctx context.Context) *BlobReader {
   113  	return &BlobReader{b, 0, ctx}
   114  }
   115  
   116  func (b Blob) Copy(ctx context.Context, w io.Writer) (int64, error) {
   117  	return b.CopyReadAhead(ctx, w, 1<<23 /* 8MB */, 6)
   118  }
   119  
   120  // CopyReadAhead copies the entire contents of |b| to |w|, and attempts to stay
   121  // |concurrency| |chunkSize| blocks of bytes ahead of the last byte written to
   122  // |w|.
   123  func (b Blob) CopyReadAhead(ctx context.Context, w io.Writer, chunkSize uint64, concurrency int) (int64, error) {
   124  	ae := atomicerr.New()
   125  	bChan := make(chan chan []byte, concurrency)
   126  
   127  	go func() {
   128  		defer close(bChan)
   129  		for idx, l := uint64(0), b.Len(); idx < l; {
   130  			if ae.IsSet() {
   131  				break
   132  			}
   133  
   134  			bc := make(chan []byte)
   135  			bChan <- bc
   136  
   137  			start := idx
   138  			blockLength := b.Len() - start
   139  			if blockLength > chunkSize {
   140  				blockLength = chunkSize
   141  			}
   142  			idx += blockLength
   143  
   144  			go func() {
   145  				defer close(bc)
   146  				buff := make([]byte, blockLength)
   147  				n, err := b.ReadAt(ctx, buff, int64(start))
   148  
   149  				if err != nil && err != io.EOF {
   150  					ae.SetIfError(err)
   151  				} else if n > 0 {
   152  					bc <- buff
   153  				}
   154  			}()
   155  		}
   156  	}()
   157  
   158  	// Ensure read-ahead goroutines can exit
   159  	defer func() {
   160  		for range bChan {
   161  		}
   162  	}()
   163  
   164  	var n int64
   165  	for b := range bChan {
   166  		if ae.IsSet() {
   167  			break
   168  		}
   169  
   170  		bytes, ok := <-b
   171  
   172  		if !ok {
   173  			continue
   174  		}
   175  
   176  		ln, err := w.Write(bytes)
   177  		n += int64(ln)
   178  		if err != nil {
   179  			ae.SetIfError(err)
   180  		}
   181  	}
   182  
   183  	return n, ae.Get()
   184  }
   185  
   186  // Concat returns a new Blob comprised of this joined with other. It only needs
   187  // to visit the rightmost prolly tree chunks of this Blob, and the leftmost
   188  // prolly tree chunks of other, so it's efficient.
   189  func (b Blob) Concat(ctx context.Context, other Blob) (Blob, error) {
   190  	seq, err := concat(ctx, b.sequence, other.sequence, func(cur *sequenceCursor, vrw ValueReadWriter) (*sequenceChunker, error) {
   191  		return b.newChunker(ctx, cur, vrw)
   192  	})
   193  
   194  	if err != nil {
   195  		return Blob{}, err
   196  	}
   197  
   198  	return newBlob(seq), nil
   199  }
   200  
   201  func (b Blob) newChunker(ctx context.Context, cur *sequenceCursor, vrw ValueReadWriter) (*sequenceChunker, error) {
   202  	return newSequenceChunker(ctx, cur, 0, vrw, makeBlobLeafChunkFn(vrw), newIndexedMetaSequenceChunkFn(BlobKind, vrw), hashValueByte)
   203  }
   204  
   205  func (b Blob) asSequence() sequence {
   206  	return b.sequence
   207  }
   208  
   209  // Value interface
   210  func (b Blob) Value(ctx context.Context) (Value, error) {
   211  	return b, nil
   212  }
   213  
   214  func (b Blob) isPrimitive() bool {
   215  	return true
   216  }
   217  
   218  func (b Blob) Kind() NomsKind {
   219  	if b.sequence == nil {
   220  		return BlobKind
   221  	}
   222  	return b.sequence.Kind()
   223  }
   224  
   225  func (b Blob) WalkValues(ctx context.Context, cb ValueCallback) error {
   226  	return nil
   227  }
   228  
   229  type BlobReader struct {
   230  	b   Blob
   231  	pos int64
   232  	ctx context.Context
   233  }
   234  
   235  func (cbr *BlobReader) Read(p []byte) (n int, err error) {
   236  	n, err = cbr.b.ReadAt(cbr.ctx, p, cbr.pos)
   237  	cbr.pos += int64(n)
   238  	return
   239  }
   240  
   241  func (cbr *BlobReader) Seek(offset int64, whence int) (int64, error) {
   242  	abs := int64(cbr.pos)
   243  
   244  	switch whence {
   245  	case 0:
   246  		abs = offset
   247  	case 1:
   248  		abs += offset
   249  	case 2:
   250  		abs = int64(cbr.b.Len()) + offset
   251  	default:
   252  		return 0, errors.New("Blob.Reader.Seek: invalid whence")
   253  	}
   254  
   255  	if abs < 0 {
   256  		return 0, errors.New("Blob.Reader.Seek: negative position")
   257  	}
   258  
   259  	cbr.pos = int64(abs)
   260  	return abs, nil
   261  }
   262  
   263  func makeBlobLeafChunkFn(vrw ValueReadWriter) makeChunkFn {
   264  	return func(level uint64, items []sequenceItem) (Collection, orderedKey, uint64, error) {
   265  		d.PanicIfFalse(level == 0)
   266  		buff := make([]byte, len(items))
   267  
   268  		for i, v := range items {
   269  			buff[i] = v.(byte)
   270  		}
   271  
   272  		return chunkBlobLeaf(vrw, buff)
   273  	}
   274  }
   275  
   276  func chunkBlobLeaf(vrw ValueReadWriter, buff []byte) (Collection, orderedKey, uint64, error) {
   277  	seq, err := newBlobLeafSequence(vrw, buff)
   278  
   279  	if err != nil {
   280  		return nil, orderedKey{}, 0, err
   281  	}
   282  
   283  	blob := newBlob(seq)
   284  
   285  	ordKey, err := orderedKeyFromInt(len(buff), vrw.Format())
   286  
   287  	if err != nil {
   288  		return nil, orderedKey{}, 0, err
   289  	}
   290  
   291  	return blob, ordKey, uint64(len(buff)), nil
   292  }
   293  
   294  // NewBlob creates a Blob by reading from every Reader in rs and
   295  // concatenating the result. NewBlob uses one goroutine per Reader.
   296  func NewBlob(ctx context.Context, vrw ValueReadWriter, rs ...io.Reader) (Blob, error) {
   297  	return readBlobsP(ctx, vrw, rs...)
   298  }
   299  
   300  func readBlobsP(ctx context.Context, vrw ValueReadWriter, rs ...io.Reader) (Blob, error) {
   301  	switch len(rs) {
   302  	case 0:
   303  		return NewEmptyBlob(vrw)
   304  	case 1:
   305  		return readBlob(ctx, rs[0], vrw)
   306  	}
   307  
   308  	blobs := make([]Blob, len(rs))
   309  
   310  	ae := atomicerr.New()
   311  	wg := &sync.WaitGroup{}
   312  	wg.Add(len(rs))
   313  
   314  	for i, r := range rs {
   315  		if ae.IsSet() {
   316  			break
   317  		}
   318  
   319  		i2, r2 := i, r
   320  		go func() {
   321  			defer wg.Done()
   322  
   323  			if !ae.IsSet() {
   324  				var err error
   325  				blobs[i2], err = readBlob(ctx, r2, vrw)
   326  				ae.SetIfError(err)
   327  			}
   328  		}()
   329  	}
   330  
   331  	wg.Wait()
   332  
   333  	if ae.IsSet() {
   334  		return Blob{}, ae.Get()
   335  	}
   336  
   337  	b := blobs[0]
   338  	for i := 1; i < len(blobs); i++ {
   339  		var err error
   340  		b, err = b.Concat(ctx, blobs[i])
   341  
   342  		if err != nil {
   343  			return Blob{}, err
   344  		}
   345  	}
   346  	return b, nil
   347  }
   348  
   349  func readBlob(ctx context.Context, r io.Reader, vrw ValueReadWriter) (Blob, error) {
   350  	sc, err := newEmptySequenceChunker(ctx, vrw, makeBlobLeafChunkFn(vrw), newIndexedMetaSequenceChunkFn(BlobKind, vrw), func(item sequenceItem, rv *rollingValueHasher) error {
   351  		rv.HashByte(item.(byte))
   352  		return nil
   353  	})
   354  
   355  	if err != nil {
   356  		return Blob{}, err
   357  	}
   358  
   359  	// TODO: The code below is temporary. It's basically a custom leaf-level chunker for blobs. There are substational
   360  	// perf gains by doing it this way as it avoids the cost of boxing every single byte which is chunked.
   361  	chunkBuff := [8192]byte{}
   362  	chunkBytes := chunkBuff[:]
   363  	rv := newRollingValueHasher(vrw.Format(), 0)
   364  	offset := 0
   365  	addByte := func(b byte) bool {
   366  		if offset >= len(chunkBytes) {
   367  			tmp := make([]byte, len(chunkBytes)*2)
   368  			copy(tmp, chunkBytes)
   369  			chunkBytes = tmp
   370  		}
   371  		chunkBytes[offset] = b
   372  		offset++
   373  		rv.hashByte(b, uint32(offset))
   374  		return rv.crossedBoundary
   375  	}
   376  
   377  	ae := atomicerr.New()
   378  	mtChan := make(chan chan metaTuple, runtime.NumCPU())
   379  
   380  	makeChunk := func() {
   381  		rv.Reset()
   382  		cp := make([]byte, offset)
   383  		copy(cp, chunkBytes[0:offset])
   384  
   385  		ch := make(chan metaTuple)
   386  		mtChan <- ch
   387  
   388  		go func(ch chan metaTuple, cp []byte) {
   389  			defer close(ch)
   390  
   391  			col, key, numLeaves, err := chunkBlobLeaf(vrw, cp)
   392  
   393  			if err != nil {
   394  				ae.SetIfError(err)
   395  				return
   396  			}
   397  
   398  			val, err := vrw.WriteValue(ctx, col)
   399  
   400  			if ae.SetIfError(err) {
   401  				return
   402  			}
   403  
   404  			mt, err := newMetaTuple(val, key, numLeaves)
   405  
   406  			if ae.SetIfError(err) {
   407  				return
   408  			}
   409  
   410  			ch <- mt
   411  		}(ch, cp)
   412  
   413  		offset = 0
   414  	}
   415  
   416  	go func() {
   417  		defer close(mtChan)
   418  		readBuff := [8192]byte{}
   419  		for {
   420  			if ae.IsSet() {
   421  				break
   422  			}
   423  
   424  			n, err := r.Read(readBuff[:])
   425  
   426  			isEOF := err == io.EOF
   427  			if err != nil && err != io.EOF {
   428  				ae.SetIfError(err)
   429  				break
   430  			}
   431  
   432  			for i := 0; i < n; i++ {
   433  				if addByte(readBuff[i]) {
   434  					makeChunk()
   435  				}
   436  			}
   437  
   438  			if isEOF {
   439  				if offset > 0 {
   440  					makeChunk()
   441  				}
   442  				break
   443  			}
   444  		}
   445  	}()
   446  
   447  	for ch := range mtChan {
   448  		if ae.IsSet() {
   449  			break
   450  		}
   451  
   452  		mt, ok := <-ch
   453  
   454  		if !ok {
   455  			continue
   456  		}
   457  
   458  		if sc.parent == nil {
   459  			err := sc.createParent(ctx)
   460  
   461  			if ae.SetIfError(err) {
   462  				continue
   463  			}
   464  		}
   465  
   466  		_, err := sc.parent.Append(ctx, mt)
   467  		ae.SetIfError(err)
   468  	}
   469  
   470  	seq, err := sc.Done(ctx)
   471  
   472  	if err != nil {
   473  		return Blob{}, err
   474  	}
   475  
   476  	return newBlob(seq), nil
   477  }
   478  
   479  func (b Blob) readFrom(nbf *NomsBinFormat, bnr *binaryNomsReader) (Value, error) {
   480  	panic("unreachable")
   481  }
   482  
   483  func (b Blob) skip(nbf *NomsBinFormat, bnr *binaryNomsReader) {
   484  	panic("unreachable")
   485  }
   486  
   487  func (b Blob) String() string {
   488  	panic("unreachable")
   489  }
   490  
   491  func (b Blob) HumanReadableString() string {
   492  	panic("unreachable")
   493  }