github.com/dolthub/dolt/go@v0.40.5-0.20240520175717-68db7794bea6/store/nbs/byte_sink.go (about)

     1  // Copyright 2019 Dolthub, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package nbs
    16  
    17  import (
    18  	"bytes"
    19  	"crypto/md5"
    20  	"crypto/sha512"
    21  	"errors"
    22  	"hash"
    23  	"io"
    24  	"os"
    25  	"sync"
    26  
    27  	"github.com/dolthub/dolt/go/libraries/utils/file"
    28  	"github.com/dolthub/dolt/go/libraries/utils/iohelp"
    29  	"github.com/dolthub/dolt/go/store/atomicerr"
    30  	"github.com/dolthub/dolt/go/store/util/tempfiles"
    31  )
    32  
    33  func flushSinkToFile(sink ByteSink, path string) (err error) {
    34  	var f *os.File
    35  	f, err = os.OpenFile(path, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, os.ModePerm)
    36  
    37  	if err != nil {
    38  		return err
    39  	}
    40  
    41  	defer func() {
    42  		closeErr := f.Close()
    43  
    44  		if err == nil {
    45  			err = closeErr
    46  		}
    47  	}()
    48  
    49  	err = sink.Flush(f)
    50  	return err
    51  }
    52  
    53  // A ByteSink is an interface for writing bytes which can later be flushed to a writer
    54  type ByteSink interface {
    55  	io.Writer
    56  
    57  	// Flush writes all the data that was written to the ByteSink to the supplied writer
    58  	Flush(wr io.Writer) error
    59  
    60  	// FlushToFile writes all the data that was written to the ByteSink to a file at the given path
    61  	FlushToFile(path string) error
    62  
    63  	Reader() (io.ReadCloser, error)
    64  }
    65  
    66  // ErrBuffFull used by the FixedBufferSink when the data written is larger than the buffer allocated.
    67  var ErrBufferFull = errors.New("buffer full")
    68  
    69  // FixedBufferByteSink is a ByteSink implementation with a buffer whose size will not change.  Writing more
    70  // data than the fixed buffer can hold will result in an error
    71  type FixedBufferByteSink struct {
    72  	buff []byte
    73  	pos  uint64
    74  }
    75  
    76  // NewFixedBufferByteSink creates a FixedBufferTableSink which will use the supplied buffer
    77  func NewFixedBufferByteSink(buff []byte) *FixedBufferByteSink {
    78  	if len(buff) == 0 {
    79  		panic("must provide a buffer")
    80  	}
    81  
    82  	return &FixedBufferByteSink{buff: buff}
    83  }
    84  
    85  // Write writes a byte array to the sink.
    86  func (sink *FixedBufferByteSink) Write(src []byte) (int, error) {
    87  	dest := sink.buff[sink.pos:]
    88  	destLen := len(dest)
    89  	srcLen := len(src)
    90  
    91  	if destLen < srcLen {
    92  		return 0, ErrBufferFull
    93  	}
    94  
    95  	copy(dest, src)
    96  
    97  	sink.pos += uint64(srcLen)
    98  	return srcLen, nil
    99  }
   100  
   101  // Flush writes all the data that was written to the ByteSink to the supplied writer
   102  func (sink *FixedBufferByteSink) Flush(wr io.Writer) error {
   103  	return iohelp.WriteAll(wr, sink.buff[:sink.pos])
   104  }
   105  
   106  // FlushToFile writes all the data that was written to the ByteSink to a file at the given path
   107  func (sink *FixedBufferByteSink) FlushToFile(path string) (err error) {
   108  	return flushSinkToFile(sink, path)
   109  }
   110  
   111  func (sink *FixedBufferByteSink) Reader() (io.ReadCloser, error) {
   112  	return io.NopCloser(bytes.NewReader(sink.buff)), nil
   113  }
   114  
   115  // BlockBufferByteSink allocates blocks of data with a given block size to store the bytes written to the sink. New
   116  // blocks are allocated as needed in order to handle all the data of the Write calls.
   117  type BlockBufferByteSink struct {
   118  	blockSize int
   119  	pos       uint64
   120  	blocks    [][]byte
   121  }
   122  
   123  // NewBlockBufferByteSink creates a BlockBufferByteSink with the provided block size.
   124  func NewBlockBufferByteSink(blockSize int) *BlockBufferByteSink {
   125  	block := make([]byte, 0, blockSize)
   126  	return &BlockBufferByteSink{blockSize, 0, [][]byte{block}}
   127  }
   128  
   129  // Write writes a byte array to the sink.
   130  func (sink *BlockBufferByteSink) Write(src []byte) (int, error) {
   131  	srcLen := len(src)
   132  	currBlockIdx := len(sink.blocks) - 1
   133  	currBlock := sink.blocks[currBlockIdx]
   134  	remaining := cap(currBlock) - len(currBlock)
   135  
   136  	if remaining >= srcLen {
   137  		currBlock = append(currBlock, src...)
   138  		sink.blocks[currBlockIdx] = currBlock
   139  	} else {
   140  		if remaining > 0 {
   141  			currBlock = append(currBlock, src[:remaining]...)
   142  			sink.blocks[currBlockIdx] = currBlock
   143  		}
   144  
   145  		newBlock := make([]byte, 0, sink.blockSize)
   146  		newBlock = append(newBlock, src[remaining:]...)
   147  		sink.blocks = append(sink.blocks, newBlock)
   148  	}
   149  
   150  	sink.pos += uint64(srcLen)
   151  	return srcLen, nil
   152  }
   153  
   154  // Flush writes all the data that was written to the ByteSink to the supplied writer
   155  func (sink *BlockBufferByteSink) Flush(wr io.Writer) (err error) {
   156  	return iohelp.WriteAll(wr, sink.blocks...)
   157  }
   158  
   159  // FlushToFile writes all the data that was written to the ByteSink to a file at the given path
   160  func (sink *BlockBufferByteSink) FlushToFile(path string) (err error) {
   161  	return flushSinkToFile(sink, path)
   162  }
   163  
   164  func (sink *BlockBufferByteSink) Reader() (io.ReadCloser, error) {
   165  	rs := make([]io.Reader, len(sink.blocks))
   166  	for i := range sink.blocks {
   167  		rs[i] = bytes.NewReader(sink.blocks[i])
   168  	}
   169  	return io.NopCloser(io.MultiReader(rs...)), nil
   170  }
   171  
   172  // BufferedFileByteSink is a ByteSink implementation that buffers some amount of data before it passes it
   173  // to a background writing thread to be flushed to a file.
   174  type BufferedFileByteSink struct {
   175  	blockSize    int
   176  	pos          uint64
   177  	currentBlock []byte
   178  
   179  	writeCh chan []byte
   180  	ae      *atomicerr.AtomicError
   181  	wg      *sync.WaitGroup
   182  
   183  	wr   io.WriteCloser
   184  	path string
   185  }
   186  
   187  // NewBufferedFileByteSink creates a BufferedFileByteSink
   188  func NewBufferedFileByteSink(tempDir string, blockSize, chBufferSize int) (*BufferedFileByteSink, error) {
   189  	f, err := tempfiles.MovableTempFileProvider.NewFile(tempDir, "buffered_file_byte_sink_")
   190  
   191  	if err != nil {
   192  		return nil, err
   193  	}
   194  
   195  	sink := &BufferedFileByteSink{
   196  		blockSize:    blockSize,
   197  		currentBlock: make([]byte, blockSize),
   198  		writeCh:      make(chan []byte, chBufferSize),
   199  		ae:           atomicerr.New(),
   200  		wg:           &sync.WaitGroup{},
   201  		wr:           f,
   202  		path:         f.Name(),
   203  	}
   204  
   205  	sink.wg.Add(1)
   206  	go func() {
   207  		defer sink.wg.Done()
   208  		sink.backgroundWrite()
   209  	}()
   210  
   211  	return sink, nil
   212  }
   213  
   214  // Write writes a byte array to the sink.
   215  func (sink *BufferedFileByteSink) Write(src []byte) (int, error) {
   216  	srcLen := len(src)
   217  	remaining := cap(sink.currentBlock) - len(sink.currentBlock)
   218  
   219  	if remaining >= srcLen {
   220  		sink.currentBlock = append(sink.currentBlock, src...)
   221  
   222  		if remaining == srcLen {
   223  			sink.writeCh <- sink.currentBlock
   224  			sink.currentBlock = nil
   225  		}
   226  	} else {
   227  		if remaining > 0 {
   228  			sink.currentBlock = append(sink.currentBlock, src[:remaining]...)
   229  			sink.writeCh <- sink.currentBlock
   230  		}
   231  
   232  		newBlock := make([]byte, 0, sink.blockSize)
   233  		newBlock = append(newBlock, src[remaining:]...)
   234  		sink.currentBlock = newBlock
   235  	}
   236  
   237  	sink.pos += uint64(srcLen)
   238  	return srcLen, nil
   239  }
   240  
   241  func (sink *BufferedFileByteSink) backgroundWrite() {
   242  	var err error
   243  	for buff := range sink.writeCh {
   244  		if err != nil {
   245  			continue // drain
   246  		}
   247  
   248  		err = iohelp.WriteAll(sink.wr, buff)
   249  		sink.ae.SetIfError(err)
   250  	}
   251  
   252  	err = sink.wr.Close()
   253  	sink.ae.SetIfError(err)
   254  }
   255  
   256  func (sink *BufferedFileByteSink) finish() error {
   257  	// |finish()| is not thread-safe. We just use writeCh == nil as a
   258  	// sentinel to mean we've been called again from Reader() as part of a
   259  	// retry or something.
   260  	if sink.writeCh != nil {
   261  		toWrite := len(sink.currentBlock)
   262  		if toWrite > 0 {
   263  			sink.writeCh <- sink.currentBlock[:toWrite]
   264  		}
   265  
   266  		close(sink.writeCh)
   267  		sink.wg.Wait()
   268  
   269  		sink.writeCh = nil
   270  	}
   271  	return sink.ae.Get()
   272  }
   273  
   274  // Flush writes all the data that was written to the ByteSink to the supplied writer
   275  func (sink *BufferedFileByteSink) Flush(wr io.Writer) (err error) {
   276  	err = sink.finish()
   277  	if err != nil {
   278  		return err
   279  	}
   280  
   281  	var f *os.File
   282  	f, err = os.Open(sink.path)
   283  
   284  	if err != nil {
   285  		return err
   286  	}
   287  
   288  	defer func() {
   289  		closeErr := f.Close()
   290  
   291  		if err == nil {
   292  			err = closeErr
   293  		}
   294  	}()
   295  
   296  	_, err = io.Copy(wr, f)
   297  
   298  	return err
   299  }
   300  
   301  // FlushToFile writes all the data that was written to the ByteSink to a file at the given path
   302  func (sink *BufferedFileByteSink) FlushToFile(path string) (err error) {
   303  	err = sink.finish()
   304  	if err != nil {
   305  		return err
   306  	}
   307  
   308  	return file.Rename(sink.path, path)
   309  }
   310  
   311  func (sink *BufferedFileByteSink) Reader() (io.ReadCloser, error) {
   312  	err := sink.finish()
   313  	if err != nil {
   314  		return nil, err
   315  	}
   316  	return os.Open(sink.path)
   317  }
   318  
   319  // HashingByteSink is a ByteSink that keeps an md5 hash of all the data written to it.
   320  type HashingByteSink struct {
   321  	backingSink ByteSink
   322  	hasher      hash.Hash
   323  	size        uint64
   324  }
   325  
   326  func NewSHA512HashingByteSink(backingSink ByteSink) *HashingByteSink {
   327  	return &HashingByteSink{backingSink: backingSink, hasher: sha512.New(), size: 0}
   328  }
   329  
   330  func NewMD5HashingByteSink(backingSink ByteSink) *HashingByteSink {
   331  	return &HashingByteSink{backingSink: backingSink, hasher: md5.New(), size: 0}
   332  }
   333  
   334  // Write writes a byte array to the sink.
   335  func (sink *HashingByteSink) Write(src []byte) (int, error) {
   336  	nWritten, err := sink.backingSink.Write(src)
   337  
   338  	if err != nil {
   339  		return 0, err
   340  	}
   341  
   342  	nHashed, err := sink.hasher.Write(src[:nWritten])
   343  
   344  	if err != nil {
   345  		return 0, err
   346  	} else if nWritten != nHashed {
   347  		return 0, errors.New("failed to hash all the data that was written to the byte sink.")
   348  	}
   349  
   350  	sink.size += uint64(nWritten)
   351  
   352  	return nWritten, nil
   353  }
   354  
   355  // Flush writes all the data that was written to the ByteSink to the supplied writer
   356  func (sink *HashingByteSink) Flush(wr io.Writer) error {
   357  	return sink.backingSink.Flush(wr)
   358  }
   359  
   360  // FlushToFile writes all the data that was written to the ByteSink to a file at the given path
   361  func (sink *HashingByteSink) FlushToFile(path string) error {
   362  	return sink.backingSink.FlushToFile(path)
   363  }
   364  
   365  func (sink *HashingByteSink) Reader() (io.ReadCloser, error) {
   366  	return sink.backingSink.Reader()
   367  }
   368  
   369  // Execute the hasher.Sum() function and return the result
   370  func (sink *HashingByteSink) GetSum() []byte {
   371  	return sink.hasher.Sum(nil)
   372  }
   373  
   374  // ResetHasher resets the hasher to allow for checksums at various points in the data stream. The expectation is that
   375  // you would call GetSum prior to calling this function.
   376  func (sink *HashingByteSink) ResetHasher() {
   377  	sink.hasher.Reset()
   378  }
   379  
   380  // Size gets the number of bytes written to the sink
   381  func (sink *HashingByteSink) Size() uint64 {
   382  	return sink.size
   383  }