github.com/dolthub/dolt/go@v0.40.5-0.20240520175717-68db7794bea6/store/nbs/cmp_chunk_table_writer.go (about)

     1  // Copyright 2019 Dolthub, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package nbs
    16  
    17  import (
    18  	"crypto/sha512"
    19  	"encoding/binary"
    20  	"errors"
    21  	gohash "hash"
    22  	"io"
    23  	"os"
    24  	"sort"
    25  
    26  	"github.com/golang/snappy"
    27  
    28  	"github.com/dolthub/dolt/go/store/hash"
    29  )
    30  
    31  const defaultTableSinkBlockSize = 2 * 1024 * 1024
    32  const defaultChBufferSize = 32 * 1024
    33  
    34  // ErrNotFinished is an error returned by a CmpChunkTableWriter when a call to Flush* is called before Finish is called
    35  var ErrNotFinished = errors.New("not finished")
    36  
    37  // ErrAlreadyFinished is an error returned if Finish is called more than once on a CmpChunkTableWriter
    38  var ErrAlreadyFinished = errors.New("already Finished")
    39  
    40  // ErrDuplicateChunkWritten is returned by Finish if the same chunk was given to the writer multiple times.
    41  var ErrDuplicateChunkWritten = errors.New("duplicate chunks written")
    42  
    43  // CmpChunkTableWriter writes CompressedChunks to a table file
    44  type CmpChunkTableWriter struct {
    45  	sink                  *HashingByteSink
    46  	totalCompressedData   uint64
    47  	totalUncompressedData uint64
    48  	prefixes              prefixIndexSlice
    49  	blockAddr             *hash.Hash
    50  	path                  string
    51  }
    52  
    53  // NewCmpChunkTableWriter creates a new CmpChunkTableWriter instance with a default ByteSink
    54  func NewCmpChunkTableWriter(tempDir string) (*CmpChunkTableWriter, error) {
    55  	s, err := NewBufferedFileByteSink(tempDir, defaultTableSinkBlockSize, defaultChBufferSize)
    56  	if err != nil {
    57  		return nil, err
    58  	}
    59  
    60  	return &CmpChunkTableWriter{NewMD5HashingByteSink(s), 0, 0, nil, nil, s.path}, nil
    61  }
    62  
    63  func (tw *CmpChunkTableWriter) ChunkCount() int {
    64  	return len(tw.prefixes)
    65  }
    66  
    67  // Gets the size of the entire table file in bytes
    68  func (tw *CmpChunkTableWriter) ContentLength() uint64 {
    69  	return tw.sink.Size()
    70  }
    71  
    72  // Gets the MD5 of the entire table file
    73  func (tw *CmpChunkTableWriter) GetMD5() []byte {
    74  	return tw.sink.GetSum()
    75  }
    76  
    77  // AddCmpChunk adds a compressed chunk
    78  func (tw *CmpChunkTableWriter) AddCmpChunk(c CompressedChunk) error {
    79  	if len(c.CompressedData) == 0 {
    80  		panic("NBS blocks cannot be zero length")
    81  	}
    82  
    83  	uncmpLen, err := snappy.DecodedLen(c.CompressedData)
    84  
    85  	if err != nil {
    86  		return err
    87  	}
    88  
    89  	fullLen := len(c.FullCompressedChunk)
    90  	_, err = tw.sink.Write(c.FullCompressedChunk)
    91  
    92  	if err != nil {
    93  		return err
    94  	}
    95  
    96  	tw.totalCompressedData += uint64(len(c.CompressedData))
    97  	tw.totalUncompressedData += uint64(uncmpLen)
    98  
    99  	// Stored in insertion order
   100  	tw.prefixes = append(tw.prefixes, prefixIndexRec{
   101  		c.H,
   102  		uint32(len(tw.prefixes)),
   103  		uint32(fullLen),
   104  	})
   105  
   106  	return nil
   107  }
   108  
   109  // Finish will write the index and footer of the table file and return the id of the file.
   110  func (tw *CmpChunkTableWriter) Finish() (string, error) {
   111  	if tw.blockAddr != nil {
   112  		return "", ErrAlreadyFinished
   113  	}
   114  
   115  	blockHash, err := tw.writeIndex()
   116  
   117  	if err != nil {
   118  		return "", err
   119  	}
   120  
   121  	err = tw.writeFooter()
   122  
   123  	if err != nil {
   124  		return "", err
   125  	}
   126  
   127  	var h []byte
   128  	h = blockHash.Sum(h)
   129  	blockAddr := hash.New(h[:hash.ByteLen])
   130  
   131  	tw.blockAddr = &blockAddr
   132  	return tw.blockAddr.String(), nil
   133  }
   134  
   135  // FlushToFile can be called after Finish in order to write the data out to the path provided.
   136  func (tw *CmpChunkTableWriter) FlushToFile(path string) error {
   137  	if tw.blockAddr == nil {
   138  		return ErrNotFinished
   139  	}
   140  
   141  	return tw.sink.FlushToFile(path)
   142  }
   143  
   144  // Flush can be called after Finish in order to write the data out to the writer provided.
   145  func (tw *CmpChunkTableWriter) Flush(wr io.Writer) error {
   146  	if tw.blockAddr == nil {
   147  		return ErrNotFinished
   148  	}
   149  
   150  	err := tw.sink.Flush(wr)
   151  
   152  	if err != nil {
   153  		return err
   154  	}
   155  
   156  	return nil
   157  }
   158  
   159  func (tw *CmpChunkTableWriter) Reader() (io.ReadCloser, error) {
   160  	if tw.blockAddr == nil {
   161  		return nil, ErrNotFinished
   162  	}
   163  	return tw.sink.Reader()
   164  }
   165  
   166  func (tw *CmpChunkTableWriter) Remove() error {
   167  	return os.Remove(tw.path)
   168  }
   169  
   170  func containsDuplicates(prefixes prefixIndexSlice) bool {
   171  	if len(prefixes) == 0 {
   172  		return false
   173  	}
   174  	for i := 0; i < len(prefixes); i++ {
   175  		curr := prefixes[i]
   176  		// The list is sorted by prefixes. We have to perform n^2
   177  		// checks against every run of matching prefixes. For all
   178  		// shapes of real world data this is not a concern.
   179  		for j := i + 1; j < len(prefixes); j++ {
   180  			cmp := prefixes[j]
   181  			if cmp.addr.Prefix() != curr.addr.Prefix() {
   182  				break
   183  			}
   184  			if cmp.addr == curr.addr {
   185  				return true
   186  			}
   187  		}
   188  	}
   189  	return false
   190  }
   191  
   192  func (tw *CmpChunkTableWriter) writeIndex() (gohash.Hash, error) {
   193  	sort.Sort(tw.prefixes)
   194  
   195  	// We do a sanity check here to assert that we are never writing duplicate chunks into
   196  	// a table file using this interface.
   197  	if containsDuplicates(tw.prefixes) {
   198  		return nil, ErrDuplicateChunkWritten
   199  	}
   200  
   201  	pfxScratch := [hash.PrefixLen]byte{}
   202  	blockHash := sha512.New()
   203  
   204  	numRecords := uint32(len(tw.prefixes))
   205  	lengthsOffset := lengthsOffset(numRecords)   // skip prefix and ordinal for each record
   206  	suffixesOffset := suffixesOffset(numRecords) // skip size for each record
   207  	suffixesLen := uint64(numRecords) * hash.SuffixLen
   208  	buff := make([]byte, suffixesLen+suffixesOffset)
   209  
   210  	var pos uint64
   211  	for _, pi := range tw.prefixes {
   212  		binary.BigEndian.PutUint64(pfxScratch[:], pi.addr.Prefix())
   213  
   214  		// hash prefix
   215  		n := uint64(copy(buff[pos:], pfxScratch[:]))
   216  		if n != hash.PrefixLen {
   217  			return nil, errors.New("failed to copy all data")
   218  		}
   219  		pos += hash.PrefixLen
   220  
   221  		// order
   222  		binary.BigEndian.PutUint32(buff[pos:], pi.order)
   223  		pos += ordinalSize
   224  
   225  		// length
   226  		offset := lengthsOffset + uint64(pi.order)*lengthSize
   227  		binary.BigEndian.PutUint32(buff[offset:], pi.size)
   228  
   229  		// hash suffix
   230  		offset = suffixesOffset + uint64(pi.order)*hash.SuffixLen
   231  		n = uint64(copy(buff[offset:], pi.addr.Suffix()))
   232  
   233  		if n != hash.SuffixLen {
   234  			return nil, errors.New("failed to copy all bytes")
   235  		}
   236  	}
   237  
   238  	blockHash.Write(buff[suffixesOffset:])
   239  	_, err := tw.sink.Write(buff)
   240  
   241  	if err != nil {
   242  		return nil, err
   243  	}
   244  
   245  	return blockHash, nil
   246  }
   247  
   248  func (tw *CmpChunkTableWriter) writeFooter() error {
   249  	// chunk count
   250  	err := binary.Write(tw.sink, binary.BigEndian, uint32(len(tw.prefixes)))
   251  
   252  	if err != nil {
   253  		return err
   254  	}
   255  
   256  	// total uncompressed chunk data
   257  	err = binary.Write(tw.sink, binary.BigEndian, tw.totalUncompressedData)
   258  
   259  	if err != nil {
   260  		return err
   261  	}
   262  
   263  	// magic number
   264  	_, err = tw.sink.Write([]byte(magicNumber))
   265  
   266  	if err != nil {
   267  		return err
   268  	}
   269  
   270  	return nil
   271  }