github.com/hasnat/dolt/go@v0.0.0-20210628190320-9eb5d843fbb7/store/nbs/cmp_chunk_table_writer.go (about)

     1  // Copyright 2019 Dolthub, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package nbs
    16  
    17  import (
    18  	"crypto/sha512"
    19  	"encoding/binary"
    20  	"errors"
    21  	"hash"
    22  	"io"
    23  	"sort"
    24  
    25  	"github.com/golang/snappy"
    26  
    27  	nomshash "github.com/dolthub/dolt/go/store/hash"
    28  )
    29  
    30  const defaultTableSinkBlockSize = 2 * 1024 * 1024
    31  const defaultChBufferSize = 32 * 1024
    32  
    33  // ErrNotFinished is an error returned by a CmpChunkTableWriter when a call to Flush* is called before Finish is called
    34  var ErrNotFinished = errors.New("not finished")
    35  
    36  // ErrAlreadyFinished is an error returned if Finish is called more than once on a CmpChunkTableWriter
    37  var ErrAlreadyFinished = errors.New("already Finished")
    38  
    39  var ErrChunkAlreadyWritten = errors.New("chunk already written")
    40  
    41  // CmpChunkTableWriter writes CompressedChunks to a table file
    42  type CmpChunkTableWriter struct {
    43  	sink                  *HashingByteSink
    44  	totalCompressedData   uint64
    45  	totalUncompressedData uint64
    46  	prefixes              prefixIndexSlice // TODO: This is in danger of exploding memory
    47  	blockAddr             *addr
    48  	chunkHashes           nomshash.HashSet
    49  }
    50  
    51  // NewCmpChunkTableWriter creates a new CmpChunkTableWriter instance with a default ByteSink
    52  func NewCmpChunkTableWriter(tempDir string) (*CmpChunkTableWriter, error) {
    53  	s, err := NewBufferedFileByteSink(tempDir, defaultTableSinkBlockSize, defaultChBufferSize)
    54  
    55  	if err != nil {
    56  		return nil, err
    57  	}
    58  
    59  	return &CmpChunkTableWriter{NewHashingByteSink(s), 0, 0, nil, nil, nomshash.NewHashSet()}, nil
    60  }
    61  
    62  // Size returns the number of compressed chunks that have been added
    63  func (tw *CmpChunkTableWriter) Size() int {
    64  	return len(tw.prefixes)
    65  }
    66  
    67  func (tw *CmpChunkTableWriter) ChunkCount() uint32 {
    68  	return uint32(len(tw.prefixes))
    69  }
    70  
    71  // Gets the size of the entire table file in bytes
    72  func (tw *CmpChunkTableWriter) ContentLength() uint64 {
    73  	return tw.sink.Size()
    74  }
    75  
    76  // Gets the MD5 of the entire table file
    77  func (tw *CmpChunkTableWriter) GetMD5() []byte {
    78  	return tw.sink.GetMD5()
    79  }
    80  
    81  // AddCmpChunk adds a compressed chunk
    82  func (tw *CmpChunkTableWriter) AddCmpChunk(c CompressedChunk) error {
    83  	if len(c.CompressedData) == 0 {
    84  		panic("NBS blocks cannot be zero length")
    85  	}
    86  
    87  	if tw.chunkHashes.Has(c.H) {
    88  		return ErrChunkAlreadyWritten
    89  	}
    90  
    91  	tw.chunkHashes.Insert(c.H)
    92  	uncmpLen, err := snappy.DecodedLen(c.CompressedData)
    93  
    94  	if err != nil {
    95  		return err
    96  	}
    97  
    98  	fullLen := len(c.FullCompressedChunk)
    99  	_, err = tw.sink.Write(c.FullCompressedChunk)
   100  
   101  	if err != nil {
   102  		return err
   103  	}
   104  
   105  	tw.totalCompressedData += uint64(len(c.CompressedData))
   106  	tw.totalUncompressedData += uint64(uncmpLen)
   107  
   108  	a := addr(c.H)
   109  	// Stored in insertion order
   110  	tw.prefixes = append(tw.prefixes, prefixIndexRec{
   111  		a.Prefix(),
   112  		a[addrPrefixSize:],
   113  		uint32(len(tw.prefixes)),
   114  		uint32(fullLen),
   115  	})
   116  
   117  	return nil
   118  }
   119  
   120  // Finish will write the index and footer of the table file and return the id of the file.
   121  func (tw *CmpChunkTableWriter) Finish() (string, error) {
   122  	if tw.blockAddr != nil {
   123  		return "", ErrAlreadyFinished
   124  	}
   125  
   126  	blockHash, err := tw.writeIndex()
   127  
   128  	if err != nil {
   129  		return "", err
   130  	}
   131  
   132  	err = tw.writeFooter()
   133  
   134  	if err != nil {
   135  		return "", err
   136  	}
   137  
   138  	var h []byte
   139  	h = blockHash.Sum(h)
   140  
   141  	var blockAddr addr
   142  	copy(blockAddr[:], h)
   143  
   144  	tw.blockAddr = &blockAddr
   145  	return tw.blockAddr.String(), nil
   146  }
   147  
   148  // FlushToFile can be called after Finish in order to write the data out to the path provided.
   149  func (tw *CmpChunkTableWriter) FlushToFile(path string) error {
   150  	if tw.blockAddr == nil {
   151  		return ErrNotFinished
   152  	}
   153  
   154  	return tw.sink.FlushToFile(path)
   155  }
   156  
   157  // Flush can be called after Finish in order to write the data out to the writer provided.
   158  func (tw *CmpChunkTableWriter) Flush(wr io.Writer) error {
   159  	if tw.blockAddr == nil {
   160  		return ErrNotFinished
   161  	}
   162  
   163  	err := tw.sink.Flush(wr)
   164  
   165  	if err != nil {
   166  		return err
   167  	}
   168  
   169  	return nil
   170  }
   171  
   172  func (tw *CmpChunkTableWriter) writeIndex() (hash.Hash, error) {
   173  	sort.Sort(tw.prefixes)
   174  
   175  	pfxScratch := [addrPrefixSize]byte{}
   176  	blockHash := sha512.New()
   177  
   178  	numRecords := uint32(len(tw.prefixes))
   179  	lengthsOffset := lengthsOffset(numRecords)   // skip prefix and ordinal for each record
   180  	suffixesOffset := suffixesOffset(numRecords) // skip size for each record
   181  	suffixesLen := uint64(numRecords) * addrSuffixSize
   182  	buff := make([]byte, suffixesLen+suffixesOffset)
   183  
   184  	var pos uint64
   185  	for _, pi := range tw.prefixes {
   186  		binary.BigEndian.PutUint64(pfxScratch[:], pi.prefix)
   187  
   188  		// hash prefix
   189  		n := uint64(copy(buff[pos:], pfxScratch[:]))
   190  		if n != addrPrefixSize {
   191  			return nil, errors.New("failed to copy all data")
   192  		}
   193  
   194  		pos += n
   195  
   196  		// order
   197  		binary.BigEndian.PutUint32(buff[pos:], pi.order)
   198  		pos += ordinalSize
   199  
   200  		// length
   201  		offset := lengthsOffset + uint64(pi.order)*lengthSize
   202  		binary.BigEndian.PutUint32(buff[offset:], pi.size)
   203  
   204  		// hash suffix
   205  		offset = suffixesOffset + uint64(pi.order)*addrSuffixSize
   206  		n = uint64(copy(buff[offset:], pi.suffix))
   207  
   208  		if n != addrSuffixSize {
   209  			return nil, errors.New("failed to copy all bytes")
   210  		}
   211  	}
   212  
   213  	blockHash.Write(buff[suffixesOffset:])
   214  	_, err := tw.sink.Write(buff)
   215  
   216  	if err != nil {
   217  		return nil, err
   218  	}
   219  
   220  	return blockHash, nil
   221  }
   222  
   223  func (tw *CmpChunkTableWriter) writeFooter() error {
   224  	// chunk count
   225  	err := binary.Write(tw.sink, binary.BigEndian, uint32(len(tw.prefixes)))
   226  
   227  	if err != nil {
   228  		return err
   229  	}
   230  
   231  	// total uncompressed chunk data
   232  	err = binary.Write(tw.sink, binary.BigEndian, tw.totalUncompressedData)
   233  
   234  	if err != nil {
   235  		return err
   236  	}
   237  
   238  	// magic number
   239  	_, err = tw.sink.Write([]byte(magicNumber))
   240  
   241  	if err != nil {
   242  		return err
   243  	}
   244  
   245  	return nil
   246  }