github.com/jbendotnet/noms@v0.0.0-20190904222105-c43e4293ea92/go/nbs/table_writer.go

github.com/jbendotnet/noms@v0.0.0-20190904222105-c43e4293ea92/go/nbs/table_writer.go (about)

     1  // Copyright 2016 Attic Labs, Inc. All rights reserved.
     2  // Licensed under the Apache License, version 2.0:
     3  // http://www.apache.org/licenses/LICENSE-2.0
     4  
     5  package nbs
     6  
     7  import (
     8  	"crypto/sha512"
     9  	"encoding/binary"
    10  	"fmt"
    11  	"hash"
    12  	"sort"
    13  
    14  	"github.com/attic-labs/noms/go/d"
    15  	"github.com/golang/snappy"
    16  )
    17  
    18  // tableWriter encodes a collection of byte stream chunks into a nbs table. NOT goroutine safe.
    19  type tableWriter struct {
    20  	buff                  []byte
    21  	pos                   uint64
    22  	totalCompressedData   uint64
    23  	totalUncompressedData uint64
    24  	prefixes              prefixIndexSlice // TODO: This is in danger of exploding memory
    25  	blockHash             hash.Hash
    26  
    27  	snapper snappyEncoder
    28  }
    29  
    30  type snappyEncoder interface {
    31  	Encode(dst, src []byte) []byte
    32  }
    33  
    34  type realSnappyEncoder struct{}
    35  
    36  func (r realSnappyEncoder) Encode(dst, src []byte) []byte {
    37  	return snappy.Encode(dst, src)
    38  }
    39  
    40  func maxTableSize(numChunks, totalData uint64) uint64 {
    41  	avgChunkSize := totalData / numChunks
    42  	d.Chk.True(avgChunkSize < maxChunkSize)
    43  	maxSnappySize := snappy.MaxEncodedLen(int(avgChunkSize))
    44  	d.Chk.True(maxSnappySize > 0)
    45  	return numChunks*(prefixTupleSize+lengthSize+addrSuffixSize+checksumSize+uint64(maxSnappySize)) + footerSize
    46  }
    47  
    48  func indexSize(numChunks uint32) uint64 {
    49  	return uint64(numChunks) * (addrSuffixSize + lengthSize + prefixTupleSize)
    50  }
    51  
    52  func lengthsOffset(numChunks uint32) uint64 {
    53  	return uint64(numChunks) * prefixTupleSize
    54  }
    55  
    56  func suffixesOffset(numChunks uint32) uint64 {
    57  	return uint64(numChunks) * (prefixTupleSize + lengthSize)
    58  }
    59  
    60  // len(buff) must be >= maxTableSize(numChunks, totalData)
    61  func newTableWriter(buff []byte, snapper snappyEncoder) *tableWriter {
    62  	if snapper == nil {
    63  		snapper = realSnappyEncoder{}
    64  	}
    65  	return &tableWriter{
    66  		buff:      buff,
    67  		blockHash: sha512.New(),
    68  		snapper:   snapper,
    69  	}
    70  }
    71  
    72  func (tw *tableWriter) addChunk(h addr, data []byte) bool {
    73  	if len(data) == 0 {
    74  		panic("NBS blocks cannont be zero length")
    75  	}
    76  
    77  	// Compress data straight into tw.buff
    78  	compressed := tw.snapper.Encode(tw.buff[tw.pos:], data)
    79  	dataLength := uint64(len(compressed))
    80  	tw.totalCompressedData += dataLength
    81  
    82  	// BUG 3156 indicated that, sometimes, snappy decided that there's not enough space in tw.buff[tw.pos:] to encode into.
    83  	// This _should never happen anymore be_, because we iterate over all chunks to be added and sum the max amount of space that snappy says it might need.
    84  	// Since we know that |data| can't be 0-length, we also know that the compressed version of |data| has length greater than zero. The first element in a snappy-encoded blob is a Uvarint indicating how much data is present. Therefore, if there's a Uvarint-encoded 0 at tw.buff[tw.pos:], we know that snappy did not write anything there and we have a problem.
    85  	if v, n := binary.Uvarint(tw.buff[tw.pos:]); v == 0 {
    86  		d.Chk.True(n != 0)
    87  		panic(fmt.Errorf("BUG 3156: unbuffered chunk %s: uncompressed %d, compressed %d, snappy max %d, tw.buff %d\n", h.String(), len(data), dataLength, snappy.MaxEncodedLen(len(data)), len(tw.buff[tw.pos:])))
    88  	}
    89  
    90  	tw.pos += dataLength
    91  	tw.totalUncompressedData += uint64(len(data))
    92  
    93  	// checksum (4 LSBytes, big-endian)
    94  	binary.BigEndian.PutUint32(tw.buff[tw.pos:], crc(compressed))
    95  	tw.pos += checksumSize
    96  
    97  	// Stored in insertion order
    98  	tw.prefixes = append(tw.prefixes, prefixIndexRec{
    99  		h.Prefix(),
   100  		h[addrPrefixSize:],
   101  		uint32(len(tw.prefixes)),
   102  		uint32(checksumSize + dataLength),
   103  	})
   104  
   105  	return true
   106  }
   107  
   108  func (tw *tableWriter) finish() (uncompressedLength uint64, blockAddr addr) {
   109  	tw.writeIndex()
   110  	tw.writeFooter()
   111  	uncompressedLength = tw.pos
   112  
   113  	var h []byte
   114  	h = tw.blockHash.Sum(h) // Appends hash to h
   115  	copy(blockAddr[:], h)
   116  	return
   117  }
   118  
   119  type prefixIndexRec struct {
   120  	prefix      uint64
   121  	suffix      []byte
   122  	order, size uint32
   123  }
   124  
   125  type prefixIndexSlice []prefixIndexRec
   126  
   127  func (hs prefixIndexSlice) Len() int           { return len(hs) }
   128  func (hs prefixIndexSlice) Less(i, j int) bool { return hs[i].prefix < hs[j].prefix }
   129  func (hs prefixIndexSlice) Swap(i, j int)      { hs[i], hs[j] = hs[j], hs[i] }
   130  
   131  func (tw *tableWriter) writeIndex() {
   132  	sort.Sort(tw.prefixes)
   133  
   134  	pfxScratch := [addrPrefixSize]byte{}
   135  
   136  	numRecords := uint32(len(tw.prefixes))
   137  	lengthsOffset := tw.pos + lengthsOffset(numRecords)   // skip prefix and ordinal for each record
   138  	suffixesOffset := tw.pos + suffixesOffset(numRecords) // skip size for each record
   139  	for _, pi := range tw.prefixes {
   140  		binary.BigEndian.PutUint64(pfxScratch[:], pi.prefix)
   141  
   142  		// hash prefix
   143  		n := uint64(copy(tw.buff[tw.pos:], pfxScratch[:]))
   144  		d.Chk.True(n == addrPrefixSize)
   145  		tw.pos += n
   146  
   147  		// order
   148  		binary.BigEndian.PutUint32(tw.buff[tw.pos:], pi.order)
   149  		tw.pos += ordinalSize
   150  
   151  		// length
   152  		offset := lengthsOffset + uint64(pi.order)*lengthSize
   153  		binary.BigEndian.PutUint32(tw.buff[offset:], pi.size)
   154  
   155  		// hash suffix
   156  		offset = suffixesOffset + uint64(pi.order)*addrSuffixSize
   157  		n = uint64(copy(tw.buff[offset:], pi.suffix))
   158  		d.Chk.True(n == addrSuffixSize)
   159  	}
   160  	suffixesLen := uint64(numRecords) * addrSuffixSize
   161  	tw.blockHash.Write(tw.buff[suffixesOffset : suffixesOffset+suffixesLen])
   162  	tw.pos = suffixesOffset + suffixesLen
   163  }
   164  
   165  func (tw *tableWriter) writeFooter() {
   166  	tw.pos += writeFooter(tw.buff[tw.pos:], uint32(len(tw.prefixes)), tw.totalUncompressedData)
   167  }
   168  
   169  func writeFooter(dst []byte, chunkCount uint32, uncData uint64) (consumed uint64) {
   170  	// chunk count
   171  	binary.BigEndian.PutUint32(dst[consumed:], chunkCount)
   172  	consumed += uint32Size
   173  
   174  	// total uncompressed chunk data
   175  	binary.BigEndian.PutUint64(dst[consumed:], uncData)
   176  	consumed += uint64Size
   177  
   178  	// magic number
   179  	copy(dst[consumed:], magicNumber)
   180  	consumed += magicNumberSize
   181  	return
   182  }