github.com/jbendotnet/noms@v0.0.0-20190904222105-c43e4293ea92/go/nbs/table_writer.go (about) 1 // Copyright 2016 Attic Labs, Inc. All rights reserved. 2 // Licensed under the Apache License, version 2.0: 3 // http://www.apache.org/licenses/LICENSE-2.0 4 5 package nbs 6 7 import ( 8 "crypto/sha512" 9 "encoding/binary" 10 "fmt" 11 "hash" 12 "sort" 13 14 "github.com/attic-labs/noms/go/d" 15 "github.com/golang/snappy" 16 ) 17 18 // tableWriter encodes a collection of byte stream chunks into a nbs table. NOT goroutine safe. 19 type tableWriter struct { 20 buff []byte 21 pos uint64 22 totalCompressedData uint64 23 totalUncompressedData uint64 24 prefixes prefixIndexSlice // TODO: This is in danger of exploding memory 25 blockHash hash.Hash 26 27 snapper snappyEncoder 28 } 29 30 type snappyEncoder interface { 31 Encode(dst, src []byte) []byte 32 } 33 34 type realSnappyEncoder struct{} 35 36 func (r realSnappyEncoder) Encode(dst, src []byte) []byte { 37 return snappy.Encode(dst, src) 38 } 39 40 func maxTableSize(numChunks, totalData uint64) uint64 { 41 avgChunkSize := totalData / numChunks 42 d.Chk.True(avgChunkSize < maxChunkSize) 43 maxSnappySize := snappy.MaxEncodedLen(int(avgChunkSize)) 44 d.Chk.True(maxSnappySize > 0) 45 return numChunks*(prefixTupleSize+lengthSize+addrSuffixSize+checksumSize+uint64(maxSnappySize)) + footerSize 46 } 47 48 func indexSize(numChunks uint32) uint64 { 49 return uint64(numChunks) * (addrSuffixSize + lengthSize + prefixTupleSize) 50 } 51 52 func lengthsOffset(numChunks uint32) uint64 { 53 return uint64(numChunks) * prefixTupleSize 54 } 55 56 func suffixesOffset(numChunks uint32) uint64 { 57 return uint64(numChunks) * (prefixTupleSize + lengthSize) 58 } 59 60 // len(buff) must be >= maxTableSize(numChunks, totalData) 61 func newTableWriter(buff []byte, snapper snappyEncoder) *tableWriter { 62 if snapper == nil { 63 snapper = realSnappyEncoder{} 64 } 65 return &tableWriter{ 66 buff: buff, 67 blockHash: sha512.New(), 68 snapper: snapper, 69 } 70 } 71 72 func (tw *tableWriter) addChunk(h addr, data []byte) bool { 73 if len(data) == 0 { 74 panic("NBS blocks cannont be zero length") 75 } 76 77 // Compress data straight into tw.buff 78 compressed := tw.snapper.Encode(tw.buff[tw.pos:], data) 79 dataLength := uint64(len(compressed)) 80 tw.totalCompressedData += dataLength 81 82 // BUG 3156 indicated that, sometimes, snappy decided that there's not enough space in tw.buff[tw.pos:] to encode into. 83 // This _should never happen anymore be_, because we iterate over all chunks to be added and sum the max amount of space that snappy says it might need. 84 // Since we know that |data| can't be 0-length, we also know that the compressed version of |data| has length greater than zero. The first element in a snappy-encoded blob is a Uvarint indicating how much data is present. Therefore, if there's a Uvarint-encoded 0 at tw.buff[tw.pos:], we know that snappy did not write anything there and we have a problem. 85 if v, n := binary.Uvarint(tw.buff[tw.pos:]); v == 0 { 86 d.Chk.True(n != 0) 87 panic(fmt.Errorf("BUG 3156: unbuffered chunk %s: uncompressed %d, compressed %d, snappy max %d, tw.buff %d\n", h.String(), len(data), dataLength, snappy.MaxEncodedLen(len(data)), len(tw.buff[tw.pos:]))) 88 } 89 90 tw.pos += dataLength 91 tw.totalUncompressedData += uint64(len(data)) 92 93 // checksum (4 LSBytes, big-endian) 94 binary.BigEndian.PutUint32(tw.buff[tw.pos:], crc(compressed)) 95 tw.pos += checksumSize 96 97 // Stored in insertion order 98 tw.prefixes = append(tw.prefixes, prefixIndexRec{ 99 h.Prefix(), 100 h[addrPrefixSize:], 101 uint32(len(tw.prefixes)), 102 uint32(checksumSize + dataLength), 103 }) 104 105 return true 106 } 107 108 func (tw *tableWriter) finish() (uncompressedLength uint64, blockAddr addr) { 109 tw.writeIndex() 110 tw.writeFooter() 111 uncompressedLength = tw.pos 112 113 var h []byte 114 h = tw.blockHash.Sum(h) // Appends hash to h 115 copy(blockAddr[:], h) 116 return 117 } 118 119 type prefixIndexRec struct { 120 prefix uint64 121 suffix []byte 122 order, size uint32 123 } 124 125 type prefixIndexSlice []prefixIndexRec 126 127 func (hs prefixIndexSlice) Len() int { return len(hs) } 128 func (hs prefixIndexSlice) Less(i, j int) bool { return hs[i].prefix < hs[j].prefix } 129 func (hs prefixIndexSlice) Swap(i, j int) { hs[i], hs[j] = hs[j], hs[i] } 130 131 func (tw *tableWriter) writeIndex() { 132 sort.Sort(tw.prefixes) 133 134 pfxScratch := [addrPrefixSize]byte{} 135 136 numRecords := uint32(len(tw.prefixes)) 137 lengthsOffset := tw.pos + lengthsOffset(numRecords) // skip prefix and ordinal for each record 138 suffixesOffset := tw.pos + suffixesOffset(numRecords) // skip size for each record 139 for _, pi := range tw.prefixes { 140 binary.BigEndian.PutUint64(pfxScratch[:], pi.prefix) 141 142 // hash prefix 143 n := uint64(copy(tw.buff[tw.pos:], pfxScratch[:])) 144 d.Chk.True(n == addrPrefixSize) 145 tw.pos += n 146 147 // order 148 binary.BigEndian.PutUint32(tw.buff[tw.pos:], pi.order) 149 tw.pos += ordinalSize 150 151 // length 152 offset := lengthsOffset + uint64(pi.order)*lengthSize 153 binary.BigEndian.PutUint32(tw.buff[offset:], pi.size) 154 155 // hash suffix 156 offset = suffixesOffset + uint64(pi.order)*addrSuffixSize 157 n = uint64(copy(tw.buff[offset:], pi.suffix)) 158 d.Chk.True(n == addrSuffixSize) 159 } 160 suffixesLen := uint64(numRecords) * addrSuffixSize 161 tw.blockHash.Write(tw.buff[suffixesOffset : suffixesOffset+suffixesLen]) 162 tw.pos = suffixesOffset + suffixesLen 163 } 164 165 func (tw *tableWriter) writeFooter() { 166 tw.pos += writeFooter(tw.buff[tw.pos:], uint32(len(tw.prefixes)), tw.totalUncompressedData) 167 } 168 169 func writeFooter(dst []byte, chunkCount uint32, uncData uint64) (consumed uint64) { 170 // chunk count 171 binary.BigEndian.PutUint32(dst[consumed:], chunkCount) 172 consumed += uint32Size 173 174 // total uncompressed chunk data 175 binary.BigEndian.PutUint64(dst[consumed:], uncData) 176 consumed += uint64Size 177 178 // magic number 179 copy(dst[consumed:], magicNumber) 180 consumed += magicNumberSize 181 return 182 }