github.com/hasnat/dolt/go@v0.0.0-20210628190320-9eb5d843fbb7/store/nbs/cmp_chunk_table_writer.go (about) 1 // Copyright 2019 Dolthub, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package nbs 16 17 import ( 18 "crypto/sha512" 19 "encoding/binary" 20 "errors" 21 "hash" 22 "io" 23 "sort" 24 25 "github.com/golang/snappy" 26 27 nomshash "github.com/dolthub/dolt/go/store/hash" 28 ) 29 30 const defaultTableSinkBlockSize = 2 * 1024 * 1024 31 const defaultChBufferSize = 32 * 1024 32 33 // ErrNotFinished is an error returned by a CmpChunkTableWriter when a call to Flush* is called before Finish is called 34 var ErrNotFinished = errors.New("not finished") 35 36 // ErrAlreadyFinished is an error returned if Finish is called more than once on a CmpChunkTableWriter 37 var ErrAlreadyFinished = errors.New("already Finished") 38 39 var ErrChunkAlreadyWritten = errors.New("chunk already written") 40 41 // CmpChunkTableWriter writes CompressedChunks to a table file 42 type CmpChunkTableWriter struct { 43 sink *HashingByteSink 44 totalCompressedData uint64 45 totalUncompressedData uint64 46 prefixes prefixIndexSlice // TODO: This is in danger of exploding memory 47 blockAddr *addr 48 chunkHashes nomshash.HashSet 49 } 50 51 // NewCmpChunkTableWriter creates a new CmpChunkTableWriter instance with a default ByteSink 52 func NewCmpChunkTableWriter(tempDir string) (*CmpChunkTableWriter, error) { 53 s, err := NewBufferedFileByteSink(tempDir, defaultTableSinkBlockSize, defaultChBufferSize) 54 55 if err != nil { 56 return nil, err 57 } 58 59 return &CmpChunkTableWriter{NewHashingByteSink(s), 0, 0, nil, nil, nomshash.NewHashSet()}, nil 60 } 61 62 // Size returns the number of compressed chunks that have been added 63 func (tw *CmpChunkTableWriter) Size() int { 64 return len(tw.prefixes) 65 } 66 67 func (tw *CmpChunkTableWriter) ChunkCount() uint32 { 68 return uint32(len(tw.prefixes)) 69 } 70 71 // Gets the size of the entire table file in bytes 72 func (tw *CmpChunkTableWriter) ContentLength() uint64 { 73 return tw.sink.Size() 74 } 75 76 // Gets the MD5 of the entire table file 77 func (tw *CmpChunkTableWriter) GetMD5() []byte { 78 return tw.sink.GetMD5() 79 } 80 81 // AddCmpChunk adds a compressed chunk 82 func (tw *CmpChunkTableWriter) AddCmpChunk(c CompressedChunk) error { 83 if len(c.CompressedData) == 0 { 84 panic("NBS blocks cannot be zero length") 85 } 86 87 if tw.chunkHashes.Has(c.H) { 88 return ErrChunkAlreadyWritten 89 } 90 91 tw.chunkHashes.Insert(c.H) 92 uncmpLen, err := snappy.DecodedLen(c.CompressedData) 93 94 if err != nil { 95 return err 96 } 97 98 fullLen := len(c.FullCompressedChunk) 99 _, err = tw.sink.Write(c.FullCompressedChunk) 100 101 if err != nil { 102 return err 103 } 104 105 tw.totalCompressedData += uint64(len(c.CompressedData)) 106 tw.totalUncompressedData += uint64(uncmpLen) 107 108 a := addr(c.H) 109 // Stored in insertion order 110 tw.prefixes = append(tw.prefixes, prefixIndexRec{ 111 a.Prefix(), 112 a[addrPrefixSize:], 113 uint32(len(tw.prefixes)), 114 uint32(fullLen), 115 }) 116 117 return nil 118 } 119 120 // Finish will write the index and footer of the table file and return the id of the file. 121 func (tw *CmpChunkTableWriter) Finish() (string, error) { 122 if tw.blockAddr != nil { 123 return "", ErrAlreadyFinished 124 } 125 126 blockHash, err := tw.writeIndex() 127 128 if err != nil { 129 return "", err 130 } 131 132 err = tw.writeFooter() 133 134 if err != nil { 135 return "", err 136 } 137 138 var h []byte 139 h = blockHash.Sum(h) 140 141 var blockAddr addr 142 copy(blockAddr[:], h) 143 144 tw.blockAddr = &blockAddr 145 return tw.blockAddr.String(), nil 146 } 147 148 // FlushToFile can be called after Finish in order to write the data out to the path provided. 149 func (tw *CmpChunkTableWriter) FlushToFile(path string) error { 150 if tw.blockAddr == nil { 151 return ErrNotFinished 152 } 153 154 return tw.sink.FlushToFile(path) 155 } 156 157 // Flush can be called after Finish in order to write the data out to the writer provided. 158 func (tw *CmpChunkTableWriter) Flush(wr io.Writer) error { 159 if tw.blockAddr == nil { 160 return ErrNotFinished 161 } 162 163 err := tw.sink.Flush(wr) 164 165 if err != nil { 166 return err 167 } 168 169 return nil 170 } 171 172 func (tw *CmpChunkTableWriter) writeIndex() (hash.Hash, error) { 173 sort.Sort(tw.prefixes) 174 175 pfxScratch := [addrPrefixSize]byte{} 176 blockHash := sha512.New() 177 178 numRecords := uint32(len(tw.prefixes)) 179 lengthsOffset := lengthsOffset(numRecords) // skip prefix and ordinal for each record 180 suffixesOffset := suffixesOffset(numRecords) // skip size for each record 181 suffixesLen := uint64(numRecords) * addrSuffixSize 182 buff := make([]byte, suffixesLen+suffixesOffset) 183 184 var pos uint64 185 for _, pi := range tw.prefixes { 186 binary.BigEndian.PutUint64(pfxScratch[:], pi.prefix) 187 188 // hash prefix 189 n := uint64(copy(buff[pos:], pfxScratch[:])) 190 if n != addrPrefixSize { 191 return nil, errors.New("failed to copy all data") 192 } 193 194 pos += n 195 196 // order 197 binary.BigEndian.PutUint32(buff[pos:], pi.order) 198 pos += ordinalSize 199 200 // length 201 offset := lengthsOffset + uint64(pi.order)*lengthSize 202 binary.BigEndian.PutUint32(buff[offset:], pi.size) 203 204 // hash suffix 205 offset = suffixesOffset + uint64(pi.order)*addrSuffixSize 206 n = uint64(copy(buff[offset:], pi.suffix)) 207 208 if n != addrSuffixSize { 209 return nil, errors.New("failed to copy all bytes") 210 } 211 } 212 213 blockHash.Write(buff[suffixesOffset:]) 214 _, err := tw.sink.Write(buff) 215 216 if err != nil { 217 return nil, err 218 } 219 220 return blockHash, nil 221 } 222 223 func (tw *CmpChunkTableWriter) writeFooter() error { 224 // chunk count 225 err := binary.Write(tw.sink, binary.BigEndian, uint32(len(tw.prefixes))) 226 227 if err != nil { 228 return err 229 } 230 231 // total uncompressed chunk data 232 err = binary.Write(tw.sink, binary.BigEndian, tw.totalUncompressedData) 233 234 if err != nil { 235 return err 236 } 237 238 // magic number 239 _, err = tw.sink.Write([]byte(magicNumber)) 240 241 if err != nil { 242 return err 243 } 244 245 return nil 246 }