github.com/dolthub/dolt/go@v0.40.5-0.20240520175717-68db7794bea6/store/nbs/cmp_chunk_table_writer.go (about) 1 // Copyright 2019 Dolthub, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package nbs 16 17 import ( 18 "crypto/sha512" 19 "encoding/binary" 20 "errors" 21 gohash "hash" 22 "io" 23 "os" 24 "sort" 25 26 "github.com/golang/snappy" 27 28 "github.com/dolthub/dolt/go/store/hash" 29 ) 30 31 const defaultTableSinkBlockSize = 2 * 1024 * 1024 32 const defaultChBufferSize = 32 * 1024 33 34 // ErrNotFinished is an error returned by a CmpChunkTableWriter when a call to Flush* is called before Finish is called 35 var ErrNotFinished = errors.New("not finished") 36 37 // ErrAlreadyFinished is an error returned if Finish is called more than once on a CmpChunkTableWriter 38 var ErrAlreadyFinished = errors.New("already Finished") 39 40 // ErrDuplicateChunkWritten is returned by Finish if the same chunk was given to the writer multiple times. 41 var ErrDuplicateChunkWritten = errors.New("duplicate chunks written") 42 43 // CmpChunkTableWriter writes CompressedChunks to a table file 44 type CmpChunkTableWriter struct { 45 sink *HashingByteSink 46 totalCompressedData uint64 47 totalUncompressedData uint64 48 prefixes prefixIndexSlice 49 blockAddr *hash.Hash 50 path string 51 } 52 53 // NewCmpChunkTableWriter creates a new CmpChunkTableWriter instance with a default ByteSink 54 func NewCmpChunkTableWriter(tempDir string) (*CmpChunkTableWriter, error) { 55 s, err := NewBufferedFileByteSink(tempDir, defaultTableSinkBlockSize, defaultChBufferSize) 56 if err != nil { 57 return nil, err 58 } 59 60 return &CmpChunkTableWriter{NewMD5HashingByteSink(s), 0, 0, nil, nil, s.path}, nil 61 } 62 63 func (tw *CmpChunkTableWriter) ChunkCount() int { 64 return len(tw.prefixes) 65 } 66 67 // Gets the size of the entire table file in bytes 68 func (tw *CmpChunkTableWriter) ContentLength() uint64 { 69 return tw.sink.Size() 70 } 71 72 // Gets the MD5 of the entire table file 73 func (tw *CmpChunkTableWriter) GetMD5() []byte { 74 return tw.sink.GetSum() 75 } 76 77 // AddCmpChunk adds a compressed chunk 78 func (tw *CmpChunkTableWriter) AddCmpChunk(c CompressedChunk) error { 79 if len(c.CompressedData) == 0 { 80 panic("NBS blocks cannot be zero length") 81 } 82 83 uncmpLen, err := snappy.DecodedLen(c.CompressedData) 84 85 if err != nil { 86 return err 87 } 88 89 fullLen := len(c.FullCompressedChunk) 90 _, err = tw.sink.Write(c.FullCompressedChunk) 91 92 if err != nil { 93 return err 94 } 95 96 tw.totalCompressedData += uint64(len(c.CompressedData)) 97 tw.totalUncompressedData += uint64(uncmpLen) 98 99 // Stored in insertion order 100 tw.prefixes = append(tw.prefixes, prefixIndexRec{ 101 c.H, 102 uint32(len(tw.prefixes)), 103 uint32(fullLen), 104 }) 105 106 return nil 107 } 108 109 // Finish will write the index and footer of the table file and return the id of the file. 110 func (tw *CmpChunkTableWriter) Finish() (string, error) { 111 if tw.blockAddr != nil { 112 return "", ErrAlreadyFinished 113 } 114 115 blockHash, err := tw.writeIndex() 116 117 if err != nil { 118 return "", err 119 } 120 121 err = tw.writeFooter() 122 123 if err != nil { 124 return "", err 125 } 126 127 var h []byte 128 h = blockHash.Sum(h) 129 blockAddr := hash.New(h[:hash.ByteLen]) 130 131 tw.blockAddr = &blockAddr 132 return tw.blockAddr.String(), nil 133 } 134 135 // FlushToFile can be called after Finish in order to write the data out to the path provided. 136 func (tw *CmpChunkTableWriter) FlushToFile(path string) error { 137 if tw.blockAddr == nil { 138 return ErrNotFinished 139 } 140 141 return tw.sink.FlushToFile(path) 142 } 143 144 // Flush can be called after Finish in order to write the data out to the writer provided. 145 func (tw *CmpChunkTableWriter) Flush(wr io.Writer) error { 146 if tw.blockAddr == nil { 147 return ErrNotFinished 148 } 149 150 err := tw.sink.Flush(wr) 151 152 if err != nil { 153 return err 154 } 155 156 return nil 157 } 158 159 func (tw *CmpChunkTableWriter) Reader() (io.ReadCloser, error) { 160 if tw.blockAddr == nil { 161 return nil, ErrNotFinished 162 } 163 return tw.sink.Reader() 164 } 165 166 func (tw *CmpChunkTableWriter) Remove() error { 167 return os.Remove(tw.path) 168 } 169 170 func containsDuplicates(prefixes prefixIndexSlice) bool { 171 if len(prefixes) == 0 { 172 return false 173 } 174 for i := 0; i < len(prefixes); i++ { 175 curr := prefixes[i] 176 // The list is sorted by prefixes. We have to perform n^2 177 // checks against every run of matching prefixes. For all 178 // shapes of real world data this is not a concern. 179 for j := i + 1; j < len(prefixes); j++ { 180 cmp := prefixes[j] 181 if cmp.addr.Prefix() != curr.addr.Prefix() { 182 break 183 } 184 if cmp.addr == curr.addr { 185 return true 186 } 187 } 188 } 189 return false 190 } 191 192 func (tw *CmpChunkTableWriter) writeIndex() (gohash.Hash, error) { 193 sort.Sort(tw.prefixes) 194 195 // We do a sanity check here to assert that we are never writing duplicate chunks into 196 // a table file using this interface. 197 if containsDuplicates(tw.prefixes) { 198 return nil, ErrDuplicateChunkWritten 199 } 200 201 pfxScratch := [hash.PrefixLen]byte{} 202 blockHash := sha512.New() 203 204 numRecords := uint32(len(tw.prefixes)) 205 lengthsOffset := lengthsOffset(numRecords) // skip prefix and ordinal for each record 206 suffixesOffset := suffixesOffset(numRecords) // skip size for each record 207 suffixesLen := uint64(numRecords) * hash.SuffixLen 208 buff := make([]byte, suffixesLen+suffixesOffset) 209 210 var pos uint64 211 for _, pi := range tw.prefixes { 212 binary.BigEndian.PutUint64(pfxScratch[:], pi.addr.Prefix()) 213 214 // hash prefix 215 n := uint64(copy(buff[pos:], pfxScratch[:])) 216 if n != hash.PrefixLen { 217 return nil, errors.New("failed to copy all data") 218 } 219 pos += hash.PrefixLen 220 221 // order 222 binary.BigEndian.PutUint32(buff[pos:], pi.order) 223 pos += ordinalSize 224 225 // length 226 offset := lengthsOffset + uint64(pi.order)*lengthSize 227 binary.BigEndian.PutUint32(buff[offset:], pi.size) 228 229 // hash suffix 230 offset = suffixesOffset + uint64(pi.order)*hash.SuffixLen 231 n = uint64(copy(buff[offset:], pi.addr.Suffix())) 232 233 if n != hash.SuffixLen { 234 return nil, errors.New("failed to copy all bytes") 235 } 236 } 237 238 blockHash.Write(buff[suffixesOffset:]) 239 _, err := tw.sink.Write(buff) 240 241 if err != nil { 242 return nil, err 243 } 244 245 return blockHash, nil 246 } 247 248 func (tw *CmpChunkTableWriter) writeFooter() error { 249 // chunk count 250 err := binary.Write(tw.sink, binary.BigEndian, uint32(len(tw.prefixes))) 251 252 if err != nil { 253 return err 254 } 255 256 // total uncompressed chunk data 257 err = binary.Write(tw.sink, binary.BigEndian, tw.totalUncompressedData) 258 259 if err != nil { 260 return err 261 } 262 263 // magic number 264 _, err = tw.sink.Write([]byte(magicNumber)) 265 266 if err != nil { 267 return err 268 } 269 270 return nil 271 }