github.com/dolthub/dolt/go@v0.40.5-0.20240520175717-68db7794bea6/store/nbs/byte_sink.go (about) 1 // Copyright 2019 Dolthub, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package nbs 16 17 import ( 18 "bytes" 19 "crypto/md5" 20 "crypto/sha512" 21 "errors" 22 "hash" 23 "io" 24 "os" 25 "sync" 26 27 "github.com/dolthub/dolt/go/libraries/utils/file" 28 "github.com/dolthub/dolt/go/libraries/utils/iohelp" 29 "github.com/dolthub/dolt/go/store/atomicerr" 30 "github.com/dolthub/dolt/go/store/util/tempfiles" 31 ) 32 33 func flushSinkToFile(sink ByteSink, path string) (err error) { 34 var f *os.File 35 f, err = os.OpenFile(path, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, os.ModePerm) 36 37 if err != nil { 38 return err 39 } 40 41 defer func() { 42 closeErr := f.Close() 43 44 if err == nil { 45 err = closeErr 46 } 47 }() 48 49 err = sink.Flush(f) 50 return err 51 } 52 53 // A ByteSink is an interface for writing bytes which can later be flushed to a writer 54 type ByteSink interface { 55 io.Writer 56 57 // Flush writes all the data that was written to the ByteSink to the supplied writer 58 Flush(wr io.Writer) error 59 60 // FlushToFile writes all the data that was written to the ByteSink to a file at the given path 61 FlushToFile(path string) error 62 63 Reader() (io.ReadCloser, error) 64 } 65 66 // ErrBuffFull used by the FixedBufferSink when the data written is larger than the buffer allocated. 67 var ErrBufferFull = errors.New("buffer full") 68 69 // FixedBufferByteSink is a ByteSink implementation with a buffer whose size will not change. Writing more 70 // data than the fixed buffer can hold will result in an error 71 type FixedBufferByteSink struct { 72 buff []byte 73 pos uint64 74 } 75 76 // NewFixedBufferByteSink creates a FixedBufferTableSink which will use the supplied buffer 77 func NewFixedBufferByteSink(buff []byte) *FixedBufferByteSink { 78 if len(buff) == 0 { 79 panic("must provide a buffer") 80 } 81 82 return &FixedBufferByteSink{buff: buff} 83 } 84 85 // Write writes a byte array to the sink. 86 func (sink *FixedBufferByteSink) Write(src []byte) (int, error) { 87 dest := sink.buff[sink.pos:] 88 destLen := len(dest) 89 srcLen := len(src) 90 91 if destLen < srcLen { 92 return 0, ErrBufferFull 93 } 94 95 copy(dest, src) 96 97 sink.pos += uint64(srcLen) 98 return srcLen, nil 99 } 100 101 // Flush writes all the data that was written to the ByteSink to the supplied writer 102 func (sink *FixedBufferByteSink) Flush(wr io.Writer) error { 103 return iohelp.WriteAll(wr, sink.buff[:sink.pos]) 104 } 105 106 // FlushToFile writes all the data that was written to the ByteSink to a file at the given path 107 func (sink *FixedBufferByteSink) FlushToFile(path string) (err error) { 108 return flushSinkToFile(sink, path) 109 } 110 111 func (sink *FixedBufferByteSink) Reader() (io.ReadCloser, error) { 112 return io.NopCloser(bytes.NewReader(sink.buff)), nil 113 } 114 115 // BlockBufferByteSink allocates blocks of data with a given block size to store the bytes written to the sink. New 116 // blocks are allocated as needed in order to handle all the data of the Write calls. 117 type BlockBufferByteSink struct { 118 blockSize int 119 pos uint64 120 blocks [][]byte 121 } 122 123 // NewBlockBufferByteSink creates a BlockBufferByteSink with the provided block size. 124 func NewBlockBufferByteSink(blockSize int) *BlockBufferByteSink { 125 block := make([]byte, 0, blockSize) 126 return &BlockBufferByteSink{blockSize, 0, [][]byte{block}} 127 } 128 129 // Write writes a byte array to the sink. 130 func (sink *BlockBufferByteSink) Write(src []byte) (int, error) { 131 srcLen := len(src) 132 currBlockIdx := len(sink.blocks) - 1 133 currBlock := sink.blocks[currBlockIdx] 134 remaining := cap(currBlock) - len(currBlock) 135 136 if remaining >= srcLen { 137 currBlock = append(currBlock, src...) 138 sink.blocks[currBlockIdx] = currBlock 139 } else { 140 if remaining > 0 { 141 currBlock = append(currBlock, src[:remaining]...) 142 sink.blocks[currBlockIdx] = currBlock 143 } 144 145 newBlock := make([]byte, 0, sink.blockSize) 146 newBlock = append(newBlock, src[remaining:]...) 147 sink.blocks = append(sink.blocks, newBlock) 148 } 149 150 sink.pos += uint64(srcLen) 151 return srcLen, nil 152 } 153 154 // Flush writes all the data that was written to the ByteSink to the supplied writer 155 func (sink *BlockBufferByteSink) Flush(wr io.Writer) (err error) { 156 return iohelp.WriteAll(wr, sink.blocks...) 157 } 158 159 // FlushToFile writes all the data that was written to the ByteSink to a file at the given path 160 func (sink *BlockBufferByteSink) FlushToFile(path string) (err error) { 161 return flushSinkToFile(sink, path) 162 } 163 164 func (sink *BlockBufferByteSink) Reader() (io.ReadCloser, error) { 165 rs := make([]io.Reader, len(sink.blocks)) 166 for i := range sink.blocks { 167 rs[i] = bytes.NewReader(sink.blocks[i]) 168 } 169 return io.NopCloser(io.MultiReader(rs...)), nil 170 } 171 172 // BufferedFileByteSink is a ByteSink implementation that buffers some amount of data before it passes it 173 // to a background writing thread to be flushed to a file. 174 type BufferedFileByteSink struct { 175 blockSize int 176 pos uint64 177 currentBlock []byte 178 179 writeCh chan []byte 180 ae *atomicerr.AtomicError 181 wg *sync.WaitGroup 182 183 wr io.WriteCloser 184 path string 185 } 186 187 // NewBufferedFileByteSink creates a BufferedFileByteSink 188 func NewBufferedFileByteSink(tempDir string, blockSize, chBufferSize int) (*BufferedFileByteSink, error) { 189 f, err := tempfiles.MovableTempFileProvider.NewFile(tempDir, "buffered_file_byte_sink_") 190 191 if err != nil { 192 return nil, err 193 } 194 195 sink := &BufferedFileByteSink{ 196 blockSize: blockSize, 197 currentBlock: make([]byte, blockSize), 198 writeCh: make(chan []byte, chBufferSize), 199 ae: atomicerr.New(), 200 wg: &sync.WaitGroup{}, 201 wr: f, 202 path: f.Name(), 203 } 204 205 sink.wg.Add(1) 206 go func() { 207 defer sink.wg.Done() 208 sink.backgroundWrite() 209 }() 210 211 return sink, nil 212 } 213 214 // Write writes a byte array to the sink. 215 func (sink *BufferedFileByteSink) Write(src []byte) (int, error) { 216 srcLen := len(src) 217 remaining := cap(sink.currentBlock) - len(sink.currentBlock) 218 219 if remaining >= srcLen { 220 sink.currentBlock = append(sink.currentBlock, src...) 221 222 if remaining == srcLen { 223 sink.writeCh <- sink.currentBlock 224 sink.currentBlock = nil 225 } 226 } else { 227 if remaining > 0 { 228 sink.currentBlock = append(sink.currentBlock, src[:remaining]...) 229 sink.writeCh <- sink.currentBlock 230 } 231 232 newBlock := make([]byte, 0, sink.blockSize) 233 newBlock = append(newBlock, src[remaining:]...) 234 sink.currentBlock = newBlock 235 } 236 237 sink.pos += uint64(srcLen) 238 return srcLen, nil 239 } 240 241 func (sink *BufferedFileByteSink) backgroundWrite() { 242 var err error 243 for buff := range sink.writeCh { 244 if err != nil { 245 continue // drain 246 } 247 248 err = iohelp.WriteAll(sink.wr, buff) 249 sink.ae.SetIfError(err) 250 } 251 252 err = sink.wr.Close() 253 sink.ae.SetIfError(err) 254 } 255 256 func (sink *BufferedFileByteSink) finish() error { 257 // |finish()| is not thread-safe. We just use writeCh == nil as a 258 // sentinel to mean we've been called again from Reader() as part of a 259 // retry or something. 260 if sink.writeCh != nil { 261 toWrite := len(sink.currentBlock) 262 if toWrite > 0 { 263 sink.writeCh <- sink.currentBlock[:toWrite] 264 } 265 266 close(sink.writeCh) 267 sink.wg.Wait() 268 269 sink.writeCh = nil 270 } 271 return sink.ae.Get() 272 } 273 274 // Flush writes all the data that was written to the ByteSink to the supplied writer 275 func (sink *BufferedFileByteSink) Flush(wr io.Writer) (err error) { 276 err = sink.finish() 277 if err != nil { 278 return err 279 } 280 281 var f *os.File 282 f, err = os.Open(sink.path) 283 284 if err != nil { 285 return err 286 } 287 288 defer func() { 289 closeErr := f.Close() 290 291 if err == nil { 292 err = closeErr 293 } 294 }() 295 296 _, err = io.Copy(wr, f) 297 298 return err 299 } 300 301 // FlushToFile writes all the data that was written to the ByteSink to a file at the given path 302 func (sink *BufferedFileByteSink) FlushToFile(path string) (err error) { 303 err = sink.finish() 304 if err != nil { 305 return err 306 } 307 308 return file.Rename(sink.path, path) 309 } 310 311 func (sink *BufferedFileByteSink) Reader() (io.ReadCloser, error) { 312 err := sink.finish() 313 if err != nil { 314 return nil, err 315 } 316 return os.Open(sink.path) 317 } 318 319 // HashingByteSink is a ByteSink that keeps an md5 hash of all the data written to it. 320 type HashingByteSink struct { 321 backingSink ByteSink 322 hasher hash.Hash 323 size uint64 324 } 325 326 func NewSHA512HashingByteSink(backingSink ByteSink) *HashingByteSink { 327 return &HashingByteSink{backingSink: backingSink, hasher: sha512.New(), size: 0} 328 } 329 330 func NewMD5HashingByteSink(backingSink ByteSink) *HashingByteSink { 331 return &HashingByteSink{backingSink: backingSink, hasher: md5.New(), size: 0} 332 } 333 334 // Write writes a byte array to the sink. 335 func (sink *HashingByteSink) Write(src []byte) (int, error) { 336 nWritten, err := sink.backingSink.Write(src) 337 338 if err != nil { 339 return 0, err 340 } 341 342 nHashed, err := sink.hasher.Write(src[:nWritten]) 343 344 if err != nil { 345 return 0, err 346 } else if nWritten != nHashed { 347 return 0, errors.New("failed to hash all the data that was written to the byte sink.") 348 } 349 350 sink.size += uint64(nWritten) 351 352 return nWritten, nil 353 } 354 355 // Flush writes all the data that was written to the ByteSink to the supplied writer 356 func (sink *HashingByteSink) Flush(wr io.Writer) error { 357 return sink.backingSink.Flush(wr) 358 } 359 360 // FlushToFile writes all the data that was written to the ByteSink to a file at the given path 361 func (sink *HashingByteSink) FlushToFile(path string) error { 362 return sink.backingSink.FlushToFile(path) 363 } 364 365 func (sink *HashingByteSink) Reader() (io.ReadCloser, error) { 366 return sink.backingSink.Reader() 367 } 368 369 // Execute the hasher.Sum() function and return the result 370 func (sink *HashingByteSink) GetSum() []byte { 371 return sink.hasher.Sum(nil) 372 } 373 374 // ResetHasher resets the hasher to allow for checksums at various points in the data stream. The expectation is that 375 // you would call GetSum prior to calling this function. 376 func (sink *HashingByteSink) ResetHasher() { 377 sink.hasher.Reset() 378 } 379 380 // Size gets the number of bytes written to the sink 381 func (sink *HashingByteSink) Size() uint64 { 382 return sink.size 383 }