github.com/dolthub/dolt/go@v0.40.5-0.20240520175717-68db7794bea6/store/datas/pull/pull_table_file_writer.go (about) 1 // Copyright 2024 Dolthub, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package pull 16 17 import ( 18 "context" 19 "io" 20 "sync" 21 "sync/atomic" 22 23 "golang.org/x/sync/errgroup" 24 25 "github.com/dolthub/dolt/go/store/nbs" 26 ) 27 28 // A PullTableFileWriter is used by the Puller to manage comprsesed table file 29 // writers for a pull or push process. It manages writing compressed chunks 30 // into table files and adding them to the sink database for a Pull. 31 // 32 // It can be configured with: 33 // * Target file size for uploaded table files. 34 // * Number of concurrent table file uploads. 35 // * Number of pending table files awaiting upload. 36 // 37 // For the last configuration point, the basic observation is that pushes are 38 // not currently resumable across `dolt push`/`call dolt_push` invocations. It 39 // is not necessarily in a user's best interest to buffer lots and lots of 40 // table files to the local disk while a user awaits the upload of the existing 41 // buffered table files to the remote database. In the worst case, it can cause 42 // 2x disk utilization on a pushing host, which is not what the user expects. 43 // 44 // Note that, as currently implemented, the limit on the number of pending 45 // table files applies to table files which are not being uploaded at all 46 // currently. So the total number of table files possible is # of concurrent 47 // uploads + number of pending table files. 48 // 49 // A PullTableFileWriter needs must be |Close()|d at the end of delivering all 50 // of its chunks, since it needs to finalize the last in-flight table file and 51 // finish uploading all remaining table files. The error from |Close()| must be 52 // checked, since it will include any failure to upload the files. 53 type PullTableFileWriter struct { 54 cfg PullTableFileWriterConfig 55 56 addChunkCh chan nbs.CompressedChunk 57 newWriterCh chan *nbs.CmpChunkTableWriter 58 egCtx context.Context 59 eg *errgroup.Group 60 61 bufferedSendBytes uint64 62 finishedSendBytes uint64 63 } 64 65 type PullTableFileWriterConfig struct { 66 ConcurrentUploads int 67 68 ChunksPerFile int 69 70 MaximumBufferedFiles int 71 72 TempDir string 73 74 DestStore DestTableFileStore 75 } 76 77 type DestTableFileStore interface { 78 WriteTableFile(ctx context.Context, id string, numChunks int, contentHash []byte, getRd func() (io.ReadCloser, uint64, error)) error 79 AddTableFilesToManifest(ctx context.Context, fileIdToNumChunks map[string]int) error 80 } 81 82 type PullTableFileWriterStats struct { 83 // Bytes which are queued up to be sent to the destination but have not 84 // yet gone out on the wire. 85 BufferedSendBytes uint64 86 87 // Bytes which we sent to the destination. These have been delivered to 88 // the operating system to be sent to the destination database. This 89 // number never goes down. In the case that we have to retry an upload, 90 // for example, BufferedSendBytes will instead go up. 91 FinishedSendBytes uint64 92 } 93 94 func NewPullTableFileWriter(ctx context.Context, cfg PullTableFileWriterConfig) *PullTableFileWriter { 95 ret := &PullTableFileWriter{ 96 cfg: cfg, 97 addChunkCh: make(chan nbs.CompressedChunk), 98 newWriterCh: make(chan *nbs.CmpChunkTableWriter, cfg.MaximumBufferedFiles), 99 } 100 ret.eg, ret.egCtx = errgroup.WithContext(ctx) 101 ret.eg.Go(ret.uploadAndFinalizeThread) 102 ret.eg.Go(ret.addChunkThread) 103 return ret 104 } 105 106 func (w *PullTableFileWriter) GetStats() PullTableFileWriterStats { 107 return PullTableFileWriterStats{ 108 FinishedSendBytes: atomic.LoadUint64(&w.finishedSendBytes), 109 BufferedSendBytes: atomic.LoadUint64(&w.bufferedSendBytes), 110 } 111 } 112 113 // Adds the compressed chunk to the table files to be uploaded to the destination store. 114 // 115 // If there is a terminal error with uploading a table file, this method will 116 // start returning a non-nil |error|. 117 // 118 // This method may block for arbitrary amounts of time if there is already a 119 // lot of buffered table files and we are waiting for uploads to succeed before 120 // creating more table files. 121 func (w *PullTableFileWriter) AddCompressedChunk(ctx context.Context, chk nbs.CompressedChunk) error { 122 select { 123 case w.addChunkCh <- chk: 124 return nil 125 case <-ctx.Done(): 126 return context.Cause(ctx) 127 case <-w.egCtx.Done(): 128 return w.eg.Wait() 129 } 130 } 131 132 // This thread coordinates the threads which do the actual uploading. It spawns 133 // the appropriate number of upload threads and waits for them all to exit, 134 // which they typically do after newWriterCh is closed, but may also do if 135 // their context is canceled. This thread reads the response channel from the 136 // upload threads and accumulate the manifest updates. 137 // 138 // When all uploads threads have exited, it finishes reading the response 139 // channel, and then, if all upload threads exited successfully, it applies any 140 // necessary manifest updates to the DestStore. 141 func (w *PullTableFileWriter) uploadAndFinalizeThread() (err error) { 142 respCh := make(chan tempTblFile) 143 144 uploadEg, uploadCtx := errgroup.WithContext(w.egCtx) 145 for i := 0; i < w.cfg.ConcurrentUploads; i++ { 146 uploadEg.Go(func() error { 147 return w.uploadThread(uploadCtx, w.newWriterCh, respCh) 148 }) 149 } 150 151 // After all upload threads are done, the response channel is closed. 152 go func() { 153 uploadEg.Wait() 154 close(respCh) 155 }() 156 157 // We don't need too much coordination here, since respCh is guaranteed 158 // to always be closed after uploadEg is done and we are going to check 159 // for errors later. 160 manifestUpdates := make(map[string]int) 161 var manifestWg sync.WaitGroup 162 manifestWg.Add(1) 163 go func() { 164 defer manifestWg.Done() 165 for ttf := range respCh { 166 manifestUpdates[ttf.id] = ttf.numChunks 167 } 168 }() 169 170 manifestWg.Wait() 171 err = uploadEg.Wait() 172 if err != nil { 173 return err 174 } 175 176 if len(manifestUpdates) > 0 { 177 return w.cfg.DestStore.AddTableFilesToManifest(w.egCtx, manifestUpdates) 178 } else { 179 return nil 180 } 181 } 182 183 // This thread reads from addChunkCh and writes the chunks to table files. 184 // When a table file gets big enough, it stops reading from addChunkCh 185 // temporarily and shuffles the file off to the pendingUploadThread, before it 186 // goes back to reading from addChunkCh. 187 // 188 // Once addChunkCh closes, it sends along the last table file, if any, and then 189 // closes newWriterCh and exits itself. 190 func (w *PullTableFileWriter) addChunkThread() (err error) { 191 var curWr *nbs.CmpChunkTableWriter 192 193 defer func() { 194 if curWr != nil { 195 // Cleanup dangling writer, whose contents will never be used. 196 curWr.Finish() 197 rd, _ := curWr.Reader() 198 if rd != nil { 199 rd.Close() 200 } 201 } 202 }() 203 204 sendTableFile := func() error { 205 select { 206 case <-w.egCtx.Done(): 207 return context.Cause(w.egCtx) 208 case w.newWriterCh <- curWr: 209 curWr = nil 210 return nil 211 } 212 } 213 214 LOOP: 215 for { 216 if curWr != nil && curWr.ChunkCount() >= w.cfg.ChunksPerFile { 217 if err := sendTableFile(); err != nil { 218 return err 219 } 220 continue 221 } 222 223 select { 224 case <-w.egCtx.Done(): 225 return context.Cause(w.egCtx) 226 case newChnk, ok := <-w.addChunkCh: 227 if !ok { 228 break LOOP 229 } 230 231 if curWr == nil { 232 curWr, err = nbs.NewCmpChunkTableWriter(w.cfg.TempDir) 233 if err != nil { 234 return err 235 } 236 } 237 238 // Add the chunk to writer. 239 err = curWr.AddCmpChunk(newChnk) 240 if err != nil { 241 return err 242 } 243 atomic.AddUint64(&w.bufferedSendBytes, uint64(len(newChnk.FullCompressedChunk))) 244 } 245 } 246 247 // Send the last writer, if there is one. 248 if curWr != nil { 249 if err := sendTableFile(); err != nil { 250 return err 251 } 252 } 253 254 close(w.newWriterCh) 255 256 return nil 257 } 258 259 // Finalize any in-flight table file writes and add all the uploaded table 260 // files to the destination database. 261 // 262 // Returns any errors encountered on uploading or adding the table files to the 263 // destination database. 264 func (w *PullTableFileWriter) Close() error { 265 close(w.addChunkCh) 266 return w.eg.Wait() 267 } 268 269 func (w *PullTableFileWriter) uploadThread(ctx context.Context, reqCh chan *nbs.CmpChunkTableWriter, respCh chan tempTblFile) error { 270 for { 271 select { 272 case wr, ok := <-reqCh: 273 if !ok { 274 return nil 275 } 276 // content length before we finish the write, which will 277 // add the index and table file footer. 278 chunksLen := wr.ContentLength() 279 280 id, err := wr.Finish() 281 if err != nil { 282 return err 283 } 284 285 ttf := tempTblFile{ 286 id: id, 287 read: wr, 288 numChunks: wr.ChunkCount(), 289 chunksLen: chunksLen, 290 contentLen: wr.ContentLength(), 291 contentHash: wr.GetMD5(), 292 } 293 err = w.uploadTempTableFile(ctx, ttf) 294 295 // Always remove the file... 296 wr.Remove() 297 298 if err != nil { 299 return err 300 } 301 302 select { 303 case respCh <- ttf: 304 case <-ctx.Done(): 305 return context.Cause(ctx) 306 } 307 case <-ctx.Done(): 308 return context.Cause(ctx) 309 } 310 } 311 } 312 313 func (w *PullTableFileWriter) uploadTempTableFile(ctx context.Context, tmpTblFile tempTblFile) error { 314 fileSize := tmpTblFile.contentLen 315 316 // So far, we've added all the bytes for the compressed chunk data. 317 // We add the remaining bytes here --- bytes for the index and the 318 // table file footer. 319 atomic.AddUint64(&w.bufferedSendBytes, uint64(fileSize)-tmpTblFile.chunksLen) 320 321 // Tracks the number of bytes we have uploaded as part of a ReadCloser() get from a WriteTableFile call. 322 // If the upload get retried by WriteTableFile, then the callback gets 323 // called more than once, and we can register it as rebuffering the 324 // already upload bytes. 325 var uploaded uint64 326 327 return w.cfg.DestStore.WriteTableFile(ctx, tmpTblFile.id, tmpTblFile.numChunks, tmpTblFile.contentHash, func() (io.ReadCloser, uint64, error) { 328 rc, err := tmpTblFile.read.Reader() 329 if err != nil { 330 return nil, 0, err 331 } 332 333 if uploaded != 0 { 334 // A retry. We treat it as if what was already uploaded was rebuffered. 335 atomic.AddUint64(&w.bufferedSendBytes, uint64(uploaded)) 336 uploaded = 0 337 } 338 339 fWithStats := countingReader{countingReader{rc, &uploaded}, &w.finishedSendBytes} 340 341 return fWithStats, uint64(fileSize), nil 342 }) 343 }