github.com/hasnat/dolt/go@v0.0.0-20210628190320-9eb5d843fbb7/store/datas/puller.go (about) 1 // Copyright 2019 Dolthub, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package datas 16 17 import ( 18 "context" 19 "errors" 20 "fmt" 21 "os" 22 "path/filepath" 23 "sync" 24 25 "github.com/dolthub/dolt/go/store/atomicerr" 26 "github.com/dolthub/dolt/go/store/chunks" 27 "github.com/dolthub/dolt/go/store/hash" 28 "github.com/dolthub/dolt/go/store/nbs" 29 "github.com/dolthub/dolt/go/store/types" 30 ) 31 32 type FileReaderWithSize struct { 33 *os.File 34 size int64 35 } 36 37 func (rd FileReaderWithSize) Size() int64 { 38 return rd.size 39 } 40 41 // ErrDBUpToDate is the error code returned from NewPuller in the event that there is no work to do. 42 var ErrDBUpToDate = errors.New("the database does not need to be pulled as it's already up to date") 43 44 // ErrIncompatibleSourceChunkStore is the error code returned from NewPuller in 45 // the event that the source ChunkStore does not implement `NBSCompressedChunkStore`. 46 var ErrIncompatibleSourceChunkStore = errors.New("the chunk store of the source database does not implement NBSCompressedChunkStore.") 47 48 const ( 49 maxChunkWorkers = 2 50 ) 51 52 // FilledWriters store CmpChunkTableWriter that have been filled and are ready to be flushed. In the future will likely 53 // add the md5 of the data to this structure to be used to verify table upload calls. 54 type FilledWriters struct { 55 wr *nbs.CmpChunkTableWriter 56 } 57 58 // CmpChnkAndRefs holds a CompressedChunk and all of it's references 59 type CmpChnkAndRefs struct { 60 cmpChnk nbs.CompressedChunk 61 refs map[hash.Hash]int 62 } 63 64 type NBSCompressedChunkStore interface { 65 chunks.ChunkStore 66 GetManyCompressed(context.Context, hash.HashSet, func(nbs.CompressedChunk)) error 67 } 68 69 // Puller is used to sync data between to Databases 70 type Puller struct { 71 fmt *types.NomsBinFormat 72 73 srcDB Database 74 srcChunkStore NBSCompressedChunkStore 75 sinkDB Database 76 rootChunkHash hash.Hash 77 downloaded hash.HashSet 78 79 wr *nbs.CmpChunkTableWriter 80 tempDir string 81 chunksPerTF int 82 83 eventCh chan PullerEvent 84 } 85 86 type PullerEventType int 87 88 const ( 89 NewLevelTWEvent PullerEventType = iota 90 DestDBHasTWEvent 91 LevelUpdateTWEvent 92 LevelDoneTWEvent 93 StartUploadTableFile 94 EndUpdateTableFile 95 ) 96 97 type TreeWalkEventDetails struct { 98 TreeLevel int 99 ChunksInLevel int 100 ChunksAlreadyHad int 101 ChunksBuffered int 102 ChildrenFound int 103 TableFilesGenerated int 104 } 105 106 type TableFileEventDetails struct { 107 TableFileCount int 108 TableFilesUploaded int 109 CurrentFileSize int64 110 } 111 112 type PullerEvent struct { 113 EventType PullerEventType 114 TWEventDetails TreeWalkEventDetails 115 TFEventDetails TableFileEventDetails 116 } 117 118 func NewTWPullerEvent(et PullerEventType, details *TreeWalkEventDetails) PullerEvent { 119 return PullerEvent{EventType: et, TWEventDetails: *details} 120 } 121 122 func NewTFPullerEvent(et PullerEventType, details *TableFileEventDetails) PullerEvent { 123 return PullerEvent{EventType: et, TFEventDetails: *details} 124 } 125 126 // NewPuller creates a new Puller instance to do the syncing. If a nil puller is returned without error that means 127 // that there is nothing to pull and the sinkDB is already up to date. 128 func NewPuller(ctx context.Context, tempDir string, chunksPerTF int, srcDB, sinkDB Database, rootChunkHash hash.Hash, eventCh chan PullerEvent) (*Puller, error) { 129 if eventCh == nil { 130 panic("eventCh is required") 131 } 132 133 // Sanity Check 134 exists, err := srcDB.chunkStore().Has(ctx, rootChunkHash) 135 136 if err != nil { 137 return nil, err 138 } 139 140 if !exists { 141 return nil, errors.New("not found") 142 } 143 144 exists, err = sinkDB.chunkStore().Has(ctx, rootChunkHash) 145 146 if err != nil { 147 return nil, err 148 } 149 150 if exists { 151 return nil, ErrDBUpToDate 152 } 153 154 if srcDB.chunkStore().Version() != sinkDB.chunkStore().Version() { 155 return nil, fmt.Errorf("cannot pull from src to sink; src version is %v and sink version is %v", srcDB.chunkStore().Version(), sinkDB.chunkStore().Version()) 156 } 157 158 srcChunkStore, ok := srcDB.chunkStore().(NBSCompressedChunkStore) 159 if !ok { 160 return nil, ErrIncompatibleSourceChunkStore 161 } 162 163 wr, err := nbs.NewCmpChunkTableWriter(tempDir) 164 165 if err != nil { 166 return nil, err 167 } 168 169 return &Puller{ 170 fmt: srcDB.Format(), 171 srcDB: srcDB, 172 srcChunkStore: srcChunkStore, 173 sinkDB: sinkDB, 174 rootChunkHash: rootChunkHash, 175 downloaded: hash.HashSet{}, 176 tempDir: tempDir, 177 wr: wr, 178 chunksPerTF: chunksPerTF, 179 eventCh: eventCh, 180 }, nil 181 } 182 183 func (p *Puller) processCompletedTables(ctx context.Context, ae *atomicerr.AtomicError, completedTables <-chan FilledWriters) { 184 type tempTblFile struct { 185 id string 186 path string 187 numChunks int 188 contentLen uint64 189 contentHash []byte 190 } 191 192 var tblFiles []tempTblFile 193 194 var err error 195 for tblFile := range completedTables { 196 if err != nil { 197 continue // drain 198 } 199 200 var id string 201 id, err = tblFile.wr.Finish() 202 203 if ae.SetIfError(err) { 204 continue 205 } 206 207 path := filepath.Join(p.tempDir, id) 208 err = tblFile.wr.FlushToFile(path) 209 210 if ae.SetIfError(err) { 211 continue 212 } 213 214 tblFiles = append(tblFiles, tempTblFile{ 215 id: id, 216 path: path, 217 numChunks: tblFile.wr.Size(), 218 contentLen: tblFile.wr.ContentLength(), 219 contentHash: tblFile.wr.GetMD5(), 220 }) 221 } 222 223 if ae.IsSet() { 224 return 225 } 226 227 details := &TableFileEventDetails{TableFileCount: len(tblFiles)} 228 229 // Write tables in reverse order so that on a partial success, it will still be true that if a db has a chunk, it 230 // also has all of that chunks references. 231 for i := len(tblFiles) - 1; i >= 0; i-- { 232 tmpTblFile := tblFiles[i] 233 234 fi, err := os.Stat(tmpTblFile.path) 235 236 if ae.SetIfError(err) { 237 return 238 } 239 240 f, err := os.Open(tmpTblFile.path) 241 242 if ae.SetIfError(err) { 243 return 244 } 245 246 details.CurrentFileSize = fi.Size() 247 p.eventCh <- NewTFPullerEvent(StartUploadTableFile, details) 248 249 fWithSize := FileReaderWithSize{f, fi.Size()} 250 err = p.sinkDB.chunkStore().(nbs.TableFileStore).WriteTableFile(ctx, tmpTblFile.id, tmpTblFile.numChunks, fWithSize, tmpTblFile.contentLen, tmpTblFile.contentHash) 251 252 go func() { 253 _ = os.Remove(tmpTblFile.path) 254 }() 255 256 if ae.SetIfError(err) { 257 return 258 } 259 260 details.TableFilesUploaded++ 261 p.eventCh <- NewTFPullerEvent(EndUpdateTableFile, details) 262 } 263 } 264 265 // Pull executes the sync operation 266 func (p *Puller) Pull(ctx context.Context) error { 267 twDetails := &TreeWalkEventDetails{TreeLevel: -1} 268 269 leaves := make(hash.HashSet) 270 absent := make(hash.HashSet) 271 absent.Insert(p.rootChunkHash) 272 273 ae := atomicerr.New() 274 wg := &sync.WaitGroup{} 275 completedTables := make(chan FilledWriters, 8) 276 277 wg.Add(1) 278 go func() { 279 defer wg.Done() 280 p.processCompletedTables(ctx, ae, completedTables) 281 }() 282 283 for len(absent) > 0 { 284 limitToNewChunks(absent, p.downloaded) 285 286 chunksInLevel := len(absent) 287 twDetails.ChunksInLevel = chunksInLevel 288 p.eventCh <- NewTWPullerEvent(NewLevelTWEvent, twDetails) 289 290 var err error 291 absent, err = p.sinkDB.chunkStore().HasMany(ctx, absent) 292 293 if ae.SetIfError(err) { 294 break 295 } 296 297 twDetails.ChunksAlreadyHad = chunksInLevel - len(absent) 298 p.eventCh <- NewTWPullerEvent(DestDBHasTWEvent, twDetails) 299 300 if len(absent) > 0 { 301 leaves, absent, err = p.getCmp(ctx, twDetails, leaves, absent, completedTables) 302 303 if ae.SetIfError(err) { 304 break 305 } 306 } 307 } 308 if !ae.IsSet() && p.wr.Size() > 0 { 309 // p.wr may be nil in the error case 310 completedTables <- FilledWriters{p.wr} 311 } 312 313 close(completedTables) 314 315 wg.Wait() 316 return ae.Get() 317 } 318 319 func limitToNewChunks(absent hash.HashSet, downloaded hash.HashSet) { 320 smaller := absent 321 longer := downloaded 322 if len(absent) > len(downloaded) { 323 smaller = downloaded 324 longer = absent 325 } 326 327 for k := range smaller { 328 if longer.Has(k) { 329 absent.Remove(k) 330 } 331 } 332 } 333 334 func (p *Puller) getCmp(ctx context.Context, twDetails *TreeWalkEventDetails, leaves, batch hash.HashSet, completedTables chan FilledWriters) (hash.HashSet, hash.HashSet, error) { 335 found := make(chan nbs.CompressedChunk, 4096) 336 processed := make(chan CmpChnkAndRefs, 4096) 337 338 ae := atomicerr.New() 339 go func() { 340 defer close(found) 341 err := p.srcChunkStore.GetManyCompressed(ctx, batch, func(c nbs.CompressedChunk) { found <- c }) 342 ae.SetIfError(err) 343 }() 344 345 batchSize := len(batch) 346 numChunkWorkers := (batchSize / 1024) + 1 347 if numChunkWorkers > maxChunkWorkers { 348 numChunkWorkers = maxChunkWorkers 349 } 350 351 go func() { 352 defer close(processed) 353 for cmpChnk := range found { 354 if ae.IsSet() { 355 break 356 } 357 358 p.downloaded.Insert(cmpChnk.H) 359 360 if leaves.Has(cmpChnk.H) { 361 processed <- CmpChnkAndRefs{cmpChnk: cmpChnk} 362 } else { 363 chnk, err := cmpChnk.ToChunk() 364 365 if ae.SetIfError(err) { 366 return 367 } 368 369 refs := make(map[hash.Hash]int) 370 if err := types.WalkRefs(chnk, p.fmt, func(r types.Ref) error { 371 refs[r.TargetHash()] = int(r.Height()) 372 return nil 373 }); ae.SetIfError(err) { 374 return 375 } 376 377 processed <- CmpChnkAndRefs{cmpChnk: cmpChnk, refs: refs} 378 } 379 } 380 }() 381 382 var err error 383 var maxHeight int 384 nextLeaves := make(hash.HashSet, batchSize) 385 nextLevel := make(hash.HashSet, batchSize) 386 387 twDetails.ChunksBuffered = 0 388 for cmpAndRef := range processed { 389 if err != nil { 390 // drain to prevent deadlock 391 continue 392 } 393 394 twDetails.ChunksBuffered++ 395 396 if twDetails.ChunksBuffered%1000 == 0 { 397 p.eventCh <- NewTWPullerEvent(LevelUpdateTWEvent, twDetails) 398 } 399 400 err = p.wr.AddCmpChunk(cmpAndRef.cmpChnk) 401 402 if ae.SetIfError(err) { 403 continue 404 } 405 406 if p.wr.Size() >= p.chunksPerTF { 407 completedTables <- FilledWriters{p.wr} 408 p.wr, err = nbs.NewCmpChunkTableWriter(p.tempDir) 409 410 if ae.SetIfError(err) { 411 continue 412 } 413 } 414 415 for h, height := range cmpAndRef.refs { 416 nextLevel.Insert(h) 417 twDetails.ChildrenFound++ 418 419 if height == 1 { 420 nextLeaves.Insert(h) 421 } 422 423 if height > maxHeight { 424 maxHeight = height 425 } 426 } 427 } 428 429 if err := ae.Get(); err != nil { 430 return nil, nil, err 431 } 432 433 if twDetails.ChunksBuffered != len(batch) { 434 return nil, nil, errors.New("failed to get all chunks.") 435 } 436 437 p.eventCh <- NewTWPullerEvent(LevelDoneTWEvent, twDetails) 438 439 twDetails.TreeLevel = maxHeight 440 return nextLeaves, nextLevel, nil 441 }