github.com/hasnat/dolt/go@v0.0.0-20210628190320-9eb5d843fbb7/store/datas/pull.go (about) 1 // Copyright 2019 Dolthub, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 // 15 // This file incorporates work covered by the following copyright and 16 // permission notice: 17 // 18 // Copyright 2016 Attic Labs, Inc. All rights reserved. 19 // Licensed under the Apache License, version 2.0: 20 // http://www.apache.org/licenses/LICENSE-2.0 21 22 package datas 23 24 import ( 25 "context" 26 "errors" 27 "fmt" 28 "io" 29 "math" 30 "math/rand" 31 "sync" 32 33 "github.com/cenkalti/backoff" 34 "github.com/golang/snappy" 35 "golang.org/x/sync/errgroup" 36 "golang.org/x/sync/semaphore" 37 38 "github.com/dolthub/dolt/go/store/chunks" 39 "github.com/dolthub/dolt/go/store/hash" 40 "github.com/dolthub/dolt/go/store/nbs" 41 "github.com/dolthub/dolt/go/store/types" 42 ) 43 44 type PullProgress struct { 45 DoneCount, KnownCount, ApproxWrittenBytes uint64 46 } 47 48 const ( 49 bytesWrittenSampleRate = .10 50 defaultBatchSize = 1 << 12 // 4096 chunks 51 ) 52 53 var ErrNoData = errors.New("no data") 54 55 func makeProgTrack(progressCh chan PullProgress) func(moreDone, moreKnown, moreApproxBytesWritten uint64) { 56 var doneCount, knownCount, approxBytesWritten uint64 57 return func(moreDone, moreKnown, moreApproxBytesWritten uint64) { 58 if progressCh == nil { 59 return 60 } 61 doneCount, knownCount, approxBytesWritten = doneCount+moreDone, knownCount+moreKnown, approxBytesWritten+moreApproxBytesWritten 62 progressCh <- PullProgress{doneCount, knownCount, approxBytesWritten} 63 } 64 } 65 66 func Clone(ctx context.Context, srcDB, sinkDB Database, eventCh chan<- TableFileEvent) error { 67 68 srcCS := srcDB.chunkStore().(interface{}) 69 sinkCS := sinkDB.chunkStore().(interface{}) 70 71 srcTS, srcOK := srcCS.(nbs.TableFileStore) 72 73 if !srcOK { 74 return errors.New("src db is not a Table File Store") 75 } 76 77 size, err := srcTS.Size(ctx) 78 79 if err != nil { 80 return err 81 } 82 83 if size == 0 { 84 return ErrNoData 85 } 86 87 sinkTS, sinkOK := sinkCS.(nbs.TableFileStore) 88 89 if !sinkOK { 90 return errors.New("sink db is not a Table File Store") 91 } 92 93 return clone(ctx, srcTS, sinkTS, eventCh) 94 } 95 96 type CloneTableFileEvent int 97 98 const ( 99 Listed = iota 100 DownloadStart 101 DownloadSuccess 102 DownloadFailed 103 ) 104 105 type TableFileEvent struct { 106 EventType CloneTableFileEvent 107 TableFiles []nbs.TableFile 108 } 109 110 // mapTableFiles returns the list of all fileIDs for the table files, and a map from fileID to nbs.TableFile 111 func mapTableFiles(tblFiles []nbs.TableFile) ([]string, map[string]nbs.TableFile) { 112 fileIds := make([]string, len(tblFiles)) 113 fileIDtoTblFile := make(map[string]nbs.TableFile) 114 115 for i, tblFile := range tblFiles { 116 fileIDtoTblFile[tblFile.FileID()] = tblFile 117 fileIds[i] = tblFile.FileID() 118 } 119 120 return fileIds, fileIDtoTblFile 121 } 122 123 func CloseWithErr(c io.Closer, err *error) { 124 e := c.Close() 125 if *err == nil && e != nil { 126 *err = e 127 } 128 } 129 130 const concurrentTableFileDownloads = 3 131 132 func clone(ctx context.Context, srcTS, sinkTS nbs.TableFileStore, eventCh chan<- TableFileEvent) error { 133 root, sourceFiles, appendixFiles, err := srcTS.Sources(ctx) 134 if err != nil { 135 return err 136 } 137 138 tblFiles := filterAppendicesFromSourceFiles(appendixFiles, sourceFiles) 139 report := func(e TableFileEvent) { 140 if eventCh != nil { 141 eventCh <- e 142 } 143 } 144 145 // Initializes the list of fileIDs we are going to download, and the map of fileIDToTF. If this clone takes a long 146 // time some of the urls within the nbs.TableFiles will expire and fail to download. At that point we will retrieve 147 // the sources again, and update the fileIDToTF map with updated info, but not change the files we are downloading. 148 desiredFiles, fileIDToTF := mapTableFiles(tblFiles) 149 completed := make([]bool, len(desiredFiles)) 150 151 report(TableFileEvent{Listed, tblFiles}) 152 153 download := func(ctx context.Context) error { 154 sem := semaphore.NewWeighted(concurrentTableFileDownloads) 155 eg, ctx := errgroup.WithContext(ctx) 156 for i := 0; i < len(desiredFiles); i++ { 157 if completed[i] { 158 continue 159 } 160 if err := sem.Acquire(ctx, 1); err != nil { 161 // The errgroup ctx has been canceled. We will 162 // return the error from wg.Wait() below. 163 break 164 } 165 idx := i 166 eg.Go(func() (err error) { 167 defer sem.Release(1) 168 169 fileID := desiredFiles[idx] 170 tblFile, ok := fileIDToTF[fileID] 171 if !ok { 172 // conjoin happened during clone 173 return backoff.Permanent(errors.New("table file not found. please try again")) 174 } 175 176 var rd io.ReadCloser 177 if rd, err = tblFile.Open(ctx); err != nil { 178 return err 179 } 180 defer CloseWithErr(rd, &err) 181 182 report(TableFileEvent{DownloadStart, []nbs.TableFile{tblFile}}) 183 err = sinkTS.WriteTableFile(ctx, tblFile.FileID(), tblFile.NumChunks(), rd, 0, nil) 184 if err != nil { 185 report(TableFileEvent{DownloadFailed, []nbs.TableFile{tblFile}}) 186 return err 187 } 188 189 report(TableFileEvent{DownloadSuccess, []nbs.TableFile{tblFile}}) 190 completed[idx] = true 191 return nil 192 }) 193 } 194 195 return eg.Wait() 196 } 197 198 const maxAttempts = 3 199 previousCompletedCnt := 0 200 failureCount := 0 201 202 madeProgress := func() bool { 203 currentCompletedCnt := 0 204 for _, b := range completed { 205 if b { 206 currentCompletedCnt++ 207 } 208 } 209 if currentCompletedCnt == previousCompletedCnt { 210 return false 211 } else { 212 previousCompletedCnt = currentCompletedCnt 213 return true 214 } 215 } 216 217 // keep going as long as progress is being made. If progress is not made retry up to maxAttempts times. 218 for { 219 err = download(ctx) 220 if err == nil { 221 break 222 } 223 if permanent, ok := err.(*backoff.PermanentError); ok { 224 return permanent.Err 225 } else if madeProgress() { 226 failureCount = 0 227 } else { 228 failureCount++ 229 } 230 if failureCount >= maxAttempts { 231 return err 232 } 233 if _, sourceFiles, appendixFiles, err = srcTS.Sources(ctx); err != nil { 234 return err 235 } else { 236 tblFiles = filterAppendicesFromSourceFiles(appendixFiles, sourceFiles) 237 _, fileIDToTF = mapTableFiles(tblFiles) 238 } 239 } 240 241 return sinkTS.SetRootChunk(ctx, root, hash.Hash{}) 242 } 243 244 func filterAppendicesFromSourceFiles(appendixFiles []nbs.TableFile, sourceFiles []nbs.TableFile) []nbs.TableFile { 245 if len(appendixFiles) == 0 { 246 return sourceFiles 247 } 248 tblFiles := make([]nbs.TableFile, 0) 249 _, appendixMap := mapTableFiles(appendixFiles) 250 for _, sf := range sourceFiles { 251 if _, ok := appendixMap[sf.FileID()]; !ok { 252 tblFiles = append(tblFiles, sf) 253 } 254 } 255 return tblFiles 256 } 257 258 // Pull objects that descend from sourceRef from srcDB to sinkDB. 259 func Pull(ctx context.Context, srcDB, sinkDB Database, sourceRef types.Ref, progressCh chan PullProgress) error { 260 return pull(ctx, srcDB, sinkDB, sourceRef, progressCh, defaultBatchSize) 261 } 262 263 func pull(ctx context.Context, srcDB, sinkDB Database, sourceRef types.Ref, progressCh chan PullProgress, batchSize int) error { 264 // Sanity Check 265 exists, err := srcDB.chunkStore().Has(ctx, sourceRef.TargetHash()) 266 267 if err != nil { 268 return err 269 } 270 271 if !exists { 272 return errors.New("not found") 273 } 274 275 exists, err = sinkDB.chunkStore().Has(ctx, sourceRef.TargetHash()) 276 277 if err != nil { 278 return err 279 } 280 281 if exists { 282 return nil // already up to date 283 } 284 285 if srcDB.chunkStore().Version() != sinkDB.chunkStore().Version() { 286 return fmt.Errorf("cannot pull from src to sink; src version is %v and sink version is %v", srcDB.chunkStore().Version(), sinkDB.chunkStore().Version()) 287 } 288 289 var sampleSize, sampleCount uint64 290 updateProgress := makeProgTrack(progressCh) 291 292 // TODO: This batches based on limiting the _number_ of chunks processed at the same time. We really want to batch based on the _amount_ of chunk data being processed simultaneously. We also want to consider the chunks in a particular order, however, and the current GetMany() interface doesn't provide any ordering guarantees. Once BUG 3750 is fixed, we should be able to revisit this and do a better job. 293 absent := hash.HashSlice{sourceRef.TargetHash()} 294 for absentCount := len(absent); absentCount != 0; absentCount = len(absent) { 295 updateProgress(0, uint64(absentCount), 0) 296 297 // For gathering up the hashes in the next level of the tree 298 nextLevel := hash.HashSet{} 299 uniqueOrdered := hash.HashSlice{} 300 301 // Process all absent chunks in this level of the tree in quanta of at most |batchSize| 302 for start, end := 0, batchSize; start < absentCount; start, end = end, end+batchSize { 303 if end > absentCount { 304 end = absentCount 305 } 306 batch := absent[start:end] 307 308 neededChunks, err := getChunks(ctx, srcDB, batch, sampleSize, sampleCount, updateProgress) 309 310 if err != nil { 311 return err 312 } 313 314 uniqueOrdered, err = putChunks(ctx, sinkDB, batch, neededChunks, nextLevel, uniqueOrdered) 315 316 if err != nil { 317 return err 318 } 319 } 320 321 absent, err = nextLevelMissingChunks(ctx, sinkDB, nextLevel, absent, uniqueOrdered) 322 323 if err != nil { 324 return err 325 } 326 } 327 328 err = persistChunks(ctx, sinkDB.chunkStore()) 329 330 if err != nil { 331 return err 332 } 333 334 return nil 335 } 336 337 func persistChunks(ctx context.Context, cs chunks.ChunkStore) error { 338 // todo: there is no call to rebase on an unsuccessful Commit() 339 // will this loop forever? 340 var success bool 341 for !success { 342 r, err := cs.Root(ctx) 343 344 if err != nil { 345 return err 346 } 347 348 success, err = cs.Commit(ctx, r, r) 349 350 if err != nil { 351 return err 352 } 353 } 354 355 return nil 356 } 357 358 // PullWithoutBatching effectively removes the batching of chunk retrieval done on each level of the tree. This means 359 // all chunks from one level of the tree will be retrieved from the underlying chunk store in one call, which pushes the 360 // optimization problem down to the chunk store which can make smarter decisions. 361 func PullWithoutBatching(ctx context.Context, srcDB, sinkDB Database, sourceRef types.Ref, progressCh chan PullProgress) error { 362 // by increasing the batch size to MaxInt32 we effectively remove batching here. 363 return pull(ctx, srcDB, sinkDB, sourceRef, progressCh, math.MaxInt32) 364 } 365 366 // concurrently pull all chunks from this batch that the sink is missing out of the source 367 func getChunks(ctx context.Context, srcDB Database, batch hash.HashSlice, sampleSize uint64, sampleCount uint64, updateProgress func(moreDone uint64, moreKnown uint64, moreApproxBytesWritten uint64)) (map[hash.Hash]*chunks.Chunk, error) { 368 mu := &sync.Mutex{} 369 neededChunks := map[hash.Hash]*chunks.Chunk{} 370 err := srcDB.chunkStore().GetMany(ctx, batch.HashSet(), func(c *chunks.Chunk) { 371 mu.Lock() 372 defer mu.Unlock() 373 neededChunks[c.Hash()] = c 374 375 // Randomly sample amount of data written 376 if rand.Float64() < bytesWrittenSampleRate { 377 sampleSize += uint64(len(snappy.Encode(nil, c.Data()))) 378 sampleCount++ 379 } 380 updateProgress(1, 0, sampleSize/uint64(math.Max(1, float64(sampleCount)))) 381 }) 382 if err != nil { 383 return nil, err 384 } 385 return neededChunks, nil 386 } 387 388 // put the chunks that were downloaded into the sink IN ORDER and at the same time gather up an ordered, uniquified list 389 // of all the children of the chunks and add them to the list of the next level tree chunks. 390 func putChunks(ctx context.Context, sinkDB Database, hashes hash.HashSlice, neededChunks map[hash.Hash]*chunks.Chunk, nextLevel hash.HashSet, uniqueOrdered hash.HashSlice) (hash.HashSlice, error) { 391 for _, h := range hashes { 392 c := neededChunks[h] 393 err := sinkDB.chunkStore().Put(ctx, *c) 394 395 if err != nil { 396 return hash.HashSlice{}, err 397 } 398 399 err = types.WalkRefs(*c, sinkDB.Format(), func(r types.Ref) error { 400 if !nextLevel.Has(r.TargetHash()) { 401 uniqueOrdered = append(uniqueOrdered, r.TargetHash()) 402 nextLevel.Insert(r.TargetHash()) 403 } 404 405 return nil 406 }) 407 408 if err != nil { 409 return hash.HashSlice{}, err 410 } 411 } 412 413 return uniqueOrdered, nil 414 } 415 416 // ask sinkDB which of the next level's hashes it doesn't have, and add those chunks to the absent list which will need 417 // to be retrieved. 418 func nextLevelMissingChunks(ctx context.Context, sinkDB Database, nextLevel hash.HashSet, absent hash.HashSlice, uniqueOrdered hash.HashSlice) (hash.HashSlice, error) { 419 missingFromSink, err := sinkDB.chunkStore().HasMany(ctx, nextLevel) 420 421 if err != nil { 422 return hash.HashSlice{}, err 423 } 424 425 absent = absent[:0] 426 for _, h := range uniqueOrdered { 427 if missingFromSink.Has(h) { 428 absent = append(absent, h) 429 } 430 } 431 432 return absent, nil 433 }