github.com/jbendotnet/noms@v0.0.0-20190904222105-c43e4293ea92/go/datas/pull.go (about) 1 // Copyright 2016 Attic Labs, Inc. All rights reserved. 2 // Licensed under the Apache License, version 2.0: 3 // http://www.apache.org/licenses/LICENSE-2.0 4 5 package datas 6 7 import ( 8 "math" 9 "math/rand" 10 11 "github.com/attic-labs/noms/go/chunks" 12 "github.com/attic-labs/noms/go/d" 13 "github.com/attic-labs/noms/go/hash" 14 "github.com/attic-labs/noms/go/types" 15 "github.com/golang/snappy" 16 ) 17 18 type PullProgress struct { 19 DoneCount, KnownCount, ApproxWrittenBytes uint64 20 } 21 22 const ( 23 bytesWrittenSampleRate = .10 24 batchSize = 1 << 12 // 4096 chunks 25 ) 26 27 // Pull objects that descend from sourceRef from srcDB to sinkDB. 28 func Pull(srcDB, sinkDB Database, sourceRef types.Ref, progressCh chan PullProgress) { 29 // Sanity Check 30 d.PanicIfFalse(srcDB.chunkStore().Has(sourceRef.TargetHash())) 31 32 if sinkDB.chunkStore().Has(sourceRef.TargetHash()) { 33 return // already up to date 34 } 35 36 var doneCount, knownCount, approxBytesWritten uint64 37 updateProgress := func(moreDone, moreKnown, moreApproxBytesWritten uint64) { 38 if progressCh == nil { 39 return 40 } 41 doneCount, knownCount, approxBytesWritten = doneCount+moreDone, knownCount+moreKnown, approxBytesWritten+moreApproxBytesWritten 42 progressCh <- PullProgress{doneCount, knownCount, approxBytesWritten} 43 } 44 var sampleSize, sampleCount uint64 45 46 // TODO: This batches based on limiting the _number_ of chunks processed at the same time. We really want to batch based on the _amount_ of chunk data being processed simultaneously. We also want to consider the chunks in a particular order, however, and the current GetMany() interface doesn't provide any ordering guarantees. Once BUG 3750 is fixed, we should be able to revisit this and do a better job. 47 absent := hash.HashSlice{sourceRef.TargetHash()} 48 for absentCount := len(absent); absentCount != 0; absentCount = len(absent) { 49 updateProgress(0, uint64(absentCount), 0) 50 51 // For gathering up the hashes in the next level of the tree 52 nextLevel := hash.HashSet{} 53 uniqueOrdered := hash.HashSlice{} 54 55 // Process all absent chunks in this level of the tree in quanta of at most |batchSize| 56 for start, end := 0, batchSize; start < absentCount; start, end = end, end+batchSize { 57 if end > absentCount { 58 end = absentCount 59 } 60 batch := absent[start:end] 61 62 // Concurrently pull all chunks from this batch that the sink is missing out of the source 63 neededChunks := map[hash.Hash]*chunks.Chunk{} 64 found := make(chan *chunks.Chunk) 65 go func() { defer close(found); srcDB.chunkStore().GetMany(batch.HashSet(), found) }() 66 for c := range found { 67 neededChunks[c.Hash()] = c 68 69 // Randomly sample amount of data written 70 if rand.Float64() < bytesWrittenSampleRate { 71 sampleSize += uint64(len(snappy.Encode(nil, c.Data()))) 72 sampleCount++ 73 } 74 updateProgress(1, 0, sampleSize/uint64(math.Max(1, float64(sampleCount)))) 75 } 76 77 // Now, put the absent chunks into the sink IN ORDER. 78 // At the same time, gather up an ordered, uniquified list of all the children of the chunks in |batch| and add them to those in previous batches. This list is what we'll use to descend to the next level of the tree. 79 for _, h := range batch { 80 c := neededChunks[h] 81 sinkDB.chunkStore().Put(*c) 82 types.WalkRefs(*c, func(r types.Ref) { 83 if !nextLevel.Has(r.TargetHash()) { 84 uniqueOrdered = append(uniqueOrdered, r.TargetHash()) 85 nextLevel.Insert(r.TargetHash()) 86 } 87 }) 88 } 89 } 90 91 // Ask sinkDB which of the next level's hashes it doesn't have. 92 absentSet := sinkDB.chunkStore().HasMany(nextLevel) 93 absent = absent[:0] 94 for _, h := range uniqueOrdered { 95 if absentSet.Has(h) { 96 absent = append(absent, h) 97 } 98 } 99 } 100 101 persistChunks(sinkDB.chunkStore()) 102 }