github.com/jbendotnet/noms@v0.0.0-20190904222105-c43e4293ea92/go/datas/pull.go

github.com/jbendotnet/noms@v0.0.0-20190904222105-c43e4293ea92/go/datas/pull.go (about)

     1  // Copyright 2016 Attic Labs, Inc. All rights reserved.
     2  // Licensed under the Apache License, version 2.0:
     3  // http://www.apache.org/licenses/LICENSE-2.0
     4  
     5  package datas
     6  
     7  import (
     8  	"math"
     9  	"math/rand"
    10  
    11  	"github.com/attic-labs/noms/go/chunks"
    12  	"github.com/attic-labs/noms/go/d"
    13  	"github.com/attic-labs/noms/go/hash"
    14  	"github.com/attic-labs/noms/go/types"
    15  	"github.com/golang/snappy"
    16  )
    17  
    18  type PullProgress struct {
    19  	DoneCount, KnownCount, ApproxWrittenBytes uint64
    20  }
    21  
    22  const (
    23  	bytesWrittenSampleRate = .10
    24  	batchSize              = 1 << 12 // 4096 chunks
    25  )
    26  
    27  // Pull objects that descend from sourceRef from srcDB to sinkDB.
    28  func Pull(srcDB, sinkDB Database, sourceRef types.Ref, progressCh chan PullProgress) {
    29  	// Sanity Check
    30  	d.PanicIfFalse(srcDB.chunkStore().Has(sourceRef.TargetHash()))
    31  
    32  	if sinkDB.chunkStore().Has(sourceRef.TargetHash()) {
    33  		return // already up to date
    34  	}
    35  
    36  	var doneCount, knownCount, approxBytesWritten uint64
    37  	updateProgress := func(moreDone, moreKnown, moreApproxBytesWritten uint64) {
    38  		if progressCh == nil {
    39  			return
    40  		}
    41  		doneCount, knownCount, approxBytesWritten = doneCount+moreDone, knownCount+moreKnown, approxBytesWritten+moreApproxBytesWritten
    42  		progressCh <- PullProgress{doneCount, knownCount, approxBytesWritten}
    43  	}
    44  	var sampleSize, sampleCount uint64
    45  
    46  	// TODO: This batches based on limiting the _number_ of chunks processed at the same time. We really want to batch based on the _amount_ of chunk data being processed simultaneously. We also want to consider the chunks in a particular order, however, and the current GetMany() interface doesn't provide any ordering guarantees. Once BUG 3750 is fixed, we should be able to revisit this and do a better job.
    47  	absent := hash.HashSlice{sourceRef.TargetHash()}
    48  	for absentCount := len(absent); absentCount != 0; absentCount = len(absent) {
    49  		updateProgress(0, uint64(absentCount), 0)
    50  
    51  		// For gathering up the hashes in the next level of the tree
    52  		nextLevel := hash.HashSet{}
    53  		uniqueOrdered := hash.HashSlice{}
    54  
    55  		// Process all absent chunks in this level of the tree in quanta of at most |batchSize|
    56  		for start, end := 0, batchSize; start < absentCount; start, end = end, end+batchSize {
    57  			if end > absentCount {
    58  				end = absentCount
    59  			}
    60  			batch := absent[start:end]
    61  
    62  			// Concurrently pull all chunks from this batch that the sink is missing out of the source
    63  			neededChunks := map[hash.Hash]*chunks.Chunk{}
    64  			found := make(chan *chunks.Chunk)
    65  			go func() { defer close(found); srcDB.chunkStore().GetMany(batch.HashSet(), found) }()
    66  			for c := range found {
    67  				neededChunks[c.Hash()] = c
    68  
    69  				// Randomly sample amount of data written
    70  				if rand.Float64() < bytesWrittenSampleRate {
    71  					sampleSize += uint64(len(snappy.Encode(nil, c.Data())))
    72  					sampleCount++
    73  				}
    74  				updateProgress(1, 0, sampleSize/uint64(math.Max(1, float64(sampleCount))))
    75  			}
    76  
    77  			// Now, put the absent chunks into the sink IN ORDER.
    78  			// At the same time, gather up an ordered, uniquified list of all the children of the chunks in |batch| and add them to those in previous batches. This list is what we'll use to descend to the next level of the tree.
    79  			for _, h := range batch {
    80  				c := neededChunks[h]
    81  				sinkDB.chunkStore().Put(*c)
    82  				types.WalkRefs(*c, func(r types.Ref) {
    83  					if !nextLevel.Has(r.TargetHash()) {
    84  						uniqueOrdered = append(uniqueOrdered, r.TargetHash())
    85  						nextLevel.Insert(r.TargetHash())
    86  					}
    87  				})
    88  			}
    89  		}
    90  
    91  		// Ask sinkDB which of the next level's hashes it doesn't have.
    92  		absentSet := sinkDB.chunkStore().HasMany(nextLevel)
    93  		absent = absent[:0]
    94  		for _, h := range uniqueOrdered {
    95  			if absentSet.Has(h) {
    96  				absent = append(absent, h)
    97  			}
    98  		}
    99  	}
   100  
   101  	persistChunks(sinkDB.chunkStore())
   102  }