github.com/hasnat/dolt/go@v0.0.0-20210628190320-9eb5d843fbb7/store/datas/pull.go (about)

     1  // Copyright 2019 Dolthub, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  //
    15  // This file incorporates work covered by the following copyright and
    16  // permission notice:
    17  //
    18  // Copyright 2016 Attic Labs, Inc. All rights reserved.
    19  // Licensed under the Apache License, version 2.0:
    20  // http://www.apache.org/licenses/LICENSE-2.0
    21  
    22  package datas
    23  
    24  import (
    25  	"context"
    26  	"errors"
    27  	"fmt"
    28  	"io"
    29  	"math"
    30  	"math/rand"
    31  	"sync"
    32  
    33  	"github.com/cenkalti/backoff"
    34  	"github.com/golang/snappy"
    35  	"golang.org/x/sync/errgroup"
    36  	"golang.org/x/sync/semaphore"
    37  
    38  	"github.com/dolthub/dolt/go/store/chunks"
    39  	"github.com/dolthub/dolt/go/store/hash"
    40  	"github.com/dolthub/dolt/go/store/nbs"
    41  	"github.com/dolthub/dolt/go/store/types"
    42  )
    43  
    44  type PullProgress struct {
    45  	DoneCount, KnownCount, ApproxWrittenBytes uint64
    46  }
    47  
    48  const (
    49  	bytesWrittenSampleRate = .10
    50  	defaultBatchSize       = 1 << 12 // 4096 chunks
    51  )
    52  
    53  var ErrNoData = errors.New("no data")
    54  
    55  func makeProgTrack(progressCh chan PullProgress) func(moreDone, moreKnown, moreApproxBytesWritten uint64) {
    56  	var doneCount, knownCount, approxBytesWritten uint64
    57  	return func(moreDone, moreKnown, moreApproxBytesWritten uint64) {
    58  		if progressCh == nil {
    59  			return
    60  		}
    61  		doneCount, knownCount, approxBytesWritten = doneCount+moreDone, knownCount+moreKnown, approxBytesWritten+moreApproxBytesWritten
    62  		progressCh <- PullProgress{doneCount, knownCount, approxBytesWritten}
    63  	}
    64  }
    65  
    66  func Clone(ctx context.Context, srcDB, sinkDB Database, eventCh chan<- TableFileEvent) error {
    67  
    68  	srcCS := srcDB.chunkStore().(interface{})
    69  	sinkCS := sinkDB.chunkStore().(interface{})
    70  
    71  	srcTS, srcOK := srcCS.(nbs.TableFileStore)
    72  
    73  	if !srcOK {
    74  		return errors.New("src db is not a Table File Store")
    75  	}
    76  
    77  	size, err := srcTS.Size(ctx)
    78  
    79  	if err != nil {
    80  		return err
    81  	}
    82  
    83  	if size == 0 {
    84  		return ErrNoData
    85  	}
    86  
    87  	sinkTS, sinkOK := sinkCS.(nbs.TableFileStore)
    88  
    89  	if !sinkOK {
    90  		return errors.New("sink db is not a Table File Store")
    91  	}
    92  
    93  	return clone(ctx, srcTS, sinkTS, eventCh)
    94  }
    95  
    96  type CloneTableFileEvent int
    97  
    98  const (
    99  	Listed = iota
   100  	DownloadStart
   101  	DownloadSuccess
   102  	DownloadFailed
   103  )
   104  
   105  type TableFileEvent struct {
   106  	EventType  CloneTableFileEvent
   107  	TableFiles []nbs.TableFile
   108  }
   109  
   110  // mapTableFiles returns the list of all fileIDs for the table files, and a map from fileID to nbs.TableFile
   111  func mapTableFiles(tblFiles []nbs.TableFile) ([]string, map[string]nbs.TableFile) {
   112  	fileIds := make([]string, len(tblFiles))
   113  	fileIDtoTblFile := make(map[string]nbs.TableFile)
   114  
   115  	for i, tblFile := range tblFiles {
   116  		fileIDtoTblFile[tblFile.FileID()] = tblFile
   117  		fileIds[i] = tblFile.FileID()
   118  	}
   119  
   120  	return fileIds, fileIDtoTblFile
   121  }
   122  
   123  func CloseWithErr(c io.Closer, err *error) {
   124  	e := c.Close()
   125  	if *err == nil && e != nil {
   126  		*err = e
   127  	}
   128  }
   129  
   130  const concurrentTableFileDownloads = 3
   131  
   132  func clone(ctx context.Context, srcTS, sinkTS nbs.TableFileStore, eventCh chan<- TableFileEvent) error {
   133  	root, sourceFiles, appendixFiles, err := srcTS.Sources(ctx)
   134  	if err != nil {
   135  		return err
   136  	}
   137  
   138  	tblFiles := filterAppendicesFromSourceFiles(appendixFiles, sourceFiles)
   139  	report := func(e TableFileEvent) {
   140  		if eventCh != nil {
   141  			eventCh <- e
   142  		}
   143  	}
   144  
   145  	// Initializes the list of fileIDs we are going to download, and the map of fileIDToTF.  If this clone takes a long
   146  	// time some of the urls within the nbs.TableFiles will expire and fail to download.  At that point we will retrieve
   147  	// the sources again, and update the fileIDToTF map with updated info, but not change the files we are downloading.
   148  	desiredFiles, fileIDToTF := mapTableFiles(tblFiles)
   149  	completed := make([]bool, len(desiredFiles))
   150  
   151  	report(TableFileEvent{Listed, tblFiles})
   152  
   153  	download := func(ctx context.Context) error {
   154  		sem := semaphore.NewWeighted(concurrentTableFileDownloads)
   155  		eg, ctx := errgroup.WithContext(ctx)
   156  		for i := 0; i < len(desiredFiles); i++ {
   157  			if completed[i] {
   158  				continue
   159  			}
   160  			if err := sem.Acquire(ctx, 1); err != nil {
   161  				// The errgroup ctx has been canceled. We will
   162  				// return the error from wg.Wait() below.
   163  				break
   164  			}
   165  			idx := i
   166  			eg.Go(func() (err error) {
   167  				defer sem.Release(1)
   168  
   169  				fileID := desiredFiles[idx]
   170  				tblFile, ok := fileIDToTF[fileID]
   171  				if !ok {
   172  					// conjoin happened during clone
   173  					return backoff.Permanent(errors.New("table file not found. please try again"))
   174  				}
   175  
   176  				var rd io.ReadCloser
   177  				if rd, err = tblFile.Open(ctx); err != nil {
   178  					return err
   179  				}
   180  				defer CloseWithErr(rd, &err)
   181  
   182  				report(TableFileEvent{DownloadStart, []nbs.TableFile{tblFile}})
   183  				err = sinkTS.WriteTableFile(ctx, tblFile.FileID(), tblFile.NumChunks(), rd, 0, nil)
   184  				if err != nil {
   185  					report(TableFileEvent{DownloadFailed, []nbs.TableFile{tblFile}})
   186  					return err
   187  				}
   188  
   189  				report(TableFileEvent{DownloadSuccess, []nbs.TableFile{tblFile}})
   190  				completed[idx] = true
   191  				return nil
   192  			})
   193  		}
   194  
   195  		return eg.Wait()
   196  	}
   197  
   198  	const maxAttempts = 3
   199  	previousCompletedCnt := 0
   200  	failureCount := 0
   201  
   202  	madeProgress := func() bool {
   203  		currentCompletedCnt := 0
   204  		for _, b := range completed {
   205  			if b {
   206  				currentCompletedCnt++
   207  			}
   208  		}
   209  		if currentCompletedCnt == previousCompletedCnt {
   210  			return false
   211  		} else {
   212  			previousCompletedCnt = currentCompletedCnt
   213  			return true
   214  		}
   215  	}
   216  
   217  	// keep going as long as progress is being made.  If progress is not made retry up to maxAttempts times.
   218  	for {
   219  		err = download(ctx)
   220  		if err == nil {
   221  			break
   222  		}
   223  		if permanent, ok := err.(*backoff.PermanentError); ok {
   224  			return permanent.Err
   225  		} else if madeProgress() {
   226  			failureCount = 0
   227  		} else {
   228  			failureCount++
   229  		}
   230  		if failureCount >= maxAttempts {
   231  			return err
   232  		}
   233  		if _, sourceFiles, appendixFiles, err = srcTS.Sources(ctx); err != nil {
   234  			return err
   235  		} else {
   236  			tblFiles = filterAppendicesFromSourceFiles(appendixFiles, sourceFiles)
   237  			_, fileIDToTF = mapTableFiles(tblFiles)
   238  		}
   239  	}
   240  
   241  	return sinkTS.SetRootChunk(ctx, root, hash.Hash{})
   242  }
   243  
   244  func filterAppendicesFromSourceFiles(appendixFiles []nbs.TableFile, sourceFiles []nbs.TableFile) []nbs.TableFile {
   245  	if len(appendixFiles) == 0 {
   246  		return sourceFiles
   247  	}
   248  	tblFiles := make([]nbs.TableFile, 0)
   249  	_, appendixMap := mapTableFiles(appendixFiles)
   250  	for _, sf := range sourceFiles {
   251  		if _, ok := appendixMap[sf.FileID()]; !ok {
   252  			tblFiles = append(tblFiles, sf)
   253  		}
   254  	}
   255  	return tblFiles
   256  }
   257  
   258  // Pull objects that descend from sourceRef from srcDB to sinkDB.
   259  func Pull(ctx context.Context, srcDB, sinkDB Database, sourceRef types.Ref, progressCh chan PullProgress) error {
   260  	return pull(ctx, srcDB, sinkDB, sourceRef, progressCh, defaultBatchSize)
   261  }
   262  
   263  func pull(ctx context.Context, srcDB, sinkDB Database, sourceRef types.Ref, progressCh chan PullProgress, batchSize int) error {
   264  	// Sanity Check
   265  	exists, err := srcDB.chunkStore().Has(ctx, sourceRef.TargetHash())
   266  
   267  	if err != nil {
   268  		return err
   269  	}
   270  
   271  	if !exists {
   272  		return errors.New("not found")
   273  	}
   274  
   275  	exists, err = sinkDB.chunkStore().Has(ctx, sourceRef.TargetHash())
   276  
   277  	if err != nil {
   278  		return err
   279  	}
   280  
   281  	if exists {
   282  		return nil // already up to date
   283  	}
   284  
   285  	if srcDB.chunkStore().Version() != sinkDB.chunkStore().Version() {
   286  		return fmt.Errorf("cannot pull from src to sink; src version is %v and sink version is %v", srcDB.chunkStore().Version(), sinkDB.chunkStore().Version())
   287  	}
   288  
   289  	var sampleSize, sampleCount uint64
   290  	updateProgress := makeProgTrack(progressCh)
   291  
   292  	// TODO: This batches based on limiting the _number_ of chunks processed at the same time. We really want to batch based on the _amount_ of chunk data being processed simultaneously. We also want to consider the chunks in a particular order, however, and the current GetMany() interface doesn't provide any ordering guarantees. Once BUG 3750 is fixed, we should be able to revisit this and do a better job.
   293  	absent := hash.HashSlice{sourceRef.TargetHash()}
   294  	for absentCount := len(absent); absentCount != 0; absentCount = len(absent) {
   295  		updateProgress(0, uint64(absentCount), 0)
   296  
   297  		// For gathering up the hashes in the next level of the tree
   298  		nextLevel := hash.HashSet{}
   299  		uniqueOrdered := hash.HashSlice{}
   300  
   301  		// Process all absent chunks in this level of the tree in quanta of at most |batchSize|
   302  		for start, end := 0, batchSize; start < absentCount; start, end = end, end+batchSize {
   303  			if end > absentCount {
   304  				end = absentCount
   305  			}
   306  			batch := absent[start:end]
   307  
   308  			neededChunks, err := getChunks(ctx, srcDB, batch, sampleSize, sampleCount, updateProgress)
   309  
   310  			if err != nil {
   311  				return err
   312  			}
   313  
   314  			uniqueOrdered, err = putChunks(ctx, sinkDB, batch, neededChunks, nextLevel, uniqueOrdered)
   315  
   316  			if err != nil {
   317  				return err
   318  			}
   319  		}
   320  
   321  		absent, err = nextLevelMissingChunks(ctx, sinkDB, nextLevel, absent, uniqueOrdered)
   322  
   323  		if err != nil {
   324  			return err
   325  		}
   326  	}
   327  
   328  	err = persistChunks(ctx, sinkDB.chunkStore())
   329  
   330  	if err != nil {
   331  		return err
   332  	}
   333  
   334  	return nil
   335  }
   336  
   337  func persistChunks(ctx context.Context, cs chunks.ChunkStore) error {
   338  	// todo: there is no call to rebase on an unsuccessful Commit()
   339  	// will  this loop forever?
   340  	var success bool
   341  	for !success {
   342  		r, err := cs.Root(ctx)
   343  
   344  		if err != nil {
   345  			return err
   346  		}
   347  
   348  		success, err = cs.Commit(ctx, r, r)
   349  
   350  		if err != nil {
   351  			return err
   352  		}
   353  	}
   354  
   355  	return nil
   356  }
   357  
   358  // PullWithoutBatching effectively removes the batching of chunk retrieval done on each level of the tree.  This means
   359  // all chunks from one level of the tree will be retrieved from the underlying chunk store in one call, which pushes the
   360  // optimization problem down to the chunk store which can make smarter decisions.
   361  func PullWithoutBatching(ctx context.Context, srcDB, sinkDB Database, sourceRef types.Ref, progressCh chan PullProgress) error {
   362  	// by increasing the batch size to MaxInt32 we effectively remove batching here.
   363  	return pull(ctx, srcDB, sinkDB, sourceRef, progressCh, math.MaxInt32)
   364  }
   365  
   366  // concurrently pull all chunks from this batch that the sink is missing out of the source
   367  func getChunks(ctx context.Context, srcDB Database, batch hash.HashSlice, sampleSize uint64, sampleCount uint64, updateProgress func(moreDone uint64, moreKnown uint64, moreApproxBytesWritten uint64)) (map[hash.Hash]*chunks.Chunk, error) {
   368  	mu := &sync.Mutex{}
   369  	neededChunks := map[hash.Hash]*chunks.Chunk{}
   370  	err := srcDB.chunkStore().GetMany(ctx, batch.HashSet(), func(c *chunks.Chunk) {
   371  		mu.Lock()
   372  		defer mu.Unlock()
   373  		neededChunks[c.Hash()] = c
   374  
   375  		// Randomly sample amount of data written
   376  		if rand.Float64() < bytesWrittenSampleRate {
   377  			sampleSize += uint64(len(snappy.Encode(nil, c.Data())))
   378  			sampleCount++
   379  		}
   380  		updateProgress(1, 0, sampleSize/uint64(math.Max(1, float64(sampleCount))))
   381  	})
   382  	if err != nil {
   383  		return nil, err
   384  	}
   385  	return neededChunks, nil
   386  }
   387  
   388  // put the chunks that were downloaded into the sink IN ORDER and at the same time gather up an ordered, uniquified list
   389  // of all the children of the chunks and add them to the list of the next level tree chunks.
   390  func putChunks(ctx context.Context, sinkDB Database, hashes hash.HashSlice, neededChunks map[hash.Hash]*chunks.Chunk, nextLevel hash.HashSet, uniqueOrdered hash.HashSlice) (hash.HashSlice, error) {
   391  	for _, h := range hashes {
   392  		c := neededChunks[h]
   393  		err := sinkDB.chunkStore().Put(ctx, *c)
   394  
   395  		if err != nil {
   396  			return hash.HashSlice{}, err
   397  		}
   398  
   399  		err = types.WalkRefs(*c, sinkDB.Format(), func(r types.Ref) error {
   400  			if !nextLevel.Has(r.TargetHash()) {
   401  				uniqueOrdered = append(uniqueOrdered, r.TargetHash())
   402  				nextLevel.Insert(r.TargetHash())
   403  			}
   404  
   405  			return nil
   406  		})
   407  
   408  		if err != nil {
   409  			return hash.HashSlice{}, err
   410  		}
   411  	}
   412  
   413  	return uniqueOrdered, nil
   414  }
   415  
   416  // ask sinkDB which of the next level's hashes it doesn't have, and add those chunks to the absent list which will need
   417  // to be retrieved.
   418  func nextLevelMissingChunks(ctx context.Context, sinkDB Database, nextLevel hash.HashSet, absent hash.HashSlice, uniqueOrdered hash.HashSlice) (hash.HashSlice, error) {
   419  	missingFromSink, err := sinkDB.chunkStore().HasMany(ctx, nextLevel)
   420  
   421  	if err != nil {
   422  		return hash.HashSlice{}, err
   423  	}
   424  
   425  	absent = absent[:0]
   426  	for _, h := range uniqueOrdered {
   427  		if missingFromSink.Has(h) {
   428  			absent = append(absent, h)
   429  		}
   430  	}
   431  
   432  	return absent, nil
   433  }