github.com/hasnat/dolt/go@v0.0.0-20210628190320-9eb5d843fbb7/store/datas/puller.go (about)

     1  // Copyright 2019 Dolthub, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package datas
    16  
    17  import (
    18  	"context"
    19  	"errors"
    20  	"fmt"
    21  	"os"
    22  	"path/filepath"
    23  	"sync"
    24  
    25  	"github.com/dolthub/dolt/go/store/atomicerr"
    26  	"github.com/dolthub/dolt/go/store/chunks"
    27  	"github.com/dolthub/dolt/go/store/hash"
    28  	"github.com/dolthub/dolt/go/store/nbs"
    29  	"github.com/dolthub/dolt/go/store/types"
    30  )
    31  
    32  type FileReaderWithSize struct {
    33  	*os.File
    34  	size int64
    35  }
    36  
    37  func (rd FileReaderWithSize) Size() int64 {
    38  	return rd.size
    39  }
    40  
    41  // ErrDBUpToDate is the error code returned from NewPuller in the event that there is no work to do.
    42  var ErrDBUpToDate = errors.New("the database does not need to be pulled as it's already up to date")
    43  
    44  // ErrIncompatibleSourceChunkStore is the error code returned from NewPuller in
    45  // the event that the source ChunkStore does not implement `NBSCompressedChunkStore`.
    46  var ErrIncompatibleSourceChunkStore = errors.New("the chunk store of the source database does not implement NBSCompressedChunkStore.")
    47  
    48  const (
    49  	maxChunkWorkers = 2
    50  )
    51  
    52  // FilledWriters store CmpChunkTableWriter that have been filled and are ready to be flushed.  In the future will likely
    53  // add the md5 of the data to this structure to be used to verify table upload calls.
    54  type FilledWriters struct {
    55  	wr *nbs.CmpChunkTableWriter
    56  }
    57  
    58  // CmpChnkAndRefs holds a CompressedChunk and all of it's references
    59  type CmpChnkAndRefs struct {
    60  	cmpChnk nbs.CompressedChunk
    61  	refs    map[hash.Hash]int
    62  }
    63  
    64  type NBSCompressedChunkStore interface {
    65  	chunks.ChunkStore
    66  	GetManyCompressed(context.Context, hash.HashSet, func(nbs.CompressedChunk)) error
    67  }
    68  
    69  // Puller is used to sync data between to Databases
    70  type Puller struct {
    71  	fmt *types.NomsBinFormat
    72  
    73  	srcDB         Database
    74  	srcChunkStore NBSCompressedChunkStore
    75  	sinkDB        Database
    76  	rootChunkHash hash.Hash
    77  	downloaded    hash.HashSet
    78  
    79  	wr          *nbs.CmpChunkTableWriter
    80  	tempDir     string
    81  	chunksPerTF int
    82  
    83  	eventCh chan PullerEvent
    84  }
    85  
    86  type PullerEventType int
    87  
    88  const (
    89  	NewLevelTWEvent PullerEventType = iota
    90  	DestDBHasTWEvent
    91  	LevelUpdateTWEvent
    92  	LevelDoneTWEvent
    93  	StartUploadTableFile
    94  	EndUpdateTableFile
    95  )
    96  
    97  type TreeWalkEventDetails struct {
    98  	TreeLevel           int
    99  	ChunksInLevel       int
   100  	ChunksAlreadyHad    int
   101  	ChunksBuffered      int
   102  	ChildrenFound       int
   103  	TableFilesGenerated int
   104  }
   105  
   106  type TableFileEventDetails struct {
   107  	TableFileCount     int
   108  	TableFilesUploaded int
   109  	CurrentFileSize    int64
   110  }
   111  
   112  type PullerEvent struct {
   113  	EventType      PullerEventType
   114  	TWEventDetails TreeWalkEventDetails
   115  	TFEventDetails TableFileEventDetails
   116  }
   117  
   118  func NewTWPullerEvent(et PullerEventType, details *TreeWalkEventDetails) PullerEvent {
   119  	return PullerEvent{EventType: et, TWEventDetails: *details}
   120  }
   121  
   122  func NewTFPullerEvent(et PullerEventType, details *TableFileEventDetails) PullerEvent {
   123  	return PullerEvent{EventType: et, TFEventDetails: *details}
   124  }
   125  
   126  // NewPuller creates a new Puller instance to do the syncing.  If a nil puller is returned without error that means
   127  // that there is nothing to pull and the sinkDB is already up to date.
   128  func NewPuller(ctx context.Context, tempDir string, chunksPerTF int, srcDB, sinkDB Database, rootChunkHash hash.Hash, eventCh chan PullerEvent) (*Puller, error) {
   129  	if eventCh == nil {
   130  		panic("eventCh is required")
   131  	}
   132  
   133  	// Sanity Check
   134  	exists, err := srcDB.chunkStore().Has(ctx, rootChunkHash)
   135  
   136  	if err != nil {
   137  		return nil, err
   138  	}
   139  
   140  	if !exists {
   141  		return nil, errors.New("not found")
   142  	}
   143  
   144  	exists, err = sinkDB.chunkStore().Has(ctx, rootChunkHash)
   145  
   146  	if err != nil {
   147  		return nil, err
   148  	}
   149  
   150  	if exists {
   151  		return nil, ErrDBUpToDate
   152  	}
   153  
   154  	if srcDB.chunkStore().Version() != sinkDB.chunkStore().Version() {
   155  		return nil, fmt.Errorf("cannot pull from src to sink; src version is %v and sink version is %v", srcDB.chunkStore().Version(), sinkDB.chunkStore().Version())
   156  	}
   157  
   158  	srcChunkStore, ok := srcDB.chunkStore().(NBSCompressedChunkStore)
   159  	if !ok {
   160  		return nil, ErrIncompatibleSourceChunkStore
   161  	}
   162  
   163  	wr, err := nbs.NewCmpChunkTableWriter(tempDir)
   164  
   165  	if err != nil {
   166  		return nil, err
   167  	}
   168  
   169  	return &Puller{
   170  		fmt:           srcDB.Format(),
   171  		srcDB:         srcDB,
   172  		srcChunkStore: srcChunkStore,
   173  		sinkDB:        sinkDB,
   174  		rootChunkHash: rootChunkHash,
   175  		downloaded:    hash.HashSet{},
   176  		tempDir:       tempDir,
   177  		wr:            wr,
   178  		chunksPerTF:   chunksPerTF,
   179  		eventCh:       eventCh,
   180  	}, nil
   181  }
   182  
   183  func (p *Puller) processCompletedTables(ctx context.Context, ae *atomicerr.AtomicError, completedTables <-chan FilledWriters) {
   184  	type tempTblFile struct {
   185  		id          string
   186  		path        string
   187  		numChunks   int
   188  		contentLen  uint64
   189  		contentHash []byte
   190  	}
   191  
   192  	var tblFiles []tempTblFile
   193  
   194  	var err error
   195  	for tblFile := range completedTables {
   196  		if err != nil {
   197  			continue // drain
   198  		}
   199  
   200  		var id string
   201  		id, err = tblFile.wr.Finish()
   202  
   203  		if ae.SetIfError(err) {
   204  			continue
   205  		}
   206  
   207  		path := filepath.Join(p.tempDir, id)
   208  		err = tblFile.wr.FlushToFile(path)
   209  
   210  		if ae.SetIfError(err) {
   211  			continue
   212  		}
   213  
   214  		tblFiles = append(tblFiles, tempTblFile{
   215  			id:          id,
   216  			path:        path,
   217  			numChunks:   tblFile.wr.Size(),
   218  			contentLen:  tblFile.wr.ContentLength(),
   219  			contentHash: tblFile.wr.GetMD5(),
   220  		})
   221  	}
   222  
   223  	if ae.IsSet() {
   224  		return
   225  	}
   226  
   227  	details := &TableFileEventDetails{TableFileCount: len(tblFiles)}
   228  
   229  	// Write tables in reverse order so that on a partial success, it will still be true that if a db has a chunk, it
   230  	// also has all of that chunks references.
   231  	for i := len(tblFiles) - 1; i >= 0; i-- {
   232  		tmpTblFile := tblFiles[i]
   233  
   234  		fi, err := os.Stat(tmpTblFile.path)
   235  
   236  		if ae.SetIfError(err) {
   237  			return
   238  		}
   239  
   240  		f, err := os.Open(tmpTblFile.path)
   241  
   242  		if ae.SetIfError(err) {
   243  			return
   244  		}
   245  
   246  		details.CurrentFileSize = fi.Size()
   247  		p.eventCh <- NewTFPullerEvent(StartUploadTableFile, details)
   248  
   249  		fWithSize := FileReaderWithSize{f, fi.Size()}
   250  		err = p.sinkDB.chunkStore().(nbs.TableFileStore).WriteTableFile(ctx, tmpTblFile.id, tmpTblFile.numChunks, fWithSize, tmpTblFile.contentLen, tmpTblFile.contentHash)
   251  
   252  		go func() {
   253  			_ = os.Remove(tmpTblFile.path)
   254  		}()
   255  
   256  		if ae.SetIfError(err) {
   257  			return
   258  		}
   259  
   260  		details.TableFilesUploaded++
   261  		p.eventCh <- NewTFPullerEvent(EndUpdateTableFile, details)
   262  	}
   263  }
   264  
   265  // Pull executes the sync operation
   266  func (p *Puller) Pull(ctx context.Context) error {
   267  	twDetails := &TreeWalkEventDetails{TreeLevel: -1}
   268  
   269  	leaves := make(hash.HashSet)
   270  	absent := make(hash.HashSet)
   271  	absent.Insert(p.rootChunkHash)
   272  
   273  	ae := atomicerr.New()
   274  	wg := &sync.WaitGroup{}
   275  	completedTables := make(chan FilledWriters, 8)
   276  
   277  	wg.Add(1)
   278  	go func() {
   279  		defer wg.Done()
   280  		p.processCompletedTables(ctx, ae, completedTables)
   281  	}()
   282  
   283  	for len(absent) > 0 {
   284  		limitToNewChunks(absent, p.downloaded)
   285  
   286  		chunksInLevel := len(absent)
   287  		twDetails.ChunksInLevel = chunksInLevel
   288  		p.eventCh <- NewTWPullerEvent(NewLevelTWEvent, twDetails)
   289  
   290  		var err error
   291  		absent, err = p.sinkDB.chunkStore().HasMany(ctx, absent)
   292  
   293  		if ae.SetIfError(err) {
   294  			break
   295  		}
   296  
   297  		twDetails.ChunksAlreadyHad = chunksInLevel - len(absent)
   298  		p.eventCh <- NewTWPullerEvent(DestDBHasTWEvent, twDetails)
   299  
   300  		if len(absent) > 0 {
   301  			leaves, absent, err = p.getCmp(ctx, twDetails, leaves, absent, completedTables)
   302  
   303  			if ae.SetIfError(err) {
   304  				break
   305  			}
   306  		}
   307  	}
   308  	if !ae.IsSet() && p.wr.Size() > 0 {
   309  		// p.wr may be nil in the error case
   310  		completedTables <- FilledWriters{p.wr}
   311  	}
   312  
   313  	close(completedTables)
   314  
   315  	wg.Wait()
   316  	return ae.Get()
   317  }
   318  
   319  func limitToNewChunks(absent hash.HashSet, downloaded hash.HashSet) {
   320  	smaller := absent
   321  	longer := downloaded
   322  	if len(absent) > len(downloaded) {
   323  		smaller = downloaded
   324  		longer = absent
   325  	}
   326  
   327  	for k := range smaller {
   328  		if longer.Has(k) {
   329  			absent.Remove(k)
   330  		}
   331  	}
   332  }
   333  
   334  func (p *Puller) getCmp(ctx context.Context, twDetails *TreeWalkEventDetails, leaves, batch hash.HashSet, completedTables chan FilledWriters) (hash.HashSet, hash.HashSet, error) {
   335  	found := make(chan nbs.CompressedChunk, 4096)
   336  	processed := make(chan CmpChnkAndRefs, 4096)
   337  
   338  	ae := atomicerr.New()
   339  	go func() {
   340  		defer close(found)
   341  		err := p.srcChunkStore.GetManyCompressed(ctx, batch, func(c nbs.CompressedChunk) { found <- c })
   342  		ae.SetIfError(err)
   343  	}()
   344  
   345  	batchSize := len(batch)
   346  	numChunkWorkers := (batchSize / 1024) + 1
   347  	if numChunkWorkers > maxChunkWorkers {
   348  		numChunkWorkers = maxChunkWorkers
   349  	}
   350  
   351  	go func() {
   352  		defer close(processed)
   353  		for cmpChnk := range found {
   354  			if ae.IsSet() {
   355  				break
   356  			}
   357  
   358  			p.downloaded.Insert(cmpChnk.H)
   359  
   360  			if leaves.Has(cmpChnk.H) {
   361  				processed <- CmpChnkAndRefs{cmpChnk: cmpChnk}
   362  			} else {
   363  				chnk, err := cmpChnk.ToChunk()
   364  
   365  				if ae.SetIfError(err) {
   366  					return
   367  				}
   368  
   369  				refs := make(map[hash.Hash]int)
   370  				if err := types.WalkRefs(chnk, p.fmt, func(r types.Ref) error {
   371  					refs[r.TargetHash()] = int(r.Height())
   372  					return nil
   373  				}); ae.SetIfError(err) {
   374  					return
   375  				}
   376  
   377  				processed <- CmpChnkAndRefs{cmpChnk: cmpChnk, refs: refs}
   378  			}
   379  		}
   380  	}()
   381  
   382  	var err error
   383  	var maxHeight int
   384  	nextLeaves := make(hash.HashSet, batchSize)
   385  	nextLevel := make(hash.HashSet, batchSize)
   386  
   387  	twDetails.ChunksBuffered = 0
   388  	for cmpAndRef := range processed {
   389  		if err != nil {
   390  			// drain to prevent deadlock
   391  			continue
   392  		}
   393  
   394  		twDetails.ChunksBuffered++
   395  
   396  		if twDetails.ChunksBuffered%1000 == 0 {
   397  			p.eventCh <- NewTWPullerEvent(LevelUpdateTWEvent, twDetails)
   398  		}
   399  
   400  		err = p.wr.AddCmpChunk(cmpAndRef.cmpChnk)
   401  
   402  		if ae.SetIfError(err) {
   403  			continue
   404  		}
   405  
   406  		if p.wr.Size() >= p.chunksPerTF {
   407  			completedTables <- FilledWriters{p.wr}
   408  			p.wr, err = nbs.NewCmpChunkTableWriter(p.tempDir)
   409  
   410  			if ae.SetIfError(err) {
   411  				continue
   412  			}
   413  		}
   414  
   415  		for h, height := range cmpAndRef.refs {
   416  			nextLevel.Insert(h)
   417  			twDetails.ChildrenFound++
   418  
   419  			if height == 1 {
   420  				nextLeaves.Insert(h)
   421  			}
   422  
   423  			if height > maxHeight {
   424  				maxHeight = height
   425  			}
   426  		}
   427  	}
   428  
   429  	if err := ae.Get(); err != nil {
   430  		return nil, nil, err
   431  	}
   432  
   433  	if twDetails.ChunksBuffered != len(batch) {
   434  		return nil, nil, errors.New("failed to get all chunks.")
   435  	}
   436  
   437  	p.eventCh <- NewTWPullerEvent(LevelDoneTWEvent, twDetails)
   438  
   439  	twDetails.TreeLevel = maxHeight
   440  	return nextLeaves, nextLevel, nil
   441  }