github.com/dolthub/dolt/go@v0.40.5-0.20240520175717-68db7794bea6/store/datas/pull/clone.go (about)

     1  // Copyright 2019 Dolthub, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package pull
    16  
    17  import (
    18  	"context"
    19  	"errors"
    20  	"fmt"
    21  	"io"
    22  
    23  	"github.com/cenkalti/backoff/v4"
    24  	"golang.org/x/sync/errgroup"
    25  	"golang.org/x/sync/semaphore"
    26  
    27  	"github.com/dolthub/dolt/go/libraries/utils/iohelp"
    28  	"github.com/dolthub/dolt/go/store/chunks"
    29  	"github.com/dolthub/dolt/go/store/hash"
    30  )
    31  
    32  var ErrNoData = errors.New("no data")
    33  var ErrCloneUnsupported = errors.New("clone unsupported")
    34  
    35  func Clone(ctx context.Context, srcCS, sinkCS chunks.ChunkStore, eventCh chan<- TableFileEvent) error {
    36  	srcTS, srcOK := srcCS.(chunks.TableFileStore)
    37  
    38  	if !srcOK {
    39  		return fmt.Errorf("%w: src db is not a Table File Store", ErrCloneUnsupported)
    40  	}
    41  
    42  	size, err := srcTS.Size(ctx)
    43  
    44  	if err != nil {
    45  		return err
    46  	}
    47  
    48  	if size == 0 {
    49  		return ErrNoData
    50  	}
    51  
    52  	sinkTS, sinkOK := sinkCS.(chunks.TableFileStore)
    53  
    54  	if !sinkOK {
    55  		return fmt.Errorf("%w: sink db is not a Table File Store", ErrCloneUnsupported)
    56  	}
    57  
    58  	return clone(ctx, srcTS, sinkTS, sinkCS, eventCh)
    59  }
    60  
    61  type CloneTableFileEvent int
    62  
    63  const (
    64  	Listed = iota
    65  	DownloadStart
    66  	DownloadStats
    67  	DownloadSuccess
    68  	DownloadFailed
    69  )
    70  
    71  type TableFileEvent struct {
    72  	EventType  CloneTableFileEvent
    73  	TableFiles []chunks.TableFile
    74  	Stats      []iohelp.ReadStats
    75  }
    76  
    77  // mapTableFiles returns the list of all fileIDs for the table files, and a map from fileID to chunks.TableFile
    78  func mapTableFiles(tblFiles []chunks.TableFile) ([]string, map[string]chunks.TableFile, map[string]int) {
    79  	fileIds := make([]string, len(tblFiles))
    80  	fileIDtoTblFile := make(map[string]chunks.TableFile)
    81  	fileIDtoNumChunks := make(map[string]int)
    82  
    83  	for i, tblFile := range tblFiles {
    84  		fileIDtoTblFile[tblFile.FileID()] = tblFile
    85  		fileIds[i] = tblFile.FileID()
    86  		fileIDtoNumChunks[tblFile.FileID()] = tblFile.NumChunks()
    87  	}
    88  
    89  	return fileIds, fileIDtoTblFile, fileIDtoNumChunks
    90  }
    91  
    92  const concurrentTableFileDownloads = 3
    93  
    94  func clone(ctx context.Context, srcTS, sinkTS chunks.TableFileStore, sinkCS chunks.ChunkStore, eventCh chan<- TableFileEvent) error {
    95  	root, sourceFiles, appendixFiles, err := srcTS.Sources(ctx)
    96  	if err != nil {
    97  		return err
    98  	}
    99  
   100  	tblFiles := filterAppendicesFromSourceFiles(appendixFiles, sourceFiles)
   101  	report := func(e TableFileEvent) {
   102  		if eventCh != nil {
   103  			eventCh <- e
   104  		}
   105  	}
   106  
   107  	// Initializes the list of fileIDs we are going to download, and the map of fileIDToTF.  If this clone takes a long
   108  	// time some of the urls within the chunks.TableFiles will expire and fail to download.  At that point we will retrieve
   109  	// the sources again, and update the fileIDToTF map with updated info, but not change the files we are downloading.
   110  	desiredFiles, fileIDToTF, fileIDToNumChunks := mapTableFiles(tblFiles)
   111  	completed := make([]bool, len(desiredFiles))
   112  
   113  	report(TableFileEvent{EventType: Listed, TableFiles: tblFiles})
   114  
   115  	download := func(ctx context.Context) error {
   116  		sem := semaphore.NewWeighted(concurrentTableFileDownloads)
   117  		eg, ctx := errgroup.WithContext(ctx)
   118  		for i := 0; i < len(desiredFiles); i++ {
   119  			if completed[i] {
   120  				continue
   121  			}
   122  			if err := sem.Acquire(ctx, 1); err != nil {
   123  				// The errgroup ctx has been canceled. We will
   124  				// return the error from wg.Wait() below.
   125  				break
   126  			}
   127  			idx := i
   128  			eg.Go(func() (err error) {
   129  				defer sem.Release(1)
   130  
   131  				fileID := desiredFiles[idx]
   132  				tblFile, ok := fileIDToTF[fileID]
   133  				if !ok {
   134  					// conjoin happened during clone
   135  					return backoff.Permanent(errors.New("table file not found. please try again"))
   136  				}
   137  
   138  				report(TableFileEvent{EventType: DownloadStart, TableFiles: []chunks.TableFile{tblFile}})
   139  				err = sinkTS.WriteTableFile(ctx, tblFile.FileID(), tblFile.NumChunks(), nil, func() (io.ReadCloser, uint64, error) {
   140  					rd, contentLength, err := tblFile.Open(ctx)
   141  					if err != nil {
   142  						return nil, 0, err
   143  					}
   144  					rdStats := iohelp.NewReaderWithStats(rd, int64(contentLength))
   145  
   146  					rdStats.Start(func(s iohelp.ReadStats) {
   147  						report(TableFileEvent{
   148  							EventType:  DownloadStats,
   149  							TableFiles: []chunks.TableFile{tblFile},
   150  							Stats:      []iohelp.ReadStats{s},
   151  						})
   152  					})
   153  
   154  					return rdStats, contentLength, nil
   155  				})
   156  				if err != nil {
   157  					report(TableFileEvent{EventType: DownloadFailed, TableFiles: []chunks.TableFile{tblFile}})
   158  					return err
   159  				}
   160  
   161  				report(TableFileEvent{EventType: DownloadSuccess, TableFiles: []chunks.TableFile{tblFile}})
   162  				completed[idx] = true
   163  				return nil
   164  			})
   165  		}
   166  
   167  		return eg.Wait()
   168  	}
   169  
   170  	const maxAttempts = 3
   171  	previousCompletedCnt := 0
   172  	failureCount := 0
   173  
   174  	madeProgress := func() bool {
   175  		currentCompletedCnt := 0
   176  		for _, b := range completed {
   177  			if b {
   178  				currentCompletedCnt++
   179  			}
   180  		}
   181  		if currentCompletedCnt == previousCompletedCnt {
   182  			return false
   183  		} else {
   184  			previousCompletedCnt = currentCompletedCnt
   185  			return true
   186  		}
   187  	}
   188  
   189  	// keep going as long as progress is being made.  If progress is not made retry up to maxAttempts times.
   190  	for {
   191  		err = download(ctx)
   192  		if err == nil {
   193  			break
   194  		}
   195  		if permanent, ok := err.(*backoff.PermanentError); ok {
   196  			return permanent.Err
   197  		} else if madeProgress() {
   198  			failureCount = 0
   199  		} else {
   200  			failureCount++
   201  		}
   202  		if failureCount >= maxAttempts {
   203  			return err
   204  		}
   205  		if _, sourceFiles, appendixFiles, err = srcTS.Sources(ctx); err != nil {
   206  			return err
   207  		} else {
   208  			tblFiles = filterAppendicesFromSourceFiles(appendixFiles, sourceFiles)
   209  			_, fileIDToTF, _ = mapTableFiles(tblFiles)
   210  		}
   211  	}
   212  
   213  	err = sinkTS.AddTableFilesToManifest(ctx, fileIDToNumChunks)
   214  	if err != nil {
   215  		return err
   216  	}
   217  
   218  	// AddTableFilesToManifest can set the root chunk if there is a chunk
   219  	// journal which we downloaded in the clone. If that happened, the
   220  	// chunk journal is actually more accurate on what the current root is
   221  	// than the result of |Sources| up above. We choose not to touch
   222  	// anything in that case.
   223  	err = sinkCS.Rebase(ctx)
   224  	if err != nil {
   225  		return err
   226  	}
   227  	sinkRoot, err := sinkCS.Root(ctx)
   228  	if err != nil {
   229  		return err
   230  	}
   231  	if !sinkRoot.IsEmpty() {
   232  		return nil
   233  	}
   234  
   235  	return sinkTS.SetRootChunk(ctx, root, hash.Hash{})
   236  }
   237  
   238  func filterAppendicesFromSourceFiles(appendixFiles []chunks.TableFile, sourceFiles []chunks.TableFile) []chunks.TableFile {
   239  	if len(appendixFiles) == 0 {
   240  		return sourceFiles
   241  	}
   242  	tblFiles := make([]chunks.TableFile, 0)
   243  	_, appendixMap, _ := mapTableFiles(appendixFiles)
   244  	for _, sf := range sourceFiles {
   245  		if _, ok := appendixMap[sf.FileID()]; !ok {
   246  			tblFiles = append(tblFiles, sf)
   247  		}
   248  	}
   249  	return tblFiles
   250  }