github.com/dolthub/dolt/go@v0.40.5-0.20240520175717-68db7794bea6/store/datas/pull/clone.go (about) 1 // Copyright 2019 Dolthub, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package pull 16 17 import ( 18 "context" 19 "errors" 20 "fmt" 21 "io" 22 23 "github.com/cenkalti/backoff/v4" 24 "golang.org/x/sync/errgroup" 25 "golang.org/x/sync/semaphore" 26 27 "github.com/dolthub/dolt/go/libraries/utils/iohelp" 28 "github.com/dolthub/dolt/go/store/chunks" 29 "github.com/dolthub/dolt/go/store/hash" 30 ) 31 32 var ErrNoData = errors.New("no data") 33 var ErrCloneUnsupported = errors.New("clone unsupported") 34 35 func Clone(ctx context.Context, srcCS, sinkCS chunks.ChunkStore, eventCh chan<- TableFileEvent) error { 36 srcTS, srcOK := srcCS.(chunks.TableFileStore) 37 38 if !srcOK { 39 return fmt.Errorf("%w: src db is not a Table File Store", ErrCloneUnsupported) 40 } 41 42 size, err := srcTS.Size(ctx) 43 44 if err != nil { 45 return err 46 } 47 48 if size == 0 { 49 return ErrNoData 50 } 51 52 sinkTS, sinkOK := sinkCS.(chunks.TableFileStore) 53 54 if !sinkOK { 55 return fmt.Errorf("%w: sink db is not a Table File Store", ErrCloneUnsupported) 56 } 57 58 return clone(ctx, srcTS, sinkTS, sinkCS, eventCh) 59 } 60 61 type CloneTableFileEvent int 62 63 const ( 64 Listed = iota 65 DownloadStart 66 DownloadStats 67 DownloadSuccess 68 DownloadFailed 69 ) 70 71 type TableFileEvent struct { 72 EventType CloneTableFileEvent 73 TableFiles []chunks.TableFile 74 Stats []iohelp.ReadStats 75 } 76 77 // mapTableFiles returns the list of all fileIDs for the table files, and a map from fileID to chunks.TableFile 78 func mapTableFiles(tblFiles []chunks.TableFile) ([]string, map[string]chunks.TableFile, map[string]int) { 79 fileIds := make([]string, len(tblFiles)) 80 fileIDtoTblFile := make(map[string]chunks.TableFile) 81 fileIDtoNumChunks := make(map[string]int) 82 83 for i, tblFile := range tblFiles { 84 fileIDtoTblFile[tblFile.FileID()] = tblFile 85 fileIds[i] = tblFile.FileID() 86 fileIDtoNumChunks[tblFile.FileID()] = tblFile.NumChunks() 87 } 88 89 return fileIds, fileIDtoTblFile, fileIDtoNumChunks 90 } 91 92 const concurrentTableFileDownloads = 3 93 94 func clone(ctx context.Context, srcTS, sinkTS chunks.TableFileStore, sinkCS chunks.ChunkStore, eventCh chan<- TableFileEvent) error { 95 root, sourceFiles, appendixFiles, err := srcTS.Sources(ctx) 96 if err != nil { 97 return err 98 } 99 100 tblFiles := filterAppendicesFromSourceFiles(appendixFiles, sourceFiles) 101 report := func(e TableFileEvent) { 102 if eventCh != nil { 103 eventCh <- e 104 } 105 } 106 107 // Initializes the list of fileIDs we are going to download, and the map of fileIDToTF. If this clone takes a long 108 // time some of the urls within the chunks.TableFiles will expire and fail to download. At that point we will retrieve 109 // the sources again, and update the fileIDToTF map with updated info, but not change the files we are downloading. 110 desiredFiles, fileIDToTF, fileIDToNumChunks := mapTableFiles(tblFiles) 111 completed := make([]bool, len(desiredFiles)) 112 113 report(TableFileEvent{EventType: Listed, TableFiles: tblFiles}) 114 115 download := func(ctx context.Context) error { 116 sem := semaphore.NewWeighted(concurrentTableFileDownloads) 117 eg, ctx := errgroup.WithContext(ctx) 118 for i := 0; i < len(desiredFiles); i++ { 119 if completed[i] { 120 continue 121 } 122 if err := sem.Acquire(ctx, 1); err != nil { 123 // The errgroup ctx has been canceled. We will 124 // return the error from wg.Wait() below. 125 break 126 } 127 idx := i 128 eg.Go(func() (err error) { 129 defer sem.Release(1) 130 131 fileID := desiredFiles[idx] 132 tblFile, ok := fileIDToTF[fileID] 133 if !ok { 134 // conjoin happened during clone 135 return backoff.Permanent(errors.New("table file not found. please try again")) 136 } 137 138 report(TableFileEvent{EventType: DownloadStart, TableFiles: []chunks.TableFile{tblFile}}) 139 err = sinkTS.WriteTableFile(ctx, tblFile.FileID(), tblFile.NumChunks(), nil, func() (io.ReadCloser, uint64, error) { 140 rd, contentLength, err := tblFile.Open(ctx) 141 if err != nil { 142 return nil, 0, err 143 } 144 rdStats := iohelp.NewReaderWithStats(rd, int64(contentLength)) 145 146 rdStats.Start(func(s iohelp.ReadStats) { 147 report(TableFileEvent{ 148 EventType: DownloadStats, 149 TableFiles: []chunks.TableFile{tblFile}, 150 Stats: []iohelp.ReadStats{s}, 151 }) 152 }) 153 154 return rdStats, contentLength, nil 155 }) 156 if err != nil { 157 report(TableFileEvent{EventType: DownloadFailed, TableFiles: []chunks.TableFile{tblFile}}) 158 return err 159 } 160 161 report(TableFileEvent{EventType: DownloadSuccess, TableFiles: []chunks.TableFile{tblFile}}) 162 completed[idx] = true 163 return nil 164 }) 165 } 166 167 return eg.Wait() 168 } 169 170 const maxAttempts = 3 171 previousCompletedCnt := 0 172 failureCount := 0 173 174 madeProgress := func() bool { 175 currentCompletedCnt := 0 176 for _, b := range completed { 177 if b { 178 currentCompletedCnt++ 179 } 180 } 181 if currentCompletedCnt == previousCompletedCnt { 182 return false 183 } else { 184 previousCompletedCnt = currentCompletedCnt 185 return true 186 } 187 } 188 189 // keep going as long as progress is being made. If progress is not made retry up to maxAttempts times. 190 for { 191 err = download(ctx) 192 if err == nil { 193 break 194 } 195 if permanent, ok := err.(*backoff.PermanentError); ok { 196 return permanent.Err 197 } else if madeProgress() { 198 failureCount = 0 199 } else { 200 failureCount++ 201 } 202 if failureCount >= maxAttempts { 203 return err 204 } 205 if _, sourceFiles, appendixFiles, err = srcTS.Sources(ctx); err != nil { 206 return err 207 } else { 208 tblFiles = filterAppendicesFromSourceFiles(appendixFiles, sourceFiles) 209 _, fileIDToTF, _ = mapTableFiles(tblFiles) 210 } 211 } 212 213 err = sinkTS.AddTableFilesToManifest(ctx, fileIDToNumChunks) 214 if err != nil { 215 return err 216 } 217 218 // AddTableFilesToManifest can set the root chunk if there is a chunk 219 // journal which we downloaded in the clone. If that happened, the 220 // chunk journal is actually more accurate on what the current root is 221 // than the result of |Sources| up above. We choose not to touch 222 // anything in that case. 223 err = sinkCS.Rebase(ctx) 224 if err != nil { 225 return err 226 } 227 sinkRoot, err := sinkCS.Root(ctx) 228 if err != nil { 229 return err 230 } 231 if !sinkRoot.IsEmpty() { 232 return nil 233 } 234 235 return sinkTS.SetRootChunk(ctx, root, hash.Hash{}) 236 } 237 238 func filterAppendicesFromSourceFiles(appendixFiles []chunks.TableFile, sourceFiles []chunks.TableFile) []chunks.TableFile { 239 if len(appendixFiles) == 0 { 240 return sourceFiles 241 } 242 tblFiles := make([]chunks.TableFile, 0) 243 _, appendixMap, _ := mapTableFiles(appendixFiles) 244 for _, sf := range sourceFiles { 245 if _, ok := appendixMap[sf.FileID()]; !ok { 246 tblFiles = append(tblFiles, sf) 247 } 248 } 249 return tblFiles 250 }