github.com/dolthub/dolt/go@v0.40.5-0.20240520175717-68db7794bea6/libraries/doltcore/remotestorage/chunk_fetcher.go (about) 1 // Copyright 2024 Dolthub, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package remotestorage 16 17 import ( 18 "context" 19 "errors" 20 "io" 21 "sync/atomic" 22 "time" 23 24 "golang.org/x/sync/errgroup" 25 "google.golang.org/grpc" 26 27 remotesapi "github.com/dolthub/dolt/go/gen/proto/dolt/services/remotesapi/v1alpha1" 28 "github.com/dolthub/dolt/go/libraries/doltcore/remotestorage/internal/pool" 29 "github.com/dolthub/dolt/go/libraries/doltcore/remotestorage/internal/ranges" 30 "github.com/dolthub/dolt/go/libraries/doltcore/remotestorage/internal/reliable" 31 "github.com/dolthub/dolt/go/store/hash" 32 "github.com/dolthub/dolt/go/store/nbs" 33 ) 34 35 // A remotestorage.ChunkFetcher is a pipelined chunk fetcher for fetching a 36 // large number of chunks where the downloads may benefit from range 37 // coallescing, hedging, automatic retries, pipelining of download location 38 // retrieval with the fetching of the actual chunk bytes, etc. 39 // 40 // It is expected that one goroutine will be calling `Get()` with batches of 41 // addresses to retrieve, and another goroutine will be calling `Recv()`, 42 // reading fetched chunks. 43 // 44 // When all addresses have been delivered, calling `CloseSend()` will 45 // eventually cause the `Recv()` thread to read an `io.EOF` error, which 46 // indicates that all requested chunks have been delivered. 47 type ChunkFetcher struct { 48 eg *errgroup.Group 49 egCtx context.Context 50 51 toGetCh chan hash.HashSet 52 resCh chan nbs.CompressedChunk 53 54 abortCh chan struct{} 55 stats StatsRecorder 56 } 57 58 const ( 59 getLocsBatchSize = 512 60 61 reliableCallReadRequestTimeout = 15 * time.Second 62 reliableCallDeliverRespTimeout = 15 * time.Second 63 ) 64 65 func NewChunkFetcher(ctx context.Context, dcs *DoltChunkStore) *ChunkFetcher { 66 eg, ctx := errgroup.WithContext(ctx) 67 ret := &ChunkFetcher{ 68 eg: eg, 69 egCtx: ctx, 70 71 toGetCh: make(chan hash.HashSet), 72 resCh: make(chan nbs.CompressedChunk), 73 74 abortCh: make(chan struct{}), 75 stats: StatsFactory(), 76 } 77 78 locsReqCh := make(chan *remotesapi.GetDownloadLocsRequest) 79 downloadLocCh := make(chan []*remotesapi.DownloadLoc) 80 locDoneCh := make(chan struct{}) 81 fetchReqCh := make(chan fetchReq) 82 83 eg.Go(func() error { 84 return fetcherHashSetToGetDlLocsReqsThread(ctx, ret.toGetCh, ret.abortCh, locsReqCh, getLocsBatchSize, dcs.repoPath, dcs.getRepoId) 85 }) 86 eg.Go(func() error { 87 return fetcherRPCDownloadLocsThread(ctx, locsReqCh, downloadLocCh, dcs.csClient, func(s string) { dcs.repoToken.Store(s) }, ret.resCh, dcs.host) 88 }) 89 eg.Go(func() error { 90 return fetcherDownloadRangesThread(ctx, downloadLocCh, fetchReqCh, locDoneCh) 91 }) 92 eg.Go(func() error { 93 return fetcherDownloadURLThreads(ctx, fetchReqCh, locDoneCh, ret.resCh, dcs.csClient, ret.stats, dcs.httpFetcher, dcs.params) 94 }) 95 96 return ret 97 } 98 99 // Implements nbs.ChunkFetcher. Request the contents the chunks the given 100 // |hashes|. They will be delivered through |Recv|. Returns an error if this 101 // ChunkFetcher is terminally failed or if the supplied |ctx| is |Done|. 102 func (f *ChunkFetcher) Get(ctx context.Context, hashes hash.HashSet) error { 103 select { 104 case <-ctx.Done(): 105 return context.Cause(ctx) 106 case <-f.egCtx.Done(): 107 return context.Cause(f.egCtx) 108 case f.toGetCh <- hashes: 109 return nil 110 } 111 } 112 113 // Imeplements nbs.ChunkFetcher. Indicates that no further hashes will be 114 // requested through |Get|. |Recv| will only return |io.EOF| after this is 115 // called. 116 func (f *ChunkFetcher) CloseSend() error { 117 close(f.toGetCh) 118 return nil 119 } 120 121 // Implements nbs.ChunkFetcher. Returns the next available 122 // |nbs.CompressedChunk| whose contents have been fetched after being requested 123 // by |Get|. Returns |io.EOF| after |CloseSend| is called and all requested 124 // chunks have been successfully received. Returns an error if this 125 // |ChunkFetcher| is terminally failed or if the supplied |ctx| is |Done|. 126 func (f *ChunkFetcher) Recv(ctx context.Context) (nbs.CompressedChunk, error) { 127 select { 128 case <-ctx.Done(): 129 return nbs.CompressedChunk{}, context.Cause(ctx) 130 case <-f.egCtx.Done(): 131 return nbs.CompressedChunk{}, context.Cause(f.egCtx) 132 case cc, ok := <-f.resCh: 133 if !ok { 134 return nbs.CompressedChunk{}, io.EOF 135 } 136 return cc, nil 137 } 138 } 139 140 // Implements nbs.ChunkFetcher. Makes sure all resources associated with this 141 // ChunkFetcher are released, and returns any errors encountered while fetching 142 // requested chunks. This may return a non-|nil| error if the |ChunkFetcher| is 143 // |Close|d before it has delivered |io.EOF| on a |Recv| call, but that is not 144 // guaranteed. The correct way to guarantee everything has been received 145 // without error is to read |Recv| until it returns |io.EOF|, and then to 146 // |Close| the |ChunkFetcher|. 147 func (f *ChunkFetcher) Close() error { 148 defer StatsFlusher(f.stats) 149 close(f.abortCh) 150 return f.eg.Wait() 151 } 152 153 // Reads HashSets from reqCh and batches all the received addresses 154 // into |GetDownloadLocsRequest| messages with up to |batchSize| chunk hashes 155 // in them. It delivers the batched messages to |resCh|. 156 func fetcherHashSetToGetDlLocsReqsThread(ctx context.Context, reqCh chan hash.HashSet, abortCh chan struct{}, resCh chan *remotesapi.GetDownloadLocsRequest, batchSize int, repoPath string, idFunc func() (*remotesapi.RepoId, string)) error { 157 // This is the buffer of received that we haven't sent to |resCh| yet. 158 var addrs [][]byte 159 // This is the current slice we're trying to send in a 160 // |GetDownloadLocsRequest|. After we send it successfully, we will 161 // need to allocate a new one for the next message, but we can reuse 162 // its memory when we fail to send on |resCh| to form the next download 163 // request we try to send. 164 var outbound [][]byte 165 for { 166 if reqCh == nil && len(addrs) == 0 { 167 close(resCh) 168 return nil 169 } 170 171 var thisResCh chan *remotesapi.GetDownloadLocsRequest 172 var thisRes *remotesapi.GetDownloadLocsRequest 173 174 // Each time through the loop, we build a new 175 // |GetDownloadLocsRequest| to send. It contains up to 176 // |batchSize| hashes from the end of |addrs|. If we 177 // successfully send it, then we will drop those addresses from 178 // the end of |addrs|. 179 if len(addrs) > 0 { 180 end := len(addrs) 181 st := end - batchSize 182 if st < 0 { 183 st = 0 184 } 185 if outbound == nil { 186 outbound = make([][]byte, end-st) 187 } 188 outbound = append(outbound[:0], addrs[st:end]...) 189 id, token := idFunc() 190 thisRes = &remotesapi.GetDownloadLocsRequest{RepoId: id, RepoPath: repoPath, RepoToken: token, ChunkHashes: outbound[:]} 191 thisResCh = resCh 192 } 193 194 select { 195 case hs, ok := <-reqCh: 196 if !ok { 197 reqCh = nil 198 break 199 } 200 for h := range hs { 201 h := h 202 addrs = append(addrs, h[:]) 203 } 204 case thisResCh <- thisRes: 205 addrs = addrs[:len(addrs)-len(thisRes.ChunkHashes)] 206 outbound = nil 207 case <-ctx.Done(): 208 return context.Cause(ctx) 209 case <-abortCh: 210 return errors.New("early shutdown before all chunks fetched") 211 } 212 } 213 } 214 215 // Reads request messages off |reqCh| and sends them to a streaming RPC to turn 216 // them into download locations, which it delivers to |resCh|. 217 // 218 // On success, exactly one slice will be delivered on |resCh| for every message 219 // delivered in |reqCh|, and they will be delivered in order. 220 // 221 // This function handles backoff and retries for the underlying streaming RPC. 222 func fetcherRPCDownloadLocsThread(ctx context.Context, reqCh chan *remotesapi.GetDownloadLocsRequest, resCh chan []*remotesapi.DownloadLoc, client remotesapi.ChunkStoreServiceClient, storeRepoToken func(string), missingChunkCh chan nbs.CompressedChunk, host string) error { 223 stream, err := reliable.MakeCall[*remotesapi.GetDownloadLocsRequest, *remotesapi.GetDownloadLocsResponse]( 224 ctx, 225 reliable.CallOptions[*remotesapi.GetDownloadLocsRequest, *remotesapi.GetDownloadLocsResponse]{ 226 Open: func(ctx context.Context, opts ...grpc.CallOption) (reliable.ClientStream[*remotesapi.GetDownloadLocsRequest, *remotesapi.GetDownloadLocsResponse], error) { 227 return client.StreamDownloadLocations(ctx, opts...) 228 }, 229 ErrF: processGrpcErr, 230 BackOffF: grpcBackOff, 231 ReadRequestTimeout: reliableCallReadRequestTimeout, 232 DeliverRespTimeout: reliableCallDeliverRespTimeout, 233 }, 234 ) 235 if err != nil { 236 return err 237 } 238 239 eg, ctx := errgroup.WithContext(ctx) 240 eg.Go(func() error { 241 for { 242 select { 243 case req, ok := <-reqCh: 244 if !ok { 245 return stream.CloseSend() 246 } 247 err := stream.Send(req) 248 if err != nil { 249 return NewRpcError(err, "StreamDownloadLocations", host, req) 250 } 251 case <-ctx.Done(): 252 return context.Cause(ctx) 253 } 254 } 255 }) 256 eg.Go(func() error { 257 for { 258 resp, err := stream.Recv() 259 if err == io.EOF { 260 close(resCh) 261 return nil 262 } 263 if err != nil { 264 return NewRpcError(err, "StreamDownloadLocations", host, stream.AssociatedReq()) 265 } 266 if resp.RepoToken != "" { 267 storeRepoToken(resp.RepoToken) 268 } 269 270 // Compute this before we pass resp.Locs along, since the next thread will own resp.Locs after we send it. 271 if missingChunkCh != nil { 272 req := stream.AssociatedReq() 273 missing, err := getMissingChunks(req, resp) 274 if err != nil { 275 return err 276 } 277 for h := range missing { 278 select { 279 case missingChunkCh <- nbs.CompressedChunk{H: h}: 280 case <-ctx.Done(): 281 return context.Cause(ctx) 282 } 283 } 284 } 285 286 select { 287 case resCh <- resp.Locs: 288 case <-ctx.Done(): 289 return context.Cause(ctx) 290 } 291 } 292 }) 293 return eg.Wait() 294 } 295 296 func getMissingChunks(req *remotesapi.GetDownloadLocsRequest, resp *remotesapi.GetDownloadLocsResponse) (hash.HashSet, error) { 297 numRequested := len(req.ChunkHashes) 298 numResponded := 0 299 for _, loc := range resp.Locs { 300 hgr := loc.Location.(*remotesapi.DownloadLoc_HttpGetRange).HttpGetRange 301 numResponded += len(hgr.Ranges) 302 } 303 if numResponded > numRequested { 304 return nil, errors.New("possible internal error: server responded with more chunks than we asked for in StreamDownloadLocations") 305 } 306 if numResponded == numRequested { 307 // XXX: We assume it's the same chunks and that the server is well behaved. 308 return nil, nil 309 } 310 requested := make(hash.HashSet, numRequested) 311 for _, ch := range req.ChunkHashes { 312 var h hash.Hash 313 copy(h[:], ch) 314 requested.Insert(h) 315 } 316 for _, loc := range resp.Locs { 317 hgr := loc.Location.(*remotesapi.DownloadLoc_HttpGetRange).HttpGetRange 318 for _, rc := range hgr.Ranges { 319 var h hash.Hash 320 copy(h[:], rc.Hash) 321 requested.Remove(h) 322 } 323 } 324 return requested, nil 325 } 326 327 type fetchResp struct { 328 get *GetRange 329 refresh func(ctx context.Context, err error, client remotesapi.ChunkStoreServiceClient) (string, error) 330 } 331 332 type fetchReq struct { 333 respCh chan fetchResp 334 cancelCh chan struct{} 335 } 336 337 // A simple structure to keep track of *GetRange requests along with 338 // |locationRefreshes| for the URL paths we have seen. 339 type downloads struct { 340 ranges *ranges.Tree 341 refreshes map[string]*locationRefresh 342 } 343 344 func newDownloads() downloads { 345 return downloads{ 346 ranges: ranges.NewTree(chunkAggDistance), 347 refreshes: make(map[string]*locationRefresh), 348 } 349 } 350 351 func (d downloads) Add(resp *remotesapi.DownloadLoc) { 352 gr := (*GetRange)(resp.Location.(*remotesapi.DownloadLoc_HttpGetRange).HttpGetRange) 353 path := gr.ResourcePath() 354 if v, ok := d.refreshes[path]; ok { 355 v.Add(resp) 356 } else { 357 refresh := new(locationRefresh) 358 refresh.Add(resp) 359 d.refreshes[path] = refresh 360 } 361 for _, r := range gr.Ranges { 362 d.ranges.Insert(gr.Url, r.Hash, r.Offset, r.Length) 363 } 364 } 365 366 func toGetRange(rs []*ranges.GetRange) *GetRange { 367 ret := new(GetRange) 368 for _, r := range rs { 369 ret.Url = r.Url 370 ret.Ranges = append(ret.Ranges, &remotesapi.RangeChunk{ 371 Hash: r.Hash, 372 Offset: r.Offset, 373 Length: r.Length, 374 }) 375 } 376 return ret 377 } 378 379 // Reads off |locCh| and assembles DownloadLocs into download ranges. 380 func fetcherDownloadRangesThread(ctx context.Context, locCh chan []*remotesapi.DownloadLoc, fetchReqCh chan fetchReq, doneCh chan struct{}) error { 381 downloads := newDownloads() 382 pending := make([]fetchReq, 0) 383 var toSend *GetRange 384 for { 385 // pending is our slice of request threads that showed up 386 // asking for a download. We range through it and try to send 387 // them any work we have available. 388 for j := range pending { 389 // |toSend| could have come from a previous iteration 390 // of this loop or the outer loop. If it's |nil|, we 391 // can get the next range to download from 392 // |downlaods.ranges|. 393 if toSend == nil { 394 max := downloads.ranges.DeleteMaxRegion() 395 if len(max) == 0 { 396 break 397 } 398 toSend = toGetRange(max) 399 } 400 path := toSend.ResourcePath() 401 refresh := downloads.refreshes[path] 402 403 resp := fetchResp{ 404 get: toSend, 405 refresh: func(ctx context.Context, err error, client remotesapi.ChunkStoreServiceClient) (string, error) { 406 return refresh.GetURL(ctx, err, client) 407 }, 408 } 409 410 select { 411 case pending[j].respCh <- resp: 412 toSend = nil 413 case <-pending[j].cancelCh: 414 // Because of dynamic thread pool sizing, a 415 // request thread could have been canceled and 416 // it has now gone away. If this happens, its 417 // respCh will be set to |nil| below and we 418 // will remove it from our |pending| set. But 419 // we need to hold onto |toSend| so that we do 420 // send it to a request thread eventually. 421 case <-ctx.Done(): 422 return context.Cause(ctx) 423 } 424 425 pending[j].respCh = nil 426 } 427 428 // Remove anything from |pending| that was actually delivered 429 // to. We use |respCh == nil| to indicate that the above loop 430 // delivered to the download thread. 431 newpending := make([]fetchReq, 0) 432 for i := range pending { 433 if pending[i].respCh != nil { 434 newpending = append(newpending, pending[i]) 435 } 436 } 437 pending = newpending 438 439 // Once |locCh| closes, we set |locCh| to nil. If |locCh| is 440 // nil and our ranges Tree is empty, then we have delivered 441 // every download we will ever see to a download thread. We can 442 // close |doneCh| and return nil. 443 if locCh == nil && downloads.ranges.Len() == 0 { 444 close(doneCh) 445 return nil 446 } 447 448 select { 449 case req, ok := <-locCh: 450 if !ok { 451 locCh = nil 452 } else { 453 for _, loc := range req { 454 downloads.Add(loc) 455 } 456 } 457 case req := <-fetchReqCh: 458 pending = append(pending, req) 459 case <-ctx.Done(): 460 return context.Cause(ctx) 461 } 462 } 463 } 464 465 type ConcurrencyControl struct { 466 MaxConcurrency int 467 468 failures atomic.Int64 469 successes atomic.Int64 470 } 471 472 func (cc *ConcurrencyControl) RecordSuccess() { 473 cc.successes.Add(1) 474 } 475 476 func (cc *ConcurrencyControl) RecordFailure() { 477 cc.failures.Add(1) 478 } 479 480 type SizeSetter interface { 481 SetSize(int) 482 } 483 484 // This does additive increase, multiplicative decrease on calls to |SetSize|, 485 // reading successes and failures from calls to |RecordSuccess| and 486 // |RecordFailure|. If there have been any faliures in the last update window, 487 // it will call |SetSize| with a new size that's 1/2 the current size. If there 488 // have been no faliures in the last update window, but there has been at least 489 // one success, it will call |SetSize| with a size 1 greater than the current 490 // size. Will not scale size greater than |MaxConcurrency|. 491 func (cc *ConcurrencyControl) Run(ctx context.Context, done <-chan struct{}, ss SizeSetter, sz int) error { 492 var justDecreased bool 493 const ( 494 defaultConcurrencyAdjustmentDuration = 500 * time.Millisecond 495 backoffConcurrentAdjustmentDuration = 5 * time.Second 496 ) 497 next := defaultConcurrencyAdjustmentDuration 498 var lastS int64 499 for { 500 select { 501 case <-time.After(next): 502 f := cc.failures.Load() 503 if f > 0 && !justDecreased { 504 sz = (sz + 1) / 2 505 ss.SetSize(sz) 506 justDecreased = true 507 next = backoffConcurrentAdjustmentDuration 508 } else { 509 next = defaultConcurrencyAdjustmentDuration 510 s := cc.successes.Load() 511 if f == 0 && s > lastS && sz < cc.MaxConcurrency { 512 sz += 1 513 ss.SetSize(sz) 514 lastS = s 515 } 516 cc.failures.Store(0) 517 justDecreased = false 518 } 519 case <-done: 520 return nil 521 case <-ctx.Done(): 522 return nil 523 } 524 } 525 } 526 527 func fetcherDownloadURLThreads(ctx context.Context, fetchReqCh chan fetchReq, doneCh chan struct{}, chunkCh chan nbs.CompressedChunk, client remotesapi.ChunkStoreServiceClient, stats StatsRecorder, fetcher HTTPFetcher, params NetworkRequestParams) error { 528 eg, ctx := errgroup.WithContext(ctx) 529 cc := &ConcurrencyControl{ 530 MaxConcurrency: params.MaximumConcurrentDownloads, 531 } 532 f := func(ctx context.Context, shutdownCh <-chan struct{}) error { 533 return fetcherDownloadURLThread(ctx, fetchReqCh, shutdownCh, chunkCh, client, stats, cc, fetcher, params) 534 } 535 threads := pool.NewDynamic(ctx, f, params.StartingConcurrentDownloads) 536 eg.Go(func() error { 537 return threads.Run() 538 }) 539 eg.Go(func() error { 540 select { 541 case <-doneCh: 542 threads.Close() 543 case <-ctx.Done(): 544 threads.Close() 545 } 546 return nil 547 }) 548 eg.Go(func() error { 549 return cc.Run(ctx, doneCh, threads, params.StartingConcurrentDownloads) 550 }) 551 err := eg.Wait() 552 if err != nil { 553 return err 554 } 555 close(chunkCh) 556 return nil 557 } 558 559 func fetcherDownloadURLThread(ctx context.Context, fetchReqCh chan fetchReq, doneCh <-chan struct{}, chunkCh chan nbs.CompressedChunk, client remotesapi.ChunkStoreServiceClient, stats StatsRecorder, health reliable.HealthRecorder, fetcher HTTPFetcher, params NetworkRequestParams) error { 560 respCh := make(chan fetchResp) 561 cancelCh := make(chan struct{}) 562 for { 563 select { 564 case <-ctx.Done(): 565 return context.Cause(ctx) 566 case <-doneCh: 567 return nil 568 case fetchReqCh <- fetchReq{respCh: respCh, cancelCh: cancelCh}: 569 select { 570 case <-doneCh: 571 close(cancelCh) 572 return nil 573 case <-ctx.Done(): 574 return context.Cause(ctx) 575 case fetchResp := <-respCh: 576 f := fetchResp.get.GetDownloadFunc(ctx, stats, health, fetcher, params, chunkCh, func(ctx context.Context, lastError error, resourcePath string) (string, error) { 577 return fetchResp.refresh(ctx, lastError, client) 578 }) 579 err := f() 580 if err != nil { 581 return err 582 } 583 } 584 } 585 } 586 }