github.com/hasnat/dolt/go@v0.0.0-20210628190320-9eb5d843fbb7/libraries/doltcore/remotestorage/chunk_store.go (about) 1 // Copyright 2019 Dolthub, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package remotestorage 16 17 import ( 18 "bytes" 19 "context" 20 "crypto/md5" 21 "encoding/base64" 22 "errors" 23 "fmt" 24 "io" 25 "net/http" 26 "net/url" 27 "sort" 28 "strings" 29 "sync" 30 "sync/atomic" 31 "time" 32 33 "github.com/cenkalti/backoff" 34 "github.com/opentracing/opentracing-go" 35 "golang.org/x/sync/errgroup" 36 37 remotesapi "github.com/dolthub/dolt/go/gen/proto/dolt/services/remotesapi/v1alpha1" 38 "github.com/dolthub/dolt/go/libraries/utils/iohelp" 39 "github.com/dolthub/dolt/go/libraries/utils/tracing" 40 "github.com/dolthub/dolt/go/store/atomicerr" 41 "github.com/dolthub/dolt/go/store/chunks" 42 "github.com/dolthub/dolt/go/store/datas" 43 "github.com/dolthub/dolt/go/store/hash" 44 "github.com/dolthub/dolt/go/store/nbs" 45 "github.com/dolthub/dolt/go/store/types" 46 ) 47 48 var DownloadHedger *Hedger 49 50 func init() { 51 // TODO: This does not necessarily respond well to changes in network 52 // conditions during the program's runtime. 53 DownloadHedger = NewHedger( 54 8, 55 NewMinStrategy( 56 1*time.Second, 57 NewPercentileStrategy(0, 1*time.Hour, 95.0), 58 ), 59 ) 60 } 61 62 var ErrUploadFailed = errors.New("upload failed") 63 var ErrInvalidDoltSpecPath = errors.New("invalid dolt spec path") 64 65 var globalHttpFetcher HTTPFetcher = &http.Client{} 66 67 var _ nbs.TableFileStore = (*DoltChunkStore)(nil) 68 var _ datas.NBSCompressedChunkStore = (*DoltChunkStore)(nil) 69 var _ chunks.ChunkStore = (*DoltChunkStore)(nil) 70 71 // We may need this to be configurable for users with really bad internet 72 var downThroughputCheck = iohelp.MinThroughputCheckParams{ 73 MinBytesPerSec: 1024, 74 CheckInterval: 1 * time.Second, 75 NumIntervals: 5, 76 } 77 78 const ( 79 downRetryCount = 5 80 uploadRetryCount = 5 81 ) 82 83 var uploadRetryParams = backoff.NewExponentialBackOff() 84 var downRetryParams = backoff.NewExponentialBackOff() 85 86 func init() { 87 uploadRetryParams.MaxInterval = 5 * time.Second 88 89 downRetryParams.MaxInterval = 5 * time.Second 90 } 91 92 // Only hedge downloads of ranges < 4MB in length for now. 93 const HedgeDownloadSizeLimit = 4 * 1024 * 1024 94 95 type HTTPFetcher interface { 96 Do(req *http.Request) (*http.Response, error) 97 } 98 99 type ConcurrencyParams struct { 100 ConcurrentSmallFetches int 101 ConcurrentLargeFetches int 102 LargeFetchSize int 103 } 104 105 type DoltChunkStore struct { 106 org string 107 repoName string 108 host string 109 csClient remotesapi.ChunkStoreServiceClient 110 cache ChunkCache 111 metadata *remotesapi.GetRepoMetadataResponse 112 nbf *types.NomsBinFormat 113 httpFetcher HTTPFetcher 114 concurrency ConcurrencyParams 115 stats cacheStats 116 } 117 118 func NewDoltChunkStoreFromPath(ctx context.Context, nbf *types.NomsBinFormat, path, host string, csClient remotesapi.ChunkStoreServiceClient) (*DoltChunkStore, error) { 119 tokens := strings.Split(strings.Trim(path, "/"), "/") 120 if len(tokens) != 2 { 121 return nil, ErrInvalidDoltSpecPath 122 } 123 124 // todo: 125 // this may just be a dolthub thing. Need to revisit how we do this. 126 org := tokens[0] 127 repoName := tokens[1] 128 129 return NewDoltChunkStore(ctx, nbf, org, repoName, host, csClient) 130 } 131 132 func NewDoltChunkStore(ctx context.Context, nbf *types.NomsBinFormat, org, repoName, host string, csClient remotesapi.ChunkStoreServiceClient) (*DoltChunkStore, error) { 133 metadata, err := csClient.GetRepoMetadata(ctx, &remotesapi.GetRepoMetadataRequest{ 134 RepoId: &remotesapi.RepoId{ 135 Org: org, 136 RepoName: repoName, 137 }, 138 ClientRepoFormat: &remotesapi.ClientRepoFormat{ 139 NbfVersion: nbf.VersionString(), 140 NbsVersion: nbs.StorageVersion, 141 }, 142 }) 143 144 if err != nil { 145 return nil, err 146 } 147 148 return &DoltChunkStore{org, repoName, host, csClient, newMapChunkCache(), metadata, nbf, globalHttpFetcher, defaultConcurrency, cacheStats{}}, nil 149 } 150 151 func (dcs *DoltChunkStore) WithHTTPFetcher(fetcher HTTPFetcher) *DoltChunkStore { 152 return &DoltChunkStore{dcs.org, dcs.repoName, dcs.host, dcs.csClient, dcs.cache, dcs.metadata, dcs.nbf, fetcher, dcs.concurrency, dcs.stats} 153 } 154 155 func (dcs *DoltChunkStore) WithNoopChunkCache() *DoltChunkStore { 156 return &DoltChunkStore{dcs.org, dcs.repoName, dcs.host, dcs.csClient, noopChunkCache, dcs.metadata, dcs.nbf, dcs.httpFetcher, dcs.concurrency, dcs.stats} 157 } 158 159 func (dcs *DoltChunkStore) WithChunkCache(cache ChunkCache) *DoltChunkStore { 160 return &DoltChunkStore{dcs.org, dcs.repoName, dcs.host, dcs.csClient, cache, dcs.metadata, dcs.nbf, dcs.httpFetcher, dcs.concurrency, dcs.stats} 161 } 162 163 func (dcs *DoltChunkStore) WithDownloadConcurrency(concurrency ConcurrencyParams) *DoltChunkStore { 164 return &DoltChunkStore{dcs.org, dcs.repoName, dcs.host, dcs.csClient, dcs.cache, dcs.metadata, dcs.nbf, dcs.httpFetcher, concurrency, dcs.stats} 165 } 166 167 func (dcs *DoltChunkStore) getRepoId() *remotesapi.RepoId { 168 return &remotesapi.RepoId{ 169 Org: dcs.org, 170 RepoName: dcs.repoName, 171 } 172 } 173 174 type cacheStats struct { 175 Hits uint32 176 } 177 178 func (s cacheStats) CacheHits() uint32 { 179 return s.Hits 180 } 181 182 type CacheStats interface { 183 CacheHits() uint32 184 } 185 186 // Get the Chunk for the value of the hash in the store. If the hash is absent from the store EmptyChunk is returned. 187 func (dcs *DoltChunkStore) Get(ctx context.Context, h hash.Hash) (chunks.Chunk, error) { 188 hashes := hash.HashSet{h: struct{}{}} 189 var found *chunks.Chunk 190 err := dcs.GetMany(ctx, hashes, func(c *chunks.Chunk) { found = c }) 191 if err != nil { 192 return chunks.EmptyChunk, err 193 } 194 if found != nil { 195 return *found, nil 196 } else { 197 return chunks.EmptyChunk, nil 198 } 199 } 200 201 func (dcs *DoltChunkStore) GetMany(ctx context.Context, hashes hash.HashSet, found func(*chunks.Chunk)) error { 202 ae := atomicerr.New() 203 decompressedSize := uint64(0) 204 err := dcs.GetManyCompressed(ctx, hashes, func(cc nbs.CompressedChunk) { 205 if ae.IsSet() { 206 return 207 } 208 c, err := cc.ToChunk() 209 if ae.SetIfErrAndCheck(err) { 210 return 211 } 212 atomic.AddUint64(&decompressedSize, uint64(len(c.Data()))) 213 found(&c) 214 }) 215 if span := opentracing.SpanFromContext(ctx); span != nil { 216 span.LogKV("decompressed_bytes", decompressedSize) 217 } 218 if err != nil { 219 return err 220 } 221 if err = ae.Get(); err != nil { 222 return err 223 } 224 return nil 225 } 226 227 // GetMany gets the Chunks with |hashes| from the store. On return, |foundChunks| will have been fully sent all chunks 228 // which have been found. Any non-present chunks will silently be ignored. 229 func (dcs *DoltChunkStore) GetManyCompressed(ctx context.Context, hashes hash.HashSet, found func(nbs.CompressedChunk)) error { 230 span, ctx := tracing.StartSpan(ctx, "remotestorage.GetManyCompressed") 231 defer span.Finish() 232 233 hashToChunk := dcs.cache.Get(hashes) 234 235 span.LogKV("num_hashes", len(hashes), "cache_hits", len(hashToChunk)) 236 atomic.AddUint32(&dcs.stats.Hits, uint32(len(hashToChunk))) 237 238 notCached := make([]hash.Hash, 0, len(hashes)) 239 for h := range hashes { 240 c := hashToChunk[h] 241 242 if c.IsEmpty() { 243 notCached = append(notCached, h) 244 } else { 245 found(c) 246 } 247 } 248 249 if len(notCached) > 0 { 250 err := dcs.readChunksAndCache(ctx, hashes, notCached, found) 251 252 if err != nil { 253 return err 254 } 255 } 256 257 return nil 258 } 259 260 const ( 261 getLocsBatchSize = 256 262 ) 263 264 type GetRange remotesapi.HttpGetRange 265 266 func (gr *GetRange) ResourcePath() string { 267 u, _ := url.Parse(gr.Url) 268 return fmt.Sprintf("%s://%s%s", u.Scheme, u.Host, u.Path) 269 } 270 271 func (gr *GetRange) Append(other *GetRange) { 272 gr.Url = other.Url 273 gr.Ranges = append(gr.Ranges, other.Ranges...) 274 } 275 276 func (gr *GetRange) Sort() { 277 sort.Slice(gr.Ranges, func(i, j int) bool { 278 return gr.Ranges[i].Offset < gr.Ranges[j].Offset 279 }) 280 } 281 282 func (gr *GetRange) ChunkStartOffset(i int) uint64 { 283 return gr.Ranges[i].Offset 284 } 285 286 func (gr *GetRange) ChunkEndOffset(i int) uint64 { 287 return gr.Ranges[i].Offset + uint64(gr.Ranges[i].Length) 288 } 289 290 func (gr *GetRange) GapBetween(i, j int) uint64 { 291 return gr.ChunkStartOffset(j) - gr.ChunkEndOffset(i) 292 } 293 294 func (gr *GetRange) SplitAtGaps(maxGapBytes uint64) []*GetRange { 295 gr.Sort() 296 res := make([]*GetRange, 0) 297 i := 0 298 for i < len(gr.Ranges) { 299 j := i + 1 300 for j < len(gr.Ranges) { 301 if gr.GapBetween(j-1, j) > maxGapBytes { 302 break 303 } 304 j++ 305 } 306 res = append(res, &GetRange{Url: gr.Url, Ranges: gr.Ranges[i:j]}) 307 i = j 308 } 309 return res 310 } 311 312 func (gr *GetRange) NumChunks() int { 313 return len(gr.Ranges) 314 } 315 316 func (gr *GetRange) RangeLen() uint64 { 317 return gr.ChunkEndOffset(gr.NumChunks()-1) - gr.ChunkStartOffset(0) 318 } 319 320 func (gr *GetRange) NumBytesInRanges() uint64 { 321 res := uint64(0) 322 for i := 0; i < len(gr.Ranges); i++ { 323 start, end := gr.ChunkByteRange(i) 324 res += start - end 325 } 326 return res 327 } 328 329 func (gr *GetRange) ChunkByteRange(i int) (uint64, uint64) { 330 start := gr.ChunkStartOffset(i) - gr.ChunkStartOffset(0) 331 end := gr.ChunkEndOffset(i) - gr.ChunkStartOffset(0) 332 return start, end 333 } 334 335 func sortRangesBySize(ranges []*GetRange) { 336 sort.Slice(ranges, func(i, j int) bool { 337 return ranges[j].RangeLen() < ranges[i].RangeLen() 338 }) 339 } 340 341 type resourcePathToUrlFunc func(ctx context.Context, lastError error, resourcePath string) (url string, err error) 342 343 func (gr *GetRange) GetDownloadFunc(ctx context.Context, stats StatsRecorder, fetcher HTTPFetcher, chunkChan chan nbs.CompressedChunk, pathToUrl resourcePathToUrlFunc) func() error { 344 if len(gr.Ranges) == 0 { 345 return func() error { return nil } 346 } 347 return func() error { 348 urlF := func(lastError error) (string, error) { 349 url, err := pathToUrl(ctx, lastError, gr.ResourcePath()) 350 if err != nil { 351 return "", err 352 } 353 if url == "" { 354 url = gr.Url 355 } 356 return url, nil 357 } 358 var comprData []byte 359 var err error 360 rangeLen := gr.RangeLen() 361 if rangeLen > HedgeDownloadSizeLimit { 362 comprData, err = rangeDownloadWithRetries(ctx, stats, fetcher, gr.ChunkStartOffset(0), rangeLen, 1, urlF) 363 } else { 364 comprData, err = hedgedRangeDownloadWithRetries(ctx, stats, fetcher, gr.ChunkStartOffset(0), rangeLen, urlF) 365 } 366 if err != nil { 367 return err 368 } 369 // Send the chunk for each range included in GetRange. 370 for i := 0; i < len(gr.Ranges); i++ { 371 s, e := gr.ChunkByteRange(i) 372 cmpChnk, err := nbs.NewCompressedChunk(hash.New(gr.Ranges[i].Hash), comprData[s:e]) 373 if err != nil { 374 return err 375 } 376 select { 377 case chunkChan <- cmpChnk: 378 case <-ctx.Done(): 379 return ctx.Err() 380 } 381 } 382 return nil 383 } 384 } 385 386 type locationRefresh struct { 387 RefreshAfter time.Time 388 RefreshRequest *remotesapi.RefreshTableFileUrlRequest 389 URL string 390 lastRefresh time.Time 391 mu *sync.Mutex 392 } 393 394 func (r *locationRefresh) Add(resp *remotesapi.DownloadLoc) { 395 if r.URL == "" { 396 r.URL = resp.Location.(*remotesapi.DownloadLoc_HttpGetRange).HttpGetRange.Url 397 } 398 if resp.RefreshAfter == nil { 399 return 400 } 401 respTime := resp.RefreshAfter.AsTime() 402 if (r.RefreshAfter == time.Time{}) || respTime.After(r.RefreshAfter) { 403 r.RefreshAfter = resp.RefreshAfter.AsTime() 404 r.RefreshRequest = resp.RefreshRequest 405 r.URL = resp.Location.(*remotesapi.DownloadLoc_HttpGetRange).HttpGetRange.Url 406 } 407 } 408 409 var refreshTableFileURLRetryDuration = 5 * time.Second 410 411 func (r *locationRefresh) GetURL(ctx context.Context, lastError error, client remotesapi.ChunkStoreServiceClient) (string, error) { 412 r.mu.Lock() 413 defer r.mu.Unlock() 414 if r.RefreshRequest != nil { 415 now := time.Now() 416 wantsRefresh := now.After(r.RefreshAfter) || errors.Is(lastError, HttpError) 417 canRefresh := time.Since(r.lastRefresh) > refreshTableFileURLRetryDuration 418 if wantsRefresh && canRefresh { 419 resp, err := client.RefreshTableFileUrl(ctx, r.RefreshRequest) 420 if err != nil { 421 return r.URL, err 422 } 423 r.RefreshAfter = resp.RefreshAfter.AsTime() 424 r.URL = resp.Url 425 r.lastRefresh = now 426 } 427 } 428 return r.URL, nil 429 } 430 431 type dlLocations struct { 432 ranges map[string]*GetRange 433 refreshes map[string]*locationRefresh 434 } 435 436 func newDlLocations() dlLocations { 437 return dlLocations{ 438 ranges: make(map[string]*GetRange), 439 refreshes: make(map[string]*locationRefresh), 440 } 441 } 442 443 func (l *dlLocations) Add(resp *remotesapi.DownloadLoc) { 444 gr := (*GetRange)(resp.Location.(*remotesapi.DownloadLoc_HttpGetRange).HttpGetRange) 445 path := gr.ResourcePath() 446 if v, ok := l.ranges[path]; ok { 447 v.Append(gr) 448 l.refreshes[path].Add(resp) 449 } else { 450 l.ranges[path] = gr 451 refresh := &locationRefresh{mu: new(sync.Mutex)} 452 refresh.Add(resp) 453 l.refreshes[path] = refresh 454 } 455 } 456 457 func (dcs *DoltChunkStore) getDLLocs(ctx context.Context, hashes []hash.Hash) (dlLocations, error) { 458 span, ctx := tracing.StartSpan(ctx, "remotestorage.getDLLocs") 459 span.LogKV("num_hashes", len(hashes)) 460 defer span.Finish() 461 462 res := newDlLocations() 463 464 // channel for receiving results from go routines making grpc calls to get download locations for chunks 465 resCh := make(chan []*remotesapi.DownloadLoc) 466 467 eg, ctx := errgroup.WithContext(ctx) 468 469 // go routine for receiving the results of the grpc calls and aggregating the results into resourceToUrlAndRanges 470 eg.Go(func() error { 471 for { 472 select { 473 case locs, ok := <-resCh: 474 if !ok { 475 return nil 476 } 477 for _, loc := range locs { 478 res.Add(loc) 479 } 480 case <-ctx.Done(): 481 return ctx.Err() 482 } 483 } 484 }) 485 486 // go routine for batching the get location requests, streaming the requests and streaming the responses. 487 eg.Go(func() error { 488 var reqs []*remotesapi.GetDownloadLocsRequest 489 hashesBytes := HashesToSlices(hashes) 490 batchItr(len(hashesBytes), getLocsBatchSize, func(st, end int) (stop bool) { 491 batch := hashesBytes[st:end] 492 req := &remotesapi.GetDownloadLocsRequest{RepoId: dcs.getRepoId(), ChunkHashes: batch} 493 reqs = append(reqs, req) 494 return false 495 }) 496 op := func() error { 497 seg, ctx := errgroup.WithContext(ctx) 498 stream, err := dcs.csClient.StreamDownloadLocations(ctx) 499 if err != nil { 500 return NewRpcError(err, "StreamDownloadLocations", dcs.host, nil) 501 } 502 completedReqs := 0 503 // Write requests 504 seg.Go(func() error { 505 for i := range reqs { 506 if err := stream.Send(reqs[i]); err != nil { 507 return NewRpcError(err, "StreamDownloadLocations", dcs.host, reqs[i]) 508 } 509 } 510 return stream.CloseSend() 511 }) 512 // Read responses 513 seg.Go(func() error { 514 for { 515 resp, err := stream.Recv() 516 if err != nil { 517 if err == io.EOF { 518 return nil 519 } 520 return NewRpcError(err, "StreamDownloadLocations", dcs.host, reqs[completedReqs]) 521 } 522 select { 523 case resCh <- resp.Locs: 524 completedReqs += 1 525 case <-ctx.Done(): 526 return ctx.Err() 527 } 528 } 529 }) 530 err = seg.Wait() 531 reqs = reqs[completedReqs:] 532 if len(reqs) == 0 { 533 close(resCh) 534 } 535 return processGrpcErr(err) 536 } 537 return backoff.Retry(op, backoff.WithMaxRetries(csRetryParams, csClientRetries)) 538 }) 539 540 if err := eg.Wait(); err != nil { 541 return dlLocations{}, err 542 } 543 return res, nil 544 } 545 546 func (dcs *DoltChunkStore) readChunksAndCache(ctx context.Context, hashes hash.HashSet, notCached []hash.Hash, found func(nbs.CompressedChunk)) error { 547 // get the locations where the chunks can be downloaded from 548 dlLocs, err := dcs.getDLLocs(ctx, notCached) 549 if err != nil { 550 return err 551 } 552 553 var wg sync.WaitGroup 554 555 // channel to receive chunks on 556 chunkChan := make(chan nbs.CompressedChunk, 128) 557 558 toSend := make(map[hash.Hash]struct{}, len(notCached)) 559 for _, h := range notCached { 560 toSend[h] = struct{}{} 561 } 562 563 // start a go routine to receive the downloaded chunks on 564 wg.Add(1) 565 go func() { 566 defer wg.Done() 567 for chunk := range chunkChan { 568 dcs.cache.PutChunk(chunk) 569 570 h := chunk.Hash() 571 572 if _, send := toSend[h]; send { 573 found(chunk) 574 } 575 } 576 }() 577 578 // download the chunks and close the channel after 579 func() { 580 defer close(chunkChan) 581 err = dcs.downloadChunks(ctx, dlLocs, chunkChan) 582 }() 583 584 // wait for all the results to finish processing 585 wg.Wait() 586 587 if err != nil { 588 return err 589 } 590 591 return nil 592 } 593 594 // Returns true iff the value at the address |h| is contained in the 595 // store 596 func (dcs *DoltChunkStore) Has(ctx context.Context, h hash.Hash) (bool, error) { 597 hashes := hash.HashSet{h: struct{}{}} 598 absent, err := dcs.HasMany(ctx, hashes) 599 600 if err != nil { 601 return false, err 602 } 603 604 return len(absent) == 0, nil 605 } 606 607 const maxHasManyBatchSize = 16 * 1024 608 609 // Returns a new HashSet containing any members of |hashes| that are 610 // absent from the store. 611 func (dcs *DoltChunkStore) HasMany(ctx context.Context, hashes hash.HashSet) (hash.HashSet, error) { 612 // get the set of hashes that isn't already in the cache 613 notCached := dcs.cache.Has(hashes) 614 615 if len(notCached) == 0 { 616 return notCached, nil 617 } 618 619 // convert the set to a slice of hashes and a corresponding slice of the byte encoding for those hashes 620 hashSl, byteSl := HashSetToSlices(notCached) 621 622 absent := make(hash.HashSet) 623 var found []nbs.CompressedChunk 624 var err error 625 626 batchItr(len(hashSl), maxHasManyBatchSize, func(st, end int) (stop bool) { 627 // slice the slices into a batch of hashes 628 currHashSl := hashSl[st:end] 629 currByteSl := byteSl[st:end] 630 631 // send a request to the remote api to determine which chunks the remote api already has 632 req := &remotesapi.HasChunksRequest{RepoId: dcs.getRepoId(), Hashes: currByteSl} 633 resp, err := dcs.csClient.HasChunks(ctx, req) 634 635 if err != nil { 636 err = NewRpcError(err, "HasMany", dcs.host, req) 637 return true 638 } 639 640 numAbsent := len(resp.Absent) 641 sort.Slice(resp.Absent, func(i, j int) bool { 642 return resp.Absent[i] < resp.Absent[j] 643 }) 644 645 // loop over every hash in the current batch, and if they are absent from the remote host add them to the 646 // absent set, otherwise append them to the found slice 647 for i, j := 0, 0; i < len(currHashSl); i++ { 648 currHash := currHashSl[i] 649 650 nextAbsent := -1 651 if j < numAbsent { 652 nextAbsent = int(resp.Absent[j]) 653 } 654 655 if i == nextAbsent { 656 absent[currHash] = struct{}{} 657 j++ 658 } else { 659 c := nbs.ChunkToCompressedChunk(chunks.NewChunkWithHash(currHash, []byte{})) 660 found = append(found, c) 661 } 662 } 663 664 return false 665 }) 666 667 if err != nil { 668 return nil, err 669 } 670 671 if len(found)+len(absent) != len(notCached) { 672 panic("not all chunks were accounted for") 673 } 674 675 if len(found) > 0 { 676 dcs.cache.Put(found) 677 } 678 679 return absent, nil 680 } 681 682 // Put caches c. Upon return, c must be visible to 683 // subsequent Get and Has calls, but must not be persistent until a call 684 // to Flush(). Put may be called concurrently with other calls to Put(), 685 // Get(), GetMany(), Has() and HasMany(). 686 func (dcs *DoltChunkStore) Put(ctx context.Context, c chunks.Chunk) error { 687 cc := nbs.ChunkToCompressedChunk(c) 688 dcs.cache.Put([]nbs.CompressedChunk{cc}) 689 return nil 690 } 691 692 // Returns the NomsVersion with which this ChunkSource is compatible. 693 func (dcs *DoltChunkStore) Version() string { 694 return dcs.metadata.NbfVersion 695 } 696 697 // Rebase brings this ChunkStore into sync with the persistent storage's 698 // current root. 699 func (dcs *DoltChunkStore) Rebase(ctx context.Context) error { 700 req := &remotesapi.RebaseRequest{RepoId: dcs.getRepoId()} 701 _, err := dcs.csClient.Rebase(ctx, req) 702 703 if err != nil { 704 return NewRpcError(err, "Rebase", dcs.host, req) 705 } 706 707 return dcs.refreshRepoMetadata(ctx) 708 } 709 710 func (dcs *DoltChunkStore) refreshRepoMetadata(ctx context.Context) error { 711 mdReq := &remotesapi.GetRepoMetadataRequest{ 712 RepoId: &remotesapi.RepoId{ 713 Org: dcs.org, 714 RepoName: dcs.repoName, 715 }, 716 ClientRepoFormat: &remotesapi.ClientRepoFormat{ 717 NbfVersion: dcs.nbf.VersionString(), 718 NbsVersion: nbs.StorageVersion, 719 }, 720 } 721 metadata, err := dcs.csClient.GetRepoMetadata(ctx, mdReq) 722 if err != nil { 723 return NewRpcError(err, "GetRepoMetadata", dcs.host, mdReq) 724 } 725 dcs.metadata = metadata 726 return nil 727 } 728 729 // Root returns the root of the database as of the time the ChunkStore 730 // was opened or the most recent call to Rebase. 731 func (dcs *DoltChunkStore) Root(ctx context.Context) (hash.Hash, error) { 732 req := &remotesapi.RootRequest{RepoId: dcs.getRepoId()} 733 resp, err := dcs.csClient.Root(ctx, req) 734 735 if err != nil { 736 return hash.Hash{}, NewRpcError(err, "Root", dcs.host, req) 737 } 738 739 return hash.New(resp.RootHash), nil 740 } 741 742 // Commit atomically attempts to persist all novel Chunks and update the 743 // persisted root hash from last to current (or keeps it the same). 744 // If last doesn't match the root in persistent storage, returns false. 745 func (dcs *DoltChunkStore) Commit(ctx context.Context, current, last hash.Hash) (bool, error) { 746 hashToChunkCount, err := dcs.uploadChunks(ctx) 747 748 if err != nil { 749 return false, err 750 } 751 752 chnkTblInfo := make([]*remotesapi.ChunkTableInfo, 0, len(hashToChunkCount)) 753 for h, cnt := range hashToChunkCount { 754 chnkTblInfo = append(chnkTblInfo, &remotesapi.ChunkTableInfo{Hash: h[:], ChunkCount: uint32(cnt)}) 755 } 756 757 req := &remotesapi.CommitRequest{ 758 RepoId: dcs.getRepoId(), 759 Current: current[:], 760 Last: last[:], 761 ChunkTableInfo: chnkTblInfo, 762 ClientRepoFormat: &remotesapi.ClientRepoFormat{ 763 NbfVersion: dcs.nbf.VersionString(), 764 NbsVersion: nbs.StorageVersion, 765 }, 766 } 767 resp, err := dcs.csClient.Commit(ctx, req) 768 if err != nil { 769 return false, NewRpcError(err, "Commit", dcs.host, req) 770 } 771 772 return resp.Success, dcs.refreshRepoMetadata(ctx) 773 } 774 775 // Stats may return some kind of struct that reports statistics about the 776 // ChunkStore instance. The type is implementation-dependent, and impls 777 // may return nil 778 func (dcs *DoltChunkStore) Stats() interface{} { 779 return cacheStats{atomic.LoadUint32(&dcs.stats.Hits)} 780 } 781 782 // StatsSummary may return a string containing summarized statistics for 783 // this ChunkStore. It must return "Unsupported" if this operation is not 784 // supported. 785 func (dcs *DoltChunkStore) StatsSummary() string { 786 return fmt.Sprintf("CacheHits: %v", dcs.Stats().(CacheStats).CacheHits()) 787 } 788 789 // Close tears down any resources in use by the implementation. After 790 // Close(), the ChunkStore may not be used again. It is NOT SAFE to call 791 // Close() concurrently with any other ChunkStore method; behavior is 792 // undefined and probably crashy. 793 func (dcs *DoltChunkStore) Close() error { 794 return nil 795 } 796 797 // getting this working using the simplest approach first 798 func (dcs *DoltChunkStore) uploadChunks(ctx context.Context) (map[hash.Hash]int, error) { 799 hashToChunk := dcs.cache.GetAndClearChunksToFlush() 800 801 if len(hashToChunk) == 0 { 802 return map[hash.Hash]int{}, nil 803 } 804 805 chnks := make([]chunks.Chunk, 0, len(hashToChunk)) 806 for _, chable := range hashToChunk { 807 ch, err := chable.ToChunk() 808 809 if err != nil { 810 return nil, err 811 } 812 813 chnks = append(chnks, ch) 814 } 815 816 hashToCount := make(map[hash.Hash]int) 817 hashToData := make(map[hash.Hash][]byte) 818 hashToDetails := make(map[hash.Hash]*remotesapi.TableFileDetails) 819 820 // structuring so this can be done as multiple files in the future. 821 { 822 name, data, err := nbs.WriteChunks(chnks) 823 824 if err != nil { 825 return map[hash.Hash]int{}, err 826 } 827 828 h := hash.Parse(name) 829 hashToData[h] = data 830 hashToCount[h] = len(chnks) 831 832 md5Bytes := md5.Sum(data) 833 hashToDetails[h] = &remotesapi.TableFileDetails{ 834 Id: h[:], 835 ContentLength: uint64(len(data)), 836 ContentHash: md5Bytes[:], 837 } 838 } 839 840 tfds := make([]*remotesapi.TableFileDetails, 0, len(hashToDetails)) 841 for _, v := range hashToDetails { 842 tfds = append(tfds, v) 843 } 844 845 req := &remotesapi.GetUploadLocsRequest{RepoId: dcs.getRepoId(), TableFileDetails: tfds} 846 resp, err := dcs.csClient.GetUploadLocations(ctx, req) 847 848 if err != nil { 849 return map[hash.Hash]int{}, err 850 } 851 852 for _, loc := range resp.Locs { 853 var err error 854 h := hash.New(loc.TableFileHash) 855 data := hashToData[h] 856 details := hashToDetails[h] 857 switch typedLoc := loc.Location.(type) { 858 case *remotesapi.UploadLoc_HttpPost: 859 err = dcs.httpPostUpload(ctx, loc.TableFileHash, typedLoc.HttpPost, bytes.NewBuffer(data), details.ContentHash) 860 default: 861 break 862 } 863 864 if err != nil { 865 return map[hash.Hash]int{}, err 866 } 867 } 868 869 return hashToCount, nil 870 } 871 872 type Sizer interface { 873 Size() int64 874 } 875 876 func (dcs *DoltChunkStore) httpPostUpload(ctx context.Context, hashBytes []byte, post *remotesapi.HttpPostTableFile, rd io.Reader, contentHash []byte) error { 877 return HttpPostUpload(ctx, dcs.httpFetcher, post, rd, contentHash) 878 } 879 880 func HttpPostUpload(ctx context.Context, httpFetcher HTTPFetcher, post *remotesapi.HttpPostTableFile, rd io.Reader, contentHash []byte) error { 881 req, err := http.NewRequest(http.MethodPut, post.Url, rd) 882 if err != nil { 883 return err 884 } 885 886 if sizer, ok := rd.(Sizer); ok { 887 req.ContentLength = sizer.Size() 888 } 889 890 if len(contentHash) > 0 { 891 md5s := base64.StdEncoding.EncodeToString(contentHash) 892 req.Header.Set("Content-MD5", md5s) 893 } 894 895 fetcher := globalHttpFetcher 896 if httpFetcher != nil { 897 fetcher = httpFetcher 898 } 899 900 var resp *http.Response 901 op := func() error { 902 var err error 903 resp, err = fetcher.Do(req.WithContext(ctx)) 904 905 if err == nil { 906 defer func() { 907 _ = resp.Body.Close() 908 }() 909 } 910 911 return processHttpResp(resp, err) 912 } 913 914 err = backoff.Retry(op, backoff.WithMaxRetries(uploadRetryParams, uploadRetryCount)) 915 916 if err != nil { 917 return err 918 } 919 920 return nil 921 } 922 923 // aggregateDownloads looks for byte ranges that need to be downloaded, and tries to aggregate them into a smaller number 924 // of larger downloads. It does this by sorting the ranges of bytes that are needed, and then comparing how close together 925 // neighboring ranges are. If they are within the threshold the two ranges will be aggregated into a single request for 926 // the entire range of data. 927 func aggregateDownloads(aggDistance uint64, resourceGets map[string]*GetRange) []*GetRange { 928 var res []*GetRange 929 for _, resourceGet := range resourceGets { 930 res = append(res, resourceGet.SplitAtGaps(aggDistance)...) 931 } 932 return res 933 } 934 935 const ( 936 chunkAggDistance = 8 * 1024 937 ) 938 939 var defaultConcurrency ConcurrencyParams = ConcurrencyParams{ 940 ConcurrentSmallFetches: 64, 941 ConcurrentLargeFetches: 2, 942 LargeFetchSize: 2 * 1024 * 1024, 943 } 944 945 func logDownloadStats(span opentracing.Span, originalGets map[string]*GetRange, computedGets []*GetRange) { 946 chunkCount := 0 947 originalBytes := uint64(0) 948 for _, r := range originalGets { 949 chunkCount += r.NumChunks() 950 originalBytes += r.NumBytesInRanges() 951 } 952 downloadBytes := uint64(0) 953 for _, r := range computedGets { 954 downloadBytes += r.RangeLen() 955 } 956 span.LogKV("num_files", len(originalGets), "num_chunks", chunkCount, "num_batches", len(computedGets), "original_bytes", originalBytes, "download_bytes", downloadBytes) 957 } 958 959 // creates work functions for each download and executes them in parallel. The work functions write downloaded chunks 960 // to chunkChan 961 func (dcs *DoltChunkStore) downloadChunks(ctx context.Context, dlLocs dlLocations, chunkChan chan nbs.CompressedChunk) error { 962 span, ctx := tracing.StartSpan(ctx, "remotestorage.downloadChunks") 963 defer span.Finish() 964 965 resourceGets := dlLocs.ranges 966 967 gets := aggregateDownloads(chunkAggDistance, resourceGets) 968 logDownloadStats(span, resourceGets, gets) 969 970 sortRangesBySize(gets) 971 972 toUrl := func(ctx context.Context, lastError error, resourcePath string) (string, error) { 973 return dlLocs.refreshes[resourcePath].GetURL(ctx, lastError, dcs.csClient) 974 } 975 976 stats := StatsFactory() 977 978 eg, ctx := errgroup.WithContext(ctx) 979 980 // loop over all the gets that need to be downloaded and create a work function for each 981 work := make([]func() error, len(gets)) 982 largeCutoff := -1 983 for i, get := range gets { 984 work[i] = get.GetDownloadFunc(ctx, stats, dcs.httpFetcher, chunkChan, toUrl) 985 if get.RangeLen() >= uint64(dcs.concurrency.LargeFetchSize) { 986 largeCutoff = i 987 } 988 } 989 990 // execute the work 991 eg.Go(func() error { 992 return concurrentExec(work[0:largeCutoff+1], dcs.concurrency.ConcurrentLargeFetches) 993 }) 994 eg.Go(func() error { 995 return concurrentExec(work[largeCutoff+1:len(work)], dcs.concurrency.ConcurrentSmallFetches) 996 }) 997 998 defer func() { 999 StatsFlusher(stats) 1000 }() 1001 return eg.Wait() 1002 } 1003 1004 type urlFactoryFunc func(error) (string, error) 1005 1006 func hedgedRangeDownloadWithRetries(ctx context.Context, stats StatsRecorder, fetcher HTTPFetcher, offset, length uint64, urlStrF urlFactoryFunc) ([]byte, error) { 1007 res, err := DownloadHedger.Do(ctx, Work{ 1008 Work: func(ctx context.Context, n int) (interface{}, error) { 1009 return rangeDownloadWithRetries(ctx, stats, fetcher, offset, length, n, urlStrF) 1010 }, 1011 Size: int(length), 1012 }) 1013 if err != nil { 1014 return nil, err 1015 } 1016 return res.([]byte), nil 1017 } 1018 1019 // rangeDownloadWithRetries executes an http get with the 'Range' header to get a range of bytes from a file. Request 1020 // is executed with retries and if progress was made, downloads will be resumed from where they left off on subsequent attempts. 1021 func rangeDownloadWithRetries(ctx context.Context, stats StatsRecorder, fetcher HTTPFetcher, offset, length uint64, hedgeN int, urlStrF urlFactoryFunc) ([]byte, error) { 1022 // create the request 1023 1024 // parameters used for resuming downloads. 1025 var allBufs [][]byte 1026 currOffset := offset 1027 currLength := length 1028 1029 var lastError error 1030 var retryCnt int 1031 1032 //execute the request 1033 op := func() (rerr error) { 1034 defer func() { 1035 lastError = rerr 1036 retryCnt += 1 1037 }() 1038 urlStr, err := urlStrF(lastError) 1039 if err != nil { 1040 return err 1041 } 1042 1043 req, err := http.NewRequest(http.MethodGet, urlStr, nil) 1044 if err != nil { 1045 return err 1046 } 1047 1048 rangeVal := fmt.Sprintf("bytes=%d-%d", currOffset, currOffset+currLength-1) 1049 req.Header.Set("Range", rangeVal) 1050 1051 stats.RecordDownloadAttemptStart(hedgeN, retryCnt, currOffset-offset, length) 1052 start := time.Now() 1053 resp, err := fetcher.Do(req.WithContext(ctx)) 1054 if err == nil { 1055 defer func() { 1056 _ = resp.Body.Close() 1057 }() 1058 } 1059 1060 respErr := processHttpResp(resp, err) 1061 if respErr != nil { 1062 return respErr 1063 } 1064 stats.RecordTimeToFirstByte(hedgeN, retryCnt, length, time.Since(start)) 1065 1066 // read the results 1067 comprData, err := iohelp.ReadWithMinThroughput(resp.Body, int64(currLength), downThroughputCheck) 1068 1069 dataRead := len(comprData) 1070 if dataRead > 0 { 1071 allBufs = append(allBufs, comprData) 1072 currLength -= uint64(dataRead) 1073 currOffset += uint64(dataRead) 1074 } 1075 return err 1076 } 1077 1078 dstart := time.Now() 1079 err := backoff.Retry(op, backoff.WithMaxRetries(downRetryParams, downRetryCount)) 1080 if err != nil { 1081 return nil, err 1082 } 1083 stats.RecordDownloadComplete(hedgeN, retryCnt, length, time.Since(dstart)) 1084 1085 return collapseBuffers(allBufs, length), nil 1086 } 1087 1088 func collapseBuffers(bufs [][]byte, length uint64) []byte { 1089 if len(bufs) == 1 { 1090 return bufs[0] 1091 } 1092 res := make([]byte, 0, length) 1093 for _, buf := range bufs { 1094 res = append(res, buf...) 1095 } 1096 return res 1097 } 1098 1099 func (dcs *DoltChunkStore) SupportedOperations() nbs.TableFileStoreOps { 1100 return nbs.TableFileStoreOps{ 1101 CanRead: true, 1102 CanWrite: true, 1103 CanPrune: false, 1104 CanGC: false, 1105 } 1106 } 1107 1108 // WriteTableFile reads a table file from the provided reader and writes it to the chunk store. 1109 func (dcs *DoltChunkStore) WriteTableFile(ctx context.Context, fileId string, numChunks int, rd io.Reader, contentLength uint64, contentHash []byte) error { 1110 fileIdBytes := hash.Parse(fileId) 1111 tfd := &remotesapi.TableFileDetails{ 1112 Id: fileIdBytes[:], 1113 ContentLength: contentLength, 1114 ContentHash: contentHash, 1115 } 1116 req := &remotesapi.GetUploadLocsRequest{ 1117 RepoId: dcs.getRepoId(), 1118 TableFileDetails: []*remotesapi.TableFileDetails{tfd}, 1119 1120 // redundant and deprecated. Still setting for compatibility, but will remove promptly. 1121 TableFileHashes: [][]byte{fileIdBytes[:]}, 1122 } 1123 resp, err := dcs.csClient.GetUploadLocations(ctx, req) 1124 1125 if err != nil { 1126 return err 1127 } 1128 1129 if len(resp.Locs) != 1 { 1130 return errors.New("unexpected upload location count") 1131 } 1132 1133 loc := resp.Locs[0] 1134 switch typedLoc := loc.Location.(type) { 1135 case *remotesapi.UploadLoc_HttpPost: 1136 err = dcs.httpPostUpload(ctx, loc.TableFileHash, typedLoc.HttpPost, rd, contentHash) 1137 1138 if err != nil { 1139 return err 1140 } 1141 1142 default: 1143 return errors.New("unsupported upload location") 1144 } 1145 1146 chnkTblInfo := []*remotesapi.ChunkTableInfo{ 1147 {Hash: fileIdBytes[:], ChunkCount: uint32(numChunks)}, 1148 } 1149 1150 atReq := &remotesapi.AddTableFilesRequest{ 1151 RepoId: dcs.getRepoId(), 1152 ChunkTableInfo: chnkTblInfo, 1153 ClientRepoFormat: &remotesapi.ClientRepoFormat{ 1154 NbfVersion: dcs.nbf.VersionString(), 1155 NbsVersion: nbs.StorageVersion, 1156 }, 1157 } 1158 1159 atResp, err := dcs.csClient.AddTableFiles(ctx, atReq) 1160 1161 if err != nil { 1162 return NewRpcError(err, "UpdateManifest", dcs.host, atReq) 1163 } 1164 1165 if !atResp.Success { 1166 return errors.New("update table files failed") 1167 } 1168 1169 return nil 1170 } 1171 1172 // PruneTableFiles deletes old table files that are no longer referenced in the manifest. 1173 func (dcs *DoltChunkStore) PruneTableFiles(ctx context.Context) error { 1174 return chunks.ErrUnsupportedOperation 1175 } 1176 1177 // Sources retrieves the current root hash, a list of all the table files (which may include appendix table files) 1178 // and a list of only appendix table files 1179 func (dcs *DoltChunkStore) Sources(ctx context.Context) (hash.Hash, []nbs.TableFile, []nbs.TableFile, error) { 1180 req := &remotesapi.ListTableFilesRequest{RepoId: dcs.getRepoId()} 1181 resp, err := dcs.csClient.ListTableFiles(ctx, req) 1182 if err != nil { 1183 return hash.Hash{}, nil, nil, err 1184 } 1185 sourceFiles := getTableFiles(dcs, resp.TableFileInfo) 1186 appendixFiles := getTableFiles(dcs, resp.AppendixTableFileInfo) 1187 return hash.New(resp.RootHash), sourceFiles, appendixFiles, nil 1188 } 1189 1190 func getTableFiles(dcs *DoltChunkStore, infoList []*remotesapi.TableFileInfo) []nbs.TableFile { 1191 tableFiles := make([]nbs.TableFile, 0) 1192 for _, nfo := range infoList { 1193 tableFiles = append(tableFiles, DoltRemoteTableFile{dcs, nfo}) 1194 } 1195 return tableFiles 1196 } 1197 1198 func (dcs *DoltChunkStore) Size(ctx context.Context) (uint64, error) { 1199 return dcs.metadata.StorageSize, nil 1200 } 1201 1202 // SetRootChunk changes the root chunk hash from the previous value to the new root. 1203 func (dcs *DoltChunkStore) SetRootChunk(ctx context.Context, root, previous hash.Hash) error { 1204 panic("Not Implemented") 1205 } 1206 1207 // DoltRemoteTableFile is an implementation of a TableFile that lives in a DoltChunkStore 1208 type DoltRemoteTableFile struct { 1209 dcs *DoltChunkStore 1210 info *remotesapi.TableFileInfo 1211 } 1212 1213 // FileID gets the id of the file 1214 func (drtf DoltRemoteTableFile) FileID() string { 1215 return drtf.info.FileId 1216 } 1217 1218 // NumChunks returns the number of chunks in a table file 1219 func (drtf DoltRemoteTableFile) NumChunks() int { 1220 return int(drtf.info.NumChunks) 1221 } 1222 1223 var ErrRemoteTableFileGet = errors.New("HTTP GET for remote table file failed") 1224 1225 func sanitizeSignedUrl(url string) string { 1226 si := strings.Index(url, "Signature=") 1227 if si == -1 { 1228 return url 1229 } 1230 ei := strings.Index(url[si:], "&") 1231 if ei == -1 { 1232 return url[:si+15] + "..." 1233 } else { 1234 return url[:si+15] + "..." + url[si:][ei:] 1235 } 1236 } 1237 1238 // Open returns an io.ReadCloser which can be used to read the bytes of a table file. 1239 func (drtf DoltRemoteTableFile) Open(ctx context.Context) (io.ReadCloser, error) { 1240 if drtf.info.RefreshAfter != nil && drtf.info.RefreshAfter.AsTime().After(time.Now()) { 1241 resp, err := drtf.dcs.csClient.RefreshTableFileUrl(ctx, drtf.info.RefreshRequest) 1242 if err == nil { 1243 drtf.info.Url = resp.Url 1244 drtf.info.RefreshAfter = resp.RefreshAfter 1245 } 1246 } 1247 1248 req, err := http.NewRequestWithContext(ctx, http.MethodGet, drtf.info.Url, nil) 1249 if err != nil { 1250 return nil, err 1251 } 1252 1253 resp, err := drtf.dcs.httpFetcher.Do(req) 1254 if err != nil { 1255 return nil, err 1256 } 1257 1258 if resp.StatusCode/100 != 2 { 1259 defer resp.Body.Close() 1260 body := make([]byte, 4096) 1261 n, _ := io.ReadFull(resp.Body, body) 1262 return nil, fmt.Errorf("%w: status code: %d;\nurl: %s\n\nbody:\n\n%s\n", ErrRemoteTableFileGet, resp.StatusCode, sanitizeSignedUrl(drtf.info.Url), string(body[0:n])) 1263 } 1264 1265 return resp.Body, nil 1266 }