github.com/dolthub/dolt/go@v0.40.5-0.20240520175717-68db7794bea6/libraries/doltcore/remotestorage/chunk_store.go (about) 1 // Copyright 2019 Dolthub, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package remotestorage 16 17 import ( 18 "bytes" 19 "context" 20 "crypto/md5" 21 "encoding/base64" 22 "errors" 23 "fmt" 24 "io" 25 "net/http" 26 "net/url" 27 "sort" 28 "strings" 29 "sync" 30 "sync/atomic" 31 "time" 32 33 "github.com/cenkalti/backoff/v4" 34 "go.opentelemetry.io/otel" 35 "go.opentelemetry.io/otel/attribute" 36 "go.opentelemetry.io/otel/trace" 37 "golang.org/x/sync/errgroup" 38 39 remotesapi "github.com/dolthub/dolt/go/gen/proto/dolt/services/remotesapi/v1alpha1" 40 "github.com/dolthub/dolt/go/libraries/doltcore/remotestorage/internal/reliable" 41 "github.com/dolthub/dolt/go/store/atomicerr" 42 "github.com/dolthub/dolt/go/store/chunks" 43 "github.com/dolthub/dolt/go/store/hash" 44 "github.com/dolthub/dolt/go/store/nbs" 45 "github.com/dolthub/dolt/go/store/types" 46 ) 47 48 var ErrCacheCapacityExceeded = errors.New("too much data: the cache capacity has been reached") 49 50 var ErrUploadFailed = errors.New("upload failed") 51 52 var globalHttpFetcher HTTPFetcher = &http.Client{} 53 54 var _ chunks.TableFileStore = (*DoltChunkStore)(nil) 55 var _ nbs.NBSCompressedChunkStore = (*DoltChunkStore)(nil) 56 var _ chunks.ChunkStore = (*DoltChunkStore)(nil) 57 var _ chunks.LoggingChunkStore = (*DoltChunkStore)(nil) 58 59 var tracer = otel.Tracer("github.com/dolthub/dolt/go/libraries/doltcore/remotestorage") 60 61 func uploadBackOff(ctx context.Context, max int) backoff.BackOff { 62 ret := backoff.NewExponentialBackOff() 63 ret.MaxInterval = 5 * time.Second 64 return backoff.WithContext(backoff.WithMaxRetries(ret, uint64(max)), ctx) 65 } 66 67 func downloadBackOff(ctx context.Context, max int) backoff.BackOff { 68 ret := backoff.NewExponentialBackOff() 69 ret.MaxInterval = 5 * time.Second 70 return backoff.WithContext(backoff.WithMaxRetries(ret, uint64(max)), ctx) 71 } 72 73 type HTTPFetcher interface { 74 Do(req *http.Request) (*http.Response, error) 75 } 76 77 type NetworkRequestParams struct { 78 StartingConcurrentDownloads int 79 MaximumConcurrentDownloads int 80 UploadRetryCount int 81 DownloadRetryCount int 82 ThroughputMinimumCheckInterval time.Duration 83 ThroughputMinimumBytesPerCheck int 84 ThroughputMinimumNumIntervals int 85 RespHeadersTimeout time.Duration 86 } 87 88 var defaultRequestParams = NetworkRequestParams{ 89 StartingConcurrentDownloads: 64, 90 MaximumConcurrentDownloads: 64, 91 UploadRetryCount: 5, 92 DownloadRetryCount: 5, 93 ThroughputMinimumCheckInterval: time.Second, 94 ThroughputMinimumBytesPerCheck: 1024, 95 ThroughputMinimumNumIntervals: 5, 96 RespHeadersTimeout: 15 * time.Second, 97 } 98 99 type DoltChunkStore struct { 100 repoId *remotesapi.RepoId 101 repoPath string 102 repoToken *atomic.Value // string 103 host string 104 root hash.Hash 105 csClient remotesapi.ChunkStoreServiceClient 106 finalizer func() error 107 cache ChunkCache 108 metadata *remotesapi.GetRepoMetadataResponse 109 nbf *types.NomsBinFormat 110 httpFetcher HTTPFetcher 111 params NetworkRequestParams 112 stats cacheStats 113 logger chunks.DebugLogger 114 wsValidate bool 115 } 116 117 func NewDoltChunkStoreFromPath(ctx context.Context, nbf *types.NomsBinFormat, path, host string, wsval bool, csClient remotesapi.ChunkStoreServiceClient) (*DoltChunkStore, error) { 118 var repoId *remotesapi.RepoId 119 120 path = strings.Trim(path, "/") 121 tokens := strings.Split(path, "/") 122 if len(tokens) == 2 { 123 org := tokens[0] 124 repoName := tokens[1] 125 repoId = &remotesapi.RepoId{ 126 Org: org, 127 RepoName: repoName, 128 } 129 } 130 131 metadata, err := csClient.GetRepoMetadata(ctx, &remotesapi.GetRepoMetadataRequest{ 132 RepoId: repoId, 133 RepoPath: path, 134 ClientRepoFormat: &remotesapi.ClientRepoFormat{ 135 NbfVersion: nbf.VersionString(), 136 NbsVersion: nbs.StorageVersion, 137 }, 138 }) 139 if err != nil { 140 return nil, err 141 } 142 143 repoToken := new(atomic.Value) 144 if metadata.RepoToken != "" { 145 repoToken.Store(metadata.RepoToken) 146 } 147 148 cs := &DoltChunkStore{ 149 repoId: repoId, 150 repoPath: path, 151 repoToken: repoToken, 152 host: host, 153 csClient: csClient, 154 finalizer: func() error { return nil }, 155 cache: newMapChunkCache(), 156 metadata: metadata, 157 nbf: nbf, 158 httpFetcher: globalHttpFetcher, 159 params: defaultRequestParams, 160 wsValidate: wsval, 161 } 162 err = cs.loadRoot(ctx) 163 if err != nil { 164 return nil, err 165 } 166 return cs, nil 167 } 168 169 func (dcs *DoltChunkStore) WithHTTPFetcher(fetcher HTTPFetcher) *DoltChunkStore { 170 return &DoltChunkStore{ 171 repoId: dcs.repoId, 172 repoPath: dcs.repoPath, 173 repoToken: new(atomic.Value), 174 host: dcs.host, 175 root: dcs.root, 176 csClient: dcs.csClient, 177 finalizer: dcs.finalizer, 178 cache: dcs.cache, 179 metadata: dcs.metadata, 180 nbf: dcs.nbf, 181 httpFetcher: fetcher, 182 params: dcs.params, 183 stats: dcs.stats, 184 } 185 } 186 187 func (dcs *DoltChunkStore) WithNoopChunkCache() *DoltChunkStore { 188 return &DoltChunkStore{ 189 repoId: dcs.repoId, 190 repoPath: dcs.repoPath, 191 repoToken: new(atomic.Value), 192 host: dcs.host, 193 root: dcs.root, 194 csClient: dcs.csClient, 195 finalizer: dcs.finalizer, 196 cache: noopChunkCache, 197 metadata: dcs.metadata, 198 nbf: dcs.nbf, 199 httpFetcher: dcs.httpFetcher, 200 params: dcs.params, 201 stats: dcs.stats, 202 logger: dcs.logger, 203 } 204 } 205 206 func (dcs *DoltChunkStore) WithChunkCache(cache ChunkCache) *DoltChunkStore { 207 return &DoltChunkStore{ 208 repoId: dcs.repoId, 209 repoPath: dcs.repoPath, 210 repoToken: new(atomic.Value), 211 host: dcs.host, 212 root: dcs.root, 213 csClient: dcs.csClient, 214 finalizer: dcs.finalizer, 215 cache: cache, 216 metadata: dcs.metadata, 217 nbf: dcs.nbf, 218 httpFetcher: dcs.httpFetcher, 219 params: dcs.params, 220 stats: dcs.stats, 221 logger: dcs.logger, 222 } 223 } 224 225 func (dcs *DoltChunkStore) WithNetworkRequestParams(params NetworkRequestParams) *DoltChunkStore { 226 return &DoltChunkStore{ 227 repoId: dcs.repoId, 228 repoPath: dcs.repoPath, 229 repoToken: new(atomic.Value), 230 host: dcs.host, 231 root: dcs.root, 232 csClient: dcs.csClient, 233 finalizer: dcs.finalizer, 234 cache: dcs.cache, 235 metadata: dcs.metadata, 236 nbf: dcs.nbf, 237 httpFetcher: dcs.httpFetcher, 238 params: params, 239 stats: dcs.stats, 240 logger: dcs.logger, 241 } 242 } 243 244 func (dcs *DoltChunkStore) SetLogger(logger chunks.DebugLogger) { 245 dcs.logger = logger 246 } 247 248 func (dcs *DoltChunkStore) SetFinalizer(f func() error) { 249 dcs.finalizer = f 250 } 251 252 func (dcs *DoltChunkStore) logf(fmt string, args ...interface{}) { 253 if dcs.logger != nil { 254 dcs.logger.Logf(fmt, args...) 255 } 256 } 257 258 func (dcs *DoltChunkStore) getRepoId() (*remotesapi.RepoId, string) { 259 var token string 260 curToken := dcs.repoToken.Load() 261 if curToken != nil { 262 token = curToken.(string) 263 } 264 return dcs.repoId, token 265 } 266 267 type cacheStats struct { 268 Hits uint32 269 } 270 271 func (s cacheStats) CacheHits() uint32 { 272 return s.Hits 273 } 274 275 type CacheStats interface { 276 CacheHits() uint32 277 } 278 279 func (dcs *DoltChunkStore) ChunkFetcher(ctx context.Context) nbs.ChunkFetcher { 280 return NewChunkFetcher(ctx, dcs) 281 } 282 283 // Get the Chunk for the value of the hash in the store. If the hash is absent from the store EmptyChunk is returned. 284 func (dcs *DoltChunkStore) Get(ctx context.Context, h hash.Hash) (chunks.Chunk, error) { 285 hashes := hash.HashSet{h: struct{}{}} 286 var found *chunks.Chunk 287 err := dcs.GetMany(ctx, hashes, func(_ context.Context, c *chunks.Chunk) { found = c }) 288 if err != nil { 289 return chunks.EmptyChunk, err 290 } 291 if found != nil { 292 return *found, nil 293 } else { 294 return chunks.EmptyChunk, nil 295 } 296 } 297 298 func (dcs *DoltChunkStore) GetMany(ctx context.Context, hashes hash.HashSet, found func(context.Context, *chunks.Chunk)) error { 299 ae := atomicerr.New() 300 decompressedSize := uint64(0) 301 err := dcs.GetManyCompressed(ctx, hashes, func(ctx context.Context, cc nbs.CompressedChunk) { 302 if ae.IsSet() { 303 return 304 } 305 c, err := cc.ToChunk() 306 if ae.SetIfErrAndCheck(err) { 307 return 308 } 309 atomic.AddUint64(&decompressedSize, uint64(len(c.Data()))) 310 found(ctx, &c) 311 }) 312 trace.SpanFromContext(ctx).SetAttributes(attribute.Int64("decompressed_bytes", int64(decompressedSize))) 313 if err != nil { 314 return err 315 } 316 if err = ae.Get(); err != nil { 317 return err 318 } 319 return nil 320 } 321 322 // GetMany gets the Chunks with |hashes| from the store. On return, |foundChunks| will have been fully sent all chunks 323 // which have been found. Any non-present chunks will silently be ignored. 324 func (dcs *DoltChunkStore) GetManyCompressed(ctx context.Context, hashes hash.HashSet, found func(context.Context, nbs.CompressedChunk)) error { 325 ctx, span := tracer.Start(ctx, "remotestorage.GetManyCompressed") 326 defer span.End() 327 328 hashToChunk := dcs.cache.Get(hashes) 329 330 span.SetAttributes(attribute.Int("num_hashes", len(hashes)), attribute.Int("cache_hits", len(hashToChunk))) 331 atomic.AddUint32(&dcs.stats.Hits, uint32(len(hashToChunk))) 332 333 notCached := make([]hash.Hash, 0, len(hashes)) 334 for h := range hashes { 335 c := hashToChunk[h] 336 337 if c.IsEmpty() { 338 notCached = append(notCached, h) 339 } else { 340 found(ctx, c) 341 } 342 } 343 344 if len(notCached) > 0 { 345 err := dcs.readChunksAndCache(ctx, notCached, found) 346 347 if err != nil { 348 return err 349 } 350 } 351 352 return nil 353 } 354 355 type GetRange remotesapi.HttpGetRange 356 357 func (gr *GetRange) ResourcePath() string { 358 u, _ := url.Parse(gr.Url) 359 return fmt.Sprintf("%s://%s%s", u.Scheme, u.Host, u.Path) 360 } 361 362 func (gr *GetRange) Append(other *GetRange) { 363 gr.Url = other.Url 364 gr.Ranges = append(gr.Ranges, other.Ranges...) 365 } 366 367 func (gr *GetRange) Sort() { 368 sort.Slice(gr.Ranges, func(i, j int) bool { 369 return gr.Ranges[i].Offset < gr.Ranges[j].Offset 370 }) 371 } 372 373 func (gr *GetRange) ChunkStartOffset(i int) uint64 { 374 return gr.Ranges[i].Offset 375 } 376 377 func (gr *GetRange) ChunkEndOffset(i int) uint64 { 378 return gr.Ranges[i].Offset + uint64(gr.Ranges[i].Length) 379 } 380 381 func (gr *GetRange) GapBetween(i, j int) uint64 { 382 return gr.ChunkStartOffset(j) - gr.ChunkEndOffset(i) 383 } 384 385 func (gr *GetRange) NumChunks() int { 386 return len(gr.Ranges) 387 } 388 389 func (gr *GetRange) RangeLen() uint64 { 390 return gr.ChunkEndOffset(gr.NumChunks()-1) - gr.ChunkStartOffset(0) 391 } 392 393 func (gr *GetRange) NumBytesInRanges() uint64 { 394 res := uint64(0) 395 for i := 0; i < len(gr.Ranges); i++ { 396 start, end := gr.ChunkByteRange(i) 397 res += start - end 398 } 399 return res 400 } 401 402 func (gr *GetRange) ChunkByteRange(i int) (uint64, uint64) { 403 start := gr.ChunkStartOffset(i) - gr.ChunkStartOffset(0) 404 end := gr.ChunkEndOffset(i) - gr.ChunkStartOffset(0) 405 return start, end 406 } 407 408 func sortRangesBySize(ranges []*GetRange) { 409 sort.Slice(ranges, func(i, j int) bool { 410 return ranges[j].RangeLen() < ranges[i].RangeLen() 411 }) 412 } 413 414 type resourcePathToUrlFunc func(ctx context.Context, lastError error, resourcePath string) (url string, err error) 415 416 func (gr *GetRange) GetDownloadFunc(ctx context.Context, stats StatsRecorder, health reliable.HealthRecorder, fetcher HTTPFetcher, params NetworkRequestParams, chunkChan chan nbs.CompressedChunk, pathToUrl resourcePathToUrlFunc) func() error { 417 if len(gr.Ranges) == 0 { 418 return func() error { return nil } 419 } 420 return func() error { 421 urlF := func(lastError error) (string, error) { 422 url, err := pathToUrl(ctx, lastError, gr.ResourcePath()) 423 if err != nil { 424 return "", err 425 } 426 if url == "" { 427 url = gr.Url 428 } 429 return url, nil 430 } 431 rangeLen := gr.RangeLen() 432 resp := reliable.StreamingRangeDownload(ctx, reliable.StreamingRangeRequest{ 433 Fetcher: fetcher, 434 Offset: gr.ChunkStartOffset(0), 435 Length: rangeLen, 436 UrlFact: urlF, 437 Stats: stats, 438 Health: health, 439 BackOffFact: func(ctx context.Context) backoff.BackOff { 440 return downloadBackOff(ctx, params.DownloadRetryCount) 441 }, 442 Throughput: reliable.MinimumThroughputCheck{ 443 CheckInterval: params.ThroughputMinimumCheckInterval, 444 BytesPerCheck: params.ThroughputMinimumBytesPerCheck, 445 NumIntervals: params.ThroughputMinimumNumIntervals, 446 }, 447 RespHeadersTimeout: params.RespHeadersTimeout, 448 }) 449 defer resp.Close() 450 reader := &RangeChunkReader{GetRange: gr, Reader: resp.Body} 451 for { 452 cc, err := reader.ReadChunk() 453 if errors.Is(err, io.EOF) { 454 return nil 455 } 456 if err != nil { 457 return err 458 } 459 select { 460 case chunkChan <- cc: 461 case <-ctx.Done(): 462 return context.Cause(ctx) 463 } 464 } 465 } 466 } 467 468 type RangeChunkReader struct { 469 GetRange *GetRange 470 Reader io.Reader 471 i int 472 skip int 473 } 474 475 func (r *RangeChunkReader) ReadChunk() (nbs.CompressedChunk, error) { 476 if r.skip > 0 { 477 _, err := io.CopyN(io.Discard, r.Reader, int64(r.skip)) 478 if err != nil { 479 return nbs.CompressedChunk{}, err 480 } 481 r.skip = 0 482 } 483 if r.i >= len(r.GetRange.Ranges) { 484 return nbs.CompressedChunk{}, io.EOF 485 } 486 if r.i < len(r.GetRange.Ranges)-1 { 487 r.skip = int(r.GetRange.GapBetween(r.i, r.i+1)) 488 } 489 l := r.GetRange.Ranges[r.i].Length 490 h := hash.New(r.GetRange.Ranges[r.i].Hash) 491 r.i += 1 492 buf := make([]byte, l) 493 _, err := io.ReadFull(r.Reader, buf) 494 if err != nil { 495 return nbs.CompressedChunk{}, err 496 } else { 497 return nbs.NewCompressedChunk(h, buf) 498 } 499 } 500 501 type locationRefresh struct { 502 RefreshAfter time.Time 503 RefreshRequest *remotesapi.RefreshTableFileUrlRequest 504 URL string 505 lastRefresh time.Time 506 mu sync.Mutex 507 } 508 509 func (r *locationRefresh) Add(resp *remotesapi.DownloadLoc) { 510 r.mu.Lock() 511 defer r.mu.Unlock() 512 if r.URL == "" { 513 r.URL = resp.Location.(*remotesapi.DownloadLoc_HttpGetRange).HttpGetRange.Url 514 } 515 if resp.RefreshAfter == nil { 516 return 517 } 518 respTime := resp.RefreshAfter.AsTime() 519 if (r.RefreshAfter == time.Time{}) || respTime.After(r.RefreshAfter) { 520 r.RefreshAfter = resp.RefreshAfter.AsTime() 521 r.RefreshRequest = resp.RefreshRequest 522 r.URL = resp.Location.(*remotesapi.DownloadLoc_HttpGetRange).HttpGetRange.Url 523 } 524 } 525 526 // TODO: These should be configurable in NetworkRequestParams or something. 527 var refreshTableFileURLRetryDuration = 5 * time.Second 528 var refreshTableFileURLTimeout = 15 * time.Second 529 530 func (r *locationRefresh) GetURL(ctx context.Context, lastError error, client remotesapi.ChunkStoreServiceClient) (string, error) { 531 r.mu.Lock() 532 defer r.mu.Unlock() 533 if r.RefreshRequest != nil { 534 now := time.Now() 535 wantsRefresh := now.After(r.RefreshAfter) || errors.Is(lastError, HttpError) 536 canRefresh := time.Since(r.lastRefresh) > refreshTableFileURLRetryDuration 537 if wantsRefresh && canRefresh { 538 ctx, cancel := context.WithTimeout(ctx, refreshTableFileURLTimeout) 539 resp, err := client.RefreshTableFileUrl(ctx, r.RefreshRequest) 540 cancel() 541 if err != nil { 542 return r.URL, err 543 } 544 r.RefreshAfter = resp.RefreshAfter.AsTime() 545 r.URL = resp.Url 546 r.lastRefresh = now 547 } 548 } 549 return r.URL, nil 550 } 551 552 type RepoRequest interface { 553 SetRepoId(*remotesapi.RepoId) 554 SetRepoToken(string) 555 SetRepoPath(string) 556 } 557 558 func (dcs *DoltChunkStore) readChunksAndCache(ctx context.Context, hashes []hash.Hash, found func(context.Context, nbs.CompressedChunk)) (err error) { 559 toSend := hash.NewHashSet(hashes...) 560 561 fetcher := dcs.ChunkFetcher(ctx) 562 defer func() { 563 cerr := fetcher.Close() 564 if err == nil { 565 err = cerr 566 } 567 }() 568 569 eg, egCtx := errgroup.WithContext(ctx) 570 eg.Go(func() error { 571 err := fetcher.Get(egCtx, toSend) 572 if err != nil { 573 return err 574 } 575 return fetcher.CloseSend() 576 }) 577 eg.Go(func() error { 578 for { 579 cc, err := fetcher.Recv(egCtx) 580 if errors.Is(err, io.EOF) { 581 return nil 582 } 583 if err != nil { 584 return err 585 } 586 // Don't forward on empty/not found chunks. 587 if len(cc.CompressedData) > 0 { 588 if dcs.cache.PutChunk(cc) { 589 return ErrCacheCapacityExceeded 590 } 591 found(egCtx, cc) 592 } 593 } 594 }) 595 596 return eg.Wait() 597 } 598 599 // Returns true iff the value at the address |h| is contained in the 600 // store 601 func (dcs *DoltChunkStore) Has(ctx context.Context, h hash.Hash) (bool, error) { 602 hashes := hash.HashSet{h: struct{}{}} 603 absent, err := dcs.HasMany(ctx, hashes) 604 605 if err != nil { 606 return false, err 607 } 608 609 return len(absent) == 0, nil 610 } 611 612 const maxHasManyBatchSize = 16 * 1024 613 614 // Returns a new HashSet containing any members of |hashes| that are 615 // absent from the store. 616 func (dcs *DoltChunkStore) HasMany(ctx context.Context, hashes hash.HashSet) (hash.HashSet, error) { 617 // get the set of hashes that isn't already in the cache 618 notCached := dcs.cache.Has(hashes) 619 620 if len(notCached) == 0 { 621 return notCached, nil 622 } 623 624 // convert the set to a slice of hashes and a corresponding slice of the byte encoding for those hashes 625 hashSl, byteSl := HashSetToSlices(notCached) 626 627 absent := make(hash.HashSet) 628 var found []nbs.CompressedChunk 629 var err error 630 631 batchItr(len(hashSl), maxHasManyBatchSize, func(st, end int) (stop bool) { 632 // slice the slices into a batch of hashes 633 currHashSl := hashSl[st:end] 634 currByteSl := byteSl[st:end] 635 636 // send a request to the remote api to determine which chunks the remote api already has 637 id, token := dcs.getRepoId() 638 req := &remotesapi.HasChunksRequest{RepoId: id, RepoToken: token, Hashes: currByteSl, RepoPath: dcs.repoPath} 639 var resp *remotesapi.HasChunksResponse 640 resp, err = dcs.csClient.HasChunks(ctx, req) 641 if err != nil { 642 err = NewRpcError(err, "HasChunks", dcs.host, req) 643 return true 644 } 645 646 if resp.RepoToken != "" { 647 dcs.repoToken.Store(resp.RepoToken) 648 } 649 650 numAbsent := len(resp.Absent) 651 sort.Slice(resp.Absent, func(i, j int) bool { 652 return resp.Absent[i] < resp.Absent[j] 653 }) 654 655 // loop over every hash in the current batch, and if they are absent from the remote host add them to the 656 // absent set, otherwise append them to the found slice 657 for i, j := 0, 0; i < len(currHashSl); i++ { 658 currHash := currHashSl[i] 659 660 nextAbsent := -1 661 if j < numAbsent { 662 nextAbsent = int(resp.Absent[j]) 663 } 664 665 if i == nextAbsent { 666 absent[currHash] = struct{}{} 667 j++ 668 } else { 669 c := nbs.ChunkToCompressedChunk(chunks.NewChunkWithHash(currHash, []byte{})) 670 found = append(found, c) 671 } 672 } 673 674 return false 675 }) 676 677 if err != nil { 678 return nil, err 679 } 680 681 if len(found)+len(absent) != len(notCached) { 682 panic("not all chunks were accounted for") 683 } 684 685 if len(found) > 0 { 686 if dcs.cache.Put(found) { 687 return hash.HashSet{}, ErrCacheCapacityExceeded 688 } 689 } 690 691 return absent, nil 692 } 693 694 func (dcs *DoltChunkStore) errorIfDangling(ctx context.Context, addrs hash.HashSet) error { 695 absent, err := dcs.HasMany(ctx, addrs) 696 if err != nil { 697 return err 698 } 699 if len(absent) != 0 { 700 s := absent.String() 701 return fmt.Errorf("Found dangling references to %s", s) 702 } 703 return nil 704 } 705 706 // Put caches c. Upon return, c must be visible to 707 // subsequent Get and Has calls, but must not be persistent until a call 708 // to Flush(). Put may be called concurrently with other calls to Put(), 709 // Get(), GetMany(), Has() and HasMany(). 710 func (dcs *DoltChunkStore) Put(ctx context.Context, c chunks.Chunk, getAddrs chunks.GetAddrsCurry) error { 711 addrs := hash.NewHashSet() 712 err := getAddrs(c)(ctx, addrs, func(h hash.Hash) bool { return false }) 713 if err != nil { 714 return err 715 } 716 err = dcs.errorIfDangling(ctx, addrs) 717 if err != nil { 718 return err 719 } 720 721 cc := nbs.ChunkToCompressedChunk(c) 722 if dcs.cache.Put([]nbs.CompressedChunk{cc}) { 723 return ErrCacheCapacityExceeded 724 } 725 return nil 726 } 727 728 // Returns the NomsBinFormat with which this ChunkSource is compatible. 729 func (dcs *DoltChunkStore) Version() string { 730 return dcs.metadata.NbfVersion 731 } 732 733 func (dcs *DoltChunkStore) AccessMode() chunks.ExclusiveAccessMode { 734 return chunks.ExclusiveAccessMode_Shared 735 } 736 737 // Rebase brings this ChunkStore into sync with the persistent storage's 738 // current root. 739 func (dcs *DoltChunkStore) Rebase(ctx context.Context) error { 740 err := dcs.loadRoot(ctx) 741 if err != nil { 742 return err 743 } 744 return dcs.refreshRepoMetadata(ctx) 745 } 746 747 func (dcs *DoltChunkStore) refreshRepoMetadata(ctx context.Context) error { 748 mdReq := &remotesapi.GetRepoMetadataRequest{ 749 RepoId: dcs.repoId, 750 RepoPath: dcs.repoPath, 751 ClientRepoFormat: &remotesapi.ClientRepoFormat{ 752 NbfVersion: dcs.nbf.VersionString(), 753 NbsVersion: nbs.StorageVersion, 754 }, 755 } 756 metadata, err := dcs.csClient.GetRepoMetadata(ctx, mdReq) 757 if err != nil { 758 return NewRpcError(err, "GetRepoMetadata", dcs.host, mdReq) 759 } 760 if metadata.RepoToken != "" { 761 dcs.repoToken.Store(metadata.RepoToken) 762 } 763 dcs.metadata = metadata 764 return nil 765 } 766 767 // Root returns the root of the database as of the time the ChunkStore 768 // was opened or the most recent call to Rebase. 769 func (dcs *DoltChunkStore) Root(ctx context.Context) (hash.Hash, error) { 770 return dcs.root, nil 771 } 772 773 func (dcs *DoltChunkStore) PushConcurrencyControl() chunks.PushConcurrencyControl { 774 if dcs.metadata.PushConcurrencyControl == remotesapi.PushConcurrencyControl_PUSH_CONCURRENCY_CONTROL_ASSERT_WORKING_SET { 775 return chunks.PushConcurrencyControl_AssertWorkingSet 776 } 777 778 if dcs.metadata.PushConcurrencyControl == remotesapi.PushConcurrencyControl_PUSH_CONCURRENCY_CONTROL_UNSPECIFIED { 779 if dcs.wsValidate { 780 return chunks.PushConcurrencyControl_AssertWorkingSet 781 } 782 } 783 784 return chunks.PushConcurrencyControl_IgnoreWorkingSet 785 } 786 787 func (dcs *DoltChunkStore) loadRoot(ctx context.Context) error { 788 id, token := dcs.getRepoId() 789 req := &remotesapi.RootRequest{RepoId: id, RepoToken: token, RepoPath: dcs.repoPath} 790 resp, err := dcs.csClient.Root(ctx, req) 791 if err != nil { 792 return NewRpcError(err, "Root", dcs.host, req) 793 } 794 if resp.RepoToken != "" { 795 dcs.repoToken.Store(resp.RepoToken) 796 } 797 dcs.root = hash.New(resp.RootHash) 798 return nil 799 } 800 801 // Commit atomically attempts to persist all novel Chunks and update the 802 // persisted root hash from last to current (or keeps it the same). 803 // If last doesn't match the root in persistent storage, returns false. 804 func (dcs *DoltChunkStore) Commit(ctx context.Context, current, last hash.Hash) (bool, error) { 805 hashToChunkCount, err := dcs.uploadChunks(ctx) 806 807 if err != nil { 808 return false, err 809 } 810 811 chnkTblInfo := make([]*remotesapi.ChunkTableInfo, 0, len(hashToChunkCount)) 812 for h, cnt := range hashToChunkCount { 813 chnkTblInfo = append(chnkTblInfo, &remotesapi.ChunkTableInfo{Hash: h[:], ChunkCount: uint32(cnt)}) 814 } 815 816 id, _ := dcs.getRepoId() 817 req := &remotesapi.CommitRequest{ 818 RepoId: id, 819 RepoPath: dcs.repoPath, 820 Current: current[:], 821 Last: last[:], 822 ChunkTableInfo: chnkTblInfo, 823 ClientRepoFormat: &remotesapi.ClientRepoFormat{ 824 NbfVersion: dcs.nbf.VersionString(), 825 NbsVersion: nbs.StorageVersion, 826 }, 827 } 828 resp, err := dcs.csClient.Commit(ctx, req) 829 if err != nil { 830 return false, NewRpcError(err, "Commit", dcs.host, req) 831 } 832 err = dcs.loadRoot(ctx) 833 if err != nil { 834 return false, NewRpcError(err, "Commit", dcs.host, req) 835 } 836 837 return resp.Success, dcs.refreshRepoMetadata(ctx) 838 } 839 840 // Stats may return some kind of struct that reports statistics about the 841 // ChunkStore instance. The type is implementation-dependent, and impls 842 // may return nil 843 func (dcs *DoltChunkStore) Stats() interface{} { 844 return cacheStats{atomic.LoadUint32(&dcs.stats.Hits)} 845 } 846 847 // StatsSummary may return a string containing summarized statistics for 848 // this ChunkStore. It must return "Unsupported" if this operation is not 849 // supported. 850 func (dcs *DoltChunkStore) StatsSummary() string { 851 return fmt.Sprintf("CacheHits: %v", dcs.Stats().(CacheStats).CacheHits()) 852 } 853 854 func (dcs *DoltChunkStore) PersistGhostHashes(ctx context.Context, refs hash.HashSet) error { 855 panic("runtime error: PersistGhostHashes should never be called on a remote chunk store") 856 } 857 858 // Close tears down any resources in use by the implementation. After 859 // Close(), the ChunkStore may not be used again. It is NOT SAFE to call 860 // Close() concurrently with any other ChunkStore method; behavior is 861 // undefined and probably crashy. 862 func (dcs *DoltChunkStore) Close() error { 863 return dcs.finalizer() 864 } 865 866 // getting this working using the simplest approach first 867 func (dcs *DoltChunkStore) uploadChunks(ctx context.Context) (map[hash.Hash]int, error) { 868 hashToChunk := dcs.cache.GetAndClearChunksToFlush() 869 870 if len(hashToChunk) == 0 { 871 return map[hash.Hash]int{}, nil 872 } 873 874 chnks := make([]chunks.Chunk, 0, len(hashToChunk)) 875 for _, chable := range hashToChunk { 876 ch, err := chable.ToChunk() 877 878 if err != nil { 879 return nil, err 880 } 881 882 chnks = append(chnks, ch) 883 } 884 885 hashToCount := make(map[hash.Hash]int) 886 hashToData := make(map[hash.Hash][]byte) 887 hashToContentHash := make(map[hash.Hash][]byte) 888 889 // structuring so this can be done as multiple files in the future. 890 { 891 name, data, err := nbs.WriteChunks(chnks) 892 893 if err != nil { 894 return map[hash.Hash]int{}, err 895 } 896 897 h := hash.Parse(name) 898 hashToData[h] = data 899 hashToCount[h] = len(chnks) 900 901 md5Bytes := md5.Sum(data) 902 hashToContentHash[h] = md5Bytes[:] 903 } 904 905 for h, contentHash := range hashToContentHash { 906 // Can parallelize this in the future if needed 907 err := dcs.uploadTableFileWithRetries(ctx, h, uint64(hashToCount[h]), contentHash, func() (io.ReadCloser, uint64, error) { 908 data := hashToData[h] 909 return io.NopCloser(bytes.NewReader(data)), uint64(len(data)), nil 910 }) 911 if err != nil { 912 return map[hash.Hash]int{}, err 913 } 914 } 915 916 return hashToCount, nil 917 } 918 919 func (dcs *DoltChunkStore) uploadTableFileWithRetries(ctx context.Context, tableFileId hash.Hash, numChunks uint64, tableFileContentHash []byte, getContent func() (io.ReadCloser, uint64, error)) error { 920 op := func() error { 921 body, contentLength, err := getContent() 922 if err != nil { 923 return err 924 } 925 926 tbfd := &remotesapi.TableFileDetails{ 927 Id: tableFileId[:], 928 ContentLength: contentLength, 929 ContentHash: tableFileContentHash, 930 NumChunks: numChunks, 931 } 932 933 dcs.logf("getting upload location for file %s", tableFileId.String()) 934 id, token := dcs.getRepoId() 935 req := &remotesapi.GetUploadLocsRequest{RepoId: id, RepoToken: token, RepoPath: dcs.repoPath, TableFileDetails: []*remotesapi.TableFileDetails{tbfd}} 936 resp, err := dcs.csClient.GetUploadLocations(ctx, req) 937 if err != nil { 938 err := NewRpcError(err, "GetUploadLocations", dcs.host, req) 939 if err.IsPermanent() { 940 return backoff.Permanent(err) 941 } 942 return err 943 } 944 945 if resp.RepoToken != "" { 946 dcs.repoToken.Store(resp.RepoToken) 947 } 948 949 if len(resp.Locs) != 1 { 950 return NewRpcError(errors.New("unexpected upload location count"), "GetUploadLocations", dcs.host, req) 951 } 952 loc := resp.Locs[0] 953 954 switch typedLoc := loc.Location.(type) { 955 case *remotesapi.UploadLoc_HttpPost: 956 957 // strip off the query parameters as they clutter the logs. We only 958 // really care about being able to verify the table files are being 959 // uploaded to the correct places on S3. 960 urlStr := typedLoc.HttpPost.Url 961 qmIdx := strings.IndexRune(urlStr, '?') 962 if qmIdx != -1 { 963 urlStr = urlStr[:qmIdx] 964 } 965 966 dcs.logf("uploading file %s to %s", tableFileId.String(), urlStr) 967 err = dcs.httpPostUpload(ctx, typedLoc.HttpPost, tableFileContentHash, int64(contentLength), body) 968 if err != nil { 969 dcs.logf("failed to upload file %s to %s, err: %v", tableFileId.String(), urlStr, err) 970 return err 971 } 972 dcs.logf("successfully uploaded file %s to %s", tableFileId.String(), urlStr) 973 default: 974 break 975 } 976 977 return nil 978 } 979 980 return backoff.Retry(op, uploadBackOff(ctx, dcs.params.UploadRetryCount)) 981 } 982 983 type Sizer interface { 984 Size() int64 985 } 986 987 func (dcs *DoltChunkStore) httpPostUpload(ctx context.Context, post *remotesapi.HttpPostTableFile, contentHash []byte, contentLength int64, body io.ReadCloser) error { 988 return HttpPostUpload(ctx, dcs.httpFetcher, post, contentHash, contentLength, body) 989 } 990 991 func HttpPostUpload(ctx context.Context, httpFetcher HTTPFetcher, post *remotesapi.HttpPostTableFile, contentHash []byte, contentLength int64, body io.ReadCloser) error { 992 fetcher := globalHttpFetcher 993 if httpFetcher != nil { 994 fetcher = httpFetcher 995 } 996 997 req, err := http.NewRequest(http.MethodPut, post.Url, body) 998 if err != nil { 999 return err 1000 } 1001 1002 req.ContentLength = contentLength 1003 1004 if len(contentHash) > 0 { 1005 md5s := base64.StdEncoding.EncodeToString(contentHash) 1006 req.Header.Set("Content-MD5", md5s) 1007 } 1008 1009 resp, err := fetcher.Do(req.WithContext(ctx)) 1010 1011 if err == nil { 1012 defer func() { 1013 _ = resp.Body.Close() 1014 }() 1015 } 1016 1017 return processHttpResp(resp, err) 1018 } 1019 1020 const ( 1021 chunkAggDistance = 8 * 1024 1022 ) 1023 1024 func (dcs *DoltChunkStore) SupportedOperations() chunks.TableFileStoreOps { 1025 return chunks.TableFileStoreOps{ 1026 CanRead: true, 1027 CanWrite: true, 1028 CanPrune: false, 1029 CanGC: false, 1030 } 1031 } 1032 1033 // WriteTableFile reads a table file from the provided reader and writes it to the chunk store. 1034 func (dcs *DoltChunkStore) WriteTableFile(ctx context.Context, fileId string, numChunks int, contentHash []byte, getRd func() (io.ReadCloser, uint64, error)) error { 1035 fileIdBytes := hash.Parse(fileId) 1036 err := dcs.uploadTableFileWithRetries(ctx, fileIdBytes, uint64(numChunks), contentHash, getRd) 1037 if err != nil { 1038 return err 1039 } 1040 return nil 1041 } 1042 1043 // AddTableFilesToManifest adds table files to the manifest 1044 func (dcs *DoltChunkStore) AddTableFilesToManifest(ctx context.Context, fileIdToNumChunks map[string]int) error { 1045 chnkTblInfo := make([]*remotesapi.ChunkTableInfo, 0, len(fileIdToNumChunks)) 1046 1047 debugStr := "" 1048 for fileId, numChunks := range fileIdToNumChunks { 1049 debugStr += fmt.Sprintln(fileId, ":", numChunks) 1050 fileIdBytes := hash.Parse(fileId) 1051 chnkTblInfo = append(chnkTblInfo, &remotesapi.ChunkTableInfo{Hash: fileIdBytes[:], ChunkCount: uint32(numChunks)}) 1052 } 1053 1054 id, token := dcs.getRepoId() 1055 dcs.logf("Adding Table files to repo: %s -\n%s", dcs.repoPath, debugStr) 1056 atReq := &remotesapi.AddTableFilesRequest{ 1057 RepoId: id, 1058 RepoToken: token, 1059 RepoPath: dcs.repoPath, 1060 ChunkTableInfo: chnkTblInfo, 1061 ClientRepoFormat: &remotesapi.ClientRepoFormat{ 1062 NbfVersion: dcs.nbf.VersionString(), 1063 NbsVersion: nbs.StorageVersion, 1064 }, 1065 } 1066 1067 atResp, err := dcs.csClient.AddTableFiles(ctx, atReq) 1068 if err != nil { 1069 return NewRpcError(err, "AddTableFiles", dcs.host, atReq) 1070 } 1071 1072 if atResp.RepoToken != "" { 1073 dcs.repoToken.Store(atResp.RepoToken) 1074 } 1075 1076 if !atResp.Success { 1077 return errors.New("update table files failed") 1078 } 1079 1080 return nil 1081 } 1082 1083 // PruneTableFiles deletes old table files that are no longer referenced in the manifest. 1084 func (dcs *DoltChunkStore) PruneTableFiles(ctx context.Context) error { 1085 return chunks.ErrUnsupportedOperation 1086 } 1087 1088 // Sources retrieves the current root hash, a list of all the table files (which may include appendix table files) 1089 // and a list of only appendix table files 1090 func (dcs *DoltChunkStore) Sources(ctx context.Context) (hash.Hash, []chunks.TableFile, []chunks.TableFile, error) { 1091 id, token := dcs.getRepoId() 1092 req := &remotesapi.ListTableFilesRequest{RepoId: id, RepoPath: dcs.repoPath, RepoToken: token} 1093 resp, err := dcs.csClient.ListTableFiles(ctx, req) 1094 if err != nil { 1095 return hash.Hash{}, nil, nil, NewRpcError(err, "ListTableFiles", dcs.host, req) 1096 } 1097 if resp.RepoToken != "" { 1098 dcs.repoToken.Store(resp.RepoToken) 1099 } 1100 sourceFiles := getTableFiles(dcs, resp.TableFileInfo) 1101 appendixFiles := getTableFiles(dcs, resp.AppendixTableFileInfo) 1102 return hash.New(resp.RootHash), sourceFiles, appendixFiles, nil 1103 } 1104 1105 func getTableFiles(dcs *DoltChunkStore, infoList []*remotesapi.TableFileInfo) []chunks.TableFile { 1106 tableFiles := make([]chunks.TableFile, 0) 1107 for _, nfo := range infoList { 1108 tableFiles = append(tableFiles, DoltRemoteTableFile{dcs, nfo}) 1109 } 1110 return tableFiles 1111 } 1112 1113 func (dcs *DoltChunkStore) Size(ctx context.Context) (uint64, error) { 1114 return dcs.metadata.StorageSize, nil 1115 } 1116 1117 // SetRootChunk changes the root chunk hash from the previous value to the new root. 1118 func (dcs *DoltChunkStore) SetRootChunk(ctx context.Context, root, previous hash.Hash) error { 1119 panic("Not Implemented") 1120 } 1121 1122 // DoltRemoteTableFile is an implementation of a TableFile that lives in a DoltChunkStore 1123 type DoltRemoteTableFile struct { 1124 dcs *DoltChunkStore 1125 info *remotesapi.TableFileInfo 1126 } 1127 1128 // LocationPrefix 1129 func (drtf DoltRemoteTableFile) LocationPrefix() string { 1130 return "" 1131 } 1132 1133 // FileID gets the id of the file 1134 func (drtf DoltRemoteTableFile) FileID() string { 1135 id := drtf.info.FileId 1136 1137 // Early versions of |dolt| could return GenerationalChunkStore 1138 // TableFile implementations where FileID included an `oldgen/` prefix. 1139 // If we are communicating with a remotesrv from one of those versions, 1140 // we may see this prefix. This is not relevant to how we want to 1141 // address the table file locally, so we prune it here. 1142 if strings.HasPrefix(id, "oldgen/") { 1143 id = strings.TrimPrefix(id, "oldgen/") 1144 } 1145 1146 return id 1147 } 1148 1149 // NumChunks returns the number of chunks in a table file 1150 func (drtf DoltRemoteTableFile) NumChunks() int { 1151 return int(drtf.info.NumChunks) 1152 } 1153 1154 var ErrRemoteTableFileGet = errors.New("HTTP GET for remote table file failed") 1155 1156 func sanitizeSignedUrl(url string) string { 1157 si := strings.Index(url, "Signature=") 1158 if si == -1 { 1159 return url 1160 } 1161 ei := strings.Index(url[si:], "&") 1162 if ei == -1 { 1163 return url[:si+15] + "..." 1164 } else { 1165 return url[:si+15] + "..." + url[si:][ei:] 1166 } 1167 } 1168 1169 // Open returns an io.ReadCloser which can be used to read the bytes of a table file. 1170 func (drtf DoltRemoteTableFile) Open(ctx context.Context) (io.ReadCloser, uint64, error) { 1171 if drtf.info.RefreshAfter != nil && drtf.info.RefreshAfter.AsTime().After(time.Now()) { 1172 resp, err := drtf.dcs.csClient.RefreshTableFileUrl(ctx, drtf.info.RefreshRequest) 1173 if err == nil { 1174 drtf.info.Url = resp.Url 1175 drtf.info.RefreshAfter = resp.RefreshAfter 1176 } 1177 } 1178 1179 req, err := http.NewRequestWithContext(ctx, http.MethodGet, drtf.info.Url, nil) 1180 if err != nil { 1181 return nil, 0, err 1182 } 1183 1184 resp, err := drtf.dcs.httpFetcher.Do(req) 1185 if err != nil { 1186 return nil, 0, err 1187 } 1188 1189 if resp.StatusCode/100 != 2 { 1190 defer resp.Body.Close() 1191 body := make([]byte, 4096) 1192 n, _ := io.ReadFull(resp.Body, body) 1193 return nil, 0, fmt.Errorf("%w: status code: %d;\nurl: %s\n\nbody:\n\n%s\n", ErrRemoteTableFileGet, resp.StatusCode, sanitizeSignedUrl(drtf.info.Url), string(body[0:n])) 1194 } 1195 1196 return resp.Body, uint64(resp.ContentLength), nil 1197 }