github.com/dolthub/dolt/go@v0.40.5-0.20240520175717-68db7794bea6/store/nbs/aws_table_persister.go (about)

     1  // Copyright 2019 Dolthub, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  //
    15  // This file incorporates work covered by the following copyright and
    16  // permission notice:
    17  //
    18  // Copyright 2016 Attic Labs, Inc. All rights reserved.
    19  // Licensed under the Apache License, version 2.0:
    20  // http://www.apache.org/licenses/LICENSE-2.0
    21  
    22  package nbs
    23  
    24  import (
    25  	"bytes"
    26  	"context"
    27  	"errors"
    28  	"fmt"
    29  	"io"
    30  	"net/url"
    31  	"sort"
    32  	"sync"
    33  	"time"
    34  
    35  	"github.com/aws/aws-sdk-go/aws"
    36  	"github.com/aws/aws-sdk-go/service/s3"
    37  	"github.com/aws/aws-sdk-go/service/s3/s3iface"
    38  	"github.com/aws/aws-sdk-go/service/s3/s3manager"
    39  
    40  	"github.com/dolthub/dolt/go/store/atomicerr"
    41  	"github.com/dolthub/dolt/go/store/chunks"
    42  	"github.com/dolthub/dolt/go/store/hash"
    43  	"github.com/dolthub/dolt/go/store/util/verbose"
    44  )
    45  
    46  const (
    47  	minS3PartSize = 5 * 1 << 20  // 5MiB
    48  	maxS3PartSize = 64 * 1 << 20 // 64MiB
    49  	maxS3Parts    = 10000
    50  
    51  	defaultS3PartSize = minS3PartSize // smallest allowed by S3 allows for most throughput
    52  )
    53  
    54  type awsTablePersister struct {
    55  	s3     s3iface.S3API
    56  	bucket string
    57  	rl     chan struct{}
    58  	limits awsLimits
    59  	ns     string
    60  	q      MemoryQuotaProvider
    61  }
    62  
    63  var _ tablePersister = awsTablePersister{}
    64  var _ tableFilePersister = awsTablePersister{}
    65  
    66  type awsLimits struct {
    67  	partTarget, partMin, partMax uint64
    68  }
    69  
    70  func (s3p awsTablePersister) Open(ctx context.Context, name hash.Hash, chunkCount uint32, stats *Stats) (chunkSource, error) {
    71  	return newAWSChunkSource(
    72  		ctx,
    73  		&s3ObjectReader{s3: s3p.s3, bucket: s3p.bucket, readRl: s3p.rl, ns: s3p.ns},
    74  		s3p.limits,
    75  		name,
    76  		chunkCount,
    77  		s3p.q,
    78  		stats,
    79  	)
    80  }
    81  
    82  func (s3p awsTablePersister) Exists(ctx context.Context, name hash.Hash, chunkCount uint32, stats *Stats) (bool, error) {
    83  	return tableExistsInChunkSource(
    84  		ctx,
    85  		&s3ObjectReader{s3: s3p.s3, bucket: s3p.bucket, readRl: s3p.rl, ns: s3p.ns},
    86  		s3p.limits,
    87  		name,
    88  		chunkCount,
    89  		s3p.q,
    90  		stats,
    91  	)
    92  }
    93  
    94  func (s3p awsTablePersister) CopyTableFile(ctx context.Context, r io.Reader, fileId string, fileSz uint64, chunkCount uint32) error {
    95  	return s3p.multipartUpload(ctx, r, fileSz, fileId)
    96  }
    97  
    98  func (s3p awsTablePersister) Path() string {
    99  	return s3p.bucket
   100  }
   101  
   102  func (s3p awsTablePersister) AccessMode() chunks.ExclusiveAccessMode {
   103  	return chunks.ExclusiveAccessMode_Shared
   104  }
   105  
   106  type s3UploadedPart struct {
   107  	idx  int64
   108  	etag string
   109  }
   110  
   111  func (s3p awsTablePersister) key(k string) string {
   112  	if s3p.ns != "" {
   113  		return s3p.ns + "/" + k
   114  	}
   115  	return k
   116  }
   117  
   118  func (s3p awsTablePersister) Persist(ctx context.Context, mt *memTable, haver chunkReader, stats *Stats) (chunkSource, error) {
   119  	name, data, chunkCount, err := mt.write(haver, stats)
   120  
   121  	if err != nil {
   122  		return emptyChunkSource{}, err
   123  	}
   124  
   125  	if chunkCount == 0 {
   126  		return emptyChunkSource{}, nil
   127  	}
   128  
   129  	err = s3p.multipartUpload(ctx, bytes.NewReader(data), uint64(len(data)), name.String())
   130  
   131  	if err != nil {
   132  		return emptyChunkSource{}, err
   133  	}
   134  
   135  	tra := &s3TableReaderAt{&s3ObjectReader{s3: s3p.s3, bucket: s3p.bucket, readRl: s3p.rl, ns: s3p.ns}, name}
   136  	return newReaderFromIndexData(ctx, s3p.q, data, name, tra, s3BlockSize)
   137  }
   138  
   139  func (s3p awsTablePersister) multipartUpload(ctx context.Context, r io.Reader, sz uint64, key string) error {
   140  	uploader := s3manager.NewUploaderWithClient(s3p.s3, func(u *s3manager.Uploader) {
   141  		u.PartSize = int64(s3p.limits.partTarget)
   142  	})
   143  	_, err := uploader.Upload(&s3manager.UploadInput{
   144  		Bucket: aws.String(s3p.bucket),
   145  		Key:    aws.String(s3p.key(key)),
   146  		Body:   r,
   147  	})
   148  	return err
   149  }
   150  
   151  func (s3p awsTablePersister) startMultipartUpload(ctx context.Context, key string) (string, error) {
   152  	result, err := s3p.s3.CreateMultipartUploadWithContext(ctx, &s3.CreateMultipartUploadInput{
   153  		Bucket: aws.String(s3p.bucket),
   154  		Key:    aws.String(s3p.key(key)),
   155  	})
   156  
   157  	if err != nil {
   158  		return "", err
   159  	}
   160  
   161  	return *result.UploadId, nil
   162  }
   163  
   164  func (s3p awsTablePersister) abortMultipartUpload(ctx context.Context, key, uploadID string) error {
   165  	_, abrtErr := s3p.s3.AbortMultipartUploadWithContext(ctx, &s3.AbortMultipartUploadInput{
   166  		Bucket:   aws.String(s3p.bucket),
   167  		Key:      aws.String(s3p.key(key)),
   168  		UploadId: aws.String(uploadID),
   169  	})
   170  
   171  	return abrtErr
   172  }
   173  
   174  func (s3p awsTablePersister) completeMultipartUpload(ctx context.Context, key, uploadID string, mpu *s3.CompletedMultipartUpload) error {
   175  	_, err := s3p.s3.CompleteMultipartUploadWithContext(ctx, &s3.CompleteMultipartUploadInput{
   176  		Bucket:          aws.String(s3p.bucket),
   177  		Key:             aws.String(s3p.key(key)),
   178  		MultipartUpload: mpu,
   179  		UploadId:        aws.String(uploadID),
   180  	})
   181  
   182  	return err
   183  }
   184  
   185  func getNumParts(dataLen, minPartSize uint64) uint64 {
   186  	numParts := dataLen / minPartSize
   187  	if numParts == 0 {
   188  		numParts = 1
   189  	}
   190  	return numParts
   191  }
   192  
   193  type partsByPartNum []*s3.CompletedPart
   194  
   195  func (s partsByPartNum) Len() int {
   196  	return len(s)
   197  }
   198  
   199  func (s partsByPartNum) Less(i, j int) bool {
   200  	return *s[i].PartNumber < *s[j].PartNumber
   201  }
   202  
   203  func (s partsByPartNum) Swap(i, j int) {
   204  	s[i], s[j] = s[j], s[i]
   205  }
   206  
   207  func (s3p awsTablePersister) ConjoinAll(ctx context.Context, sources chunkSources, stats *Stats) (chunkSource, cleanupFunc, error) {
   208  	plan, err := planRangeCopyConjoin(sources, stats)
   209  	if err != nil {
   210  		return nil, nil, err
   211  	}
   212  
   213  	if plan.chunkCount == 0 {
   214  		return emptyChunkSource{}, nil, nil
   215  	}
   216  	t1 := time.Now()
   217  	name := nameFromSuffixes(plan.suffixes())
   218  	err = s3p.executeCompactionPlan(ctx, plan, name.String())
   219  
   220  	if err != nil {
   221  		return nil, nil, err
   222  	}
   223  
   224  	verbose.Logger(ctx).Sugar().Debugf("Compacted table of %d Kb in %s", plan.totalCompressedData/1024, time.Since(t1))
   225  
   226  	tra := &s3TableReaderAt{&s3ObjectReader{s3: s3p.s3, bucket: s3p.bucket, readRl: s3p.rl, ns: s3p.ns}, name}
   227  	cs, err := newReaderFromIndexData(ctx, s3p.q, plan.mergedIndex, name, tra, s3BlockSize)
   228  	return cs, func() {}, err
   229  }
   230  
   231  func (s3p awsTablePersister) executeCompactionPlan(ctx context.Context, plan compactionPlan, key string) error {
   232  	uploadID, err := s3p.startMultipartUpload(ctx, key)
   233  
   234  	if err != nil {
   235  		return err
   236  	}
   237  
   238  	multipartUpload, err := s3p.assembleTable(ctx, plan, key, uploadID)
   239  	if err != nil {
   240  		_ = s3p.abortMultipartUpload(ctx, key, uploadID)
   241  		return err
   242  	}
   243  
   244  	return s3p.completeMultipartUpload(ctx, key, uploadID, multipartUpload)
   245  }
   246  
   247  func (s3p awsTablePersister) assembleTable(ctx context.Context, plan compactionPlan, key, uploadID string) (*s3.CompletedMultipartUpload, error) {
   248  	if len(plan.sources.sws) > maxS3Parts {
   249  		return nil, errors.New("exceeded maximum parts")
   250  	}
   251  
   252  	// Separate plan.sources by amount of chunkData. Tables with >5MB of chunk data (copies) can be added to the new table using S3's multipart upload copy feature. Smaller tables with <5MB of chunk data (manuals) must be read, assembled into |buff|, and then re-uploaded in parts that are larger than 5MB.
   253  	copies, manuals, buff, err := dividePlan(ctx, plan, uint64(s3p.limits.partMin), uint64(s3p.limits.partMax))
   254  
   255  	if err != nil {
   256  		return nil, err
   257  	}
   258  
   259  	ae := atomicerr.New()
   260  	// Concurrently read data from small tables into |buff|
   261  	var readWg sync.WaitGroup
   262  	for _, man := range manuals {
   263  		readWg.Add(1)
   264  		go func(m manualPart) {
   265  			defer readWg.Done()
   266  			err := m.run(ctx, buff)
   267  			if err != nil {
   268  				ae.SetIfError(fmt.Errorf("failed to read conjoin table data: %w", err))
   269  			}
   270  		}(man)
   271  	}
   272  	readWg.Wait()
   273  
   274  	if err := ae.Get(); err != nil {
   275  		return nil, err
   276  	}
   277  
   278  	// sendPart calls |doUpload| to send part |partNum|, forwarding errors over |failed| or success over |sent|. Closing (or sending) on |done| will cancel all in-progress calls to sendPart.
   279  	sent, failed, done := make(chan s3UploadedPart), make(chan error), make(chan struct{})
   280  	var uploadWg sync.WaitGroup
   281  	type uploadFn func() (etag string, err error)
   282  	sendPart := func(partNum int64, doUpload uploadFn) {
   283  		if s3p.rl != nil {
   284  			s3p.rl <- struct{}{}
   285  			defer func() { <-s3p.rl }()
   286  		}
   287  		defer uploadWg.Done()
   288  
   289  		// Check if upload has been terminated
   290  		select {
   291  		case <-done:
   292  			return
   293  		default:
   294  		}
   295  
   296  		etag, err := doUpload()
   297  		if err != nil {
   298  			failed <- err
   299  			return
   300  		}
   301  		// Try to send along part info. In the case that the upload was aborted, reading from done allows this worker to exit correctly.
   302  		select {
   303  		case sent <- s3UploadedPart{int64(partNum), etag}:
   304  		case <-done:
   305  			return
   306  		}
   307  	}
   308  
   309  	// Concurrently begin sending all parts using sendPart().
   310  	// First, kick off sending all the copyable parts.
   311  	partNum := int64(1) // Part numbers are 1-indexed
   312  	for _, cp := range copies {
   313  		uploadWg.Add(1)
   314  		go func(cp copyPart, partNum int64) {
   315  			sendPart(partNum, func() (etag string, err error) {
   316  				return s3p.uploadPartCopy(ctx, cp.name, cp.srcOffset, cp.srcLen, key, uploadID, partNum)
   317  			})
   318  		}(cp, partNum)
   319  		partNum++
   320  	}
   321  
   322  	// Then, split buff (data from |manuals| and index) into parts and upload those concurrently.
   323  	numManualParts := getNumParts(uint64(len(buff)), s3p.limits.partTarget) // TODO: What if this is too big?
   324  	for i := uint64(0); i < numManualParts; i++ {
   325  		start, end := i*s3p.limits.partTarget, (i+1)*s3p.limits.partTarget
   326  		if i+1 == numManualParts { // If this is the last part, make sure it includes any overflow
   327  			end = uint64(len(buff))
   328  		}
   329  		uploadWg.Add(1)
   330  		go func(data []byte, partNum int64) {
   331  			sendPart(partNum, func() (etag string, err error) {
   332  				return s3p.uploadPart(ctx, data, key, uploadID, partNum)
   333  			})
   334  		}(buff[start:end], partNum)
   335  		partNum++
   336  	}
   337  
   338  	// When all the uploads started above are done, close |sent| and |failed| so that the code below will correctly detect that we're done sending parts and move forward.
   339  	go func() {
   340  		uploadWg.Wait()
   341  		close(sent)
   342  		close(failed)
   343  	}()
   344  
   345  	// Watch |sent| and |failed| for the results of part uploads. If ever one fails, close |done| to stop all the in-progress or pending sendPart() calls and then bail.
   346  	multipartUpload := &s3.CompletedMultipartUpload{}
   347  	var firstFailure error
   348  	for cont := true; cont; {
   349  		select {
   350  		case sentPart, open := <-sent:
   351  			if open {
   352  				multipartUpload.Parts = append(multipartUpload.Parts, &s3.CompletedPart{
   353  					ETag:       aws.String(sentPart.etag),
   354  					PartNumber: aws.Int64(sentPart.idx),
   355  				})
   356  			}
   357  			cont = open
   358  
   359  		case err := <-failed:
   360  			if err != nil && firstFailure == nil { // nil err may happen when failed gets closed
   361  				firstFailure = err
   362  				close(done)
   363  			}
   364  		}
   365  	}
   366  
   367  	// If there was any failure detected above, |done| is already closed
   368  	if firstFailure == nil {
   369  		close(done)
   370  	}
   371  	sort.Sort(partsByPartNum(multipartUpload.Parts)) // S3 requires that these be in part-order
   372  	return multipartUpload, firstFailure
   373  }
   374  
   375  type copyPart struct {
   376  	name              string
   377  	srcOffset, srcLen int64
   378  }
   379  
   380  type manualPart struct {
   381  	src        chunkSource
   382  	start, end int64
   383  }
   384  
   385  func (mp manualPart) run(ctx context.Context, buff []byte) error {
   386  	reader, _, err := mp.src.reader(ctx)
   387  	if err != nil {
   388  		return err
   389  	}
   390  	defer reader.Close()
   391  	_, err = io.ReadFull(reader, buff[mp.start:mp.end])
   392  	return err
   393  }
   394  
   395  // dividePlan assumes that plan.sources (which is of type chunkSourcesByDescendingDataSize) is correctly sorted by descending data size.
   396  func dividePlan(ctx context.Context, plan compactionPlan, minPartSize, maxPartSize uint64) (copies []copyPart, manuals []manualPart, buff []byte, err error) {
   397  	// NB: if maxPartSize < 2*minPartSize, splitting large copies apart isn't solvable. S3's limits are plenty far enough apart that this isn't a problem in production, but we could violate this in tests.
   398  	if maxPartSize < 2*minPartSize {
   399  		return nil, nil, nil, errors.New("failed to split large copies apart")
   400  	}
   401  
   402  	buffSize := uint64(len(plan.mergedIndex))
   403  	i := 0
   404  	for ; i < len(plan.sources.sws); i++ {
   405  		sws := plan.sources.sws[i]
   406  		if sws.dataLen < minPartSize {
   407  			// since plan.sources is sorted in descending chunk-data-length order, we know that sws and all members after it are too small to copy.
   408  			break
   409  		}
   410  		if sws.dataLen <= maxPartSize {
   411  			h := sws.source.hash()
   412  			copies = append(copies, copyPart{h.String(), 0, int64(sws.dataLen)})
   413  			continue
   414  		}
   415  
   416  		// Now, we need to break the data into some number of parts such that for all parts minPartSize <= size(part) <= maxPartSize. This code tries to split the part evenly, such that all new parts satisfy the previous inequality. This gets tricky around edge cases. Consider min = 5b and max = 10b and a data length of 101b. You need to send 11 parts, but you can't just send 10 parts of 10 bytes and 1 part of 1 byte -- the last is too small. You also can't send 10 parts of 9 bytes each and 1 part of 11 bytes, because the last is too big. You have to distribute the extra bytes across all the parts so that all of them fall into the proper size range.
   417  		lens := splitOnMaxSize(sws.dataLen, maxPartSize)
   418  
   419  		var srcStart int64
   420  		for _, length := range lens {
   421  			h := sws.source.hash()
   422  			copies = append(copies, copyPart{h.String(), srcStart, length})
   423  			srcStart += length
   424  		}
   425  	}
   426  	var offset int64
   427  	for ; i < len(plan.sources.sws); i++ {
   428  		sws := plan.sources.sws[i]
   429  		manuals = append(manuals, manualPart{sws.source, offset, offset + int64(sws.dataLen)})
   430  		offset += int64(sws.dataLen)
   431  		buffSize += sws.dataLen
   432  	}
   433  	buff = make([]byte, buffSize)
   434  	copy(buff[buffSize-uint64(len(plan.mergedIndex)):], plan.mergedIndex)
   435  	return
   436  }
   437  
   438  // Splits |dataLen| into the maximum number of roughly-equal part sizes such that each is <= maxPartSize.
   439  func splitOnMaxSize(dataLen, maxPartSize uint64) []int64 {
   440  	numParts := dataLen / maxPartSize
   441  	if dataLen%maxPartSize > 0 {
   442  		numParts++
   443  	}
   444  	baseSize := int64(dataLen / numParts)
   445  	extraBytes := dataLen % numParts
   446  	sizes := make([]int64, numParts)
   447  	for i := range sizes {
   448  		sizes[i] = baseSize
   449  		if extraBytes > 0 {
   450  			sizes[i]++
   451  			extraBytes--
   452  		}
   453  	}
   454  	return sizes
   455  }
   456  
   457  func (s3p awsTablePersister) uploadPartCopy(ctx context.Context, src string, srcStart, srcEnd int64, key, uploadID string, partNum int64) (etag string, err error) {
   458  	res, err := s3p.s3.UploadPartCopyWithContext(ctx, &s3.UploadPartCopyInput{
   459  		CopySource:      aws.String(url.PathEscape(s3p.bucket + "/" + s3p.key(src))),
   460  		CopySourceRange: aws.String(s3RangeHeader(srcStart, srcEnd)),
   461  		Bucket:          aws.String(s3p.bucket),
   462  		Key:             aws.String(s3p.key(key)),
   463  		PartNumber:      aws.Int64(int64(partNum)),
   464  		UploadId:        aws.String(uploadID),
   465  	})
   466  	if err == nil {
   467  		etag = *res.CopyPartResult.ETag
   468  	}
   469  	return
   470  }
   471  
   472  func (s3p awsTablePersister) uploadPart(ctx context.Context, data []byte, key, uploadID string, partNum int64) (etag string, err error) {
   473  	res, err := s3p.s3.UploadPartWithContext(ctx, &s3.UploadPartInput{
   474  		Bucket:     aws.String(s3p.bucket),
   475  		Key:        aws.String(s3p.key(key)),
   476  		PartNumber: aws.Int64(int64(partNum)),
   477  		UploadId:   aws.String(uploadID),
   478  		Body:       bytes.NewReader(data),
   479  	})
   480  	if err == nil {
   481  		etag = *res.ETag
   482  	}
   483  	return
   484  }
   485  
   486  func (s3p awsTablePersister) PruneTableFiles(ctx context.Context, keeper func() []hash.Hash, t time.Time) error {
   487  	return chunks.ErrUnsupportedOperation
   488  }
   489  
   490  func (s3p awsTablePersister) Close() error {
   491  	return nil
   492  }