github.com/grailbio/base@v0.0.11/s3util/s3copy.go (about)

     1  package s3util
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"net/url"
     7  	"strings"
     8  	"time"
     9  
    10  	"github.com/grailbio/base/errors"
    11  	"github.com/grailbio/base/retry"
    12  	"github.com/grailbio/base/traverse"
    13  
    14  	"github.com/aws/aws-sdk-go/aws"
    15  	"github.com/aws/aws-sdk-go/service/s3"
    16  	"github.com/aws/aws-sdk-go/service/s3/s3iface"
    17  )
    18  
    19  const (
    20  	// DefaultS3ObjectCopySizeLimit is the max size of object for a single PUT Object Copy request.
    21  	// As per AWS: https://docs.aws.amazon.com/AmazonS3/latest/API/RESTObjectCOPY.html
    22  	// the max size allowed is 5GB, but we use a smaller size here to speed up large file copies.
    23  	DefaultS3ObjectCopySizeLimit = 256 << 20 // 256MiB
    24  
    25  	// defaultS3MultipartCopyPartSize is the max size of each part when doing a multi-part copy.
    26  	// Note: Though we can do parts of size up to defaultS3ObjectCopySizeLimit, for large files
    27  	// using smaller size parts (concurrently) is much faster.
    28  	DefaultS3MultipartCopyPartSize = 128 << 20 // 128MiB
    29  
    30  	// s3MultipartCopyConcurrencyLimit is the number of concurrent parts to do during a multi-part copy.
    31  	s3MultipartCopyConcurrencyLimit = 100
    32  
    33  	defaultMaxRetries = 3
    34  )
    35  
    36  var (
    37  	// DefaultRetryPolicy is the default retry policy
    38  	DefaultRetryPolicy = retry.MaxRetries(retry.Jitter(retry.Backoff(1*time.Second, time.Minute, 2), 0.25), defaultMaxRetries)
    39  )
    40  
    41  type Debugger interface {
    42  	Debugf(format string, args ...interface{})
    43  }
    44  
    45  type noOpDebugger struct{}
    46  
    47  func (d noOpDebugger) Debugf(format string, args ...interface{}) {}
    48  
    49  // Copier supports operations to copy S3 objects (within or across buckets)
    50  // by using S3 APIs that support the same (ie, without having to stream the data by reading and writing).
    51  //
    52  // Since AWS doesn't allow copying large files in a single operation,
    53  // this will do a multi-part copy object in those cases.
    54  // However, this behavior can also be controlled by setting appropriate values
    55  // for S3ObjectCopySizeLimit and S3MultipartCopyPartSize.
    56  
    57  type Copier struct {
    58  	client  s3iface.S3API
    59  	retrier retry.Policy
    60  
    61  	// S3ObjectCopySizeLimit is the max size of object for a single PUT Object Copy request.
    62  	S3ObjectCopySizeLimit int64
    63  	// S3MultipartCopyPartSize is the max size of each part when doing a multi-part copy.
    64  	S3MultipartCopyPartSize int64
    65  
    66  	Debugger
    67  }
    68  
    69  func NewCopier(client s3iface.S3API) *Copier {
    70  	return NewCopierWithParams(client, DefaultRetryPolicy, DefaultS3ObjectCopySizeLimit, DefaultS3MultipartCopyPartSize, nil)
    71  }
    72  
    73  func NewCopierWithParams(client s3iface.S3API, retrier retry.Policy, s3ObjectCopySizeLimit int64, s3MultipartCopyPartSize int64, debugger Debugger) *Copier {
    74  	if debugger == nil {
    75  		debugger = noOpDebugger{}
    76  	}
    77  	return &Copier{
    78  		client:                  client,
    79  		retrier:                 retrier,
    80  		S3ObjectCopySizeLimit:   s3ObjectCopySizeLimit,
    81  		S3MultipartCopyPartSize: s3MultipartCopyPartSize,
    82  		Debugger:                debugger,
    83  	}
    84  }
    85  
    86  // Copy copies the S3 object from srcUrl to dstUrl (both expected to be full S3 URLs)
    87  // The size of the source object (srcSize) determines behavior (whether done as single or multi-part copy).
    88  //
    89  // dstMetadata must be set if the caller wishes to set the metadata on the dstUrl object.
    90  // While the AWS API will copy the metadata over if done using CopyObject, but NOT when multi-part copy is done,
    91  // this method requires that dstMetadata be always provided to remove ambiguity.
    92  // So if metadata is desired on dstUrl object, *it must always be provided*.
    93  func (c *Copier) Copy(ctx context.Context, srcUrl, dstUrl string, srcSize int64, dstMetadata map[string]*string) error {
    94  	copySrc := strings.TrimPrefix(srcUrl, "s3://")
    95  	dstBucket, dstKey, err := bucketKey(dstUrl)
    96  	if err != nil {
    97  		return err
    98  	}
    99  	if srcSize <= c.S3ObjectCopySizeLimit {
   100  		// Do single copy
   101  		input := &s3.CopyObjectInput{
   102  			Bucket:     aws.String(dstBucket),
   103  			Key:        aws.String(dstKey),
   104  			CopySource: aws.String(copySrc),
   105  			Metadata:   dstMetadata,
   106  		}
   107  		for retries := 0; ; retries++ {
   108  			_, err = c.client.CopyObjectWithContext(ctx, input)
   109  			err = CtxErr(ctx, err)
   110  			if err == nil {
   111  				break
   112  			}
   113  			severity := Severity(err)
   114  			if severity != errors.Temporary && severity != errors.Retriable {
   115  				break
   116  			}
   117  			c.Debugf("s3copy.Copy: attempt (%d): %s -> %s\n%v\n", retries, srcUrl, dstUrl, err)
   118  			if err = retry.Wait(ctx, c.retrier, retries); err != nil {
   119  				break
   120  			}
   121  		}
   122  		if err == nil {
   123  			c.Debugf("s3copy.Copy: done: %s -> %s", srcUrl, dstUrl)
   124  		}
   125  		return err
   126  	}
   127  	// Do a multi-part copy
   128  	numParts := (srcSize + c.S3MultipartCopyPartSize - 1) / c.S3MultipartCopyPartSize
   129  	input := &s3.CreateMultipartUploadInput{
   130  		Bucket:   aws.String(dstBucket),
   131  		Key:      aws.String(dstKey),
   132  		Metadata: dstMetadata,
   133  	}
   134  	createOut, err := c.client.CreateMultipartUploadWithContext(ctx, input)
   135  	if err != nil {
   136  		return errors.E(fmt.Sprintf("CreateMultipartUpload: %s -> %s", srcUrl, dstUrl), err)
   137  	}
   138  	completedParts := make([]*s3.CompletedPart, numParts)
   139  	err = traverse.Limit(s3MultipartCopyConcurrencyLimit).Each(int(numParts), func(ti int) error {
   140  		i := int64(ti)
   141  		firstByte := i * c.S3MultipartCopyPartSize
   142  		lastByte := firstByte + c.S3MultipartCopyPartSize - 1
   143  		if lastByte >= srcSize {
   144  			lastByte = srcSize - 1
   145  		}
   146  		var partErr error
   147  		var uploadOut *s3.UploadPartCopyOutput
   148  		for retries := 0; ; retries++ {
   149  			uploadOut, partErr = c.client.UploadPartCopyWithContext(ctx, &s3.UploadPartCopyInput{
   150  				Bucket:          aws.String(dstBucket),
   151  				Key:             aws.String(dstKey),
   152  				CopySource:      aws.String(copySrc),
   153  				UploadId:        createOut.UploadId,
   154  				PartNumber:      aws.Int64(i + 1),
   155  				CopySourceRange: aws.String(fmt.Sprintf("bytes=%d-%d", firstByte, lastByte)),
   156  			})
   157  			partErr = CtxErr(ctx, partErr)
   158  			if partErr == nil {
   159  				break
   160  			}
   161  			severity := Severity(partErr)
   162  			if severity != errors.Temporary && severity != errors.Retriable {
   163  				break
   164  			}
   165  			c.Debugf("s3copy.Copy: attempt (%d) (part %d/%d): %s -> %s\n%v\n", retries, i, numParts, srcUrl, dstUrl, partErr)
   166  			if partErr = retry.Wait(ctx, c.retrier, retries); partErr != nil {
   167  				break
   168  			}
   169  		}
   170  		if partErr == nil {
   171  			completedParts[i] = &s3.CompletedPart{ETag: uploadOut.CopyPartResult.ETag, PartNumber: aws.Int64(i + 1)}
   172  			c.Debugf("s3copy.Copy: done (part %d/%d): %s -> %s", i, numParts, srcUrl, dstUrl)
   173  			return nil
   174  		}
   175  		return errors.E(fmt.Sprintf("upload part copy (part %d/%d) %s -> %s", i, numParts, srcUrl, dstUrl), partErr)
   176  	})
   177  	if err == nil {
   178  		// Complete the multi-part copy
   179  		for retries := 0; ; retries++ {
   180  			_, err = c.client.CompleteMultipartUploadWithContext(ctx, &s3.CompleteMultipartUploadInput{
   181  				Bucket:          aws.String(dstBucket),
   182  				Key:             aws.String(dstKey),
   183  				UploadId:        createOut.UploadId,
   184  				MultipartUpload: &s3.CompletedMultipartUpload{Parts: completedParts},
   185  			})
   186  			if err == nil || Severity(err) != errors.Temporary {
   187  				break
   188  			}
   189  			c.Debugf("s3copy.Copy complete upload: attempt (%d): %s -> %s\n%v\n", retries, srcUrl, dstUrl, err)
   190  			if err = retry.Wait(ctx, c.retrier, retries); err != nil {
   191  				break
   192  			}
   193  		}
   194  		if err == nil {
   195  			c.Debugf("s3copy.Copy: done (all %d parts): %s -> %s", numParts, srcUrl, dstUrl)
   196  			return nil
   197  		}
   198  		err = errors.E(fmt.Sprintf("complete multipart upload %s -> %s", srcUrl, dstUrl), Severity(err), err)
   199  	}
   200  	// Abort the multi-part copy
   201  	if _, er := c.client.AbortMultipartUploadWithContext(ctx, &s3.AbortMultipartUploadInput{
   202  		Bucket:   aws.String(dstBucket),
   203  		Key:      aws.String(dstKey),
   204  		UploadId: createOut.UploadId,
   205  	}); er != nil {
   206  		err = fmt.Errorf("abort multipart copy %v (aborting due to original error: %v)", er, err)
   207  	}
   208  	return err
   209  }
   210  
   211  // bucketKey returns the bucket and key for the given S3 object url and error (if any).
   212  func bucketKey(rawurl string) (string, string, error) {
   213  	u, err := url.Parse(rawurl)
   214  	if err != nil {
   215  		return "", "", errors.E(errors.Invalid, errors.Fatal, fmt.Sprintf("cannot determine bucket and key from rawurl %s", rawurl), err)
   216  	}
   217  	bucket := u.Host
   218  	return bucket, strings.TrimPrefix(rawurl, "s3://"+bucket+"/"), nil
   219  }