github.com/grailbio/base@v0.0.11/s3util/s3copy.go (about) 1 package s3util 2 3 import ( 4 "context" 5 "fmt" 6 "net/url" 7 "strings" 8 "time" 9 10 "github.com/grailbio/base/errors" 11 "github.com/grailbio/base/retry" 12 "github.com/grailbio/base/traverse" 13 14 "github.com/aws/aws-sdk-go/aws" 15 "github.com/aws/aws-sdk-go/service/s3" 16 "github.com/aws/aws-sdk-go/service/s3/s3iface" 17 ) 18 19 const ( 20 // DefaultS3ObjectCopySizeLimit is the max size of object for a single PUT Object Copy request. 21 // As per AWS: https://docs.aws.amazon.com/AmazonS3/latest/API/RESTObjectCOPY.html 22 // the max size allowed is 5GB, but we use a smaller size here to speed up large file copies. 23 DefaultS3ObjectCopySizeLimit = 256 << 20 // 256MiB 24 25 // defaultS3MultipartCopyPartSize is the max size of each part when doing a multi-part copy. 26 // Note: Though we can do parts of size up to defaultS3ObjectCopySizeLimit, for large files 27 // using smaller size parts (concurrently) is much faster. 28 DefaultS3MultipartCopyPartSize = 128 << 20 // 128MiB 29 30 // s3MultipartCopyConcurrencyLimit is the number of concurrent parts to do during a multi-part copy. 31 s3MultipartCopyConcurrencyLimit = 100 32 33 defaultMaxRetries = 3 34 ) 35 36 var ( 37 // DefaultRetryPolicy is the default retry policy 38 DefaultRetryPolicy = retry.MaxRetries(retry.Jitter(retry.Backoff(1*time.Second, time.Minute, 2), 0.25), defaultMaxRetries) 39 ) 40 41 type Debugger interface { 42 Debugf(format string, args ...interface{}) 43 } 44 45 type noOpDebugger struct{} 46 47 func (d noOpDebugger) Debugf(format string, args ...interface{}) {} 48 49 // Copier supports operations to copy S3 objects (within or across buckets) 50 // by using S3 APIs that support the same (ie, without having to stream the data by reading and writing). 51 // 52 // Since AWS doesn't allow copying large files in a single operation, 53 // this will do a multi-part copy object in those cases. 54 // However, this behavior can also be controlled by setting appropriate values 55 // for S3ObjectCopySizeLimit and S3MultipartCopyPartSize. 56 57 type Copier struct { 58 client s3iface.S3API 59 retrier retry.Policy 60 61 // S3ObjectCopySizeLimit is the max size of object for a single PUT Object Copy request. 62 S3ObjectCopySizeLimit int64 63 // S3MultipartCopyPartSize is the max size of each part when doing a multi-part copy. 64 S3MultipartCopyPartSize int64 65 66 Debugger 67 } 68 69 func NewCopier(client s3iface.S3API) *Copier { 70 return NewCopierWithParams(client, DefaultRetryPolicy, DefaultS3ObjectCopySizeLimit, DefaultS3MultipartCopyPartSize, nil) 71 } 72 73 func NewCopierWithParams(client s3iface.S3API, retrier retry.Policy, s3ObjectCopySizeLimit int64, s3MultipartCopyPartSize int64, debugger Debugger) *Copier { 74 if debugger == nil { 75 debugger = noOpDebugger{} 76 } 77 return &Copier{ 78 client: client, 79 retrier: retrier, 80 S3ObjectCopySizeLimit: s3ObjectCopySizeLimit, 81 S3MultipartCopyPartSize: s3MultipartCopyPartSize, 82 Debugger: debugger, 83 } 84 } 85 86 // Copy copies the S3 object from srcUrl to dstUrl (both expected to be full S3 URLs) 87 // The size of the source object (srcSize) determines behavior (whether done as single or multi-part copy). 88 // 89 // dstMetadata must be set if the caller wishes to set the metadata on the dstUrl object. 90 // While the AWS API will copy the metadata over if done using CopyObject, but NOT when multi-part copy is done, 91 // this method requires that dstMetadata be always provided to remove ambiguity. 92 // So if metadata is desired on dstUrl object, *it must always be provided*. 93 func (c *Copier) Copy(ctx context.Context, srcUrl, dstUrl string, srcSize int64, dstMetadata map[string]*string) error { 94 copySrc := strings.TrimPrefix(srcUrl, "s3://") 95 dstBucket, dstKey, err := bucketKey(dstUrl) 96 if err != nil { 97 return err 98 } 99 if srcSize <= c.S3ObjectCopySizeLimit { 100 // Do single copy 101 input := &s3.CopyObjectInput{ 102 Bucket: aws.String(dstBucket), 103 Key: aws.String(dstKey), 104 CopySource: aws.String(copySrc), 105 Metadata: dstMetadata, 106 } 107 for retries := 0; ; retries++ { 108 _, err = c.client.CopyObjectWithContext(ctx, input) 109 err = CtxErr(ctx, err) 110 if err == nil { 111 break 112 } 113 severity := Severity(err) 114 if severity != errors.Temporary && severity != errors.Retriable { 115 break 116 } 117 c.Debugf("s3copy.Copy: attempt (%d): %s -> %s\n%v\n", retries, srcUrl, dstUrl, err) 118 if err = retry.Wait(ctx, c.retrier, retries); err != nil { 119 break 120 } 121 } 122 if err == nil { 123 c.Debugf("s3copy.Copy: done: %s -> %s", srcUrl, dstUrl) 124 } 125 return err 126 } 127 // Do a multi-part copy 128 numParts := (srcSize + c.S3MultipartCopyPartSize - 1) / c.S3MultipartCopyPartSize 129 input := &s3.CreateMultipartUploadInput{ 130 Bucket: aws.String(dstBucket), 131 Key: aws.String(dstKey), 132 Metadata: dstMetadata, 133 } 134 createOut, err := c.client.CreateMultipartUploadWithContext(ctx, input) 135 if err != nil { 136 return errors.E(fmt.Sprintf("CreateMultipartUpload: %s -> %s", srcUrl, dstUrl), err) 137 } 138 completedParts := make([]*s3.CompletedPart, numParts) 139 err = traverse.Limit(s3MultipartCopyConcurrencyLimit).Each(int(numParts), func(ti int) error { 140 i := int64(ti) 141 firstByte := i * c.S3MultipartCopyPartSize 142 lastByte := firstByte + c.S3MultipartCopyPartSize - 1 143 if lastByte >= srcSize { 144 lastByte = srcSize - 1 145 } 146 var partErr error 147 var uploadOut *s3.UploadPartCopyOutput 148 for retries := 0; ; retries++ { 149 uploadOut, partErr = c.client.UploadPartCopyWithContext(ctx, &s3.UploadPartCopyInput{ 150 Bucket: aws.String(dstBucket), 151 Key: aws.String(dstKey), 152 CopySource: aws.String(copySrc), 153 UploadId: createOut.UploadId, 154 PartNumber: aws.Int64(i + 1), 155 CopySourceRange: aws.String(fmt.Sprintf("bytes=%d-%d", firstByte, lastByte)), 156 }) 157 partErr = CtxErr(ctx, partErr) 158 if partErr == nil { 159 break 160 } 161 severity := Severity(partErr) 162 if severity != errors.Temporary && severity != errors.Retriable { 163 break 164 } 165 c.Debugf("s3copy.Copy: attempt (%d) (part %d/%d): %s -> %s\n%v\n", retries, i, numParts, srcUrl, dstUrl, partErr) 166 if partErr = retry.Wait(ctx, c.retrier, retries); partErr != nil { 167 break 168 } 169 } 170 if partErr == nil { 171 completedParts[i] = &s3.CompletedPart{ETag: uploadOut.CopyPartResult.ETag, PartNumber: aws.Int64(i + 1)} 172 c.Debugf("s3copy.Copy: done (part %d/%d): %s -> %s", i, numParts, srcUrl, dstUrl) 173 return nil 174 } 175 return errors.E(fmt.Sprintf("upload part copy (part %d/%d) %s -> %s", i, numParts, srcUrl, dstUrl), partErr) 176 }) 177 if err == nil { 178 // Complete the multi-part copy 179 for retries := 0; ; retries++ { 180 _, err = c.client.CompleteMultipartUploadWithContext(ctx, &s3.CompleteMultipartUploadInput{ 181 Bucket: aws.String(dstBucket), 182 Key: aws.String(dstKey), 183 UploadId: createOut.UploadId, 184 MultipartUpload: &s3.CompletedMultipartUpload{Parts: completedParts}, 185 }) 186 if err == nil || Severity(err) != errors.Temporary { 187 break 188 } 189 c.Debugf("s3copy.Copy complete upload: attempt (%d): %s -> %s\n%v\n", retries, srcUrl, dstUrl, err) 190 if err = retry.Wait(ctx, c.retrier, retries); err != nil { 191 break 192 } 193 } 194 if err == nil { 195 c.Debugf("s3copy.Copy: done (all %d parts): %s -> %s", numParts, srcUrl, dstUrl) 196 return nil 197 } 198 err = errors.E(fmt.Sprintf("complete multipart upload %s -> %s", srcUrl, dstUrl), Severity(err), err) 199 } 200 // Abort the multi-part copy 201 if _, er := c.client.AbortMultipartUploadWithContext(ctx, &s3.AbortMultipartUploadInput{ 202 Bucket: aws.String(dstBucket), 203 Key: aws.String(dstKey), 204 UploadId: createOut.UploadId, 205 }); er != nil { 206 err = fmt.Errorf("abort multipart copy %v (aborting due to original error: %v)", er, err) 207 } 208 return err 209 } 210 211 // bucketKey returns the bucket and key for the given S3 object url and error (if any). 212 func bucketKey(rawurl string) (string, string, error) { 213 u, err := url.Parse(rawurl) 214 if err != nil { 215 return "", "", errors.E(errors.Invalid, errors.Fatal, fmt.Sprintf("cannot determine bucket and key from rawurl %s", rawurl), err) 216 } 217 bucket := u.Host 218 return bucket, strings.TrimPrefix(rawurl, "s3://"+bucket+"/"), nil 219 }