github.com/hasnat/dolt/go@v0.0.0-20210628190320-9eb5d843fbb7/store/nbs/s3_table_reader.go (about) 1 // Copyright 2019-2021 Dolthub, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 // 15 // This file incorporates work covered by the following copyright and 16 // permission notice: 17 // 18 // Copyright 2016 Attic Labs, Inc. All rights reserved. 19 // Licensed under the Apache License, version 2.0: 20 // http://www.apache.org/licenses/LICENSE-2.0 21 22 package nbs 23 24 import ( 25 "context" 26 "fmt" 27 "io" 28 "net" 29 "os" 30 "strconv" 31 "strings" 32 "sync/atomic" 33 "syscall" 34 "time" 35 36 "github.com/aws/aws-sdk-go/aws" 37 "github.com/aws/aws-sdk-go/aws/request" 38 "github.com/aws/aws-sdk-go/service/s3" 39 "github.com/jpillora/backoff" 40 "golang.org/x/sync/errgroup" 41 ) 42 43 const ( 44 s3RangePrefix = "bytes" 45 s3BlockSize = (1 << 10) * 512 // 512K 46 ) 47 48 type s3TableReaderAt struct { 49 s3 *s3ObjectReader 50 h addr 51 } 52 53 type s3svc interface { 54 AbortMultipartUploadWithContext(ctx aws.Context, input *s3.AbortMultipartUploadInput, opts ...request.Option) (*s3.AbortMultipartUploadOutput, error) 55 CreateMultipartUploadWithContext(ctx aws.Context, input *s3.CreateMultipartUploadInput, opts ...request.Option) (*s3.CreateMultipartUploadOutput, error) 56 UploadPartWithContext(ctx aws.Context, input *s3.UploadPartInput, opts ...request.Option) (*s3.UploadPartOutput, error) 57 UploadPartCopyWithContext(ctx aws.Context, input *s3.UploadPartCopyInput, opts ...request.Option) (*s3.UploadPartCopyOutput, error) 58 CompleteMultipartUploadWithContext(ctx aws.Context, input *s3.CompleteMultipartUploadInput, opts ...request.Option) (*s3.CompleteMultipartUploadOutput, error) 59 GetObjectWithContext(ctx aws.Context, input *s3.GetObjectInput, opts ...request.Option) (*s3.GetObjectOutput, error) 60 PutObjectWithContext(ctx aws.Context, input *s3.PutObjectInput, opts ...request.Option) (*s3.PutObjectOutput, error) 61 } 62 63 func (s3tra *s3TableReaderAt) ReadAtWithStats(ctx context.Context, p []byte, off int64, stats *Stats) (n int, err error) { 64 return s3tra.s3.ReadAt(ctx, s3tra.h, p, off, stats) 65 } 66 67 // TODO: Bring all the multipart upload and remote-conjoin stuff over here and make this a better analogue to ddbTableStore 68 type s3ObjectReader struct { 69 s3 s3svc 70 bucket string 71 readRl chan struct{} 72 tc tableCache 73 ns string 74 } 75 76 func (s3or *s3ObjectReader) key(k string) string { 77 if s3or.ns != "" { 78 return s3or.ns + "/" + k 79 } 80 return k 81 } 82 83 func (s3or *s3ObjectReader) ReadAt(ctx context.Context, name addr, p []byte, off int64, stats *Stats) (n int, err error) { 84 t1 := time.Now() 85 86 if s3or.tc != nil { 87 var r io.ReaderAt 88 r, err = s3or.tc.checkout(name) 89 90 if err != nil { 91 return 0, err 92 } 93 94 if r != nil { 95 defer func() { 96 stats.FileBytesPerRead.Sample(uint64(len(p))) 97 stats.FileReadLatency.SampleTimeSince(t1) 98 }() 99 100 defer func() { 101 checkinErr := s3or.tc.checkin(name) 102 103 if err == nil { 104 err = checkinErr 105 } 106 }() 107 108 n, err = r.ReadAt(p, off) 109 return 110 } 111 } 112 113 defer func() { 114 stats.S3BytesPerRead.Sample(uint64(len(p))) 115 stats.S3ReadLatency.SampleTimeSince(t1) 116 }() 117 118 n, _, err = s3or.readRange(ctx, name, p, s3RangeHeader(off, int64(len(p)))) 119 return 120 } 121 122 func s3RangeHeader(off, length int64) string { 123 lastByte := off + length - 1 // insanely, the HTTP range header specifies ranges inclusively. 124 return fmt.Sprintf("%s=%d-%d", s3RangePrefix, off, lastByte) 125 } 126 127 const maxS3ReadFromEndReqSize = 256 * 1024 * 1024 // 256MB 128 const preferredS3ReadFromEndReqSize = 128 * 1024 * 1024 // 128MB 129 130 func (s3or *s3ObjectReader) ReadFromEnd(ctx context.Context, name addr, p []byte, stats *Stats) (n int, sz uint64, err error) { 131 // TODO: enable this to use the tableCache. The wrinkle is the tableCache currently just returns a ReaderAt, which doesn't give you the length of the object that backs it, so you can't calculate an offset if all you know is that you want the last N bytes. 132 defer func(t1 time.Time) { 133 stats.S3BytesPerRead.Sample(uint64(len(p))) 134 stats.S3ReadLatency.SampleTimeSince(t1) 135 }(time.Now()) 136 totalN := uint64(0) 137 if len(p) > maxS3ReadFromEndReqSize { 138 // If we're bigger than 256MB, parallelize the read... 139 // Read the footer first and capture the size of the entire table file. 140 n, sz, err := s3or.readRange(ctx, name, p[len(p)-footerSize:], fmt.Sprintf("%s=-%d", s3RangePrefix, footerSize)) 141 if err != nil { 142 return n, sz, err 143 } 144 totalN += uint64(n) 145 eg, egctx := errgroup.WithContext(ctx) 146 start := 0 147 for start < len(p)-footerSize { 148 // Make parallel read requests of up to 128MB. 149 end := start + preferredS3ReadFromEndReqSize 150 if end > len(p)-footerSize { 151 end = len(p) - footerSize 152 } 153 bs := p[start:end] 154 rangeStart := sz - uint64(len(p)) + uint64(start) 155 rangeEnd := sz - uint64(len(p)) + uint64(end) - 1 156 eg.Go(func() error { 157 n, _, err := s3or.readRange(egctx, name, bs, fmt.Sprintf("%s=%d-%d", s3RangePrefix, rangeStart, rangeEnd)) 158 if err != nil { 159 return err 160 } 161 atomic.AddUint64(&totalN, uint64(n)) 162 return nil 163 }) 164 start = end 165 } 166 err = eg.Wait() 167 if err != nil { 168 return 0, 0, err 169 } 170 return int(totalN), sz, nil 171 } 172 return s3or.readRange(ctx, name, p, fmt.Sprintf("%s=-%d", s3RangePrefix, len(p))) 173 } 174 175 func (s3or *s3ObjectReader) readRange(ctx context.Context, name addr, p []byte, rangeHeader string) (n int, sz uint64, err error) { 176 read := func() (int, uint64, error) { 177 if s3or.readRl != nil { 178 s3or.readRl <- struct{}{} 179 defer func() { 180 <-s3or.readRl 181 }() 182 } 183 184 input := &s3.GetObjectInput{ 185 Bucket: aws.String(s3or.bucket), 186 Key: aws.String(s3or.key(name.String())), 187 Range: aws.String(rangeHeader), 188 } 189 190 result, err := s3or.s3.GetObjectWithContext(ctx, input) 191 if err != nil { 192 return 0, 0, err 193 } 194 defer result.Body.Close() 195 196 if *result.ContentLength != int64(len(p)) { 197 return 0, 0, fmt.Errorf("failed to read entire range, key: %v, len(p): %d, rangeHeader: %s, ContentLength: %d", s3or.key(name.String()), len(p), rangeHeader, *result.ContentLength) 198 } 199 200 sz := uint64(0) 201 if result.ContentRange != nil { 202 i := strings.Index(*result.ContentRange, "/") 203 if i != -1 { 204 sz, err = strconv.ParseUint((*result.ContentRange)[i+1:], 10, 64) 205 if err != nil { 206 return 0, 0, err 207 } 208 } 209 } 210 211 n, err = io.ReadFull(result.Body, p) 212 return n, sz, err 213 } 214 215 n, sz, err = read() 216 // We hit the point of diminishing returns investigating #3255, so add retries. In conversations with AWS people, it's not surprising to get transient failures when talking to S3, though SDKs are intended to have their own retrying. The issue may be that, in Go, making the S3 request and reading the data are separate operations, and the SDK kind of can't do its own retrying to handle failures in the latter. 217 if isConnReset(err) { 218 // We are backing off here because its possible and likely that the rate of requests to S3 is the underlying issue. 219 b := &backoff.Backoff{ 220 Min: 128 * time.Microsecond, 221 Max: 1024 * time.Millisecond, 222 Factor: 2, 223 Jitter: true, 224 } 225 for ; isConnReset(err); n, sz, err = read() { 226 dur := b.Duration() 227 time.Sleep(dur) 228 } 229 } 230 231 return n, sz, err 232 } 233 234 func isConnReset(err error) bool { 235 nErr, ok := err.(*net.OpError) 236 if !ok { 237 return false 238 } 239 scErr, ok := nErr.Err.(*os.SyscallError) 240 return ok && scErr.Err == syscall.ECONNRESET 241 }