github.com/hasnat/dolt/go@v0.0.0-20210628190320-9eb5d843fbb7/store/nbs/s3_table_reader.go (about)

     1  // Copyright 2019-2021 Dolthub, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  //
    15  // This file incorporates work covered by the following copyright and
    16  // permission notice:
    17  //
    18  // Copyright 2016 Attic Labs, Inc. All rights reserved.
    19  // Licensed under the Apache License, version 2.0:
    20  // http://www.apache.org/licenses/LICENSE-2.0
    21  
    22  package nbs
    23  
    24  import (
    25  	"context"
    26  	"fmt"
    27  	"io"
    28  	"net"
    29  	"os"
    30  	"strconv"
    31  	"strings"
    32  	"sync/atomic"
    33  	"syscall"
    34  	"time"
    35  
    36  	"github.com/aws/aws-sdk-go/aws"
    37  	"github.com/aws/aws-sdk-go/aws/request"
    38  	"github.com/aws/aws-sdk-go/service/s3"
    39  	"github.com/jpillora/backoff"
    40  	"golang.org/x/sync/errgroup"
    41  )
    42  
    43  const (
    44  	s3RangePrefix = "bytes"
    45  	s3BlockSize   = (1 << 10) * 512 // 512K
    46  )
    47  
    48  type s3TableReaderAt struct {
    49  	s3 *s3ObjectReader
    50  	h  addr
    51  }
    52  
    53  type s3svc interface {
    54  	AbortMultipartUploadWithContext(ctx aws.Context, input *s3.AbortMultipartUploadInput, opts ...request.Option) (*s3.AbortMultipartUploadOutput, error)
    55  	CreateMultipartUploadWithContext(ctx aws.Context, input *s3.CreateMultipartUploadInput, opts ...request.Option) (*s3.CreateMultipartUploadOutput, error)
    56  	UploadPartWithContext(ctx aws.Context, input *s3.UploadPartInput, opts ...request.Option) (*s3.UploadPartOutput, error)
    57  	UploadPartCopyWithContext(ctx aws.Context, input *s3.UploadPartCopyInput, opts ...request.Option) (*s3.UploadPartCopyOutput, error)
    58  	CompleteMultipartUploadWithContext(ctx aws.Context, input *s3.CompleteMultipartUploadInput, opts ...request.Option) (*s3.CompleteMultipartUploadOutput, error)
    59  	GetObjectWithContext(ctx aws.Context, input *s3.GetObjectInput, opts ...request.Option) (*s3.GetObjectOutput, error)
    60  	PutObjectWithContext(ctx aws.Context, input *s3.PutObjectInput, opts ...request.Option) (*s3.PutObjectOutput, error)
    61  }
    62  
    63  func (s3tra *s3TableReaderAt) ReadAtWithStats(ctx context.Context, p []byte, off int64, stats *Stats) (n int, err error) {
    64  	return s3tra.s3.ReadAt(ctx, s3tra.h, p, off, stats)
    65  }
    66  
    67  // TODO: Bring all the multipart upload and remote-conjoin stuff over here and make this a better analogue to ddbTableStore
    68  type s3ObjectReader struct {
    69  	s3     s3svc
    70  	bucket string
    71  	readRl chan struct{}
    72  	tc     tableCache
    73  	ns     string
    74  }
    75  
    76  func (s3or *s3ObjectReader) key(k string) string {
    77  	if s3or.ns != "" {
    78  		return s3or.ns + "/" + k
    79  	}
    80  	return k
    81  }
    82  
    83  func (s3or *s3ObjectReader) ReadAt(ctx context.Context, name addr, p []byte, off int64, stats *Stats) (n int, err error) {
    84  	t1 := time.Now()
    85  
    86  	if s3or.tc != nil {
    87  		var r io.ReaderAt
    88  		r, err = s3or.tc.checkout(name)
    89  
    90  		if err != nil {
    91  			return 0, err
    92  		}
    93  
    94  		if r != nil {
    95  			defer func() {
    96  				stats.FileBytesPerRead.Sample(uint64(len(p)))
    97  				stats.FileReadLatency.SampleTimeSince(t1)
    98  			}()
    99  
   100  			defer func() {
   101  				checkinErr := s3or.tc.checkin(name)
   102  
   103  				if err == nil {
   104  					err = checkinErr
   105  				}
   106  			}()
   107  
   108  			n, err = r.ReadAt(p, off)
   109  			return
   110  		}
   111  	}
   112  
   113  	defer func() {
   114  		stats.S3BytesPerRead.Sample(uint64(len(p)))
   115  		stats.S3ReadLatency.SampleTimeSince(t1)
   116  	}()
   117  
   118  	n, _, err = s3or.readRange(ctx, name, p, s3RangeHeader(off, int64(len(p))))
   119  	return
   120  }
   121  
   122  func s3RangeHeader(off, length int64) string {
   123  	lastByte := off + length - 1 // insanely, the HTTP range header specifies ranges inclusively.
   124  	return fmt.Sprintf("%s=%d-%d", s3RangePrefix, off, lastByte)
   125  }
   126  
   127  const maxS3ReadFromEndReqSize = 256 * 1024 * 1024       // 256MB
   128  const preferredS3ReadFromEndReqSize = 128 * 1024 * 1024 // 128MB
   129  
   130  func (s3or *s3ObjectReader) ReadFromEnd(ctx context.Context, name addr, p []byte, stats *Stats) (n int, sz uint64, err error) {
   131  	// TODO: enable this to use the tableCache. The wrinkle is the tableCache currently just returns a ReaderAt, which doesn't give you the length of the object that backs it, so you can't calculate an offset if all you know is that you want the last N bytes.
   132  	defer func(t1 time.Time) {
   133  		stats.S3BytesPerRead.Sample(uint64(len(p)))
   134  		stats.S3ReadLatency.SampleTimeSince(t1)
   135  	}(time.Now())
   136  	totalN := uint64(0)
   137  	if len(p) > maxS3ReadFromEndReqSize {
   138  		// If we're bigger than 256MB, parallelize the read...
   139  		// Read the footer first and capture the size of the entire table file.
   140  		n, sz, err := s3or.readRange(ctx, name, p[len(p)-footerSize:], fmt.Sprintf("%s=-%d", s3RangePrefix, footerSize))
   141  		if err != nil {
   142  			return n, sz, err
   143  		}
   144  		totalN += uint64(n)
   145  		eg, egctx := errgroup.WithContext(ctx)
   146  		start := 0
   147  		for start < len(p)-footerSize {
   148  			// Make parallel read requests of up to 128MB.
   149  			end := start + preferredS3ReadFromEndReqSize
   150  			if end > len(p)-footerSize {
   151  				end = len(p) - footerSize
   152  			}
   153  			bs := p[start:end]
   154  			rangeStart := sz - uint64(len(p)) + uint64(start)
   155  			rangeEnd := sz - uint64(len(p)) + uint64(end) - 1
   156  			eg.Go(func() error {
   157  				n, _, err := s3or.readRange(egctx, name, bs, fmt.Sprintf("%s=%d-%d", s3RangePrefix, rangeStart, rangeEnd))
   158  				if err != nil {
   159  					return err
   160  				}
   161  				atomic.AddUint64(&totalN, uint64(n))
   162  				return nil
   163  			})
   164  			start = end
   165  		}
   166  		err = eg.Wait()
   167  		if err != nil {
   168  			return 0, 0, err
   169  		}
   170  		return int(totalN), sz, nil
   171  	}
   172  	return s3or.readRange(ctx, name, p, fmt.Sprintf("%s=-%d", s3RangePrefix, len(p)))
   173  }
   174  
   175  func (s3or *s3ObjectReader) readRange(ctx context.Context, name addr, p []byte, rangeHeader string) (n int, sz uint64, err error) {
   176  	read := func() (int, uint64, error) {
   177  		if s3or.readRl != nil {
   178  			s3or.readRl <- struct{}{}
   179  			defer func() {
   180  				<-s3or.readRl
   181  			}()
   182  		}
   183  
   184  		input := &s3.GetObjectInput{
   185  			Bucket: aws.String(s3or.bucket),
   186  			Key:    aws.String(s3or.key(name.String())),
   187  			Range:  aws.String(rangeHeader),
   188  		}
   189  
   190  		result, err := s3or.s3.GetObjectWithContext(ctx, input)
   191  		if err != nil {
   192  			return 0, 0, err
   193  		}
   194  		defer result.Body.Close()
   195  
   196  		if *result.ContentLength != int64(len(p)) {
   197  			return 0, 0, fmt.Errorf("failed to read entire range, key: %v, len(p): %d, rangeHeader: %s, ContentLength: %d", s3or.key(name.String()), len(p), rangeHeader, *result.ContentLength)
   198  		}
   199  
   200  		sz := uint64(0)
   201  		if result.ContentRange != nil {
   202  			i := strings.Index(*result.ContentRange, "/")
   203  			if i != -1 {
   204  				sz, err = strconv.ParseUint((*result.ContentRange)[i+1:], 10, 64)
   205  				if err != nil {
   206  					return 0, 0, err
   207  				}
   208  			}
   209  		}
   210  
   211  		n, err = io.ReadFull(result.Body, p)
   212  		return n, sz, err
   213  	}
   214  
   215  	n, sz, err = read()
   216  	// We hit the point of diminishing returns investigating #3255, so add retries. In conversations with AWS people, it's not surprising to get transient failures when talking to S3, though SDKs are intended to have their own retrying. The issue may be that, in Go, making the S3 request and reading the data are separate operations, and the SDK kind of can't do its own retrying to handle failures in the latter.
   217  	if isConnReset(err) {
   218  		// We are backing off here because its possible and likely that the rate of requests to S3 is the underlying issue.
   219  		b := &backoff.Backoff{
   220  			Min:    128 * time.Microsecond,
   221  			Max:    1024 * time.Millisecond,
   222  			Factor: 2,
   223  			Jitter: true,
   224  		}
   225  		for ; isConnReset(err); n, sz, err = read() {
   226  			dur := b.Duration()
   227  			time.Sleep(dur)
   228  		}
   229  	}
   230  
   231  	return n, sz, err
   232  }
   233  
   234  func isConnReset(err error) bool {
   235  	nErr, ok := err.(*net.OpError)
   236  	if !ok {
   237  		return false
   238  	}
   239  	scErr, ok := nErr.Err.(*os.SyscallError)
   240  	return ok && scErr.Err == syscall.ECONNRESET
   241  }