github.com/dolthub/dolt/go@v0.40.5-0.20240520175717-68db7794bea6/store/nbs/s3_table_reader.go (about)

     1  // Copyright 2019-2021 Dolthub, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  //
    15  // This file incorporates work covered by the following copyright and
    16  // permission notice:
    17  //
    18  // Copyright 2016 Attic Labs, Inc. All rights reserved.
    19  // Licensed under the Apache License, version 2.0:
    20  // http://www.apache.org/licenses/LICENSE-2.0
    21  
    22  package nbs
    23  
    24  import (
    25  	"context"
    26  	"fmt"
    27  	"io"
    28  	"net"
    29  	"os"
    30  	"strconv"
    31  	"strings"
    32  	"sync/atomic"
    33  	"syscall"
    34  	"time"
    35  
    36  	"github.com/aws/aws-sdk-go/aws"
    37  	"github.com/aws/aws-sdk-go/service/s3"
    38  	"github.com/aws/aws-sdk-go/service/s3/s3iface"
    39  	"github.com/jpillora/backoff"
    40  	"golang.org/x/sync/errgroup"
    41  
    42  	"github.com/dolthub/dolt/go/store/hash"
    43  )
    44  
    45  const (
    46  	s3RangePrefix = "bytes"
    47  	s3BlockSize   = (1 << 10) * 512 // 512K
    48  )
    49  
    50  type s3TableReaderAt struct {
    51  	s3 *s3ObjectReader
    52  	h  hash.Hash
    53  }
    54  
    55  func (s3tra *s3TableReaderAt) Close() error {
    56  	return nil
    57  }
    58  
    59  func (s3tra *s3TableReaderAt) clone() (tableReaderAt, error) {
    60  	return s3tra, nil
    61  }
    62  
    63  func (s3tra *s3TableReaderAt) Reader(ctx context.Context) (io.ReadCloser, error) {
    64  	return s3tra.s3.Reader(ctx, s3tra.h)
    65  }
    66  
    67  func (s3tra *s3TableReaderAt) ReadAtWithStats(ctx context.Context, p []byte, off int64, stats *Stats) (n int, err error) {
    68  	return s3tra.s3.ReadAt(ctx, s3tra.h, p, off, stats)
    69  }
    70  
    71  // TODO: Bring all the multipart upload and remote-conjoin stuff over here and make this a better analogue to ddbTableStore
    72  type s3ObjectReader struct {
    73  	s3     s3iface.S3API
    74  	bucket string
    75  	readRl chan struct{}
    76  	ns     string
    77  }
    78  
    79  func (s3or *s3ObjectReader) key(k string) string {
    80  	if s3or.ns != "" {
    81  		return s3or.ns + "/" + k
    82  	}
    83  	return k
    84  }
    85  
    86  func (s3or *s3ObjectReader) Reader(ctx context.Context, name hash.Hash) (io.ReadCloser, error) {
    87  	return s3or.reader(ctx, name)
    88  }
    89  
    90  func (s3or *s3ObjectReader) ReadAt(ctx context.Context, name hash.Hash, p []byte, off int64, stats *Stats) (n int, err error) {
    91  	t1 := time.Now()
    92  
    93  	defer func() {
    94  		stats.S3BytesPerRead.Sample(uint64(len(p)))
    95  		stats.S3ReadLatency.SampleTimeSince(t1)
    96  	}()
    97  
    98  	n, _, err = s3or.readRange(ctx, name, p, s3RangeHeader(off, int64(len(p))))
    99  	return
   100  }
   101  
   102  func s3RangeHeader(off, length int64) string {
   103  	lastByte := off + length - 1 // insanely, the HTTP range header specifies ranges inclusively.
   104  	return fmt.Sprintf("%s=%d-%d", s3RangePrefix, off, lastByte)
   105  }
   106  
   107  const maxS3ReadFromEndReqSize = 256 * 1024 * 1024       // 256MB
   108  const preferredS3ReadFromEndReqSize = 128 * 1024 * 1024 // 128MB
   109  
   110  func (s3or *s3ObjectReader) ReadFromEnd(ctx context.Context, name hash.Hash, p []byte, stats *Stats) (n int, sz uint64, err error) {
   111  	defer func(t1 time.Time) {
   112  		stats.S3BytesPerRead.Sample(uint64(len(p)))
   113  		stats.S3ReadLatency.SampleTimeSince(t1)
   114  	}(time.Now())
   115  	totalN := uint64(0)
   116  	if len(p) > maxS3ReadFromEndReqSize {
   117  		// If we're bigger than 256MB, parallelize the read...
   118  		// Read the footer first and capture the size of the entire table file.
   119  		n, sz, err := s3or.readRange(ctx, name, p[len(p)-footerSize:], fmt.Sprintf("%s=-%d", s3RangePrefix, footerSize))
   120  		if err != nil {
   121  			return n, sz, err
   122  		}
   123  		totalN += uint64(n)
   124  		eg, egctx := errgroup.WithContext(ctx)
   125  		start := 0
   126  		for start < len(p)-footerSize {
   127  			// Make parallel read requests of up to 128MB.
   128  			end := start + preferredS3ReadFromEndReqSize
   129  			if end > len(p)-footerSize {
   130  				end = len(p) - footerSize
   131  			}
   132  			bs := p[start:end]
   133  			rangeStart := sz - uint64(len(p)) + uint64(start)
   134  			rangeEnd := sz - uint64(len(p)) + uint64(end) - 1
   135  			eg.Go(func() error {
   136  				n, _, err := s3or.readRange(egctx, name, bs, fmt.Sprintf("%s=%d-%d", s3RangePrefix, rangeStart, rangeEnd))
   137  				if err != nil {
   138  					return err
   139  				}
   140  				atomic.AddUint64(&totalN, uint64(n))
   141  				return nil
   142  			})
   143  			start = end
   144  		}
   145  		err = eg.Wait()
   146  		if err != nil {
   147  			return 0, 0, err
   148  		}
   149  		return int(totalN), sz, nil
   150  	}
   151  	return s3or.readRange(ctx, name, p, fmt.Sprintf("%s=-%d", s3RangePrefix, len(p)))
   152  }
   153  
   154  func (s3or *s3ObjectReader) reader(ctx context.Context, name hash.Hash) (io.ReadCloser, error) {
   155  	input := &s3.GetObjectInput{
   156  		Bucket: aws.String(s3or.bucket),
   157  		Key:    aws.String(s3or.key(name.String())),
   158  	}
   159  	result, err := s3or.s3.GetObjectWithContext(ctx, input)
   160  	if err != nil {
   161  		return nil, err
   162  	}
   163  	return result.Body, nil
   164  }
   165  
   166  func (s3or *s3ObjectReader) readRange(ctx context.Context, name hash.Hash, p []byte, rangeHeader string) (n int, sz uint64, err error) {
   167  	read := func() (int, uint64, error) {
   168  		if s3or.readRl != nil {
   169  			s3or.readRl <- struct{}{}
   170  			defer func() {
   171  				<-s3or.readRl
   172  			}()
   173  		}
   174  
   175  		input := &s3.GetObjectInput{
   176  			Bucket: aws.String(s3or.bucket),
   177  			Key:    aws.String(s3or.key(name.String())),
   178  			Range:  aws.String(rangeHeader),
   179  		}
   180  
   181  		result, err := s3or.s3.GetObjectWithContext(ctx, input)
   182  		if err != nil {
   183  			return 0, 0, err
   184  		}
   185  		defer result.Body.Close()
   186  
   187  		if *result.ContentLength != int64(len(p)) {
   188  			return 0, 0, fmt.Errorf("failed to read entire range, key: %v, len(p): %d, rangeHeader: %s, ContentLength: %d", s3or.key(name.String()), len(p), rangeHeader, *result.ContentLength)
   189  		}
   190  
   191  		sz := uint64(0)
   192  		if result.ContentRange != nil {
   193  			i := strings.Index(*result.ContentRange, "/")
   194  			if i != -1 {
   195  				sz, err = strconv.ParseUint((*result.ContentRange)[i+1:], 10, 64)
   196  				if err != nil {
   197  					return 0, 0, err
   198  				}
   199  			}
   200  		}
   201  		n, err = io.ReadFull(result.Body, p)
   202  		return n, sz, err
   203  	}
   204  
   205  	n, sz, err = read()
   206  	// We hit the point of diminishing returns investigating #3255, so add retries. In conversations with AWS people, it's not surprising to get transient failures when talking to S3, though SDKs are intended to have their own retrying. The issue may be that, in Go, making the S3 request and reading the data are separate operations, and the SDK kind of can't do its own retrying to handle failures in the latter.
   207  	if isConnReset(err) {
   208  		// We are backing off here because its possible and likely that the rate of requests to S3 is the underlying issue.
   209  		b := &backoff.Backoff{
   210  			Min:    128 * time.Microsecond,
   211  			Max:    1024 * time.Millisecond,
   212  			Factor: 2,
   213  			Jitter: true,
   214  		}
   215  		for ; isConnReset(err); n, sz, err = read() {
   216  			dur := b.Duration()
   217  			time.Sleep(dur)
   218  		}
   219  	}
   220  
   221  	return n, sz, err
   222  }
   223  
   224  func isConnReset(err error) bool {
   225  	nErr, ok := err.(*net.OpError)
   226  	if !ok {
   227  		return false
   228  	}
   229  	scErr, ok := nErr.Err.(*os.SyscallError)
   230  	return ok && scErr.Err == syscall.ECONNRESET
   231  }