github.com/pingcap/br@v5.3.0-alpha.0.20220125034240-ec59c7b6ce30+incompatible/pkg/storage/gcs.go (about)

     1  // Copyright 2020 PingCAP, Inc. Licensed under Apache-2.0.
     2  
     3  package storage
     4  
     5  import (
     6  	"context"
     7  	"io"
     8  	"os"
     9  	"path"
    10  	"strings"
    11  
    12  	"cloud.google.com/go/storage"
    13  	"github.com/pingcap/errors"
    14  	backuppb "github.com/pingcap/kvproto/pkg/backup"
    15  	"github.com/pingcap/log"
    16  	"github.com/spf13/pflag"
    17  	"go.uber.org/zap"
    18  	"golang.org/x/oauth2/google"
    19  	"google.golang.org/api/iterator"
    20  	"google.golang.org/api/option"
    21  
    22  	berrors "github.com/pingcap/br/pkg/errors"
    23  )
    24  
    25  const (
    26  	gcsEndpointOption     = "gcs.endpoint"
    27  	gcsStorageClassOption = "gcs.storage-class"
    28  	gcsPredefinedACL      = "gcs.predefined-acl"
    29  	gcsCredentialsFile    = "gcs.credentials-file"
    30  )
    31  
    32  // GCSBackendOptions are options for configuration the GCS storage.
    33  type GCSBackendOptions struct {
    34  	Endpoint        string `json:"endpoint" toml:"endpoint"`
    35  	StorageClass    string `json:"storage-class" toml:"storage-class"`
    36  	PredefinedACL   string `json:"predefined-acl" toml:"predefined-acl"`
    37  	CredentialsFile string `json:"credentials-file" toml:"credentials-file"`
    38  }
    39  
    40  func (options *GCSBackendOptions) apply(gcs *backuppb.GCS) error {
    41  	gcs.Endpoint = options.Endpoint
    42  	gcs.StorageClass = options.StorageClass
    43  	gcs.PredefinedAcl = options.PredefinedACL
    44  
    45  	if options.CredentialsFile != "" {
    46  		b, err := os.ReadFile(options.CredentialsFile)
    47  		if err != nil {
    48  			return errors.Trace(err)
    49  		}
    50  		gcs.CredentialsBlob = string(b)
    51  	}
    52  	return nil
    53  }
    54  
    55  func defineGCSFlags(flags *pflag.FlagSet) {
    56  	// TODO: remove experimental tag if it's stable
    57  	flags.String(gcsEndpointOption, "", "(experimental) Set the GCS endpoint URL")
    58  	flags.String(gcsStorageClassOption, "", "(experimental) Specify the GCS storage class for objects")
    59  	flags.String(gcsPredefinedACL, "", "(experimental) Specify the GCS predefined acl for objects")
    60  	flags.String(gcsCredentialsFile, "", "(experimental) Set the GCS credentials file path")
    61  }
    62  
    63  func (options *GCSBackendOptions) parseFromFlags(flags *pflag.FlagSet) error {
    64  	var err error
    65  	options.Endpoint, err = flags.GetString(gcsEndpointOption)
    66  	if err != nil {
    67  		return errors.Trace(err)
    68  	}
    69  
    70  	options.StorageClass, err = flags.GetString(gcsStorageClassOption)
    71  	if err != nil {
    72  		return errors.Trace(err)
    73  	}
    74  
    75  	options.PredefinedACL, err = flags.GetString(gcsPredefinedACL)
    76  	if err != nil {
    77  		return errors.Trace(err)
    78  	}
    79  
    80  	options.CredentialsFile, err = flags.GetString(gcsCredentialsFile)
    81  	if err != nil {
    82  		return errors.Trace(err)
    83  	}
    84  	return nil
    85  }
    86  
    87  type gcsStorage struct {
    88  	gcs    *backuppb.GCS
    89  	bucket *storage.BucketHandle
    90  }
    91  
    92  func (s *gcsStorage) objectName(name string) string {
    93  	return path.Join(s.gcs.Prefix, name)
    94  }
    95  
    96  // WriteFile writes data to a file to storage.
    97  func (s *gcsStorage) WriteFile(ctx context.Context, name string, data []byte) error {
    98  	object := s.objectName(name)
    99  	wc := s.bucket.Object(object).NewWriter(ctx)
   100  	wc.StorageClass = s.gcs.StorageClass
   101  	wc.PredefinedACL = s.gcs.PredefinedAcl
   102  	_, err := wc.Write(data)
   103  	if err != nil {
   104  		return errors.Trace(err)
   105  	}
   106  	return wc.Close()
   107  }
   108  
   109  // ReadFile reads the file from the storage and returns the contents.
   110  func (s *gcsStorage) ReadFile(ctx context.Context, name string) ([]byte, error) {
   111  	object := s.objectName(name)
   112  	rc, err := s.bucket.Object(object).NewReader(ctx)
   113  	if err != nil {
   114  		return nil, errors.Annotatef(err,
   115  			"failed to read gcs file, file info: input.bucket='%s', input.key='%s'",
   116  			s.gcs.Bucket, object)
   117  	}
   118  	defer rc.Close()
   119  
   120  	size := rc.Attrs.Size
   121  	var b []byte
   122  	if size < 0 {
   123  		// happened when using fake-gcs-server in integration test
   124  		b, err = io.ReadAll(rc)
   125  	} else {
   126  		b = make([]byte, size)
   127  		_, err = io.ReadFull(rc, b)
   128  	}
   129  	return b, errors.Trace(err)
   130  }
   131  
   132  // FileExists return true if file exists.
   133  func (s *gcsStorage) FileExists(ctx context.Context, name string) (bool, error) {
   134  	object := s.objectName(name)
   135  	_, err := s.bucket.Object(object).Attrs(ctx)
   136  	if err != nil {
   137  		if errors.Cause(err) == storage.ErrObjectNotExist { // nolint:errorlint
   138  			return false, nil
   139  		}
   140  		return false, errors.Trace(err)
   141  	}
   142  	return true, nil
   143  }
   144  
   145  // Open a Reader by file path.
   146  func (s *gcsStorage) Open(ctx context.Context, path string) (ExternalFileReader, error) {
   147  	object := s.objectName(path)
   148  	handle := s.bucket.Object(object)
   149  
   150  	rc, err := handle.NewRangeReader(ctx, 0, -1)
   151  	if err != nil {
   152  		return nil, errors.Annotatef(err,
   153  			"failed to read gcs file, file info: input.bucket='%s', input.key='%s'",
   154  			s.gcs.Bucket, path)
   155  	}
   156  
   157  	return &gcsObjectReader{
   158  		storage:   s,
   159  		name:      path,
   160  		objHandle: handle,
   161  		reader:    rc,
   162  		ctx:       ctx,
   163  	}, nil
   164  }
   165  
   166  // WalkDir traverse all the files in a dir.
   167  //
   168  // fn is the function called for each regular file visited by WalkDir.
   169  // The first argument is the file path that can be used in `Open`
   170  // function; the second argument is the size in byte of the file determined
   171  // by path.
   172  func (s *gcsStorage) WalkDir(ctx context.Context, opt *WalkOption, fn func(string, int64) error) error {
   173  	if opt == nil {
   174  		opt = &WalkOption{}
   175  	}
   176  
   177  	prefix := path.Join(s.gcs.Prefix, opt.SubDir)
   178  	if len(prefix) > 0 && !strings.HasSuffix(prefix, "/") {
   179  		prefix += "/"
   180  	}
   181  
   182  	query := &storage.Query{Prefix: prefix}
   183  	// only need each object's name and size
   184  	query.SetAttrSelection([]string{"Name", "Size"})
   185  	iter := s.bucket.Objects(ctx, query)
   186  	for {
   187  		attrs, err := iter.Next()
   188  		if err == iterator.Done {
   189  			break
   190  		}
   191  		if err != nil {
   192  			return errors.Trace(err)
   193  		}
   194  		// when walk on specify directory, the result include storage.Prefix,
   195  		// which can not be reuse in other API(Open/Read) directly.
   196  		// so we use TrimPrefix to filter Prefix for next Open/Read.
   197  		path := strings.TrimPrefix(attrs.Name, s.gcs.Prefix)
   198  		if err = fn(path, attrs.Size); err != nil {
   199  			return errors.Trace(err)
   200  		}
   201  	}
   202  	return nil
   203  }
   204  
   205  func (s *gcsStorage) URI() string {
   206  	return "gcs://" + s.gcs.Bucket + "/" + s.gcs.Prefix
   207  }
   208  
   209  // Create implements ExternalStorage interface.
   210  func (s *gcsStorage) Create(ctx context.Context, name string) (ExternalFileWriter, error) {
   211  	object := s.objectName(name)
   212  	wc := s.bucket.Object(object).NewWriter(ctx)
   213  	wc.StorageClass = s.gcs.StorageClass
   214  	wc.PredefinedACL = s.gcs.PredefinedAcl
   215  	return newFlushStorageWriter(wc, &emptyFlusher{}, wc), nil
   216  }
   217  
   218  func newGCSStorage(ctx context.Context, gcs *backuppb.GCS, opts *ExternalStorageOptions) (*gcsStorage, error) {
   219  	var clientOps []option.ClientOption
   220  	if opts.NoCredentials {
   221  		clientOps = append(clientOps, option.WithoutAuthentication())
   222  	} else {
   223  		if gcs.CredentialsBlob == "" {
   224  			creds, err := google.FindDefaultCredentials(ctx, storage.ScopeReadWrite)
   225  			if err != nil {
   226  				return nil, errors.Annotatef(berrors.ErrStorageInvalidConfig, "%v Or you should provide '--gcs.credentials_file'", err)
   227  			}
   228  			if opts.SendCredentials {
   229  				if len(creds.JSON) > 0 {
   230  					gcs.CredentialsBlob = string(creds.JSON)
   231  				} else {
   232  					return nil, errors.Annotate(berrors.ErrStorageInvalidConfig,
   233  						"You should provide '--gcs.credentials_file' when '--send-credentials-to-tikv' is true")
   234  				}
   235  			}
   236  			if creds != nil {
   237  				clientOps = append(clientOps, option.WithCredentials(creds))
   238  			}
   239  		} else {
   240  			clientOps = append(clientOps, option.WithCredentialsJSON([]byte(gcs.GetCredentialsBlob())))
   241  		}
   242  	}
   243  
   244  	if gcs.Endpoint != "" {
   245  		clientOps = append(clientOps, option.WithEndpoint(gcs.Endpoint))
   246  	}
   247  	if opts.HTTPClient != nil {
   248  		clientOps = append(clientOps, option.WithHTTPClient(opts.HTTPClient))
   249  	}
   250  	client, err := storage.NewClient(ctx, clientOps...)
   251  	if err != nil {
   252  		return nil, errors.Trace(err)
   253  	}
   254  
   255  	if !opts.SendCredentials {
   256  		// Clear the credentials if exists so that they will not be sent to TiKV
   257  		gcs.CredentialsBlob = ""
   258  	}
   259  
   260  	bucket := client.Bucket(gcs.Bucket)
   261  	// check whether it's a bug before #647, to solve case #2
   262  	// If the storage is set as gcs://bucket/prefix/,
   263  	// the backupmeta is written correctly to gcs://bucket/prefix/backupmeta,
   264  	// but the SSTs are written wrongly to gcs://bucket/prefix//*.sst (note the extra slash).
   265  	// see details about case 2 at https://github.com/pingcap/br/issues/675#issuecomment-753780742
   266  	sstInPrefix := hasSSTFiles(ctx, bucket, gcs.Prefix)
   267  	sstInPrefixSlash := hasSSTFiles(ctx, bucket, gcs.Prefix+"//")
   268  	if sstInPrefixSlash && !sstInPrefix {
   269  		// This is a old bug, but we must make it compatible.
   270  		// so we need find sst in slash directory
   271  		gcs.Prefix += "//"
   272  	}
   273  	// TODO remove it after BR remove cfg skip-check-path
   274  	if !opts.SkipCheckPath {
   275  		// check bucket exists
   276  		_, err = bucket.Attrs(ctx)
   277  		if err != nil {
   278  			return nil, errors.Annotatef(err, "gcs://%s/%s", gcs.Bucket, gcs.Prefix)
   279  		}
   280  	}
   281  	return &gcsStorage{gcs: gcs, bucket: bucket}, nil
   282  }
   283  
   284  func hasSSTFiles(ctx context.Context, bucket *storage.BucketHandle, prefix string) bool {
   285  	query := storage.Query{Prefix: prefix}
   286  	_ = query.SetAttrSelection([]string{"Name"})
   287  	it := bucket.Objects(ctx, &query)
   288  	for {
   289  		attrs, err := it.Next()
   290  		if err == iterator.Done { // nolint:errorlint
   291  			break
   292  		}
   293  		if err != nil {
   294  			log.Warn("failed to list objects on gcs, will use default value for `prefix`", zap.Error(err))
   295  			break
   296  		}
   297  		if strings.HasSuffix(attrs.Name, ".sst") {
   298  			log.Info("sst file found in prefix slash", zap.String("file", attrs.Name))
   299  			return true
   300  		}
   301  	}
   302  	return false
   303  }
   304  
   305  // gcsObjectReader wrap storage.Reader and add the `Seek` method.
   306  type gcsObjectReader struct {
   307  	storage   *gcsStorage
   308  	name      string
   309  	objHandle *storage.ObjectHandle
   310  	reader    io.ReadCloser
   311  	pos       int64
   312  	// reader context used for implement `io.Seek`
   313  	// currently, lightning depends on package `xitongsys/parquet-go` to read parquet file and it needs `io.Seeker`
   314  	// See: https://github.com/xitongsys/parquet-go/blob/207a3cee75900b2b95213627409b7bac0f190bb3/source/source.go#L9-L10
   315  	ctx context.Context
   316  }
   317  
   318  // Read implement the io.Reader interface.
   319  func (r *gcsObjectReader) Read(p []byte) (n int, err error) {
   320  	if r.reader == nil {
   321  		rc, err := r.objHandle.NewRangeReader(r.ctx, r.pos, -1)
   322  		if err != nil {
   323  			return 0, errors.Annotatef(err,
   324  				"failed to read gcs file, file info: input.bucket='%s', input.key='%s'",
   325  				r.storage.gcs.Bucket, r.name)
   326  		}
   327  		r.reader = rc
   328  	}
   329  	n, err = r.reader.Read(p)
   330  	r.pos += int64(n)
   331  	return n, err
   332  }
   333  
   334  // Close implement the io.Closer interface.
   335  func (r *gcsObjectReader) Close() error {
   336  	if r.reader == nil {
   337  		return nil
   338  	}
   339  	return r.reader.Close()
   340  }
   341  
   342  // Seek implement the io.Seeker interface.
   343  //
   344  // Currently, tidb-lightning depends on this method to read parquet file for gcs storage.
   345  func (r *gcsObjectReader) Seek(offset int64, whence int) (int64, error) {
   346  	var realOffset int64
   347  	switch whence {
   348  	case io.SeekStart:
   349  		if offset < 0 {
   350  			return 0, errors.Annotatef(berrors.ErrInvalidArgument, "Seek: offset '%v' out of range.", offset)
   351  		}
   352  		realOffset = offset
   353  	case io.SeekCurrent:
   354  		realOffset = r.pos + offset
   355  		if r.pos < 0 && realOffset >= 0 {
   356  			return 0, errors.Annotatef(berrors.ErrInvalidArgument, "Seek: offset '%v' out of range. current pos is '%v'.", offset, r.pos)
   357  		}
   358  	case io.SeekEnd:
   359  		if offset >= 0 {
   360  			return 0, errors.Annotatef(berrors.ErrInvalidArgument, "Seek: offset '%v' should be negative.", offset)
   361  		}
   362  		// GCS supports `NewRangeReader(ctx, -10, -1)`, which means read the last 10 bytes.
   363  		realOffset = offset
   364  	default:
   365  		return 0, errors.Annotatef(berrors.ErrStorageUnknown, "Seek: invalid whence '%d'", whence)
   366  	}
   367  
   368  	if realOffset == r.pos {
   369  		return realOffset, nil
   370  	}
   371  
   372  	_ = r.reader.Close()
   373  	r.pos = realOffset
   374  	rc, err := r.objHandle.NewRangeReader(r.ctx, r.pos, -1)
   375  	if err != nil {
   376  		return 0, errors.Annotatef(err,
   377  			"failed to read gcs file, file info: input.bucket='%s', input.key='%s'",
   378  			r.storage.gcs.Bucket, r.name)
   379  	}
   380  	r.reader = rc
   381  
   382  	return realOffset, nil
   383  }