github.com/pingcap/br@v5.3.0-alpha.0.20220125034240-ec59c7b6ce30+incompatible/pkg/storage/gcs.go (about) 1 // Copyright 2020 PingCAP, Inc. Licensed under Apache-2.0. 2 3 package storage 4 5 import ( 6 "context" 7 "io" 8 "os" 9 "path" 10 "strings" 11 12 "cloud.google.com/go/storage" 13 "github.com/pingcap/errors" 14 backuppb "github.com/pingcap/kvproto/pkg/backup" 15 "github.com/pingcap/log" 16 "github.com/spf13/pflag" 17 "go.uber.org/zap" 18 "golang.org/x/oauth2/google" 19 "google.golang.org/api/iterator" 20 "google.golang.org/api/option" 21 22 berrors "github.com/pingcap/br/pkg/errors" 23 ) 24 25 const ( 26 gcsEndpointOption = "gcs.endpoint" 27 gcsStorageClassOption = "gcs.storage-class" 28 gcsPredefinedACL = "gcs.predefined-acl" 29 gcsCredentialsFile = "gcs.credentials-file" 30 ) 31 32 // GCSBackendOptions are options for configuration the GCS storage. 33 type GCSBackendOptions struct { 34 Endpoint string `json:"endpoint" toml:"endpoint"` 35 StorageClass string `json:"storage-class" toml:"storage-class"` 36 PredefinedACL string `json:"predefined-acl" toml:"predefined-acl"` 37 CredentialsFile string `json:"credentials-file" toml:"credentials-file"` 38 } 39 40 func (options *GCSBackendOptions) apply(gcs *backuppb.GCS) error { 41 gcs.Endpoint = options.Endpoint 42 gcs.StorageClass = options.StorageClass 43 gcs.PredefinedAcl = options.PredefinedACL 44 45 if options.CredentialsFile != "" { 46 b, err := os.ReadFile(options.CredentialsFile) 47 if err != nil { 48 return errors.Trace(err) 49 } 50 gcs.CredentialsBlob = string(b) 51 } 52 return nil 53 } 54 55 func defineGCSFlags(flags *pflag.FlagSet) { 56 // TODO: remove experimental tag if it's stable 57 flags.String(gcsEndpointOption, "", "(experimental) Set the GCS endpoint URL") 58 flags.String(gcsStorageClassOption, "", "(experimental) Specify the GCS storage class for objects") 59 flags.String(gcsPredefinedACL, "", "(experimental) Specify the GCS predefined acl for objects") 60 flags.String(gcsCredentialsFile, "", "(experimental) Set the GCS credentials file path") 61 } 62 63 func (options *GCSBackendOptions) parseFromFlags(flags *pflag.FlagSet) error { 64 var err error 65 options.Endpoint, err = flags.GetString(gcsEndpointOption) 66 if err != nil { 67 return errors.Trace(err) 68 } 69 70 options.StorageClass, err = flags.GetString(gcsStorageClassOption) 71 if err != nil { 72 return errors.Trace(err) 73 } 74 75 options.PredefinedACL, err = flags.GetString(gcsPredefinedACL) 76 if err != nil { 77 return errors.Trace(err) 78 } 79 80 options.CredentialsFile, err = flags.GetString(gcsCredentialsFile) 81 if err != nil { 82 return errors.Trace(err) 83 } 84 return nil 85 } 86 87 type gcsStorage struct { 88 gcs *backuppb.GCS 89 bucket *storage.BucketHandle 90 } 91 92 func (s *gcsStorage) objectName(name string) string { 93 return path.Join(s.gcs.Prefix, name) 94 } 95 96 // WriteFile writes data to a file to storage. 97 func (s *gcsStorage) WriteFile(ctx context.Context, name string, data []byte) error { 98 object := s.objectName(name) 99 wc := s.bucket.Object(object).NewWriter(ctx) 100 wc.StorageClass = s.gcs.StorageClass 101 wc.PredefinedACL = s.gcs.PredefinedAcl 102 _, err := wc.Write(data) 103 if err != nil { 104 return errors.Trace(err) 105 } 106 return wc.Close() 107 } 108 109 // ReadFile reads the file from the storage and returns the contents. 110 func (s *gcsStorage) ReadFile(ctx context.Context, name string) ([]byte, error) { 111 object := s.objectName(name) 112 rc, err := s.bucket.Object(object).NewReader(ctx) 113 if err != nil { 114 return nil, errors.Annotatef(err, 115 "failed to read gcs file, file info: input.bucket='%s', input.key='%s'", 116 s.gcs.Bucket, object) 117 } 118 defer rc.Close() 119 120 size := rc.Attrs.Size 121 var b []byte 122 if size < 0 { 123 // happened when using fake-gcs-server in integration test 124 b, err = io.ReadAll(rc) 125 } else { 126 b = make([]byte, size) 127 _, err = io.ReadFull(rc, b) 128 } 129 return b, errors.Trace(err) 130 } 131 132 // FileExists return true if file exists. 133 func (s *gcsStorage) FileExists(ctx context.Context, name string) (bool, error) { 134 object := s.objectName(name) 135 _, err := s.bucket.Object(object).Attrs(ctx) 136 if err != nil { 137 if errors.Cause(err) == storage.ErrObjectNotExist { // nolint:errorlint 138 return false, nil 139 } 140 return false, errors.Trace(err) 141 } 142 return true, nil 143 } 144 145 // Open a Reader by file path. 146 func (s *gcsStorage) Open(ctx context.Context, path string) (ExternalFileReader, error) { 147 object := s.objectName(path) 148 handle := s.bucket.Object(object) 149 150 rc, err := handle.NewRangeReader(ctx, 0, -1) 151 if err != nil { 152 return nil, errors.Annotatef(err, 153 "failed to read gcs file, file info: input.bucket='%s', input.key='%s'", 154 s.gcs.Bucket, path) 155 } 156 157 return &gcsObjectReader{ 158 storage: s, 159 name: path, 160 objHandle: handle, 161 reader: rc, 162 ctx: ctx, 163 }, nil 164 } 165 166 // WalkDir traverse all the files in a dir. 167 // 168 // fn is the function called for each regular file visited by WalkDir. 169 // The first argument is the file path that can be used in `Open` 170 // function; the second argument is the size in byte of the file determined 171 // by path. 172 func (s *gcsStorage) WalkDir(ctx context.Context, opt *WalkOption, fn func(string, int64) error) error { 173 if opt == nil { 174 opt = &WalkOption{} 175 } 176 177 prefix := path.Join(s.gcs.Prefix, opt.SubDir) 178 if len(prefix) > 0 && !strings.HasSuffix(prefix, "/") { 179 prefix += "/" 180 } 181 182 query := &storage.Query{Prefix: prefix} 183 // only need each object's name and size 184 query.SetAttrSelection([]string{"Name", "Size"}) 185 iter := s.bucket.Objects(ctx, query) 186 for { 187 attrs, err := iter.Next() 188 if err == iterator.Done { 189 break 190 } 191 if err != nil { 192 return errors.Trace(err) 193 } 194 // when walk on specify directory, the result include storage.Prefix, 195 // which can not be reuse in other API(Open/Read) directly. 196 // so we use TrimPrefix to filter Prefix for next Open/Read. 197 path := strings.TrimPrefix(attrs.Name, s.gcs.Prefix) 198 if err = fn(path, attrs.Size); err != nil { 199 return errors.Trace(err) 200 } 201 } 202 return nil 203 } 204 205 func (s *gcsStorage) URI() string { 206 return "gcs://" + s.gcs.Bucket + "/" + s.gcs.Prefix 207 } 208 209 // Create implements ExternalStorage interface. 210 func (s *gcsStorage) Create(ctx context.Context, name string) (ExternalFileWriter, error) { 211 object := s.objectName(name) 212 wc := s.bucket.Object(object).NewWriter(ctx) 213 wc.StorageClass = s.gcs.StorageClass 214 wc.PredefinedACL = s.gcs.PredefinedAcl 215 return newFlushStorageWriter(wc, &emptyFlusher{}, wc), nil 216 } 217 218 func newGCSStorage(ctx context.Context, gcs *backuppb.GCS, opts *ExternalStorageOptions) (*gcsStorage, error) { 219 var clientOps []option.ClientOption 220 if opts.NoCredentials { 221 clientOps = append(clientOps, option.WithoutAuthentication()) 222 } else { 223 if gcs.CredentialsBlob == "" { 224 creds, err := google.FindDefaultCredentials(ctx, storage.ScopeReadWrite) 225 if err != nil { 226 return nil, errors.Annotatef(berrors.ErrStorageInvalidConfig, "%v Or you should provide '--gcs.credentials_file'", err) 227 } 228 if opts.SendCredentials { 229 if len(creds.JSON) > 0 { 230 gcs.CredentialsBlob = string(creds.JSON) 231 } else { 232 return nil, errors.Annotate(berrors.ErrStorageInvalidConfig, 233 "You should provide '--gcs.credentials_file' when '--send-credentials-to-tikv' is true") 234 } 235 } 236 if creds != nil { 237 clientOps = append(clientOps, option.WithCredentials(creds)) 238 } 239 } else { 240 clientOps = append(clientOps, option.WithCredentialsJSON([]byte(gcs.GetCredentialsBlob()))) 241 } 242 } 243 244 if gcs.Endpoint != "" { 245 clientOps = append(clientOps, option.WithEndpoint(gcs.Endpoint)) 246 } 247 if opts.HTTPClient != nil { 248 clientOps = append(clientOps, option.WithHTTPClient(opts.HTTPClient)) 249 } 250 client, err := storage.NewClient(ctx, clientOps...) 251 if err != nil { 252 return nil, errors.Trace(err) 253 } 254 255 if !opts.SendCredentials { 256 // Clear the credentials if exists so that they will not be sent to TiKV 257 gcs.CredentialsBlob = "" 258 } 259 260 bucket := client.Bucket(gcs.Bucket) 261 // check whether it's a bug before #647, to solve case #2 262 // If the storage is set as gcs://bucket/prefix/, 263 // the backupmeta is written correctly to gcs://bucket/prefix/backupmeta, 264 // but the SSTs are written wrongly to gcs://bucket/prefix//*.sst (note the extra slash). 265 // see details about case 2 at https://github.com/pingcap/br/issues/675#issuecomment-753780742 266 sstInPrefix := hasSSTFiles(ctx, bucket, gcs.Prefix) 267 sstInPrefixSlash := hasSSTFiles(ctx, bucket, gcs.Prefix+"//") 268 if sstInPrefixSlash && !sstInPrefix { 269 // This is a old bug, but we must make it compatible. 270 // so we need find sst in slash directory 271 gcs.Prefix += "//" 272 } 273 // TODO remove it after BR remove cfg skip-check-path 274 if !opts.SkipCheckPath { 275 // check bucket exists 276 _, err = bucket.Attrs(ctx) 277 if err != nil { 278 return nil, errors.Annotatef(err, "gcs://%s/%s", gcs.Bucket, gcs.Prefix) 279 } 280 } 281 return &gcsStorage{gcs: gcs, bucket: bucket}, nil 282 } 283 284 func hasSSTFiles(ctx context.Context, bucket *storage.BucketHandle, prefix string) bool { 285 query := storage.Query{Prefix: prefix} 286 _ = query.SetAttrSelection([]string{"Name"}) 287 it := bucket.Objects(ctx, &query) 288 for { 289 attrs, err := it.Next() 290 if err == iterator.Done { // nolint:errorlint 291 break 292 } 293 if err != nil { 294 log.Warn("failed to list objects on gcs, will use default value for `prefix`", zap.Error(err)) 295 break 296 } 297 if strings.HasSuffix(attrs.Name, ".sst") { 298 log.Info("sst file found in prefix slash", zap.String("file", attrs.Name)) 299 return true 300 } 301 } 302 return false 303 } 304 305 // gcsObjectReader wrap storage.Reader and add the `Seek` method. 306 type gcsObjectReader struct { 307 storage *gcsStorage 308 name string 309 objHandle *storage.ObjectHandle 310 reader io.ReadCloser 311 pos int64 312 // reader context used for implement `io.Seek` 313 // currently, lightning depends on package `xitongsys/parquet-go` to read parquet file and it needs `io.Seeker` 314 // See: https://github.com/xitongsys/parquet-go/blob/207a3cee75900b2b95213627409b7bac0f190bb3/source/source.go#L9-L10 315 ctx context.Context 316 } 317 318 // Read implement the io.Reader interface. 319 func (r *gcsObjectReader) Read(p []byte) (n int, err error) { 320 if r.reader == nil { 321 rc, err := r.objHandle.NewRangeReader(r.ctx, r.pos, -1) 322 if err != nil { 323 return 0, errors.Annotatef(err, 324 "failed to read gcs file, file info: input.bucket='%s', input.key='%s'", 325 r.storage.gcs.Bucket, r.name) 326 } 327 r.reader = rc 328 } 329 n, err = r.reader.Read(p) 330 r.pos += int64(n) 331 return n, err 332 } 333 334 // Close implement the io.Closer interface. 335 func (r *gcsObjectReader) Close() error { 336 if r.reader == nil { 337 return nil 338 } 339 return r.reader.Close() 340 } 341 342 // Seek implement the io.Seeker interface. 343 // 344 // Currently, tidb-lightning depends on this method to read parquet file for gcs storage. 345 func (r *gcsObjectReader) Seek(offset int64, whence int) (int64, error) { 346 var realOffset int64 347 switch whence { 348 case io.SeekStart: 349 if offset < 0 { 350 return 0, errors.Annotatef(berrors.ErrInvalidArgument, "Seek: offset '%v' out of range.", offset) 351 } 352 realOffset = offset 353 case io.SeekCurrent: 354 realOffset = r.pos + offset 355 if r.pos < 0 && realOffset >= 0 { 356 return 0, errors.Annotatef(berrors.ErrInvalidArgument, "Seek: offset '%v' out of range. current pos is '%v'.", offset, r.pos) 357 } 358 case io.SeekEnd: 359 if offset >= 0 { 360 return 0, errors.Annotatef(berrors.ErrInvalidArgument, "Seek: offset '%v' should be negative.", offset) 361 } 362 // GCS supports `NewRangeReader(ctx, -10, -1)`, which means read the last 10 bytes. 363 realOffset = offset 364 default: 365 return 0, errors.Annotatef(berrors.ErrStorageUnknown, "Seek: invalid whence '%d'", whence) 366 } 367 368 if realOffset == r.pos { 369 return realOffset, nil 370 } 371 372 _ = r.reader.Close() 373 r.pos = realOffset 374 rc, err := r.objHandle.NewRangeReader(r.ctx, r.pos, -1) 375 if err != nil { 376 return 0, errors.Annotatef(err, 377 "failed to read gcs file, file info: input.bucket='%s', input.key='%s'", 378 r.storage.gcs.Bucket, r.name) 379 } 380 r.reader = rc 381 382 return realOffset, nil 383 }