github.com/cockroachdb/pebble@v1.1.1-0.20240513155919-3622ade60459/objstorage/objstorageprovider/provider.go (about)

     1  // Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package objstorageprovider
     6  
     7  import (
     8  	"context"
     9  	"io"
    10  	"os"
    11  	"sort"
    12  	"sync"
    13  
    14  	"github.com/cockroachdb/errors"
    15  	"github.com/cockroachdb/errors/oserror"
    16  	"github.com/cockroachdb/pebble/internal/base"
    17  	"github.com/cockroachdb/pebble/internal/invariants"
    18  	"github.com/cockroachdb/pebble/objstorage"
    19  	"github.com/cockroachdb/pebble/objstorage/objstorageprovider/objiotracing"
    20  	"github.com/cockroachdb/pebble/objstorage/objstorageprovider/remoteobjcat"
    21  	"github.com/cockroachdb/pebble/objstorage/objstorageprovider/sharedcache"
    22  	"github.com/cockroachdb/pebble/objstorage/remote"
    23  	"github.com/cockroachdb/pebble/vfs"
    24  )
    25  
    26  // provider is the implementation of objstorage.Provider.
    27  type provider struct {
    28  	st Settings
    29  
    30  	fsDir vfs.File
    31  
    32  	tracer *objiotracing.Tracer
    33  
    34  	remote remoteSubsystem
    35  
    36  	mu struct {
    37  		sync.RWMutex
    38  
    39  		remote struct {
    40  			// catalogBatch accumulates remote object creations and deletions until
    41  			// Sync is called.
    42  			catalogBatch remoteobjcat.Batch
    43  
    44  			storageObjects map[remote.Locator]remote.Storage
    45  		}
    46  
    47  		// localObjectsChanged is set if non-remote objects were created or deleted
    48  		// but Sync was not yet called.
    49  		localObjectsChanged bool
    50  
    51  		// knownObjects maintains information about objects that are known to the provider.
    52  		// It is initialized with the list of files in the manifest when we open a DB.
    53  		knownObjects map[base.DiskFileNum]objstorage.ObjectMetadata
    54  
    55  		// protectedObjects are objects that cannot be unreferenced because they
    56  		// have outstanding SharedObjectBackingHandles. The value is a count of outstanding handles
    57  		protectedObjects map[base.DiskFileNum]int
    58  	}
    59  }
    60  
    61  var _ objstorage.Provider = (*provider)(nil)
    62  
    63  // Settings that must be specified when creating the provider.
    64  type Settings struct {
    65  	Logger base.Logger
    66  
    67  	// Local filesystem configuration.
    68  	FS        vfs.FS
    69  	FSDirName string
    70  
    71  	// FSDirInitialListing is a listing of FSDirName at the time of calling Open.
    72  	//
    73  	// This is an optional optimization to avoid double listing on Open when the
    74  	// higher layer already has a listing. When nil, we obtain the listing on
    75  	// Open.
    76  	FSDirInitialListing []string
    77  
    78  	// Cleaner cleans obsolete files from the local filesystem.
    79  	//
    80  	// The default cleaner uses the DeleteCleaner.
    81  	FSCleaner base.Cleaner
    82  
    83  	// NoSyncOnClose decides whether the implementation will enforce a
    84  	// close-time synchronization (e.g., fdatasync() or sync_file_range())
    85  	// on files it writes to. Setting this to true removes the guarantee for a
    86  	// sync on close. Some implementations can still issue a non-blocking sync.
    87  	NoSyncOnClose bool
    88  
    89  	// BytesPerSync enables periodic syncing of files in order to smooth out
    90  	// writes to disk. This option does not provide any persistence guarantee, but
    91  	// is used to avoid latency spikes if the OS automatically decides to write
    92  	// out a large chunk of dirty filesystem buffers.
    93  	BytesPerSync int
    94  
    95  	// Fields here are set only if the provider is to support remote objects
    96  	// (experimental).
    97  	Remote struct {
    98  		StorageFactory remote.StorageFactory
    99  
   100  		// If CreateOnShared is non-zero, sstables are created on remote storage using
   101  		// the CreateOnSharedLocator (when the PreferSharedStorage create option is
   102  		// true).
   103  		CreateOnShared        remote.CreateOnSharedStrategy
   104  		CreateOnSharedLocator remote.Locator
   105  
   106  		// CacheSizeBytes is the size of the on-disk block cache for objects
   107  		// on remote storage. If it is 0, no cache is used.
   108  		CacheSizeBytes int64
   109  
   110  		// CacheBlockSize is the block size of the cache; if 0, the default of 32KB is used.
   111  		CacheBlockSize int
   112  
   113  		// ShardingBlockSize is the size of a shard block. The cache is split into contiguous
   114  		// ShardingBlockSize units. The units are distributed across multiple independent shards
   115  		// of the cache, via a hash(offset) modulo num shards operation. The cache replacement
   116  		// policies operate at the level of shard, not whole cache. This is done to reduce lock
   117  		// contention.
   118  		//
   119  		// If ShardingBlockSize is 0, the default of 1 MB is used.
   120  		ShardingBlockSize int64
   121  
   122  		// The number of independent shards the cache leverages. Each shard is the same size,
   123  		// and a hash of filenum & offset map a read to a certain shard. If set to 0,
   124  		// 2*runtime.GOMAXPROCS is used as the shard count.
   125  		CacheShardCount int
   126  
   127  		// TODO(radu): allow the cache to live on another FS/location (e.g. to use
   128  		// instance-local SSD).
   129  	}
   130  }
   131  
   132  // DefaultSettings initializes default settings (with no remote storage),
   133  // suitable for tests and tools.
   134  func DefaultSettings(fs vfs.FS, dirName string) Settings {
   135  	return Settings{
   136  		Logger:        base.DefaultLogger,
   137  		FS:            fs,
   138  		FSDirName:     dirName,
   139  		FSCleaner:     base.DeleteCleaner{},
   140  		NoSyncOnClose: false,
   141  		BytesPerSync:  512 * 1024, // 512KB
   142  	}
   143  }
   144  
   145  // Open creates the provider.
   146  func Open(settings Settings) (objstorage.Provider, error) {
   147  	// Note: we can't just `return open(settings)` because in an error case we
   148  	// would return (*provider)(nil) which is not objstorage.Provider(nil).
   149  	p, err := open(settings)
   150  	if err != nil {
   151  		return nil, err
   152  	}
   153  	return p, nil
   154  }
   155  
   156  func open(settings Settings) (p *provider, _ error) {
   157  	fsDir, err := settings.FS.OpenDir(settings.FSDirName)
   158  	if err != nil {
   159  		return nil, err
   160  	}
   161  
   162  	defer func() {
   163  		if p == nil {
   164  			fsDir.Close()
   165  		}
   166  	}()
   167  
   168  	p = &provider{
   169  		st:    settings,
   170  		fsDir: fsDir,
   171  	}
   172  	p.mu.knownObjects = make(map[base.DiskFileNum]objstorage.ObjectMetadata)
   173  	p.mu.protectedObjects = make(map[base.DiskFileNum]int)
   174  
   175  	if objiotracing.Enabled {
   176  		p.tracer = objiotracing.Open(settings.FS, settings.FSDirName)
   177  	}
   178  
   179  	// Add local FS objects.
   180  	if err := p.vfsInit(); err != nil {
   181  		return nil, err
   182  	}
   183  
   184  	// Initialize remote subsystem (if configured) and add remote objects.
   185  	if err := p.remoteInit(); err != nil {
   186  		return nil, err
   187  	}
   188  
   189  	return p, nil
   190  }
   191  
   192  // Close is part of the objstorage.Provider interface.
   193  func (p *provider) Close() error {
   194  	err := p.sharedClose()
   195  	if p.fsDir != nil {
   196  		err = firstError(err, p.fsDir.Close())
   197  		p.fsDir = nil
   198  	}
   199  	if objiotracing.Enabled {
   200  		if p.tracer != nil {
   201  			p.tracer.Close()
   202  			p.tracer = nil
   203  		}
   204  	}
   205  	return err
   206  }
   207  
   208  // OpenForReading opens an existing object.
   209  func (p *provider) OpenForReading(
   210  	ctx context.Context,
   211  	fileType base.FileType,
   212  	fileNum base.DiskFileNum,
   213  	opts objstorage.OpenOptions,
   214  ) (objstorage.Readable, error) {
   215  	meta, err := p.Lookup(fileType, fileNum)
   216  	if err != nil {
   217  		if opts.MustExist {
   218  			p.st.Logger.Fatalf("%v", err)
   219  		}
   220  		return nil, err
   221  	}
   222  
   223  	var r objstorage.Readable
   224  	if !meta.IsRemote() {
   225  		r, err = p.vfsOpenForReading(ctx, fileType, fileNum, opts)
   226  	} else {
   227  		r, err = p.remoteOpenForReading(ctx, meta, opts)
   228  		if err != nil && p.isNotExistError(meta, err) {
   229  			// Wrap the error so that IsNotExistError functions properly.
   230  			err = errors.Mark(err, os.ErrNotExist)
   231  		}
   232  	}
   233  	if err != nil {
   234  		return nil, err
   235  	}
   236  	if objiotracing.Enabled {
   237  		r = p.tracer.WrapReadable(ctx, r, fileNum)
   238  	}
   239  	return r, nil
   240  }
   241  
   242  // Create creates a new object and opens it for writing.
   243  //
   244  // The object is not guaranteed to be durable (accessible in case of crashes)
   245  // until Sync is called.
   246  func (p *provider) Create(
   247  	ctx context.Context,
   248  	fileType base.FileType,
   249  	fileNum base.DiskFileNum,
   250  	opts objstorage.CreateOptions,
   251  ) (w objstorage.Writable, meta objstorage.ObjectMetadata, err error) {
   252  	if opts.PreferSharedStorage && p.st.Remote.CreateOnShared != remote.CreateOnSharedNone {
   253  		w, meta, err = p.sharedCreate(ctx, fileType, fileNum, p.st.Remote.CreateOnSharedLocator, opts)
   254  	} else {
   255  		w, meta, err = p.vfsCreate(ctx, fileType, fileNum)
   256  	}
   257  	if err != nil {
   258  		err = errors.Wrapf(err, "creating object %s", errors.Safe(fileNum))
   259  		return nil, objstorage.ObjectMetadata{}, err
   260  	}
   261  	p.addMetadata(meta)
   262  	if objiotracing.Enabled {
   263  		w = p.tracer.WrapWritable(ctx, w, fileNum)
   264  	}
   265  	return w, meta, nil
   266  }
   267  
   268  // Remove removes an object.
   269  //
   270  // Note that if the object is remote, the object is only (conceptually) removed
   271  // from this provider. If other providers have references on the remote object,
   272  // it will not be removed.
   273  //
   274  // The object is not guaranteed to be durably removed until Sync is called.
   275  func (p *provider) Remove(fileType base.FileType, fileNum base.DiskFileNum) error {
   276  	meta, err := p.Lookup(fileType, fileNum)
   277  	if err != nil {
   278  		return err
   279  	}
   280  
   281  	if !meta.IsRemote() {
   282  		err = p.vfsRemove(fileType, fileNum)
   283  	} else {
   284  		// TODO(radu): implement remote object removal (i.e. deref).
   285  		err = p.sharedUnref(meta)
   286  		if err != nil && p.isNotExistError(meta, err) {
   287  			// Wrap the error so that IsNotExistError functions properly.
   288  			err = errors.Mark(err, os.ErrNotExist)
   289  		}
   290  	}
   291  	if err != nil && !p.IsNotExistError(err) {
   292  		// We want to be able to retry a Remove, so we keep the object in our list.
   293  		// TODO(radu): we should mark the object as "zombie" and not allow any other
   294  		// operations.
   295  		return errors.Wrapf(err, "removing object %s", errors.Safe(fileNum))
   296  	}
   297  
   298  	p.removeMetadata(fileNum)
   299  	return err
   300  }
   301  
   302  func (p *provider) isNotExistError(meta objstorage.ObjectMetadata, err error) bool {
   303  	if meta.Remote.Storage != nil {
   304  		return meta.Remote.Storage.IsNotExistError(err)
   305  	}
   306  	return oserror.IsNotExist(err)
   307  }
   308  
   309  // IsNotExistError is part of the objstorage.Provider interface.
   310  func (p *provider) IsNotExistError(err error) bool {
   311  	// We use errors.Mark(err, os.ErrNotExist) for not-exist errors coming from
   312  	// remote.Storage.
   313  	return oserror.IsNotExist(err)
   314  }
   315  
   316  // Sync flushes the metadata from creation or removal of objects since the last Sync.
   317  func (p *provider) Sync() error {
   318  	if err := p.vfsSync(); err != nil {
   319  		return err
   320  	}
   321  	if err := p.sharedSync(); err != nil {
   322  		return err
   323  	}
   324  	return nil
   325  }
   326  
   327  // LinkOrCopyFromLocal creates a new object that is either a copy of a given
   328  // local file or a hard link (if the new object is created on the same FS, and
   329  // if the FS supports it).
   330  //
   331  // The object is not guaranteed to be durable (accessible in case of crashes)
   332  // until Sync is called.
   333  func (p *provider) LinkOrCopyFromLocal(
   334  	ctx context.Context,
   335  	srcFS vfs.FS,
   336  	srcFilePath string,
   337  	dstFileType base.FileType,
   338  	dstFileNum base.DiskFileNum,
   339  	opts objstorage.CreateOptions,
   340  ) (objstorage.ObjectMetadata, error) {
   341  	shared := opts.PreferSharedStorage && p.st.Remote.CreateOnShared != remote.CreateOnSharedNone
   342  	if !shared && srcFS == p.st.FS {
   343  		// Wrap the normal filesystem with one which wraps newly created files with
   344  		// vfs.NewSyncingFile.
   345  		fs := vfs.NewSyncingFS(p.st.FS, vfs.SyncingFileOptions{
   346  			NoSyncOnClose: p.st.NoSyncOnClose,
   347  			BytesPerSync:  p.st.BytesPerSync,
   348  		})
   349  		dstPath := p.vfsPath(dstFileType, dstFileNum)
   350  		if err := vfs.LinkOrCopy(fs, srcFilePath, dstPath); err != nil {
   351  			return objstorage.ObjectMetadata{}, err
   352  		}
   353  
   354  		meta := objstorage.ObjectMetadata{
   355  			DiskFileNum: dstFileNum,
   356  			FileType:    dstFileType,
   357  		}
   358  		p.addMetadata(meta)
   359  		return meta, nil
   360  	}
   361  	// Create the object and copy the data.
   362  	w, meta, err := p.Create(ctx, dstFileType, dstFileNum, opts)
   363  	if err != nil {
   364  		return objstorage.ObjectMetadata{}, err
   365  	}
   366  	f, err := srcFS.Open(srcFilePath, vfs.SequentialReadsOption)
   367  	if err != nil {
   368  		return objstorage.ObjectMetadata{}, err
   369  	}
   370  	defer f.Close()
   371  	buf := make([]byte, 64*1024)
   372  	for {
   373  		n, readErr := f.Read(buf)
   374  		if readErr != nil && readErr != io.EOF {
   375  			w.Abort()
   376  			return objstorage.ObjectMetadata{}, readErr
   377  		}
   378  
   379  		if n > 0 {
   380  			if err := w.Write(buf[:n]); err != nil {
   381  				w.Abort()
   382  				return objstorage.ObjectMetadata{}, err
   383  			}
   384  		}
   385  
   386  		if readErr == io.EOF {
   387  			break
   388  		}
   389  	}
   390  	if err := w.Finish(); err != nil {
   391  		return objstorage.ObjectMetadata{}, err
   392  	}
   393  	return meta, nil
   394  }
   395  
   396  // Lookup is part of the objstorage.Provider interface.
   397  func (p *provider) Lookup(
   398  	fileType base.FileType, fileNum base.DiskFileNum,
   399  ) (objstorage.ObjectMetadata, error) {
   400  	p.mu.RLock()
   401  	defer p.mu.RUnlock()
   402  	meta, ok := p.mu.knownObjects[fileNum]
   403  	if !ok {
   404  		return objstorage.ObjectMetadata{}, errors.Wrapf(
   405  			os.ErrNotExist,
   406  			"file %s (type %d) unknown to the objstorage provider",
   407  			errors.Safe(fileNum), errors.Safe(fileType),
   408  		)
   409  	}
   410  	if meta.FileType != fileType {
   411  		return objstorage.ObjectMetadata{}, errors.AssertionFailedf(
   412  			"file %s type mismatch (known type %d, expected type %d)",
   413  			errors.Safe(fileNum), errors.Safe(meta.FileType), errors.Safe(fileType),
   414  		)
   415  	}
   416  	return meta, nil
   417  }
   418  
   419  // Path is part of the objstorage.Provider interface.
   420  func (p *provider) Path(meta objstorage.ObjectMetadata) string {
   421  	if !meta.IsRemote() {
   422  		return p.vfsPath(meta.FileType, meta.DiskFileNum)
   423  	}
   424  	return p.remotePath(meta)
   425  }
   426  
   427  // Size returns the size of the object.
   428  func (p *provider) Size(meta objstorage.ObjectMetadata) (int64, error) {
   429  	if !meta.IsRemote() {
   430  		return p.vfsSize(meta.FileType, meta.DiskFileNum)
   431  	}
   432  	return p.remoteSize(meta)
   433  }
   434  
   435  // List is part of the objstorage.Provider interface.
   436  func (p *provider) List() []objstorage.ObjectMetadata {
   437  	p.mu.RLock()
   438  	defer p.mu.RUnlock()
   439  	res := make([]objstorage.ObjectMetadata, 0, len(p.mu.knownObjects))
   440  	for _, meta := range p.mu.knownObjects {
   441  		res = append(res, meta)
   442  	}
   443  	sort.Slice(res, func(i, j int) bool {
   444  		return res[i].DiskFileNum.FileNum() < res[j].DiskFileNum.FileNum()
   445  	})
   446  	return res
   447  }
   448  
   449  // Metrics is part of the objstorage.Provider interface.
   450  func (p *provider) Metrics() sharedcache.Metrics {
   451  	if p.remote.cache != nil {
   452  		return p.remote.cache.Metrics()
   453  	}
   454  	return sharedcache.Metrics{}
   455  }
   456  
   457  func (p *provider) addMetadata(meta objstorage.ObjectMetadata) {
   458  	if invariants.Enabled {
   459  		meta.AssertValid()
   460  	}
   461  	p.mu.Lock()
   462  	defer p.mu.Unlock()
   463  	p.mu.knownObjects[meta.DiskFileNum] = meta
   464  	if meta.IsRemote() {
   465  		p.mu.remote.catalogBatch.AddObject(remoteobjcat.RemoteObjectMetadata{
   466  			FileNum:        meta.DiskFileNum,
   467  			FileType:       meta.FileType,
   468  			CreatorID:      meta.Remote.CreatorID,
   469  			CreatorFileNum: meta.Remote.CreatorFileNum,
   470  			Locator:        meta.Remote.Locator,
   471  			CleanupMethod:  meta.Remote.CleanupMethod,
   472  		})
   473  	} else {
   474  		p.mu.localObjectsChanged = true
   475  	}
   476  }
   477  
   478  func (p *provider) removeMetadata(fileNum base.DiskFileNum) {
   479  	p.mu.Lock()
   480  	defer p.mu.Unlock()
   481  
   482  	meta, ok := p.mu.knownObjects[fileNum]
   483  	if !ok {
   484  		return
   485  	}
   486  	delete(p.mu.knownObjects, fileNum)
   487  	if meta.IsRemote() {
   488  		p.mu.remote.catalogBatch.DeleteObject(fileNum)
   489  	} else {
   490  		p.mu.localObjectsChanged = true
   491  	}
   492  }
   493  
   494  // protectObject prevents the unreferencing of a remote object until
   495  // unprotectObject is called.
   496  func (p *provider) protectObject(fileNum base.DiskFileNum) {
   497  	p.mu.Lock()
   498  	defer p.mu.Unlock()
   499  	p.mu.protectedObjects[fileNum] = p.mu.protectedObjects[fileNum] + 1
   500  }
   501  
   502  func (p *provider) unprotectObject(fileNum base.DiskFileNum) {
   503  	p.mu.Lock()
   504  	defer p.mu.Unlock()
   505  	v := p.mu.protectedObjects[fileNum]
   506  	if invariants.Enabled && v == 0 {
   507  		panic("invalid protection count")
   508  	}
   509  	if v > 1 {
   510  		p.mu.protectedObjects[fileNum] = v - 1
   511  	} else {
   512  		delete(p.mu.protectedObjects, fileNum)
   513  		// TODO(radu): check if the object is still in knownObject; if not, unref it
   514  		// now.
   515  	}
   516  }
   517  
   518  func (p *provider) isProtected(fileNum base.DiskFileNum) bool {
   519  	p.mu.Lock()
   520  	defer p.mu.Unlock()
   521  	return p.mu.protectedObjects[fileNum] > 0
   522  }