github.com/cockroachdb/pebble@v1.1.2/objstorage/objstorageprovider/provider.go (about)

     1  // Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package objstorageprovider
     6  
     7  import (
     8  	"context"
     9  	"io"
    10  	"os"
    11  	"sort"
    12  	"sync"
    13  
    14  	"github.com/cockroachdb/errors"
    15  	"github.com/cockroachdb/errors/oserror"
    16  	"github.com/cockroachdb/pebble/internal/base"
    17  	"github.com/cockroachdb/pebble/internal/invariants"
    18  	"github.com/cockroachdb/pebble/objstorage"
    19  	"github.com/cockroachdb/pebble/objstorage/objstorageprovider/objiotracing"
    20  	"github.com/cockroachdb/pebble/objstorage/objstorageprovider/remoteobjcat"
    21  	"github.com/cockroachdb/pebble/objstorage/objstorageprovider/sharedcache"
    22  	"github.com/cockroachdb/pebble/objstorage/remote"
    23  	"github.com/cockroachdb/pebble/vfs"
    24  )
    25  
    26  // provider is the implementation of objstorage.Provider.
    27  type provider struct {
    28  	st Settings
    29  
    30  	fsDir vfs.File
    31  
    32  	tracer *objiotracing.Tracer
    33  
    34  	remote remoteSubsystem
    35  
    36  	mu struct {
    37  		sync.RWMutex
    38  
    39  		remote struct {
    40  			// catalogBatch accumulates remote object creations and deletions until
    41  			// Sync is called.
    42  			catalogBatch remoteobjcat.Batch
    43  
    44  			storageObjects map[remote.Locator]remote.Storage
    45  		}
    46  
    47  		// localObjectsChanged is set if non-remote objects were created or deleted
    48  		// but Sync was not yet called.
    49  		localObjectsChanged bool
    50  
    51  		// knownObjects maintains information about objects that are known to the provider.
    52  		// It is initialized with the list of files in the manifest when we open a DB.
    53  		knownObjects map[base.DiskFileNum]objstorage.ObjectMetadata
    54  
    55  		// protectedObjects are objects that cannot be unreferenced because they
    56  		// have outstanding SharedObjectBackingHandles. The value is a count of outstanding handles
    57  		protectedObjects map[base.DiskFileNum]int
    58  	}
    59  }
    60  
    61  var _ objstorage.Provider = (*provider)(nil)
    62  
    63  // Settings that must be specified when creating the provider.
    64  type Settings struct {
    65  	Logger base.Logger
    66  
    67  	// Local filesystem configuration.
    68  	FS        vfs.FS
    69  	FSDirName string
    70  
    71  	// FSDirInitialListing is a listing of FSDirName at the time of calling Open.
    72  	//
    73  	// This is an optional optimization to avoid double listing on Open when the
    74  	// higher layer already has a listing. When nil, we obtain the listing on
    75  	// Open.
    76  	FSDirInitialListing []string
    77  
    78  	// Cleaner cleans obsolete files from the local filesystem.
    79  	//
    80  	// The default cleaner uses the DeleteCleaner.
    81  	FSCleaner base.Cleaner
    82  
    83  	// NoSyncOnClose decides whether the implementation will enforce a
    84  	// close-time synchronization (e.g., fdatasync() or sync_file_range())
    85  	// on files it writes to. Setting this to true removes the guarantee for a
    86  	// sync on close. Some implementations can still issue a non-blocking sync.
    87  	NoSyncOnClose bool
    88  
    89  	// BytesPerSync enables periodic syncing of files in order to smooth out
    90  	// writes to disk. This option does not provide any persistence guarantee, but
    91  	// is used to avoid latency spikes if the OS automatically decides to write
    92  	// out a large chunk of dirty filesystem buffers.
    93  	BytesPerSync int
    94  
    95  	// Local contains fields that are only relevant for files stored on the local
    96  	// filesystem.
    97  	Local struct {
    98  		// TODO(radu): move FSCleaner, NoSyncOnClose, BytesPerSync here.
    99  
   100  		// ReadaheadConfigFn is a function used to retrieve the current readahead
   101  		// mode. This function is run whenever a local object is open for reading.
   102  		// If it is nil, DefaultReadaheadConfig is used.
   103  		ReadaheadConfigFn func() ReadaheadConfig
   104  	}
   105  
   106  	// Fields here are set only if the provider is to support remote objects
   107  	// (experimental).
   108  	Remote struct {
   109  		StorageFactory remote.StorageFactory
   110  
   111  		// If CreateOnShared is non-zero, sstables are created on remote storage using
   112  		// the CreateOnSharedLocator (when the PreferSharedStorage create option is
   113  		// true).
   114  		CreateOnShared        remote.CreateOnSharedStrategy
   115  		CreateOnSharedLocator remote.Locator
   116  
   117  		// CacheSizeBytes is the size of the on-disk block cache for objects
   118  		// on remote storage. If it is 0, no cache is used.
   119  		CacheSizeBytes int64
   120  
   121  		// CacheBlockSize is the block size of the cache; if 0, the default of 32KB is used.
   122  		CacheBlockSize int
   123  
   124  		// ShardingBlockSize is the size of a shard block. The cache is split into contiguous
   125  		// ShardingBlockSize units. The units are distributed across multiple independent shards
   126  		// of the cache, via a hash(offset) modulo num shards operation. The cache replacement
   127  		// policies operate at the level of shard, not whole cache. This is done to reduce lock
   128  		// contention.
   129  		//
   130  		// If ShardingBlockSize is 0, the default of 1 MB is used.
   131  		ShardingBlockSize int64
   132  
   133  		// The number of independent shards the cache leverages. Each shard is the same size,
   134  		// and a hash of filenum & offset map a read to a certain shard. If set to 0,
   135  		// 2*runtime.GOMAXPROCS is used as the shard count.
   136  		CacheShardCount int
   137  
   138  		// TODO(radu): allow the cache to live on another FS/location (e.g. to use
   139  		// instance-local SSD).
   140  	}
   141  }
   142  
   143  // ReadaheadConfig controls the use of read-ahead.
   144  type ReadaheadConfig struct {
   145  	// Informed is the type of read-ahead for operations that are known to read a
   146  	// large consecutive chunk of a file.
   147  	Informed ReadaheadMode
   148  
   149  	// Speculative is the type of read-ahead used automatically, when consecutive
   150  	// reads are detected.
   151  	Speculative ReadaheadMode
   152  }
   153  
   154  // DefaultReadaheadConfig is the readahead config used when ReadaheadConfigFn is
   155  // not specified.
   156  var DefaultReadaheadConfig = ReadaheadConfig{
   157  	Informed:    FadviseSequential,
   158  	Speculative: FadviseSequential,
   159  }
   160  
   161  // ReadaheadMode indicates the type of read-ahead to use, either for informed
   162  // read-ahead (e.g. compactions) or speculative read-ahead.
   163  type ReadaheadMode uint8
   164  
   165  const (
   166  	// NoReadahead disables readahead altogether.
   167  	NoReadahead ReadaheadMode = iota
   168  
   169  	// SysReadahead enables the use of SYS_READAHEAD call to prefetch data.
   170  	// The prefetch window grows dynamically as consecutive writes are detected.
   171  	SysReadahead
   172  
   173  	// FadviseSequential enables to use of FADV_SEQUENTIAL. For informed
   174  	// read-ahead, FADV_SEQUENTIAL is used from the beginning. For speculative
   175  	// read-ahead SYS_READAHEAD is first used until the window reaches the maximum
   176  	// size, then we siwtch to FADV_SEQUENTIAL.
   177  	FadviseSequential
   178  )
   179  
   180  // DefaultSettings initializes default settings (with no remote storage),
   181  // suitable for tests and tools.
   182  func DefaultSettings(fs vfs.FS, dirName string) Settings {
   183  	return Settings{
   184  		Logger:        base.DefaultLogger,
   185  		FS:            fs,
   186  		FSDirName:     dirName,
   187  		FSCleaner:     base.DeleteCleaner{},
   188  		NoSyncOnClose: false,
   189  		BytesPerSync:  512 * 1024, // 512KB
   190  	}
   191  }
   192  
   193  // Open creates the provider.
   194  func Open(settings Settings) (objstorage.Provider, error) {
   195  	// Note: we can't just `return open(settings)` because in an error case we
   196  	// would return (*provider)(nil) which is not objstorage.Provider(nil).
   197  	p, err := open(settings)
   198  	if err != nil {
   199  		return nil, err
   200  	}
   201  	return p, nil
   202  }
   203  
   204  func open(settings Settings) (p *provider, _ error) {
   205  	fsDir, err := settings.FS.OpenDir(settings.FSDirName)
   206  	if err != nil {
   207  		return nil, err
   208  	}
   209  
   210  	defer func() {
   211  		if p == nil {
   212  			fsDir.Close()
   213  		}
   214  	}()
   215  
   216  	p = &provider{
   217  		st:    settings,
   218  		fsDir: fsDir,
   219  	}
   220  	p.mu.knownObjects = make(map[base.DiskFileNum]objstorage.ObjectMetadata)
   221  	p.mu.protectedObjects = make(map[base.DiskFileNum]int)
   222  
   223  	if objiotracing.Enabled {
   224  		p.tracer = objiotracing.Open(settings.FS, settings.FSDirName)
   225  	}
   226  
   227  	// Add local FS objects.
   228  	if err := p.vfsInit(); err != nil {
   229  		return nil, err
   230  	}
   231  
   232  	// Initialize remote subsystem (if configured) and add remote objects.
   233  	if err := p.remoteInit(); err != nil {
   234  		return nil, err
   235  	}
   236  
   237  	return p, nil
   238  }
   239  
   240  // Close is part of the objstorage.Provider interface.
   241  func (p *provider) Close() error {
   242  	err := p.sharedClose()
   243  	if p.fsDir != nil {
   244  		err = firstError(err, p.fsDir.Close())
   245  		p.fsDir = nil
   246  	}
   247  	if objiotracing.Enabled {
   248  		if p.tracer != nil {
   249  			p.tracer.Close()
   250  			p.tracer = nil
   251  		}
   252  	}
   253  	return err
   254  }
   255  
   256  // OpenForReading opens an existing object.
   257  func (p *provider) OpenForReading(
   258  	ctx context.Context,
   259  	fileType base.FileType,
   260  	fileNum base.DiskFileNum,
   261  	opts objstorage.OpenOptions,
   262  ) (objstorage.Readable, error) {
   263  	meta, err := p.Lookup(fileType, fileNum)
   264  	if err != nil {
   265  		if opts.MustExist {
   266  			p.st.Logger.Fatalf("%v", err)
   267  		}
   268  		return nil, err
   269  	}
   270  
   271  	var r objstorage.Readable
   272  	if !meta.IsRemote() {
   273  		r, err = p.vfsOpenForReading(ctx, fileType, fileNum, opts)
   274  	} else {
   275  		r, err = p.remoteOpenForReading(ctx, meta, opts)
   276  		if err != nil && p.isNotExistError(meta, err) {
   277  			// Wrap the error so that IsNotExistError functions properly.
   278  			err = errors.Mark(err, os.ErrNotExist)
   279  		}
   280  	}
   281  	if err != nil {
   282  		return nil, err
   283  	}
   284  	if objiotracing.Enabled {
   285  		r = p.tracer.WrapReadable(ctx, r, fileNum)
   286  	}
   287  	return r, nil
   288  }
   289  
   290  // Create creates a new object and opens it for writing.
   291  //
   292  // The object is not guaranteed to be durable (accessible in case of crashes)
   293  // until Sync is called.
   294  func (p *provider) Create(
   295  	ctx context.Context,
   296  	fileType base.FileType,
   297  	fileNum base.DiskFileNum,
   298  	opts objstorage.CreateOptions,
   299  ) (w objstorage.Writable, meta objstorage.ObjectMetadata, err error) {
   300  	if opts.PreferSharedStorage && p.st.Remote.CreateOnShared != remote.CreateOnSharedNone {
   301  		w, meta, err = p.sharedCreate(ctx, fileType, fileNum, p.st.Remote.CreateOnSharedLocator, opts)
   302  	} else {
   303  		w, meta, err = p.vfsCreate(ctx, fileType, fileNum)
   304  	}
   305  	if err != nil {
   306  		err = errors.Wrapf(err, "creating object %s", errors.Safe(fileNum))
   307  		return nil, objstorage.ObjectMetadata{}, err
   308  	}
   309  	p.addMetadata(meta)
   310  	if objiotracing.Enabled {
   311  		w = p.tracer.WrapWritable(ctx, w, fileNum)
   312  	}
   313  	return w, meta, nil
   314  }
   315  
   316  // Remove removes an object.
   317  //
   318  // Note that if the object is remote, the object is only (conceptually) removed
   319  // from this provider. If other providers have references on the remote object,
   320  // it will not be removed.
   321  //
   322  // The object is not guaranteed to be durably removed until Sync is called.
   323  func (p *provider) Remove(fileType base.FileType, fileNum base.DiskFileNum) error {
   324  	meta, err := p.Lookup(fileType, fileNum)
   325  	if err != nil {
   326  		return err
   327  	}
   328  
   329  	if !meta.IsRemote() {
   330  		err = p.vfsRemove(fileType, fileNum)
   331  	} else {
   332  		// TODO(radu): implement remote object removal (i.e. deref).
   333  		err = p.sharedUnref(meta)
   334  		if err != nil && p.isNotExistError(meta, err) {
   335  			// Wrap the error so that IsNotExistError functions properly.
   336  			err = errors.Mark(err, os.ErrNotExist)
   337  		}
   338  	}
   339  	if err != nil && !p.IsNotExistError(err) {
   340  		// We want to be able to retry a Remove, so we keep the object in our list.
   341  		// TODO(radu): we should mark the object as "zombie" and not allow any other
   342  		// operations.
   343  		return errors.Wrapf(err, "removing object %s", errors.Safe(fileNum))
   344  	}
   345  
   346  	p.removeMetadata(fileNum)
   347  	return err
   348  }
   349  
   350  func (p *provider) isNotExistError(meta objstorage.ObjectMetadata, err error) bool {
   351  	if meta.Remote.Storage != nil {
   352  		return meta.Remote.Storage.IsNotExistError(err)
   353  	}
   354  	return oserror.IsNotExist(err)
   355  }
   356  
   357  // IsNotExistError is part of the objstorage.Provider interface.
   358  func (p *provider) IsNotExistError(err error) bool {
   359  	// We use errors.Mark(err, os.ErrNotExist) for not-exist errors coming from
   360  	// remote.Storage.
   361  	return oserror.IsNotExist(err)
   362  }
   363  
   364  // Sync flushes the metadata from creation or removal of objects since the last Sync.
   365  func (p *provider) Sync() error {
   366  	if err := p.vfsSync(); err != nil {
   367  		return err
   368  	}
   369  	if err := p.sharedSync(); err != nil {
   370  		return err
   371  	}
   372  	return nil
   373  }
   374  
   375  // LinkOrCopyFromLocal creates a new object that is either a copy of a given
   376  // local file or a hard link (if the new object is created on the same FS, and
   377  // if the FS supports it).
   378  //
   379  // The object is not guaranteed to be durable (accessible in case of crashes)
   380  // until Sync is called.
   381  func (p *provider) LinkOrCopyFromLocal(
   382  	ctx context.Context,
   383  	srcFS vfs.FS,
   384  	srcFilePath string,
   385  	dstFileType base.FileType,
   386  	dstFileNum base.DiskFileNum,
   387  	opts objstorage.CreateOptions,
   388  ) (objstorage.ObjectMetadata, error) {
   389  	shared := opts.PreferSharedStorage && p.st.Remote.CreateOnShared != remote.CreateOnSharedNone
   390  	if !shared && srcFS == p.st.FS {
   391  		// Wrap the normal filesystem with one which wraps newly created files with
   392  		// vfs.NewSyncingFile.
   393  		fs := vfs.NewSyncingFS(p.st.FS, vfs.SyncingFileOptions{
   394  			NoSyncOnClose: p.st.NoSyncOnClose,
   395  			BytesPerSync:  p.st.BytesPerSync,
   396  		})
   397  		dstPath := p.vfsPath(dstFileType, dstFileNum)
   398  		if err := vfs.LinkOrCopy(fs, srcFilePath, dstPath); err != nil {
   399  			return objstorage.ObjectMetadata{}, err
   400  		}
   401  
   402  		meta := objstorage.ObjectMetadata{
   403  			DiskFileNum: dstFileNum,
   404  			FileType:    dstFileType,
   405  		}
   406  		p.addMetadata(meta)
   407  		return meta, nil
   408  	}
   409  	// Create the object and copy the data.
   410  	w, meta, err := p.Create(ctx, dstFileType, dstFileNum, opts)
   411  	if err != nil {
   412  		return objstorage.ObjectMetadata{}, err
   413  	}
   414  	f, err := srcFS.Open(srcFilePath, vfs.SequentialReadsOption)
   415  	if err != nil {
   416  		return objstorage.ObjectMetadata{}, err
   417  	}
   418  	defer f.Close()
   419  	buf := make([]byte, 64*1024)
   420  	for {
   421  		n, readErr := f.Read(buf)
   422  		if readErr != nil && readErr != io.EOF {
   423  			w.Abort()
   424  			return objstorage.ObjectMetadata{}, readErr
   425  		}
   426  
   427  		if n > 0 {
   428  			if err := w.Write(buf[:n]); err != nil {
   429  				w.Abort()
   430  				return objstorage.ObjectMetadata{}, err
   431  			}
   432  		}
   433  
   434  		if readErr == io.EOF {
   435  			break
   436  		}
   437  	}
   438  	if err := w.Finish(); err != nil {
   439  		return objstorage.ObjectMetadata{}, err
   440  	}
   441  	return meta, nil
   442  }
   443  
   444  // Lookup is part of the objstorage.Provider interface.
   445  func (p *provider) Lookup(
   446  	fileType base.FileType, fileNum base.DiskFileNum,
   447  ) (objstorage.ObjectMetadata, error) {
   448  	p.mu.RLock()
   449  	defer p.mu.RUnlock()
   450  	meta, ok := p.mu.knownObjects[fileNum]
   451  	if !ok {
   452  		return objstorage.ObjectMetadata{}, errors.Wrapf(
   453  			os.ErrNotExist,
   454  			"file %s (type %d) unknown to the objstorage provider",
   455  			errors.Safe(fileNum), errors.Safe(fileType),
   456  		)
   457  	}
   458  	if meta.FileType != fileType {
   459  		return objstorage.ObjectMetadata{}, errors.AssertionFailedf(
   460  			"file %s type mismatch (known type %d, expected type %d)",
   461  			errors.Safe(fileNum), errors.Safe(meta.FileType), errors.Safe(fileType),
   462  		)
   463  	}
   464  	return meta, nil
   465  }
   466  
   467  // Path is part of the objstorage.Provider interface.
   468  func (p *provider) Path(meta objstorage.ObjectMetadata) string {
   469  	if !meta.IsRemote() {
   470  		return p.vfsPath(meta.FileType, meta.DiskFileNum)
   471  	}
   472  	return p.remotePath(meta)
   473  }
   474  
   475  // Size returns the size of the object.
   476  func (p *provider) Size(meta objstorage.ObjectMetadata) (int64, error) {
   477  	if !meta.IsRemote() {
   478  		return p.vfsSize(meta.FileType, meta.DiskFileNum)
   479  	}
   480  	return p.remoteSize(meta)
   481  }
   482  
   483  // List is part of the objstorage.Provider interface.
   484  func (p *provider) List() []objstorage.ObjectMetadata {
   485  	p.mu.RLock()
   486  	defer p.mu.RUnlock()
   487  	res := make([]objstorage.ObjectMetadata, 0, len(p.mu.knownObjects))
   488  	for _, meta := range p.mu.knownObjects {
   489  		res = append(res, meta)
   490  	}
   491  	sort.Slice(res, func(i, j int) bool {
   492  		return res[i].DiskFileNum.FileNum() < res[j].DiskFileNum.FileNum()
   493  	})
   494  	return res
   495  }
   496  
   497  // Metrics is part of the objstorage.Provider interface.
   498  func (p *provider) Metrics() sharedcache.Metrics {
   499  	if p.remote.cache != nil {
   500  		return p.remote.cache.Metrics()
   501  	}
   502  	return sharedcache.Metrics{}
   503  }
   504  
   505  func (p *provider) addMetadata(meta objstorage.ObjectMetadata) {
   506  	if invariants.Enabled {
   507  		meta.AssertValid()
   508  	}
   509  	p.mu.Lock()
   510  	defer p.mu.Unlock()
   511  	p.mu.knownObjects[meta.DiskFileNum] = meta
   512  	if meta.IsRemote() {
   513  		p.mu.remote.catalogBatch.AddObject(remoteobjcat.RemoteObjectMetadata{
   514  			FileNum:        meta.DiskFileNum,
   515  			FileType:       meta.FileType,
   516  			CreatorID:      meta.Remote.CreatorID,
   517  			CreatorFileNum: meta.Remote.CreatorFileNum,
   518  			Locator:        meta.Remote.Locator,
   519  			CleanupMethod:  meta.Remote.CleanupMethod,
   520  		})
   521  	} else {
   522  		p.mu.localObjectsChanged = true
   523  	}
   524  }
   525  
   526  func (p *provider) removeMetadata(fileNum base.DiskFileNum) {
   527  	p.mu.Lock()
   528  	defer p.mu.Unlock()
   529  
   530  	meta, ok := p.mu.knownObjects[fileNum]
   531  	if !ok {
   532  		return
   533  	}
   534  	delete(p.mu.knownObjects, fileNum)
   535  	if meta.IsRemote() {
   536  		p.mu.remote.catalogBatch.DeleteObject(fileNum)
   537  	} else {
   538  		p.mu.localObjectsChanged = true
   539  	}
   540  }
   541  
   542  // protectObject prevents the unreferencing of a remote object until
   543  // unprotectObject is called.
   544  func (p *provider) protectObject(fileNum base.DiskFileNum) {
   545  	p.mu.Lock()
   546  	defer p.mu.Unlock()
   547  	p.mu.protectedObjects[fileNum] = p.mu.protectedObjects[fileNum] + 1
   548  }
   549  
   550  func (p *provider) unprotectObject(fileNum base.DiskFileNum) {
   551  	p.mu.Lock()
   552  	defer p.mu.Unlock()
   553  	v := p.mu.protectedObjects[fileNum]
   554  	if invariants.Enabled && v == 0 {
   555  		panic("invalid protection count")
   556  	}
   557  	if v > 1 {
   558  		p.mu.protectedObjects[fileNum] = v - 1
   559  	} else {
   560  		delete(p.mu.protectedObjects, fileNum)
   561  		// TODO(radu): check if the object is still in knownObject; if not, unref it
   562  		// now.
   563  	}
   564  }
   565  
   566  func (p *provider) isProtected(fileNum base.DiskFileNum) bool {
   567  	p.mu.Lock()
   568  	defer p.mu.Unlock()
   569  	return p.mu.protectedObjects[fileNum] > 0
   570  }