github.com/cockroachdb/pebble@v1.1.1-0.20240513155919-3622ade60459/objstorage/objstorage.go (about)

     1  // Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package objstorage
     6  
     7  import (
     8  	"context"
     9  	"fmt"
    10  
    11  	"github.com/cockroachdb/errors"
    12  	"github.com/cockroachdb/pebble/internal/base"
    13  	"github.com/cockroachdb/pebble/objstorage/objstorageprovider/sharedcache"
    14  	"github.com/cockroachdb/pebble/objstorage/remote"
    15  	"github.com/cockroachdb/pebble/vfs"
    16  )
    17  
    18  // Readable is the handle for an object that is open for reading.
    19  type Readable interface {
    20  	// ReadAt reads len(p) bytes into p starting at offset off.
    21  	//
    22  	// Does not return partial results; if off + len(p) is past the end of the
    23  	// object, an error is returned.
    24  	//
    25  	// Clients of ReadAt can execute parallel ReadAt calls on the
    26  	// same Readable.
    27  	ReadAt(ctx context.Context, p []byte, off int64) error
    28  
    29  	Close() error
    30  
    31  	// Size returns the size of the object.
    32  	Size() int64
    33  
    34  	// NewReadHandle creates a read handle for ReadAt requests that are related
    35  	// and can benefit from optimizations like read-ahead.
    36  	//
    37  	// The ReadHandle must be closed before the Readable is closed.
    38  	//
    39  	// Multiple separate ReadHandles can be used.
    40  	NewReadHandle(ctx context.Context) ReadHandle
    41  }
    42  
    43  // ReadHandle is used to perform reads that are related and might benefit from
    44  // optimizations like read-ahead.
    45  type ReadHandle interface {
    46  	// ReadAt reads len(p) bytes into p starting at offset off.
    47  	//
    48  	// Does not return partial results; if off + len(p) is past the end of the
    49  	// object, an error is returned.
    50  	//
    51  	// Parallel ReadAt calls on the same ReadHandle are not allowed.
    52  	ReadAt(ctx context.Context, p []byte, off int64) error
    53  
    54  	Close() error
    55  
    56  	// SetupForCompaction informs the implementation that the read handle will
    57  	// be used to read data blocks for a compaction. The implementation can expect
    58  	// sequential reads, and can decide to not retain data in any caches.
    59  	SetupForCompaction()
    60  
    61  	// RecordCacheHit informs the implementation that we were able to retrieve a
    62  	// block from cache. This is useful for example when the implementation is
    63  	// trying to detect a sequential reading pattern.
    64  	RecordCacheHit(ctx context.Context, offset, size int64)
    65  }
    66  
    67  // Writable is the handle for an object that is open for writing.
    68  // Either Finish or Abort must be called.
    69  type Writable interface {
    70  	// Write writes len(p) bytes from p to the underlying object. The data is not
    71  	// guaranteed to be durable until Finish is called.
    72  	//
    73  	// Note that Write *is* allowed to modify the slice passed in, whether
    74  	// temporarily or permanently. Callers of Write need to take this into
    75  	// account.
    76  	Write(p []byte) error
    77  
    78  	// Finish completes the object and makes the data durable.
    79  	// No further calls are allowed after calling Finish.
    80  	Finish() error
    81  
    82  	// Abort gives up on finishing the object. There is no guarantee about whether
    83  	// the object exists after calling Abort.
    84  	// No further calls are allowed after calling Abort.
    85  	Abort()
    86  }
    87  
    88  // ObjectMetadata contains the metadata required to be able to access an object.
    89  type ObjectMetadata struct {
    90  	DiskFileNum base.DiskFileNum
    91  	FileType    base.FileType
    92  
    93  	// The fields below are only set if the object is on remote storage.
    94  	Remote struct {
    95  		// CreatorID identifies the DB instance that originally created the object.
    96  		//
    97  		// Only used when CustomObjectName is not set.
    98  		CreatorID CreatorID
    99  		// CreatorFileNum is the identifier for the object within the context of the
   100  		// DB instance that originally created the object.
   101  		//
   102  		// Only used when CustomObjectName is not set.
   103  		CreatorFileNum base.DiskFileNum
   104  		// CustomObjectName (if it is set) overrides the object name that is normally
   105  		// derived from the CreatorID and CreatorFileNum.
   106  		CustomObjectName string
   107  		// CleanupMethod indicates the method for cleaning up unused shared objects.
   108  		CleanupMethod SharedCleanupMethod
   109  		// Locator identifies the remote.Storage implementation for this object.
   110  		Locator remote.Locator
   111  		// Storage is the remote.Storage object corresponding to the Locator. Used
   112  		// to avoid lookups in hot paths.
   113  		Storage remote.Storage
   114  	}
   115  }
   116  
   117  // IsRemote returns true if the object is on remote storage.
   118  func (meta *ObjectMetadata) IsRemote() bool {
   119  	return meta.IsShared() || meta.IsExternal()
   120  }
   121  
   122  // IsExternal returns true if the object is on remote storage but is not owned
   123  // by any Pebble instances in the cluster.
   124  func (meta *ObjectMetadata) IsExternal() bool {
   125  	return meta.Remote.CustomObjectName != ""
   126  }
   127  
   128  // IsShared returns true if the object is on remote storage and is owned by a
   129  // Pebble instance in the cluster (potentially shared between multiple
   130  // instances).
   131  func (meta *ObjectMetadata) IsShared() bool {
   132  	return meta.Remote.CreatorID.IsSet()
   133  }
   134  
   135  // AssertValid checks that the metadata is sane.
   136  func (meta *ObjectMetadata) AssertValid() {
   137  	if !meta.IsRemote() {
   138  		// Verify all Remote fields are empty.
   139  		if meta.Remote != (ObjectMetadata{}).Remote {
   140  			panic(errors.AssertionFailedf("meta.Remote not empty: %#v", meta.Remote))
   141  		}
   142  	} else {
   143  		if meta.Remote.CustomObjectName != "" {
   144  			if meta.Remote.CreatorID == 0 {
   145  				panic(errors.AssertionFailedf("CreatorID not set"))
   146  			}
   147  			if meta.Remote.CreatorFileNum == base.FileNum(0).DiskFileNum() {
   148  				panic(errors.AssertionFailedf("CreatorFileNum not set"))
   149  			}
   150  		}
   151  		if meta.Remote.CleanupMethod != SharedNoCleanup && meta.Remote.CleanupMethod != SharedRefTracking {
   152  			panic(errors.AssertionFailedf("invalid CleanupMethod %d", meta.Remote.CleanupMethod))
   153  		}
   154  		if meta.Remote.Storage == nil {
   155  			panic(errors.AssertionFailedf("Storage not set"))
   156  		}
   157  	}
   158  }
   159  
   160  // CreatorID identifies the DB instance that originally created a shared object.
   161  // This ID is incorporated in backing object names.
   162  // Must be non-zero.
   163  type CreatorID uint64
   164  
   165  // IsSet returns true if the CreatorID is not zero.
   166  func (c CreatorID) IsSet() bool { return c != 0 }
   167  
   168  func (c CreatorID) String() string { return fmt.Sprintf("%d", c) }
   169  
   170  // SharedCleanupMethod indicates the method for cleaning up unused shared objects.
   171  type SharedCleanupMethod uint8
   172  
   173  const (
   174  	// SharedRefTracking is used for shared objects for which objstorage providers
   175  	// keep track of references via reference marker objects.
   176  	SharedRefTracking SharedCleanupMethod = iota
   177  
   178  	// SharedNoCleanup is used for remote objects that are managed externally; the
   179  	// objstorage provider never deletes such objects.
   180  	SharedNoCleanup
   181  )
   182  
   183  // OpenOptions contains optional arguments for OpenForReading.
   184  type OpenOptions struct {
   185  	// MustExist triggers a fatal error if the file does not exist. The fatal
   186  	// error message contains extra information helpful for debugging.
   187  	MustExist bool
   188  }
   189  
   190  // CreateOptions contains optional arguments for Create.
   191  type CreateOptions struct {
   192  	// PreferSharedStorage causes the object to be created on shared storage if
   193  	// the provider has shared storage configured.
   194  	PreferSharedStorage bool
   195  
   196  	// SharedCleanupMethod is used for the object when it is created on shared storage.
   197  	// The default (zero) value is SharedRefTracking.
   198  	SharedCleanupMethod SharedCleanupMethod
   199  }
   200  
   201  // Provider is a singleton object used to access and manage objects.
   202  //
   203  // An object is conceptually like a large immutable file. The main use of
   204  // objects is for storing sstables; in the future it could also be used for blob
   205  // storage.
   206  //
   207  // The Provider can only manage objects that it knows about - either objects
   208  // created by the provider, or existing objects the Provider was informed about
   209  // via AddObjects.
   210  //
   211  // Objects are currently backed by a vfs.File or a remote.Storage object.
   212  type Provider interface {
   213  	// OpenForReading opens an existing object.
   214  	OpenForReading(
   215  		ctx context.Context, fileType base.FileType, FileNum base.DiskFileNum, opts OpenOptions,
   216  	) (Readable, error)
   217  
   218  	// Create creates a new object and opens it for writing.
   219  	//
   220  	// The object is not guaranteed to be durable (accessible in case of crashes)
   221  	// until Sync is called.
   222  	Create(
   223  		ctx context.Context, fileType base.FileType, FileNum base.DiskFileNum, opts CreateOptions,
   224  	) (w Writable, meta ObjectMetadata, err error)
   225  
   226  	// Remove removes an object.
   227  	//
   228  	// The object is not guaranteed to be durably removed until Sync is called.
   229  	Remove(fileType base.FileType, FileNum base.DiskFileNum) error
   230  
   231  	// Sync flushes the metadata from creation or removal of objects since the last Sync.
   232  	// This includes objects that have been Created but for which
   233  	// Writable.Finish() has not yet been called.
   234  	Sync() error
   235  
   236  	// LinkOrCopyFromLocal creates a new object that is either a copy of a given
   237  	// local file or a hard link (if the new object is created on the same FS, and
   238  	// if the FS supports it).
   239  	//
   240  	// The object is not guaranteed to be durable (accessible in case of crashes)
   241  	// until Sync is called.
   242  	LinkOrCopyFromLocal(
   243  		ctx context.Context,
   244  		srcFS vfs.FS,
   245  		srcFilePath string,
   246  		dstFileType base.FileType,
   247  		dstFileNum base.DiskFileNum,
   248  		opts CreateOptions,
   249  	) (ObjectMetadata, error)
   250  
   251  	// Lookup returns the metadata of an object that is already known to the Provider.
   252  	// Does not perform any I/O.
   253  	Lookup(fileType base.FileType, FileNum base.DiskFileNum) (ObjectMetadata, error)
   254  
   255  	// Path returns an internal, implementation-dependent path for the object. It is
   256  	// meant to be used for informational purposes (like logging).
   257  	Path(meta ObjectMetadata) string
   258  
   259  	// Size returns the size of the object.
   260  	Size(meta ObjectMetadata) (int64, error)
   261  
   262  	// List returns the objects currently known to the provider. Does not perform any I/O.
   263  	List() []ObjectMetadata
   264  
   265  	// SetCreatorID sets the CreatorID which is needed in order to use shared
   266  	// objects. Remote object usage is disabled until this method is called the
   267  	// first time. Once set, the Creator ID is persisted and cannot change.
   268  	//
   269  	// Cannot be called if shared storage is not configured for the provider.
   270  	SetCreatorID(creatorID CreatorID) error
   271  
   272  	// IsSharedForeign returns whether this object is owned by a different node.
   273  	IsSharedForeign(meta ObjectMetadata) bool
   274  
   275  	// RemoteObjectBacking encodes the remote object metadata for the given object.
   276  	RemoteObjectBacking(meta *ObjectMetadata) (RemoteObjectBackingHandle, error)
   277  
   278  	// CreateExternalObjectBacking creates a backing for an existing object with a
   279  	// custom object name. The object is considered to be managed outside of
   280  	// Pebble and will never be removed by Pebble.
   281  	CreateExternalObjectBacking(locator remote.Locator, objName string) (RemoteObjectBacking, error)
   282  
   283  	// AttachRemoteObjects registers existing remote objects with this provider.
   284  	AttachRemoteObjects(objs []RemoteObjectToAttach) ([]ObjectMetadata, error)
   285  
   286  	Close() error
   287  
   288  	// IsNotExistError indicates whether the error is known to report that a file or
   289  	// directory does not exist.
   290  	IsNotExistError(err error) bool
   291  
   292  	// Metrics returns metrics about objstorage. Currently, it only returns metrics
   293  	// about the shared cache.
   294  	Metrics() sharedcache.Metrics
   295  }
   296  
   297  // RemoteObjectBacking encodes the metadata necessary to incorporate a shared
   298  // object into a different Pebble instance. The encoding is specific to a given
   299  // Provider implementation.
   300  type RemoteObjectBacking []byte
   301  
   302  // RemoteObjectBackingHandle is a container for a RemoteObjectBacking which
   303  // ensures that the backing stays valid. A backing can otherwise become invalid
   304  // if this provider unrefs the shared object. The RemoteObjectBackingHandle
   305  // delays any unref until Close.
   306  type RemoteObjectBackingHandle interface {
   307  	// Get returns the backing. The backing is only guaranteed to be valid until
   308  	// Close is called (or until the Provider is closed). If Close was already
   309  	// called, returns an error.
   310  	Get() (RemoteObjectBacking, error)
   311  	Close()
   312  }
   313  
   314  // RemoteObjectToAttach contains the arguments needed to attach an existing remote object.
   315  type RemoteObjectToAttach struct {
   316  	// FileNum is the file number that will be used to refer to this object (in
   317  	// the context of this instance).
   318  	FileNum  base.DiskFileNum
   319  	FileType base.FileType
   320  	// Backing contains the metadata for the remote object backing (normally
   321  	// generated from a different instance, but using the same Provider
   322  	// implementation).
   323  	Backing RemoteObjectBacking
   324  }