github.com/cockroachdb/pebble@v0.0.0-20231214172447-ab4952c5f87b/objstorage/objstorage.go (about)

     1  // Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package objstorage
     6  
     7  import (
     8  	"context"
     9  	"fmt"
    10  
    11  	"github.com/cockroachdb/errors"
    12  	"github.com/cockroachdb/pebble/internal/base"
    13  	"github.com/cockroachdb/pebble/objstorage/objstorageprovider/sharedcache"
    14  	"github.com/cockroachdb/pebble/objstorage/remote"
    15  	"github.com/cockroachdb/pebble/vfs"
    16  	"github.com/cockroachdb/redact"
    17  )
    18  
    19  // Readable is the handle for an object that is open for reading.
    20  type Readable interface {
    21  	// ReadAt reads len(p) bytes into p starting at offset off.
    22  	//
    23  	// Does not return partial results; if off + len(p) is past the end of the
    24  	// object, an error is returned.
    25  	//
    26  	// Clients of ReadAt can execute parallel ReadAt calls on the
    27  	// same Readable.
    28  	ReadAt(ctx context.Context, p []byte, off int64) error
    29  
    30  	Close() error
    31  
    32  	// Size returns the size of the object.
    33  	Size() int64
    34  
    35  	// NewReadHandle creates a read handle for ReadAt requests that are related
    36  	// and can benefit from optimizations like read-ahead.
    37  	//
    38  	// The ReadHandle must be closed before the Readable is closed.
    39  	//
    40  	// Multiple separate ReadHandles can be used.
    41  	NewReadHandle(ctx context.Context) ReadHandle
    42  }
    43  
    44  // ReadHandle is used to perform reads that are related and might benefit from
    45  // optimizations like read-ahead.
    46  type ReadHandle interface {
    47  	// ReadAt reads len(p) bytes into p starting at offset off.
    48  	//
    49  	// Does not return partial results; if off + len(p) is past the end of the
    50  	// object, an error is returned.
    51  	//
    52  	// Parallel ReadAt calls on the same ReadHandle are not allowed.
    53  	ReadAt(ctx context.Context, p []byte, off int64) error
    54  
    55  	Close() error
    56  
    57  	// SetupForCompaction informs the implementation that the read handle will
    58  	// be used to read data blocks for a compaction. The implementation can expect
    59  	// sequential reads, and can decide to not retain data in any caches.
    60  	SetupForCompaction()
    61  
    62  	// RecordCacheHit informs the implementation that we were able to retrieve a
    63  	// block from cache. This is useful for example when the implementation is
    64  	// trying to detect a sequential reading pattern.
    65  	RecordCacheHit(ctx context.Context, offset, size int64)
    66  }
    67  
    68  // Writable is the handle for an object that is open for writing.
    69  // Either Finish or Abort must be called.
    70  type Writable interface {
    71  	// Write writes len(p) bytes from p to the underlying object. The data is not
    72  	// guaranteed to be durable until Finish is called.
    73  	//
    74  	// Note that Write *is* allowed to modify the slice passed in, whether
    75  	// temporarily or permanently. Callers of Write need to take this into
    76  	// account.
    77  	Write(p []byte) error
    78  
    79  	// Finish completes the object and makes the data durable.
    80  	// No further calls are allowed after calling Finish.
    81  	Finish() error
    82  
    83  	// Abort gives up on finishing the object. There is no guarantee about whether
    84  	// the object exists after calling Abort.
    85  	// No further calls are allowed after calling Abort.
    86  	Abort()
    87  }
    88  
    89  // ObjectMetadata contains the metadata required to be able to access an object.
    90  type ObjectMetadata struct {
    91  	DiskFileNum base.DiskFileNum
    92  	FileType    base.FileType
    93  
    94  	// The fields below are only set if the object is on remote storage.
    95  	Remote struct {
    96  		// CreatorID identifies the DB instance that originally created the object.
    97  		//
    98  		// Only used when CustomObjectName is not set.
    99  		CreatorID CreatorID
   100  		// CreatorFileNum is the identifier for the object within the context of the
   101  		// DB instance that originally created the object.
   102  		//
   103  		// Only used when CustomObjectName is not set.
   104  		CreatorFileNum base.DiskFileNum
   105  		// CustomObjectName (if it is set) overrides the object name that is normally
   106  		// derived from the CreatorID and CreatorFileNum.
   107  		CustomObjectName string
   108  		// CleanupMethod indicates the method for cleaning up unused shared objects.
   109  		CleanupMethod SharedCleanupMethod
   110  		// Locator identifies the remote.Storage implementation for this object.
   111  		Locator remote.Locator
   112  		// Storage is the remote.Storage object corresponding to the Locator. Used
   113  		// to avoid lookups in hot paths.
   114  		Storage remote.Storage
   115  	}
   116  }
   117  
   118  // IsRemote returns true if the object is on remote storage.
   119  func (meta *ObjectMetadata) IsRemote() bool {
   120  	return meta.IsShared() || meta.IsExternal()
   121  }
   122  
   123  // IsExternal returns true if the object is on remote storage but is not owned
   124  // by any Pebble instances in the cluster.
   125  func (meta *ObjectMetadata) IsExternal() bool {
   126  	return meta.Remote.CustomObjectName != ""
   127  }
   128  
   129  // IsShared returns true if the object is on remote storage and is owned by a
   130  // Pebble instance in the cluster (potentially shared between multiple
   131  // instances).
   132  func (meta *ObjectMetadata) IsShared() bool {
   133  	return meta.Remote.CreatorID.IsSet()
   134  }
   135  
   136  // AssertValid checks that the metadata is sane.
   137  func (meta *ObjectMetadata) AssertValid() {
   138  	if !meta.IsRemote() {
   139  		// Verify all Remote fields are empty.
   140  		if meta.Remote != (ObjectMetadata{}).Remote {
   141  			panic(errors.AssertionFailedf("meta.Remote not empty: %#v", meta.Remote))
   142  		}
   143  	} else {
   144  		if meta.Remote.CustomObjectName == "" {
   145  			if meta.Remote.CreatorID == 0 {
   146  				panic(errors.AssertionFailedf("CreatorID not set"))
   147  			}
   148  			if meta.Remote.CreatorFileNum == base.FileNum(0).DiskFileNum() {
   149  				panic(errors.AssertionFailedf("CreatorFileNum not set"))
   150  			}
   151  		}
   152  		if meta.Remote.CleanupMethod != SharedNoCleanup && meta.Remote.CleanupMethod != SharedRefTracking {
   153  			panic(errors.AssertionFailedf("invalid CleanupMethod %d", meta.Remote.CleanupMethod))
   154  		}
   155  		if meta.Remote.Storage == nil {
   156  			panic(errors.AssertionFailedf("Storage not set"))
   157  		}
   158  	}
   159  }
   160  
   161  // CreatorID identifies the DB instance that originally created a shared object.
   162  // This ID is incorporated in backing object names.
   163  // Must be non-zero.
   164  type CreatorID uint64
   165  
   166  // IsSet returns true if the CreatorID is not zero.
   167  func (c CreatorID) IsSet() bool { return c != 0 }
   168  
   169  func (c CreatorID) String() string { return fmt.Sprintf("%d", c) }
   170  
   171  // SafeFormat implements redact.SafeFormatter.
   172  func (c CreatorID) SafeFormat(w redact.SafePrinter, _ rune) {
   173  	w.Printf("%d", redact.SafeUint(c))
   174  }
   175  
   176  // SharedCleanupMethod indicates the method for cleaning up unused shared objects.
   177  type SharedCleanupMethod uint8
   178  
   179  const (
   180  	// SharedRefTracking is used for shared objects for which objstorage providers
   181  	// keep track of references via reference marker objects.
   182  	SharedRefTracking SharedCleanupMethod = iota
   183  
   184  	// SharedNoCleanup is used for remote objects that are managed externally; the
   185  	// objstorage provider never deletes such objects.
   186  	SharedNoCleanup
   187  )
   188  
   189  // OpenOptions contains optional arguments for OpenForReading.
   190  type OpenOptions struct {
   191  	// MustExist triggers a fatal error if the file does not exist. The fatal
   192  	// error message contains extra information helpful for debugging.
   193  	MustExist bool
   194  }
   195  
   196  // CreateOptions contains optional arguments for Create.
   197  type CreateOptions struct {
   198  	// PreferSharedStorage causes the object to be created on shared storage if
   199  	// the provider has shared storage configured.
   200  	PreferSharedStorage bool
   201  
   202  	// SharedCleanupMethod is used for the object when it is created on shared storage.
   203  	// The default (zero) value is SharedRefTracking.
   204  	SharedCleanupMethod SharedCleanupMethod
   205  }
   206  
   207  // Provider is a singleton object used to access and manage objects.
   208  //
   209  // An object is conceptually like a large immutable file. The main use of
   210  // objects is for storing sstables; in the future it could also be used for blob
   211  // storage.
   212  //
   213  // The Provider can only manage objects that it knows about - either objects
   214  // created by the provider, or existing objects the Provider was informed about
   215  // via AddObjects.
   216  //
   217  // Objects are currently backed by a vfs.File or a remote.Storage object.
   218  type Provider interface {
   219  	// OpenForReading opens an existing object.
   220  	OpenForReading(
   221  		ctx context.Context, fileType base.FileType, FileNum base.DiskFileNum, opts OpenOptions,
   222  	) (Readable, error)
   223  
   224  	// Create creates a new object and opens it for writing.
   225  	//
   226  	// The object is not guaranteed to be durable (accessible in case of crashes)
   227  	// until Sync is called.
   228  	Create(
   229  		ctx context.Context, fileType base.FileType, FileNum base.DiskFileNum, opts CreateOptions,
   230  	) (w Writable, meta ObjectMetadata, err error)
   231  
   232  	// Remove removes an object.
   233  	//
   234  	// The object is not guaranteed to be durably removed until Sync is called.
   235  	Remove(fileType base.FileType, FileNum base.DiskFileNum) error
   236  
   237  	// Sync flushes the metadata from creation or removal of objects since the last Sync.
   238  	// This includes objects that have been Created but for which
   239  	// Writable.Finish() has not yet been called.
   240  	Sync() error
   241  
   242  	// LinkOrCopyFromLocal creates a new object that is either a copy of a given
   243  	// local file or a hard link (if the new object is created on the same FS, and
   244  	// if the FS supports it).
   245  	//
   246  	// The object is not guaranteed to be durable (accessible in case of crashes)
   247  	// until Sync is called.
   248  	LinkOrCopyFromLocal(
   249  		ctx context.Context,
   250  		srcFS vfs.FS,
   251  		srcFilePath string,
   252  		dstFileType base.FileType,
   253  		dstFileNum base.DiskFileNum,
   254  		opts CreateOptions,
   255  	) (ObjectMetadata, error)
   256  
   257  	// Lookup returns the metadata of an object that is already known to the Provider.
   258  	// Does not perform any I/O.
   259  	Lookup(fileType base.FileType, FileNum base.DiskFileNum) (ObjectMetadata, error)
   260  
   261  	// Path returns an internal, implementation-dependent path for the object. It is
   262  	// meant to be used for informational purposes (like logging).
   263  	Path(meta ObjectMetadata) string
   264  
   265  	// Size returns the size of the object.
   266  	Size(meta ObjectMetadata) (int64, error)
   267  
   268  	// List returns the objects currently known to the provider. Does not perform any I/O.
   269  	List() []ObjectMetadata
   270  
   271  	// SetCreatorID sets the CreatorID which is needed in order to use shared
   272  	// objects. Remote object usage is disabled until this method is called the
   273  	// first time. Once set, the Creator ID is persisted and cannot change.
   274  	//
   275  	// Cannot be called if shared storage is not configured for the provider.
   276  	SetCreatorID(creatorID CreatorID) error
   277  
   278  	// IsSharedForeign returns whether this object is owned by a different node.
   279  	IsSharedForeign(meta ObjectMetadata) bool
   280  
   281  	// RemoteObjectBacking encodes the remote object metadata for the given object.
   282  	RemoteObjectBacking(meta *ObjectMetadata) (RemoteObjectBackingHandle, error)
   283  
   284  	// CreateExternalObjectBacking creates a backing for an existing object with a
   285  	// custom object name. The object is considered to be managed outside of
   286  	// Pebble and will never be removed by Pebble.
   287  	CreateExternalObjectBacking(locator remote.Locator, objName string) (RemoteObjectBacking, error)
   288  
   289  	// AttachRemoteObjects registers existing remote objects with this provider.
   290  	AttachRemoteObjects(objs []RemoteObjectToAttach) ([]ObjectMetadata, error)
   291  
   292  	Close() error
   293  
   294  	// IsNotExistError indicates whether the error is known to report that a file or
   295  	// directory does not exist.
   296  	IsNotExistError(err error) bool
   297  
   298  	// Metrics returns metrics about objstorage. Currently, it only returns metrics
   299  	// about the shared cache.
   300  	Metrics() sharedcache.Metrics
   301  }
   302  
   303  // RemoteObjectBacking encodes the metadata necessary to incorporate a shared
   304  // object into a different Pebble instance. The encoding is specific to a given
   305  // Provider implementation.
   306  type RemoteObjectBacking []byte
   307  
   308  // RemoteObjectBackingHandle is a container for a RemoteObjectBacking which
   309  // ensures that the backing stays valid. A backing can otherwise become invalid
   310  // if this provider unrefs the shared object. The RemoteObjectBackingHandle
   311  // delays any unref until Close.
   312  type RemoteObjectBackingHandle interface {
   313  	// Get returns the backing. The backing is only guaranteed to be valid until
   314  	// Close is called (or until the Provider is closed). If Close was already
   315  	// called, returns an error.
   316  	Get() (RemoteObjectBacking, error)
   317  	Close()
   318  }
   319  
   320  // RemoteObjectToAttach contains the arguments needed to attach an existing remote object.
   321  type RemoteObjectToAttach struct {
   322  	// FileNum is the file number that will be used to refer to this object (in
   323  	// the context of this instance).
   324  	FileNum  base.DiskFileNum
   325  	FileType base.FileType
   326  	// Backing contains the metadata for the remote object backing (normally
   327  	// generated from a different instance, but using the same Provider
   328  	// implementation).
   329  	Backing RemoteObjectBacking
   330  }