github.com/cockroachdb/pebble@v1.1.1-0.20240513155919-3622ade60459/objstorage/objstorage.go (about) 1 // Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package objstorage 6 7 import ( 8 "context" 9 "fmt" 10 11 "github.com/cockroachdb/errors" 12 "github.com/cockroachdb/pebble/internal/base" 13 "github.com/cockroachdb/pebble/objstorage/objstorageprovider/sharedcache" 14 "github.com/cockroachdb/pebble/objstorage/remote" 15 "github.com/cockroachdb/pebble/vfs" 16 ) 17 18 // Readable is the handle for an object that is open for reading. 19 type Readable interface { 20 // ReadAt reads len(p) bytes into p starting at offset off. 21 // 22 // Does not return partial results; if off + len(p) is past the end of the 23 // object, an error is returned. 24 // 25 // Clients of ReadAt can execute parallel ReadAt calls on the 26 // same Readable. 27 ReadAt(ctx context.Context, p []byte, off int64) error 28 29 Close() error 30 31 // Size returns the size of the object. 32 Size() int64 33 34 // NewReadHandle creates a read handle for ReadAt requests that are related 35 // and can benefit from optimizations like read-ahead. 36 // 37 // The ReadHandle must be closed before the Readable is closed. 38 // 39 // Multiple separate ReadHandles can be used. 40 NewReadHandle(ctx context.Context) ReadHandle 41 } 42 43 // ReadHandle is used to perform reads that are related and might benefit from 44 // optimizations like read-ahead. 45 type ReadHandle interface { 46 // ReadAt reads len(p) bytes into p starting at offset off. 47 // 48 // Does not return partial results; if off + len(p) is past the end of the 49 // object, an error is returned. 50 // 51 // Parallel ReadAt calls on the same ReadHandle are not allowed. 52 ReadAt(ctx context.Context, p []byte, off int64) error 53 54 Close() error 55 56 // SetupForCompaction informs the implementation that the read handle will 57 // be used to read data blocks for a compaction. The implementation can expect 58 // sequential reads, and can decide to not retain data in any caches. 59 SetupForCompaction() 60 61 // RecordCacheHit informs the implementation that we were able to retrieve a 62 // block from cache. This is useful for example when the implementation is 63 // trying to detect a sequential reading pattern. 64 RecordCacheHit(ctx context.Context, offset, size int64) 65 } 66 67 // Writable is the handle for an object that is open for writing. 68 // Either Finish or Abort must be called. 69 type Writable interface { 70 // Write writes len(p) bytes from p to the underlying object. The data is not 71 // guaranteed to be durable until Finish is called. 72 // 73 // Note that Write *is* allowed to modify the slice passed in, whether 74 // temporarily or permanently. Callers of Write need to take this into 75 // account. 76 Write(p []byte) error 77 78 // Finish completes the object and makes the data durable. 79 // No further calls are allowed after calling Finish. 80 Finish() error 81 82 // Abort gives up on finishing the object. There is no guarantee about whether 83 // the object exists after calling Abort. 84 // No further calls are allowed after calling Abort. 85 Abort() 86 } 87 88 // ObjectMetadata contains the metadata required to be able to access an object. 89 type ObjectMetadata struct { 90 DiskFileNum base.DiskFileNum 91 FileType base.FileType 92 93 // The fields below are only set if the object is on remote storage. 94 Remote struct { 95 // CreatorID identifies the DB instance that originally created the object. 96 // 97 // Only used when CustomObjectName is not set. 98 CreatorID CreatorID 99 // CreatorFileNum is the identifier for the object within the context of the 100 // DB instance that originally created the object. 101 // 102 // Only used when CustomObjectName is not set. 103 CreatorFileNum base.DiskFileNum 104 // CustomObjectName (if it is set) overrides the object name that is normally 105 // derived from the CreatorID and CreatorFileNum. 106 CustomObjectName string 107 // CleanupMethod indicates the method for cleaning up unused shared objects. 108 CleanupMethod SharedCleanupMethod 109 // Locator identifies the remote.Storage implementation for this object. 110 Locator remote.Locator 111 // Storage is the remote.Storage object corresponding to the Locator. Used 112 // to avoid lookups in hot paths. 113 Storage remote.Storage 114 } 115 } 116 117 // IsRemote returns true if the object is on remote storage. 118 func (meta *ObjectMetadata) IsRemote() bool { 119 return meta.IsShared() || meta.IsExternal() 120 } 121 122 // IsExternal returns true if the object is on remote storage but is not owned 123 // by any Pebble instances in the cluster. 124 func (meta *ObjectMetadata) IsExternal() bool { 125 return meta.Remote.CustomObjectName != "" 126 } 127 128 // IsShared returns true if the object is on remote storage and is owned by a 129 // Pebble instance in the cluster (potentially shared between multiple 130 // instances). 131 func (meta *ObjectMetadata) IsShared() bool { 132 return meta.Remote.CreatorID.IsSet() 133 } 134 135 // AssertValid checks that the metadata is sane. 136 func (meta *ObjectMetadata) AssertValid() { 137 if !meta.IsRemote() { 138 // Verify all Remote fields are empty. 139 if meta.Remote != (ObjectMetadata{}).Remote { 140 panic(errors.AssertionFailedf("meta.Remote not empty: %#v", meta.Remote)) 141 } 142 } else { 143 if meta.Remote.CustomObjectName != "" { 144 if meta.Remote.CreatorID == 0 { 145 panic(errors.AssertionFailedf("CreatorID not set")) 146 } 147 if meta.Remote.CreatorFileNum == base.FileNum(0).DiskFileNum() { 148 panic(errors.AssertionFailedf("CreatorFileNum not set")) 149 } 150 } 151 if meta.Remote.CleanupMethod != SharedNoCleanup && meta.Remote.CleanupMethod != SharedRefTracking { 152 panic(errors.AssertionFailedf("invalid CleanupMethod %d", meta.Remote.CleanupMethod)) 153 } 154 if meta.Remote.Storage == nil { 155 panic(errors.AssertionFailedf("Storage not set")) 156 } 157 } 158 } 159 160 // CreatorID identifies the DB instance that originally created a shared object. 161 // This ID is incorporated in backing object names. 162 // Must be non-zero. 163 type CreatorID uint64 164 165 // IsSet returns true if the CreatorID is not zero. 166 func (c CreatorID) IsSet() bool { return c != 0 } 167 168 func (c CreatorID) String() string { return fmt.Sprintf("%d", c) } 169 170 // SharedCleanupMethod indicates the method for cleaning up unused shared objects. 171 type SharedCleanupMethod uint8 172 173 const ( 174 // SharedRefTracking is used for shared objects for which objstorage providers 175 // keep track of references via reference marker objects. 176 SharedRefTracking SharedCleanupMethod = iota 177 178 // SharedNoCleanup is used for remote objects that are managed externally; the 179 // objstorage provider never deletes such objects. 180 SharedNoCleanup 181 ) 182 183 // OpenOptions contains optional arguments for OpenForReading. 184 type OpenOptions struct { 185 // MustExist triggers a fatal error if the file does not exist. The fatal 186 // error message contains extra information helpful for debugging. 187 MustExist bool 188 } 189 190 // CreateOptions contains optional arguments for Create. 191 type CreateOptions struct { 192 // PreferSharedStorage causes the object to be created on shared storage if 193 // the provider has shared storage configured. 194 PreferSharedStorage bool 195 196 // SharedCleanupMethod is used for the object when it is created on shared storage. 197 // The default (zero) value is SharedRefTracking. 198 SharedCleanupMethod SharedCleanupMethod 199 } 200 201 // Provider is a singleton object used to access and manage objects. 202 // 203 // An object is conceptually like a large immutable file. The main use of 204 // objects is for storing sstables; in the future it could also be used for blob 205 // storage. 206 // 207 // The Provider can only manage objects that it knows about - either objects 208 // created by the provider, or existing objects the Provider was informed about 209 // via AddObjects. 210 // 211 // Objects are currently backed by a vfs.File or a remote.Storage object. 212 type Provider interface { 213 // OpenForReading opens an existing object. 214 OpenForReading( 215 ctx context.Context, fileType base.FileType, FileNum base.DiskFileNum, opts OpenOptions, 216 ) (Readable, error) 217 218 // Create creates a new object and opens it for writing. 219 // 220 // The object is not guaranteed to be durable (accessible in case of crashes) 221 // until Sync is called. 222 Create( 223 ctx context.Context, fileType base.FileType, FileNum base.DiskFileNum, opts CreateOptions, 224 ) (w Writable, meta ObjectMetadata, err error) 225 226 // Remove removes an object. 227 // 228 // The object is not guaranteed to be durably removed until Sync is called. 229 Remove(fileType base.FileType, FileNum base.DiskFileNum) error 230 231 // Sync flushes the metadata from creation or removal of objects since the last Sync. 232 // This includes objects that have been Created but for which 233 // Writable.Finish() has not yet been called. 234 Sync() error 235 236 // LinkOrCopyFromLocal creates a new object that is either a copy of a given 237 // local file or a hard link (if the new object is created on the same FS, and 238 // if the FS supports it). 239 // 240 // The object is not guaranteed to be durable (accessible in case of crashes) 241 // until Sync is called. 242 LinkOrCopyFromLocal( 243 ctx context.Context, 244 srcFS vfs.FS, 245 srcFilePath string, 246 dstFileType base.FileType, 247 dstFileNum base.DiskFileNum, 248 opts CreateOptions, 249 ) (ObjectMetadata, error) 250 251 // Lookup returns the metadata of an object that is already known to the Provider. 252 // Does not perform any I/O. 253 Lookup(fileType base.FileType, FileNum base.DiskFileNum) (ObjectMetadata, error) 254 255 // Path returns an internal, implementation-dependent path for the object. It is 256 // meant to be used for informational purposes (like logging). 257 Path(meta ObjectMetadata) string 258 259 // Size returns the size of the object. 260 Size(meta ObjectMetadata) (int64, error) 261 262 // List returns the objects currently known to the provider. Does not perform any I/O. 263 List() []ObjectMetadata 264 265 // SetCreatorID sets the CreatorID which is needed in order to use shared 266 // objects. Remote object usage is disabled until this method is called the 267 // first time. Once set, the Creator ID is persisted and cannot change. 268 // 269 // Cannot be called if shared storage is not configured for the provider. 270 SetCreatorID(creatorID CreatorID) error 271 272 // IsSharedForeign returns whether this object is owned by a different node. 273 IsSharedForeign(meta ObjectMetadata) bool 274 275 // RemoteObjectBacking encodes the remote object metadata for the given object. 276 RemoteObjectBacking(meta *ObjectMetadata) (RemoteObjectBackingHandle, error) 277 278 // CreateExternalObjectBacking creates a backing for an existing object with a 279 // custom object name. The object is considered to be managed outside of 280 // Pebble and will never be removed by Pebble. 281 CreateExternalObjectBacking(locator remote.Locator, objName string) (RemoteObjectBacking, error) 282 283 // AttachRemoteObjects registers existing remote objects with this provider. 284 AttachRemoteObjects(objs []RemoteObjectToAttach) ([]ObjectMetadata, error) 285 286 Close() error 287 288 // IsNotExistError indicates whether the error is known to report that a file or 289 // directory does not exist. 290 IsNotExistError(err error) bool 291 292 // Metrics returns metrics about objstorage. Currently, it only returns metrics 293 // about the shared cache. 294 Metrics() sharedcache.Metrics 295 } 296 297 // RemoteObjectBacking encodes the metadata necessary to incorporate a shared 298 // object into a different Pebble instance. The encoding is specific to a given 299 // Provider implementation. 300 type RemoteObjectBacking []byte 301 302 // RemoteObjectBackingHandle is a container for a RemoteObjectBacking which 303 // ensures that the backing stays valid. A backing can otherwise become invalid 304 // if this provider unrefs the shared object. The RemoteObjectBackingHandle 305 // delays any unref until Close. 306 type RemoteObjectBackingHandle interface { 307 // Get returns the backing. The backing is only guaranteed to be valid until 308 // Close is called (or until the Provider is closed). If Close was already 309 // called, returns an error. 310 Get() (RemoteObjectBacking, error) 311 Close() 312 } 313 314 // RemoteObjectToAttach contains the arguments needed to attach an existing remote object. 315 type RemoteObjectToAttach struct { 316 // FileNum is the file number that will be used to refer to this object (in 317 // the context of this instance). 318 FileNum base.DiskFileNum 319 FileType base.FileType 320 // Backing contains the metadata for the remote object backing (normally 321 // generated from a different instance, but using the same Provider 322 // implementation). 323 Backing RemoteObjectBacking 324 }