github.com/cockroachdb/pebble@v0.0.0-20231214172447-ab4952c5f87b/objstorage/objstorage.go (about) 1 // Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package objstorage 6 7 import ( 8 "context" 9 "fmt" 10 11 "github.com/cockroachdb/errors" 12 "github.com/cockroachdb/pebble/internal/base" 13 "github.com/cockroachdb/pebble/objstorage/objstorageprovider/sharedcache" 14 "github.com/cockroachdb/pebble/objstorage/remote" 15 "github.com/cockroachdb/pebble/vfs" 16 "github.com/cockroachdb/redact" 17 ) 18 19 // Readable is the handle for an object that is open for reading. 20 type Readable interface { 21 // ReadAt reads len(p) bytes into p starting at offset off. 22 // 23 // Does not return partial results; if off + len(p) is past the end of the 24 // object, an error is returned. 25 // 26 // Clients of ReadAt can execute parallel ReadAt calls on the 27 // same Readable. 28 ReadAt(ctx context.Context, p []byte, off int64) error 29 30 Close() error 31 32 // Size returns the size of the object. 33 Size() int64 34 35 // NewReadHandle creates a read handle for ReadAt requests that are related 36 // and can benefit from optimizations like read-ahead. 37 // 38 // The ReadHandle must be closed before the Readable is closed. 39 // 40 // Multiple separate ReadHandles can be used. 41 NewReadHandle(ctx context.Context) ReadHandle 42 } 43 44 // ReadHandle is used to perform reads that are related and might benefit from 45 // optimizations like read-ahead. 46 type ReadHandle interface { 47 // ReadAt reads len(p) bytes into p starting at offset off. 48 // 49 // Does not return partial results; if off + len(p) is past the end of the 50 // object, an error is returned. 51 // 52 // Parallel ReadAt calls on the same ReadHandle are not allowed. 53 ReadAt(ctx context.Context, p []byte, off int64) error 54 55 Close() error 56 57 // SetupForCompaction informs the implementation that the read handle will 58 // be used to read data blocks for a compaction. The implementation can expect 59 // sequential reads, and can decide to not retain data in any caches. 60 SetupForCompaction() 61 62 // RecordCacheHit informs the implementation that we were able to retrieve a 63 // block from cache. This is useful for example when the implementation is 64 // trying to detect a sequential reading pattern. 65 RecordCacheHit(ctx context.Context, offset, size int64) 66 } 67 68 // Writable is the handle for an object that is open for writing. 69 // Either Finish or Abort must be called. 70 type Writable interface { 71 // Write writes len(p) bytes from p to the underlying object. The data is not 72 // guaranteed to be durable until Finish is called. 73 // 74 // Note that Write *is* allowed to modify the slice passed in, whether 75 // temporarily or permanently. Callers of Write need to take this into 76 // account. 77 Write(p []byte) error 78 79 // Finish completes the object and makes the data durable. 80 // No further calls are allowed after calling Finish. 81 Finish() error 82 83 // Abort gives up on finishing the object. There is no guarantee about whether 84 // the object exists after calling Abort. 85 // No further calls are allowed after calling Abort. 86 Abort() 87 } 88 89 // ObjectMetadata contains the metadata required to be able to access an object. 90 type ObjectMetadata struct { 91 DiskFileNum base.DiskFileNum 92 FileType base.FileType 93 94 // The fields below are only set if the object is on remote storage. 95 Remote struct { 96 // CreatorID identifies the DB instance that originally created the object. 97 // 98 // Only used when CustomObjectName is not set. 99 CreatorID CreatorID 100 // CreatorFileNum is the identifier for the object within the context of the 101 // DB instance that originally created the object. 102 // 103 // Only used when CustomObjectName is not set. 104 CreatorFileNum base.DiskFileNum 105 // CustomObjectName (if it is set) overrides the object name that is normally 106 // derived from the CreatorID and CreatorFileNum. 107 CustomObjectName string 108 // CleanupMethod indicates the method for cleaning up unused shared objects. 109 CleanupMethod SharedCleanupMethod 110 // Locator identifies the remote.Storage implementation for this object. 111 Locator remote.Locator 112 // Storage is the remote.Storage object corresponding to the Locator. Used 113 // to avoid lookups in hot paths. 114 Storage remote.Storage 115 } 116 } 117 118 // IsRemote returns true if the object is on remote storage. 119 func (meta *ObjectMetadata) IsRemote() bool { 120 return meta.IsShared() || meta.IsExternal() 121 } 122 123 // IsExternal returns true if the object is on remote storage but is not owned 124 // by any Pebble instances in the cluster. 125 func (meta *ObjectMetadata) IsExternal() bool { 126 return meta.Remote.CustomObjectName != "" 127 } 128 129 // IsShared returns true if the object is on remote storage and is owned by a 130 // Pebble instance in the cluster (potentially shared between multiple 131 // instances). 132 func (meta *ObjectMetadata) IsShared() bool { 133 return meta.Remote.CreatorID.IsSet() 134 } 135 136 // AssertValid checks that the metadata is sane. 137 func (meta *ObjectMetadata) AssertValid() { 138 if !meta.IsRemote() { 139 // Verify all Remote fields are empty. 140 if meta.Remote != (ObjectMetadata{}).Remote { 141 panic(errors.AssertionFailedf("meta.Remote not empty: %#v", meta.Remote)) 142 } 143 } else { 144 if meta.Remote.CustomObjectName == "" { 145 if meta.Remote.CreatorID == 0 { 146 panic(errors.AssertionFailedf("CreatorID not set")) 147 } 148 if meta.Remote.CreatorFileNum == base.FileNum(0).DiskFileNum() { 149 panic(errors.AssertionFailedf("CreatorFileNum not set")) 150 } 151 } 152 if meta.Remote.CleanupMethod != SharedNoCleanup && meta.Remote.CleanupMethod != SharedRefTracking { 153 panic(errors.AssertionFailedf("invalid CleanupMethod %d", meta.Remote.CleanupMethod)) 154 } 155 if meta.Remote.Storage == nil { 156 panic(errors.AssertionFailedf("Storage not set")) 157 } 158 } 159 } 160 161 // CreatorID identifies the DB instance that originally created a shared object. 162 // This ID is incorporated in backing object names. 163 // Must be non-zero. 164 type CreatorID uint64 165 166 // IsSet returns true if the CreatorID is not zero. 167 func (c CreatorID) IsSet() bool { return c != 0 } 168 169 func (c CreatorID) String() string { return fmt.Sprintf("%d", c) } 170 171 // SafeFormat implements redact.SafeFormatter. 172 func (c CreatorID) SafeFormat(w redact.SafePrinter, _ rune) { 173 w.Printf("%d", redact.SafeUint(c)) 174 } 175 176 // SharedCleanupMethod indicates the method for cleaning up unused shared objects. 177 type SharedCleanupMethod uint8 178 179 const ( 180 // SharedRefTracking is used for shared objects for which objstorage providers 181 // keep track of references via reference marker objects. 182 SharedRefTracking SharedCleanupMethod = iota 183 184 // SharedNoCleanup is used for remote objects that are managed externally; the 185 // objstorage provider never deletes such objects. 186 SharedNoCleanup 187 ) 188 189 // OpenOptions contains optional arguments for OpenForReading. 190 type OpenOptions struct { 191 // MustExist triggers a fatal error if the file does not exist. The fatal 192 // error message contains extra information helpful for debugging. 193 MustExist bool 194 } 195 196 // CreateOptions contains optional arguments for Create. 197 type CreateOptions struct { 198 // PreferSharedStorage causes the object to be created on shared storage if 199 // the provider has shared storage configured. 200 PreferSharedStorage bool 201 202 // SharedCleanupMethod is used for the object when it is created on shared storage. 203 // The default (zero) value is SharedRefTracking. 204 SharedCleanupMethod SharedCleanupMethod 205 } 206 207 // Provider is a singleton object used to access and manage objects. 208 // 209 // An object is conceptually like a large immutable file. The main use of 210 // objects is for storing sstables; in the future it could also be used for blob 211 // storage. 212 // 213 // The Provider can only manage objects that it knows about - either objects 214 // created by the provider, or existing objects the Provider was informed about 215 // via AddObjects. 216 // 217 // Objects are currently backed by a vfs.File or a remote.Storage object. 218 type Provider interface { 219 // OpenForReading opens an existing object. 220 OpenForReading( 221 ctx context.Context, fileType base.FileType, FileNum base.DiskFileNum, opts OpenOptions, 222 ) (Readable, error) 223 224 // Create creates a new object and opens it for writing. 225 // 226 // The object is not guaranteed to be durable (accessible in case of crashes) 227 // until Sync is called. 228 Create( 229 ctx context.Context, fileType base.FileType, FileNum base.DiskFileNum, opts CreateOptions, 230 ) (w Writable, meta ObjectMetadata, err error) 231 232 // Remove removes an object. 233 // 234 // The object is not guaranteed to be durably removed until Sync is called. 235 Remove(fileType base.FileType, FileNum base.DiskFileNum) error 236 237 // Sync flushes the metadata from creation or removal of objects since the last Sync. 238 // This includes objects that have been Created but for which 239 // Writable.Finish() has not yet been called. 240 Sync() error 241 242 // LinkOrCopyFromLocal creates a new object that is either a copy of a given 243 // local file or a hard link (if the new object is created on the same FS, and 244 // if the FS supports it). 245 // 246 // The object is not guaranteed to be durable (accessible in case of crashes) 247 // until Sync is called. 248 LinkOrCopyFromLocal( 249 ctx context.Context, 250 srcFS vfs.FS, 251 srcFilePath string, 252 dstFileType base.FileType, 253 dstFileNum base.DiskFileNum, 254 opts CreateOptions, 255 ) (ObjectMetadata, error) 256 257 // Lookup returns the metadata of an object that is already known to the Provider. 258 // Does not perform any I/O. 259 Lookup(fileType base.FileType, FileNum base.DiskFileNum) (ObjectMetadata, error) 260 261 // Path returns an internal, implementation-dependent path for the object. It is 262 // meant to be used for informational purposes (like logging). 263 Path(meta ObjectMetadata) string 264 265 // Size returns the size of the object. 266 Size(meta ObjectMetadata) (int64, error) 267 268 // List returns the objects currently known to the provider. Does not perform any I/O. 269 List() []ObjectMetadata 270 271 // SetCreatorID sets the CreatorID which is needed in order to use shared 272 // objects. Remote object usage is disabled until this method is called the 273 // first time. Once set, the Creator ID is persisted and cannot change. 274 // 275 // Cannot be called if shared storage is not configured for the provider. 276 SetCreatorID(creatorID CreatorID) error 277 278 // IsSharedForeign returns whether this object is owned by a different node. 279 IsSharedForeign(meta ObjectMetadata) bool 280 281 // RemoteObjectBacking encodes the remote object metadata for the given object. 282 RemoteObjectBacking(meta *ObjectMetadata) (RemoteObjectBackingHandle, error) 283 284 // CreateExternalObjectBacking creates a backing for an existing object with a 285 // custom object name. The object is considered to be managed outside of 286 // Pebble and will never be removed by Pebble. 287 CreateExternalObjectBacking(locator remote.Locator, objName string) (RemoteObjectBacking, error) 288 289 // AttachRemoteObjects registers existing remote objects with this provider. 290 AttachRemoteObjects(objs []RemoteObjectToAttach) ([]ObjectMetadata, error) 291 292 Close() error 293 294 // IsNotExistError indicates whether the error is known to report that a file or 295 // directory does not exist. 296 IsNotExistError(err error) bool 297 298 // Metrics returns metrics about objstorage. Currently, it only returns metrics 299 // about the shared cache. 300 Metrics() sharedcache.Metrics 301 } 302 303 // RemoteObjectBacking encodes the metadata necessary to incorporate a shared 304 // object into a different Pebble instance. The encoding is specific to a given 305 // Provider implementation. 306 type RemoteObjectBacking []byte 307 308 // RemoteObjectBackingHandle is a container for a RemoteObjectBacking which 309 // ensures that the backing stays valid. A backing can otherwise become invalid 310 // if this provider unrefs the shared object. The RemoteObjectBackingHandle 311 // delays any unref until Close. 312 type RemoteObjectBackingHandle interface { 313 // Get returns the backing. The backing is only guaranteed to be valid until 314 // Close is called (or until the Provider is closed). If Close was already 315 // called, returns an error. 316 Get() (RemoteObjectBacking, error) 317 Close() 318 } 319 320 // RemoteObjectToAttach contains the arguments needed to attach an existing remote object. 321 type RemoteObjectToAttach struct { 322 // FileNum is the file number that will be used to refer to this object (in 323 // the context of this instance). 324 FileNum base.DiskFileNum 325 FileType base.FileType 326 // Backing contains the metadata for the remote object backing (normally 327 // generated from a different instance, but using the same Provider 328 // implementation). 329 Backing RemoteObjectBacking 330 }