github.com/cockroachdb/pebble@v0.0.0-20231214172447-ab4952c5f87b/objstorage/objstorageprovider/provider.go (about) 1 // Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package objstorageprovider 6 7 import ( 8 "cmp" 9 "context" 10 "io" 11 "os" 12 "slices" 13 "sync" 14 15 "github.com/cockroachdb/errors" 16 "github.com/cockroachdb/errors/oserror" 17 "github.com/cockroachdb/pebble/internal/base" 18 "github.com/cockroachdb/pebble/internal/invariants" 19 "github.com/cockroachdb/pebble/objstorage" 20 "github.com/cockroachdb/pebble/objstorage/objstorageprovider/objiotracing" 21 "github.com/cockroachdb/pebble/objstorage/objstorageprovider/remoteobjcat" 22 "github.com/cockroachdb/pebble/objstorage/objstorageprovider/sharedcache" 23 "github.com/cockroachdb/pebble/objstorage/remote" 24 "github.com/cockroachdb/pebble/vfs" 25 ) 26 27 // provider is the implementation of objstorage.Provider. 28 type provider struct { 29 st Settings 30 31 fsDir vfs.File 32 33 tracer *objiotracing.Tracer 34 35 remote remoteSubsystem 36 37 mu struct { 38 sync.RWMutex 39 40 remote struct { 41 // catalogBatch accumulates remote object creations and deletions until 42 // Sync is called. 43 catalogBatch remoteobjcat.Batch 44 45 storageObjects map[remote.Locator]remote.Storage 46 } 47 48 // localObjectsChanged is set if non-remote objects were created or deleted 49 // but Sync was not yet called. 50 localObjectsChanged bool 51 52 // knownObjects maintains information about objects that are known to the provider. 53 // It is initialized with the list of files in the manifest when we open a DB. 54 knownObjects map[base.DiskFileNum]objstorage.ObjectMetadata 55 56 // protectedObjects are objects that cannot be unreferenced because they 57 // have outstanding SharedObjectBackingHandles. The value is a count of outstanding handles 58 protectedObjects map[base.DiskFileNum]int 59 } 60 } 61 62 var _ objstorage.Provider = (*provider)(nil) 63 64 // Settings that must be specified when creating the provider. 65 type Settings struct { 66 Logger base.Logger 67 68 // Local filesystem configuration. 69 FS vfs.FS 70 FSDirName string 71 72 // FSDirInitialListing is a listing of FSDirName at the time of calling Open. 73 // 74 // This is an optional optimization to avoid double listing on Open when the 75 // higher layer already has a listing. When nil, we obtain the listing on 76 // Open. 77 FSDirInitialListing []string 78 79 // Cleaner cleans obsolete files from the local filesystem. 80 // 81 // The default cleaner uses the DeleteCleaner. 82 FSCleaner base.Cleaner 83 84 // NoSyncOnClose decides whether the implementation will enforce a 85 // close-time synchronization (e.g., fdatasync() or sync_file_range()) 86 // on files it writes to. Setting this to true removes the guarantee for a 87 // sync on close. Some implementations can still issue a non-blocking sync. 88 NoSyncOnClose bool 89 90 // BytesPerSync enables periodic syncing of files in order to smooth out 91 // writes to disk. This option does not provide any persistence guarantee, but 92 // is used to avoid latency spikes if the OS automatically decides to write 93 // out a large chunk of dirty filesystem buffers. 94 BytesPerSync int 95 96 // Fields here are set only if the provider is to support remote objects 97 // (experimental). 98 Remote struct { 99 StorageFactory remote.StorageFactory 100 101 // If CreateOnShared is non-zero, sstables are created on remote storage using 102 // the CreateOnSharedLocator (when the PreferSharedStorage create option is 103 // true). 104 CreateOnShared remote.CreateOnSharedStrategy 105 CreateOnSharedLocator remote.Locator 106 107 // CacheSizeBytes is the size of the on-disk block cache for objects 108 // on remote storage. If it is 0, no cache is used. 109 CacheSizeBytes int64 110 111 // CacheBlockSize is the block size of the cache; if 0, the default of 32KB is used. 112 CacheBlockSize int 113 114 // ShardingBlockSize is the size of a shard block. The cache is split into contiguous 115 // ShardingBlockSize units. The units are distributed across multiple independent shards 116 // of the cache, via a hash(offset) modulo num shards operation. The cache replacement 117 // policies operate at the level of shard, not whole cache. This is done to reduce lock 118 // contention. 119 // 120 // If ShardingBlockSize is 0, the default of 1 MB is used. 121 ShardingBlockSize int64 122 123 // The number of independent shards the cache leverages. Each shard is the same size, 124 // and a hash of filenum & offset map a read to a certain shard. If set to 0, 125 // 2*runtime.GOMAXPROCS is used as the shard count. 126 CacheShardCount int 127 128 // TODO(radu): allow the cache to live on another FS/location (e.g. to use 129 // instance-local SSD). 130 } 131 } 132 133 // DefaultSettings initializes default settings (with no remote storage), 134 // suitable for tests and tools. 135 func DefaultSettings(fs vfs.FS, dirName string) Settings { 136 return Settings{ 137 Logger: base.DefaultLogger, 138 FS: fs, 139 FSDirName: dirName, 140 FSCleaner: base.DeleteCleaner{}, 141 NoSyncOnClose: false, 142 BytesPerSync: 512 * 1024, // 512KB 143 } 144 } 145 146 // Open creates the provider. 147 func Open(settings Settings) (objstorage.Provider, error) { 148 // Note: we can't just `return open(settings)` because in an error case we 149 // would return (*provider)(nil) which is not objstorage.Provider(nil). 150 p, err := open(settings) 151 if err != nil { 152 return nil, err 153 } 154 return p, nil 155 } 156 157 func open(settings Settings) (p *provider, _ error) { 158 fsDir, err := settings.FS.OpenDir(settings.FSDirName) 159 if err != nil { 160 return nil, err 161 } 162 163 defer func() { 164 if p == nil { 165 fsDir.Close() 166 } 167 }() 168 169 p = &provider{ 170 st: settings, 171 fsDir: fsDir, 172 } 173 p.mu.knownObjects = make(map[base.DiskFileNum]objstorage.ObjectMetadata) 174 p.mu.protectedObjects = make(map[base.DiskFileNum]int) 175 176 if objiotracing.Enabled { 177 p.tracer = objiotracing.Open(settings.FS, settings.FSDirName) 178 } 179 180 // Add local FS objects. 181 if err := p.vfsInit(); err != nil { 182 return nil, err 183 } 184 185 // Initialize remote subsystem (if configured) and add remote objects. 186 if err := p.remoteInit(); err != nil { 187 return nil, err 188 } 189 190 return p, nil 191 } 192 193 // Close is part of the objstorage.Provider interface. 194 func (p *provider) Close() error { 195 err := p.sharedClose() 196 if p.fsDir != nil { 197 err = firstError(err, p.fsDir.Close()) 198 p.fsDir = nil 199 } 200 if objiotracing.Enabled { 201 if p.tracer != nil { 202 p.tracer.Close() 203 p.tracer = nil 204 } 205 } 206 return err 207 } 208 209 // OpenForReading opens an existing object. 210 func (p *provider) OpenForReading( 211 ctx context.Context, 212 fileType base.FileType, 213 fileNum base.DiskFileNum, 214 opts objstorage.OpenOptions, 215 ) (objstorage.Readable, error) { 216 meta, err := p.Lookup(fileType, fileNum) 217 if err != nil { 218 if opts.MustExist { 219 p.st.Logger.Fatalf("%v", err) 220 } 221 return nil, err 222 } 223 224 var r objstorage.Readable 225 if !meta.IsRemote() { 226 r, err = p.vfsOpenForReading(ctx, fileType, fileNum, opts) 227 } else { 228 r, err = p.remoteOpenForReading(ctx, meta, opts) 229 if err != nil && p.isNotExistError(meta, err) { 230 // Wrap the error so that IsNotExistError functions properly. 231 err = errors.Mark(err, os.ErrNotExist) 232 } 233 } 234 if err != nil { 235 return nil, err 236 } 237 if objiotracing.Enabled { 238 r = p.tracer.WrapReadable(ctx, r, fileNum) 239 } 240 return r, nil 241 } 242 243 // Create creates a new object and opens it for writing. 244 // 245 // The object is not guaranteed to be durable (accessible in case of crashes) 246 // until Sync is called. 247 func (p *provider) Create( 248 ctx context.Context, 249 fileType base.FileType, 250 fileNum base.DiskFileNum, 251 opts objstorage.CreateOptions, 252 ) (w objstorage.Writable, meta objstorage.ObjectMetadata, err error) { 253 if opts.PreferSharedStorage && p.st.Remote.CreateOnShared != remote.CreateOnSharedNone { 254 w, meta, err = p.sharedCreate(ctx, fileType, fileNum, p.st.Remote.CreateOnSharedLocator, opts) 255 } else { 256 w, meta, err = p.vfsCreate(ctx, fileType, fileNum) 257 } 258 if err != nil { 259 err = errors.Wrapf(err, "creating object %s", fileNum) 260 return nil, objstorage.ObjectMetadata{}, err 261 } 262 p.addMetadata(meta) 263 if objiotracing.Enabled { 264 w = p.tracer.WrapWritable(ctx, w, fileNum) 265 } 266 return w, meta, nil 267 } 268 269 // Remove removes an object. 270 // 271 // Note that if the object is remote, the object is only (conceptually) removed 272 // from this provider. If other providers have references on the remote object, 273 // it will not be removed. 274 // 275 // The object is not guaranteed to be durably removed until Sync is called. 276 func (p *provider) Remove(fileType base.FileType, fileNum base.DiskFileNum) error { 277 meta, err := p.Lookup(fileType, fileNum) 278 if err != nil { 279 return err 280 } 281 282 if !meta.IsRemote() { 283 err = p.vfsRemove(fileType, fileNum) 284 } else { 285 // TODO(radu): implement remote object removal (i.e. deref). 286 err = p.sharedUnref(meta) 287 if err != nil && p.isNotExistError(meta, err) { 288 // Wrap the error so that IsNotExistError functions properly. 289 err = errors.Mark(err, os.ErrNotExist) 290 } 291 } 292 if err != nil && !p.IsNotExistError(err) { 293 // We want to be able to retry a Remove, so we keep the object in our list. 294 // TODO(radu): we should mark the object as "zombie" and not allow any other 295 // operations. 296 return errors.Wrapf(err, "removing object %s", fileNum) 297 } 298 299 p.removeMetadata(fileNum) 300 return err 301 } 302 303 func (p *provider) isNotExistError(meta objstorage.ObjectMetadata, err error) bool { 304 if meta.Remote.Storage != nil { 305 return meta.Remote.Storage.IsNotExistError(err) 306 } 307 return oserror.IsNotExist(err) 308 } 309 310 // IsNotExistError is part of the objstorage.Provider interface. 311 func (p *provider) IsNotExistError(err error) bool { 312 // We use errors.Mark(err, os.ErrNotExist) for not-exist errors coming from 313 // remote.Storage. 314 return oserror.IsNotExist(err) 315 } 316 317 // Sync flushes the metadata from creation or removal of objects since the last Sync. 318 func (p *provider) Sync() error { 319 if err := p.vfsSync(); err != nil { 320 return err 321 } 322 if err := p.sharedSync(); err != nil { 323 return err 324 } 325 return nil 326 } 327 328 // LinkOrCopyFromLocal creates a new object that is either a copy of a given 329 // local file or a hard link (if the new object is created on the same FS, and 330 // if the FS supports it). 331 // 332 // The object is not guaranteed to be durable (accessible in case of crashes) 333 // until Sync is called. 334 func (p *provider) LinkOrCopyFromLocal( 335 ctx context.Context, 336 srcFS vfs.FS, 337 srcFilePath string, 338 dstFileType base.FileType, 339 dstFileNum base.DiskFileNum, 340 opts objstorage.CreateOptions, 341 ) (objstorage.ObjectMetadata, error) { 342 shared := opts.PreferSharedStorage && p.st.Remote.CreateOnShared != remote.CreateOnSharedNone 343 if !shared && srcFS == p.st.FS { 344 // Wrap the normal filesystem with one which wraps newly created files with 345 // vfs.NewSyncingFile. 346 fs := vfs.NewSyncingFS(p.st.FS, vfs.SyncingFileOptions{ 347 NoSyncOnClose: p.st.NoSyncOnClose, 348 BytesPerSync: p.st.BytesPerSync, 349 }) 350 dstPath := p.vfsPath(dstFileType, dstFileNum) 351 if err := vfs.LinkOrCopy(fs, srcFilePath, dstPath); err != nil { 352 return objstorage.ObjectMetadata{}, err 353 } 354 355 meta := objstorage.ObjectMetadata{ 356 DiskFileNum: dstFileNum, 357 FileType: dstFileType, 358 } 359 p.addMetadata(meta) 360 return meta, nil 361 } 362 // Create the object and copy the data. 363 w, meta, err := p.Create(ctx, dstFileType, dstFileNum, opts) 364 if err != nil { 365 return objstorage.ObjectMetadata{}, err 366 } 367 f, err := srcFS.Open(srcFilePath, vfs.SequentialReadsOption) 368 if err != nil { 369 return objstorage.ObjectMetadata{}, err 370 } 371 defer f.Close() 372 buf := make([]byte, 64*1024) 373 for { 374 n, readErr := f.Read(buf) 375 if readErr != nil && readErr != io.EOF { 376 w.Abort() 377 return objstorage.ObjectMetadata{}, readErr 378 } 379 380 if n > 0 { 381 if err := w.Write(buf[:n]); err != nil { 382 w.Abort() 383 return objstorage.ObjectMetadata{}, err 384 } 385 } 386 387 if readErr == io.EOF { 388 break 389 } 390 } 391 if err := w.Finish(); err != nil { 392 return objstorage.ObjectMetadata{}, err 393 } 394 return meta, nil 395 } 396 397 // Lookup is part of the objstorage.Provider interface. 398 func (p *provider) Lookup( 399 fileType base.FileType, fileNum base.DiskFileNum, 400 ) (objstorage.ObjectMetadata, error) { 401 p.mu.RLock() 402 defer p.mu.RUnlock() 403 meta, ok := p.mu.knownObjects[fileNum] 404 if !ok { 405 return objstorage.ObjectMetadata{}, errors.Wrapf( 406 os.ErrNotExist, 407 "file %s (type %d) unknown to the objstorage provider", 408 fileNum, errors.Safe(fileType), 409 ) 410 } 411 if meta.FileType != fileType { 412 return objstorage.ObjectMetadata{}, errors.AssertionFailedf( 413 "file %s type mismatch (known type %d, expected type %d)", 414 fileNum, errors.Safe(meta.FileType), errors.Safe(fileType), 415 ) 416 } 417 return meta, nil 418 } 419 420 // Path is part of the objstorage.Provider interface. 421 func (p *provider) Path(meta objstorage.ObjectMetadata) string { 422 if !meta.IsRemote() { 423 return p.vfsPath(meta.FileType, meta.DiskFileNum) 424 } 425 return p.remotePath(meta) 426 } 427 428 // Size returns the size of the object. 429 func (p *provider) Size(meta objstorage.ObjectMetadata) (int64, error) { 430 if !meta.IsRemote() { 431 return p.vfsSize(meta.FileType, meta.DiskFileNum) 432 } 433 return p.remoteSize(meta) 434 } 435 436 // List is part of the objstorage.Provider interface. 437 func (p *provider) List() []objstorage.ObjectMetadata { 438 p.mu.RLock() 439 defer p.mu.RUnlock() 440 res := make([]objstorage.ObjectMetadata, 0, len(p.mu.knownObjects)) 441 for _, meta := range p.mu.knownObjects { 442 res = append(res, meta) 443 } 444 slices.SortFunc(res, func(a, b objstorage.ObjectMetadata) int { 445 return cmp.Compare(a.DiskFileNum, b.DiskFileNum) 446 }) 447 return res 448 } 449 450 // Metrics is part of the objstorage.Provider interface. 451 func (p *provider) Metrics() sharedcache.Metrics { 452 if p.remote.cache != nil { 453 return p.remote.cache.Metrics() 454 } 455 return sharedcache.Metrics{} 456 } 457 458 func (p *provider) addMetadata(meta objstorage.ObjectMetadata) { 459 if invariants.Enabled { 460 meta.AssertValid() 461 } 462 p.mu.Lock() 463 defer p.mu.Unlock() 464 p.mu.knownObjects[meta.DiskFileNum] = meta 465 if meta.IsRemote() { 466 p.mu.remote.catalogBatch.AddObject(remoteobjcat.RemoteObjectMetadata{ 467 FileNum: meta.DiskFileNum, 468 FileType: meta.FileType, 469 CreatorID: meta.Remote.CreatorID, 470 CreatorFileNum: meta.Remote.CreatorFileNum, 471 Locator: meta.Remote.Locator, 472 CleanupMethod: meta.Remote.CleanupMethod, 473 CustomObjectName: meta.Remote.CustomObjectName, 474 }) 475 } else { 476 p.mu.localObjectsChanged = true 477 } 478 } 479 480 func (p *provider) removeMetadata(fileNum base.DiskFileNum) { 481 p.mu.Lock() 482 defer p.mu.Unlock() 483 484 meta, ok := p.mu.knownObjects[fileNum] 485 if !ok { 486 return 487 } 488 delete(p.mu.knownObjects, fileNum) 489 if meta.IsRemote() { 490 p.mu.remote.catalogBatch.DeleteObject(fileNum) 491 } else { 492 p.mu.localObjectsChanged = true 493 } 494 } 495 496 // protectObject prevents the unreferencing of a remote object until 497 // unprotectObject is called. 498 func (p *provider) protectObject(fileNum base.DiskFileNum) { 499 p.mu.Lock() 500 defer p.mu.Unlock() 501 p.mu.protectedObjects[fileNum] = p.mu.protectedObjects[fileNum] + 1 502 } 503 504 func (p *provider) unprotectObject(fileNum base.DiskFileNum) { 505 p.mu.Lock() 506 defer p.mu.Unlock() 507 v := p.mu.protectedObjects[fileNum] 508 if invariants.Enabled && v == 0 { 509 panic("invalid protection count") 510 } 511 if v > 1 { 512 p.mu.protectedObjects[fileNum] = v - 1 513 } else { 514 delete(p.mu.protectedObjects, fileNum) 515 // TODO(radu): check if the object is still in knownObject; if not, unref it 516 // now. 517 } 518 } 519 520 func (p *provider) isProtected(fileNum base.DiskFileNum) bool { 521 p.mu.Lock() 522 defer p.mu.Unlock() 523 return p.mu.protectedObjects[fileNum] > 0 524 }