github.com/cockroachdb/pebble@v1.1.2/objstorage/objstorageprovider/provider.go (about) 1 // Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package objstorageprovider 6 7 import ( 8 "context" 9 "io" 10 "os" 11 "sort" 12 "sync" 13 14 "github.com/cockroachdb/errors" 15 "github.com/cockroachdb/errors/oserror" 16 "github.com/cockroachdb/pebble/internal/base" 17 "github.com/cockroachdb/pebble/internal/invariants" 18 "github.com/cockroachdb/pebble/objstorage" 19 "github.com/cockroachdb/pebble/objstorage/objstorageprovider/objiotracing" 20 "github.com/cockroachdb/pebble/objstorage/objstorageprovider/remoteobjcat" 21 "github.com/cockroachdb/pebble/objstorage/objstorageprovider/sharedcache" 22 "github.com/cockroachdb/pebble/objstorage/remote" 23 "github.com/cockroachdb/pebble/vfs" 24 ) 25 26 // provider is the implementation of objstorage.Provider. 27 type provider struct { 28 st Settings 29 30 fsDir vfs.File 31 32 tracer *objiotracing.Tracer 33 34 remote remoteSubsystem 35 36 mu struct { 37 sync.RWMutex 38 39 remote struct { 40 // catalogBatch accumulates remote object creations and deletions until 41 // Sync is called. 42 catalogBatch remoteobjcat.Batch 43 44 storageObjects map[remote.Locator]remote.Storage 45 } 46 47 // localObjectsChanged is set if non-remote objects were created or deleted 48 // but Sync was not yet called. 49 localObjectsChanged bool 50 51 // knownObjects maintains information about objects that are known to the provider. 52 // It is initialized with the list of files in the manifest when we open a DB. 53 knownObjects map[base.DiskFileNum]objstorage.ObjectMetadata 54 55 // protectedObjects are objects that cannot be unreferenced because they 56 // have outstanding SharedObjectBackingHandles. The value is a count of outstanding handles 57 protectedObjects map[base.DiskFileNum]int 58 } 59 } 60 61 var _ objstorage.Provider = (*provider)(nil) 62 63 // Settings that must be specified when creating the provider. 64 type Settings struct { 65 Logger base.Logger 66 67 // Local filesystem configuration. 68 FS vfs.FS 69 FSDirName string 70 71 // FSDirInitialListing is a listing of FSDirName at the time of calling Open. 72 // 73 // This is an optional optimization to avoid double listing on Open when the 74 // higher layer already has a listing. When nil, we obtain the listing on 75 // Open. 76 FSDirInitialListing []string 77 78 // Cleaner cleans obsolete files from the local filesystem. 79 // 80 // The default cleaner uses the DeleteCleaner. 81 FSCleaner base.Cleaner 82 83 // NoSyncOnClose decides whether the implementation will enforce a 84 // close-time synchronization (e.g., fdatasync() or sync_file_range()) 85 // on files it writes to. Setting this to true removes the guarantee for a 86 // sync on close. Some implementations can still issue a non-blocking sync. 87 NoSyncOnClose bool 88 89 // BytesPerSync enables periodic syncing of files in order to smooth out 90 // writes to disk. This option does not provide any persistence guarantee, but 91 // is used to avoid latency spikes if the OS automatically decides to write 92 // out a large chunk of dirty filesystem buffers. 93 BytesPerSync int 94 95 // Local contains fields that are only relevant for files stored on the local 96 // filesystem. 97 Local struct { 98 // TODO(radu): move FSCleaner, NoSyncOnClose, BytesPerSync here. 99 100 // ReadaheadConfigFn is a function used to retrieve the current readahead 101 // mode. This function is run whenever a local object is open for reading. 102 // If it is nil, DefaultReadaheadConfig is used. 103 ReadaheadConfigFn func() ReadaheadConfig 104 } 105 106 // Fields here are set only if the provider is to support remote objects 107 // (experimental). 108 Remote struct { 109 StorageFactory remote.StorageFactory 110 111 // If CreateOnShared is non-zero, sstables are created on remote storage using 112 // the CreateOnSharedLocator (when the PreferSharedStorage create option is 113 // true). 114 CreateOnShared remote.CreateOnSharedStrategy 115 CreateOnSharedLocator remote.Locator 116 117 // CacheSizeBytes is the size of the on-disk block cache for objects 118 // on remote storage. If it is 0, no cache is used. 119 CacheSizeBytes int64 120 121 // CacheBlockSize is the block size of the cache; if 0, the default of 32KB is used. 122 CacheBlockSize int 123 124 // ShardingBlockSize is the size of a shard block. The cache is split into contiguous 125 // ShardingBlockSize units. The units are distributed across multiple independent shards 126 // of the cache, via a hash(offset) modulo num shards operation. The cache replacement 127 // policies operate at the level of shard, not whole cache. This is done to reduce lock 128 // contention. 129 // 130 // If ShardingBlockSize is 0, the default of 1 MB is used. 131 ShardingBlockSize int64 132 133 // The number of independent shards the cache leverages. Each shard is the same size, 134 // and a hash of filenum & offset map a read to a certain shard. If set to 0, 135 // 2*runtime.GOMAXPROCS is used as the shard count. 136 CacheShardCount int 137 138 // TODO(radu): allow the cache to live on another FS/location (e.g. to use 139 // instance-local SSD). 140 } 141 } 142 143 // ReadaheadConfig controls the use of read-ahead. 144 type ReadaheadConfig struct { 145 // Informed is the type of read-ahead for operations that are known to read a 146 // large consecutive chunk of a file. 147 Informed ReadaheadMode 148 149 // Speculative is the type of read-ahead used automatically, when consecutive 150 // reads are detected. 151 Speculative ReadaheadMode 152 } 153 154 // DefaultReadaheadConfig is the readahead config used when ReadaheadConfigFn is 155 // not specified. 156 var DefaultReadaheadConfig = ReadaheadConfig{ 157 Informed: FadviseSequential, 158 Speculative: FadviseSequential, 159 } 160 161 // ReadaheadMode indicates the type of read-ahead to use, either for informed 162 // read-ahead (e.g. compactions) or speculative read-ahead. 163 type ReadaheadMode uint8 164 165 const ( 166 // NoReadahead disables readahead altogether. 167 NoReadahead ReadaheadMode = iota 168 169 // SysReadahead enables the use of SYS_READAHEAD call to prefetch data. 170 // The prefetch window grows dynamically as consecutive writes are detected. 171 SysReadahead 172 173 // FadviseSequential enables to use of FADV_SEQUENTIAL. For informed 174 // read-ahead, FADV_SEQUENTIAL is used from the beginning. For speculative 175 // read-ahead SYS_READAHEAD is first used until the window reaches the maximum 176 // size, then we siwtch to FADV_SEQUENTIAL. 177 FadviseSequential 178 ) 179 180 // DefaultSettings initializes default settings (with no remote storage), 181 // suitable for tests and tools. 182 func DefaultSettings(fs vfs.FS, dirName string) Settings { 183 return Settings{ 184 Logger: base.DefaultLogger, 185 FS: fs, 186 FSDirName: dirName, 187 FSCleaner: base.DeleteCleaner{}, 188 NoSyncOnClose: false, 189 BytesPerSync: 512 * 1024, // 512KB 190 } 191 } 192 193 // Open creates the provider. 194 func Open(settings Settings) (objstorage.Provider, error) { 195 // Note: we can't just `return open(settings)` because in an error case we 196 // would return (*provider)(nil) which is not objstorage.Provider(nil). 197 p, err := open(settings) 198 if err != nil { 199 return nil, err 200 } 201 return p, nil 202 } 203 204 func open(settings Settings) (p *provider, _ error) { 205 fsDir, err := settings.FS.OpenDir(settings.FSDirName) 206 if err != nil { 207 return nil, err 208 } 209 210 defer func() { 211 if p == nil { 212 fsDir.Close() 213 } 214 }() 215 216 p = &provider{ 217 st: settings, 218 fsDir: fsDir, 219 } 220 p.mu.knownObjects = make(map[base.DiskFileNum]objstorage.ObjectMetadata) 221 p.mu.protectedObjects = make(map[base.DiskFileNum]int) 222 223 if objiotracing.Enabled { 224 p.tracer = objiotracing.Open(settings.FS, settings.FSDirName) 225 } 226 227 // Add local FS objects. 228 if err := p.vfsInit(); err != nil { 229 return nil, err 230 } 231 232 // Initialize remote subsystem (if configured) and add remote objects. 233 if err := p.remoteInit(); err != nil { 234 return nil, err 235 } 236 237 return p, nil 238 } 239 240 // Close is part of the objstorage.Provider interface. 241 func (p *provider) Close() error { 242 err := p.sharedClose() 243 if p.fsDir != nil { 244 err = firstError(err, p.fsDir.Close()) 245 p.fsDir = nil 246 } 247 if objiotracing.Enabled { 248 if p.tracer != nil { 249 p.tracer.Close() 250 p.tracer = nil 251 } 252 } 253 return err 254 } 255 256 // OpenForReading opens an existing object. 257 func (p *provider) OpenForReading( 258 ctx context.Context, 259 fileType base.FileType, 260 fileNum base.DiskFileNum, 261 opts objstorage.OpenOptions, 262 ) (objstorage.Readable, error) { 263 meta, err := p.Lookup(fileType, fileNum) 264 if err != nil { 265 if opts.MustExist { 266 p.st.Logger.Fatalf("%v", err) 267 } 268 return nil, err 269 } 270 271 var r objstorage.Readable 272 if !meta.IsRemote() { 273 r, err = p.vfsOpenForReading(ctx, fileType, fileNum, opts) 274 } else { 275 r, err = p.remoteOpenForReading(ctx, meta, opts) 276 if err != nil && p.isNotExistError(meta, err) { 277 // Wrap the error so that IsNotExistError functions properly. 278 err = errors.Mark(err, os.ErrNotExist) 279 } 280 } 281 if err != nil { 282 return nil, err 283 } 284 if objiotracing.Enabled { 285 r = p.tracer.WrapReadable(ctx, r, fileNum) 286 } 287 return r, nil 288 } 289 290 // Create creates a new object and opens it for writing. 291 // 292 // The object is not guaranteed to be durable (accessible in case of crashes) 293 // until Sync is called. 294 func (p *provider) Create( 295 ctx context.Context, 296 fileType base.FileType, 297 fileNum base.DiskFileNum, 298 opts objstorage.CreateOptions, 299 ) (w objstorage.Writable, meta objstorage.ObjectMetadata, err error) { 300 if opts.PreferSharedStorage && p.st.Remote.CreateOnShared != remote.CreateOnSharedNone { 301 w, meta, err = p.sharedCreate(ctx, fileType, fileNum, p.st.Remote.CreateOnSharedLocator, opts) 302 } else { 303 w, meta, err = p.vfsCreate(ctx, fileType, fileNum) 304 } 305 if err != nil { 306 err = errors.Wrapf(err, "creating object %s", errors.Safe(fileNum)) 307 return nil, objstorage.ObjectMetadata{}, err 308 } 309 p.addMetadata(meta) 310 if objiotracing.Enabled { 311 w = p.tracer.WrapWritable(ctx, w, fileNum) 312 } 313 return w, meta, nil 314 } 315 316 // Remove removes an object. 317 // 318 // Note that if the object is remote, the object is only (conceptually) removed 319 // from this provider. If other providers have references on the remote object, 320 // it will not be removed. 321 // 322 // The object is not guaranteed to be durably removed until Sync is called. 323 func (p *provider) Remove(fileType base.FileType, fileNum base.DiskFileNum) error { 324 meta, err := p.Lookup(fileType, fileNum) 325 if err != nil { 326 return err 327 } 328 329 if !meta.IsRemote() { 330 err = p.vfsRemove(fileType, fileNum) 331 } else { 332 // TODO(radu): implement remote object removal (i.e. deref). 333 err = p.sharedUnref(meta) 334 if err != nil && p.isNotExistError(meta, err) { 335 // Wrap the error so that IsNotExistError functions properly. 336 err = errors.Mark(err, os.ErrNotExist) 337 } 338 } 339 if err != nil && !p.IsNotExistError(err) { 340 // We want to be able to retry a Remove, so we keep the object in our list. 341 // TODO(radu): we should mark the object as "zombie" and not allow any other 342 // operations. 343 return errors.Wrapf(err, "removing object %s", errors.Safe(fileNum)) 344 } 345 346 p.removeMetadata(fileNum) 347 return err 348 } 349 350 func (p *provider) isNotExistError(meta objstorage.ObjectMetadata, err error) bool { 351 if meta.Remote.Storage != nil { 352 return meta.Remote.Storage.IsNotExistError(err) 353 } 354 return oserror.IsNotExist(err) 355 } 356 357 // IsNotExistError is part of the objstorage.Provider interface. 358 func (p *provider) IsNotExistError(err error) bool { 359 // We use errors.Mark(err, os.ErrNotExist) for not-exist errors coming from 360 // remote.Storage. 361 return oserror.IsNotExist(err) 362 } 363 364 // Sync flushes the metadata from creation or removal of objects since the last Sync. 365 func (p *provider) Sync() error { 366 if err := p.vfsSync(); err != nil { 367 return err 368 } 369 if err := p.sharedSync(); err != nil { 370 return err 371 } 372 return nil 373 } 374 375 // LinkOrCopyFromLocal creates a new object that is either a copy of a given 376 // local file or a hard link (if the new object is created on the same FS, and 377 // if the FS supports it). 378 // 379 // The object is not guaranteed to be durable (accessible in case of crashes) 380 // until Sync is called. 381 func (p *provider) LinkOrCopyFromLocal( 382 ctx context.Context, 383 srcFS vfs.FS, 384 srcFilePath string, 385 dstFileType base.FileType, 386 dstFileNum base.DiskFileNum, 387 opts objstorage.CreateOptions, 388 ) (objstorage.ObjectMetadata, error) { 389 shared := opts.PreferSharedStorage && p.st.Remote.CreateOnShared != remote.CreateOnSharedNone 390 if !shared && srcFS == p.st.FS { 391 // Wrap the normal filesystem with one which wraps newly created files with 392 // vfs.NewSyncingFile. 393 fs := vfs.NewSyncingFS(p.st.FS, vfs.SyncingFileOptions{ 394 NoSyncOnClose: p.st.NoSyncOnClose, 395 BytesPerSync: p.st.BytesPerSync, 396 }) 397 dstPath := p.vfsPath(dstFileType, dstFileNum) 398 if err := vfs.LinkOrCopy(fs, srcFilePath, dstPath); err != nil { 399 return objstorage.ObjectMetadata{}, err 400 } 401 402 meta := objstorage.ObjectMetadata{ 403 DiskFileNum: dstFileNum, 404 FileType: dstFileType, 405 } 406 p.addMetadata(meta) 407 return meta, nil 408 } 409 // Create the object and copy the data. 410 w, meta, err := p.Create(ctx, dstFileType, dstFileNum, opts) 411 if err != nil { 412 return objstorage.ObjectMetadata{}, err 413 } 414 f, err := srcFS.Open(srcFilePath, vfs.SequentialReadsOption) 415 if err != nil { 416 return objstorage.ObjectMetadata{}, err 417 } 418 defer f.Close() 419 buf := make([]byte, 64*1024) 420 for { 421 n, readErr := f.Read(buf) 422 if readErr != nil && readErr != io.EOF { 423 w.Abort() 424 return objstorage.ObjectMetadata{}, readErr 425 } 426 427 if n > 0 { 428 if err := w.Write(buf[:n]); err != nil { 429 w.Abort() 430 return objstorage.ObjectMetadata{}, err 431 } 432 } 433 434 if readErr == io.EOF { 435 break 436 } 437 } 438 if err := w.Finish(); err != nil { 439 return objstorage.ObjectMetadata{}, err 440 } 441 return meta, nil 442 } 443 444 // Lookup is part of the objstorage.Provider interface. 445 func (p *provider) Lookup( 446 fileType base.FileType, fileNum base.DiskFileNum, 447 ) (objstorage.ObjectMetadata, error) { 448 p.mu.RLock() 449 defer p.mu.RUnlock() 450 meta, ok := p.mu.knownObjects[fileNum] 451 if !ok { 452 return objstorage.ObjectMetadata{}, errors.Wrapf( 453 os.ErrNotExist, 454 "file %s (type %d) unknown to the objstorage provider", 455 errors.Safe(fileNum), errors.Safe(fileType), 456 ) 457 } 458 if meta.FileType != fileType { 459 return objstorage.ObjectMetadata{}, errors.AssertionFailedf( 460 "file %s type mismatch (known type %d, expected type %d)", 461 errors.Safe(fileNum), errors.Safe(meta.FileType), errors.Safe(fileType), 462 ) 463 } 464 return meta, nil 465 } 466 467 // Path is part of the objstorage.Provider interface. 468 func (p *provider) Path(meta objstorage.ObjectMetadata) string { 469 if !meta.IsRemote() { 470 return p.vfsPath(meta.FileType, meta.DiskFileNum) 471 } 472 return p.remotePath(meta) 473 } 474 475 // Size returns the size of the object. 476 func (p *provider) Size(meta objstorage.ObjectMetadata) (int64, error) { 477 if !meta.IsRemote() { 478 return p.vfsSize(meta.FileType, meta.DiskFileNum) 479 } 480 return p.remoteSize(meta) 481 } 482 483 // List is part of the objstorage.Provider interface. 484 func (p *provider) List() []objstorage.ObjectMetadata { 485 p.mu.RLock() 486 defer p.mu.RUnlock() 487 res := make([]objstorage.ObjectMetadata, 0, len(p.mu.knownObjects)) 488 for _, meta := range p.mu.knownObjects { 489 res = append(res, meta) 490 } 491 sort.Slice(res, func(i, j int) bool { 492 return res[i].DiskFileNum.FileNum() < res[j].DiskFileNum.FileNum() 493 }) 494 return res 495 } 496 497 // Metrics is part of the objstorage.Provider interface. 498 func (p *provider) Metrics() sharedcache.Metrics { 499 if p.remote.cache != nil { 500 return p.remote.cache.Metrics() 501 } 502 return sharedcache.Metrics{} 503 } 504 505 func (p *provider) addMetadata(meta objstorage.ObjectMetadata) { 506 if invariants.Enabled { 507 meta.AssertValid() 508 } 509 p.mu.Lock() 510 defer p.mu.Unlock() 511 p.mu.knownObjects[meta.DiskFileNum] = meta 512 if meta.IsRemote() { 513 p.mu.remote.catalogBatch.AddObject(remoteobjcat.RemoteObjectMetadata{ 514 FileNum: meta.DiskFileNum, 515 FileType: meta.FileType, 516 CreatorID: meta.Remote.CreatorID, 517 CreatorFileNum: meta.Remote.CreatorFileNum, 518 Locator: meta.Remote.Locator, 519 CleanupMethod: meta.Remote.CleanupMethod, 520 }) 521 } else { 522 p.mu.localObjectsChanged = true 523 } 524 } 525 526 func (p *provider) removeMetadata(fileNum base.DiskFileNum) { 527 p.mu.Lock() 528 defer p.mu.Unlock() 529 530 meta, ok := p.mu.knownObjects[fileNum] 531 if !ok { 532 return 533 } 534 delete(p.mu.knownObjects, fileNum) 535 if meta.IsRemote() { 536 p.mu.remote.catalogBatch.DeleteObject(fileNum) 537 } else { 538 p.mu.localObjectsChanged = true 539 } 540 } 541 542 // protectObject prevents the unreferencing of a remote object until 543 // unprotectObject is called. 544 func (p *provider) protectObject(fileNum base.DiskFileNum) { 545 p.mu.Lock() 546 defer p.mu.Unlock() 547 p.mu.protectedObjects[fileNum] = p.mu.protectedObjects[fileNum] + 1 548 } 549 550 func (p *provider) unprotectObject(fileNum base.DiskFileNum) { 551 p.mu.Lock() 552 defer p.mu.Unlock() 553 v := p.mu.protectedObjects[fileNum] 554 if invariants.Enabled && v == 0 { 555 panic("invalid protection count") 556 } 557 if v > 1 { 558 p.mu.protectedObjects[fileNum] = v - 1 559 } else { 560 delete(p.mu.protectedObjects, fileNum) 561 // TODO(radu): check if the object is still in knownObject; if not, unref it 562 // now. 563 } 564 } 565 566 func (p *provider) isProtected(fileNum base.DiskFileNum) bool { 567 p.mu.Lock() 568 defer p.mu.Unlock() 569 return p.mu.protectedObjects[fileNum] > 0 570 }