github.com/cockroachdb/pebble@v1.1.1-0.20240513155919-3622ade60459/objstorage/objstorageprovider/provider.go (about) 1 // Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package objstorageprovider 6 7 import ( 8 "context" 9 "io" 10 "os" 11 "sort" 12 "sync" 13 14 "github.com/cockroachdb/errors" 15 "github.com/cockroachdb/errors/oserror" 16 "github.com/cockroachdb/pebble/internal/base" 17 "github.com/cockroachdb/pebble/internal/invariants" 18 "github.com/cockroachdb/pebble/objstorage" 19 "github.com/cockroachdb/pebble/objstorage/objstorageprovider/objiotracing" 20 "github.com/cockroachdb/pebble/objstorage/objstorageprovider/remoteobjcat" 21 "github.com/cockroachdb/pebble/objstorage/objstorageprovider/sharedcache" 22 "github.com/cockroachdb/pebble/objstorage/remote" 23 "github.com/cockroachdb/pebble/vfs" 24 ) 25 26 // provider is the implementation of objstorage.Provider. 27 type provider struct { 28 st Settings 29 30 fsDir vfs.File 31 32 tracer *objiotracing.Tracer 33 34 remote remoteSubsystem 35 36 mu struct { 37 sync.RWMutex 38 39 remote struct { 40 // catalogBatch accumulates remote object creations and deletions until 41 // Sync is called. 42 catalogBatch remoteobjcat.Batch 43 44 storageObjects map[remote.Locator]remote.Storage 45 } 46 47 // localObjectsChanged is set if non-remote objects were created or deleted 48 // but Sync was not yet called. 49 localObjectsChanged bool 50 51 // knownObjects maintains information about objects that are known to the provider. 52 // It is initialized with the list of files in the manifest when we open a DB. 53 knownObjects map[base.DiskFileNum]objstorage.ObjectMetadata 54 55 // protectedObjects are objects that cannot be unreferenced because they 56 // have outstanding SharedObjectBackingHandles. The value is a count of outstanding handles 57 protectedObjects map[base.DiskFileNum]int 58 } 59 } 60 61 var _ objstorage.Provider = (*provider)(nil) 62 63 // Settings that must be specified when creating the provider. 64 type Settings struct { 65 Logger base.Logger 66 67 // Local filesystem configuration. 68 FS vfs.FS 69 FSDirName string 70 71 // FSDirInitialListing is a listing of FSDirName at the time of calling Open. 72 // 73 // This is an optional optimization to avoid double listing on Open when the 74 // higher layer already has a listing. When nil, we obtain the listing on 75 // Open. 76 FSDirInitialListing []string 77 78 // Cleaner cleans obsolete files from the local filesystem. 79 // 80 // The default cleaner uses the DeleteCleaner. 81 FSCleaner base.Cleaner 82 83 // NoSyncOnClose decides whether the implementation will enforce a 84 // close-time synchronization (e.g., fdatasync() or sync_file_range()) 85 // on files it writes to. Setting this to true removes the guarantee for a 86 // sync on close. Some implementations can still issue a non-blocking sync. 87 NoSyncOnClose bool 88 89 // BytesPerSync enables periodic syncing of files in order to smooth out 90 // writes to disk. This option does not provide any persistence guarantee, but 91 // is used to avoid latency spikes if the OS automatically decides to write 92 // out a large chunk of dirty filesystem buffers. 93 BytesPerSync int 94 95 // Fields here are set only if the provider is to support remote objects 96 // (experimental). 97 Remote struct { 98 StorageFactory remote.StorageFactory 99 100 // If CreateOnShared is non-zero, sstables are created on remote storage using 101 // the CreateOnSharedLocator (when the PreferSharedStorage create option is 102 // true). 103 CreateOnShared remote.CreateOnSharedStrategy 104 CreateOnSharedLocator remote.Locator 105 106 // CacheSizeBytes is the size of the on-disk block cache for objects 107 // on remote storage. If it is 0, no cache is used. 108 CacheSizeBytes int64 109 110 // CacheBlockSize is the block size of the cache; if 0, the default of 32KB is used. 111 CacheBlockSize int 112 113 // ShardingBlockSize is the size of a shard block. The cache is split into contiguous 114 // ShardingBlockSize units. The units are distributed across multiple independent shards 115 // of the cache, via a hash(offset) modulo num shards operation. The cache replacement 116 // policies operate at the level of shard, not whole cache. This is done to reduce lock 117 // contention. 118 // 119 // If ShardingBlockSize is 0, the default of 1 MB is used. 120 ShardingBlockSize int64 121 122 // The number of independent shards the cache leverages. Each shard is the same size, 123 // and a hash of filenum & offset map a read to a certain shard. If set to 0, 124 // 2*runtime.GOMAXPROCS is used as the shard count. 125 CacheShardCount int 126 127 // TODO(radu): allow the cache to live on another FS/location (e.g. to use 128 // instance-local SSD). 129 } 130 } 131 132 // DefaultSettings initializes default settings (with no remote storage), 133 // suitable for tests and tools. 134 func DefaultSettings(fs vfs.FS, dirName string) Settings { 135 return Settings{ 136 Logger: base.DefaultLogger, 137 FS: fs, 138 FSDirName: dirName, 139 FSCleaner: base.DeleteCleaner{}, 140 NoSyncOnClose: false, 141 BytesPerSync: 512 * 1024, // 512KB 142 } 143 } 144 145 // Open creates the provider. 146 func Open(settings Settings) (objstorage.Provider, error) { 147 // Note: we can't just `return open(settings)` because in an error case we 148 // would return (*provider)(nil) which is not objstorage.Provider(nil). 149 p, err := open(settings) 150 if err != nil { 151 return nil, err 152 } 153 return p, nil 154 } 155 156 func open(settings Settings) (p *provider, _ error) { 157 fsDir, err := settings.FS.OpenDir(settings.FSDirName) 158 if err != nil { 159 return nil, err 160 } 161 162 defer func() { 163 if p == nil { 164 fsDir.Close() 165 } 166 }() 167 168 p = &provider{ 169 st: settings, 170 fsDir: fsDir, 171 } 172 p.mu.knownObjects = make(map[base.DiskFileNum]objstorage.ObjectMetadata) 173 p.mu.protectedObjects = make(map[base.DiskFileNum]int) 174 175 if objiotracing.Enabled { 176 p.tracer = objiotracing.Open(settings.FS, settings.FSDirName) 177 } 178 179 // Add local FS objects. 180 if err := p.vfsInit(); err != nil { 181 return nil, err 182 } 183 184 // Initialize remote subsystem (if configured) and add remote objects. 185 if err := p.remoteInit(); err != nil { 186 return nil, err 187 } 188 189 return p, nil 190 } 191 192 // Close is part of the objstorage.Provider interface. 193 func (p *provider) Close() error { 194 err := p.sharedClose() 195 if p.fsDir != nil { 196 err = firstError(err, p.fsDir.Close()) 197 p.fsDir = nil 198 } 199 if objiotracing.Enabled { 200 if p.tracer != nil { 201 p.tracer.Close() 202 p.tracer = nil 203 } 204 } 205 return err 206 } 207 208 // OpenForReading opens an existing object. 209 func (p *provider) OpenForReading( 210 ctx context.Context, 211 fileType base.FileType, 212 fileNum base.DiskFileNum, 213 opts objstorage.OpenOptions, 214 ) (objstorage.Readable, error) { 215 meta, err := p.Lookup(fileType, fileNum) 216 if err != nil { 217 if opts.MustExist { 218 p.st.Logger.Fatalf("%v", err) 219 } 220 return nil, err 221 } 222 223 var r objstorage.Readable 224 if !meta.IsRemote() { 225 r, err = p.vfsOpenForReading(ctx, fileType, fileNum, opts) 226 } else { 227 r, err = p.remoteOpenForReading(ctx, meta, opts) 228 if err != nil && p.isNotExistError(meta, err) { 229 // Wrap the error so that IsNotExistError functions properly. 230 err = errors.Mark(err, os.ErrNotExist) 231 } 232 } 233 if err != nil { 234 return nil, err 235 } 236 if objiotracing.Enabled { 237 r = p.tracer.WrapReadable(ctx, r, fileNum) 238 } 239 return r, nil 240 } 241 242 // Create creates a new object and opens it for writing. 243 // 244 // The object is not guaranteed to be durable (accessible in case of crashes) 245 // until Sync is called. 246 func (p *provider) Create( 247 ctx context.Context, 248 fileType base.FileType, 249 fileNum base.DiskFileNum, 250 opts objstorage.CreateOptions, 251 ) (w objstorage.Writable, meta objstorage.ObjectMetadata, err error) { 252 if opts.PreferSharedStorage && p.st.Remote.CreateOnShared != remote.CreateOnSharedNone { 253 w, meta, err = p.sharedCreate(ctx, fileType, fileNum, p.st.Remote.CreateOnSharedLocator, opts) 254 } else { 255 w, meta, err = p.vfsCreate(ctx, fileType, fileNum) 256 } 257 if err != nil { 258 err = errors.Wrapf(err, "creating object %s", errors.Safe(fileNum)) 259 return nil, objstorage.ObjectMetadata{}, err 260 } 261 p.addMetadata(meta) 262 if objiotracing.Enabled { 263 w = p.tracer.WrapWritable(ctx, w, fileNum) 264 } 265 return w, meta, nil 266 } 267 268 // Remove removes an object. 269 // 270 // Note that if the object is remote, the object is only (conceptually) removed 271 // from this provider. If other providers have references on the remote object, 272 // it will not be removed. 273 // 274 // The object is not guaranteed to be durably removed until Sync is called. 275 func (p *provider) Remove(fileType base.FileType, fileNum base.DiskFileNum) error { 276 meta, err := p.Lookup(fileType, fileNum) 277 if err != nil { 278 return err 279 } 280 281 if !meta.IsRemote() { 282 err = p.vfsRemove(fileType, fileNum) 283 } else { 284 // TODO(radu): implement remote object removal (i.e. deref). 285 err = p.sharedUnref(meta) 286 if err != nil && p.isNotExistError(meta, err) { 287 // Wrap the error so that IsNotExistError functions properly. 288 err = errors.Mark(err, os.ErrNotExist) 289 } 290 } 291 if err != nil && !p.IsNotExistError(err) { 292 // We want to be able to retry a Remove, so we keep the object in our list. 293 // TODO(radu): we should mark the object as "zombie" and not allow any other 294 // operations. 295 return errors.Wrapf(err, "removing object %s", errors.Safe(fileNum)) 296 } 297 298 p.removeMetadata(fileNum) 299 return err 300 } 301 302 func (p *provider) isNotExistError(meta objstorage.ObjectMetadata, err error) bool { 303 if meta.Remote.Storage != nil { 304 return meta.Remote.Storage.IsNotExistError(err) 305 } 306 return oserror.IsNotExist(err) 307 } 308 309 // IsNotExistError is part of the objstorage.Provider interface. 310 func (p *provider) IsNotExistError(err error) bool { 311 // We use errors.Mark(err, os.ErrNotExist) for not-exist errors coming from 312 // remote.Storage. 313 return oserror.IsNotExist(err) 314 } 315 316 // Sync flushes the metadata from creation or removal of objects since the last Sync. 317 func (p *provider) Sync() error { 318 if err := p.vfsSync(); err != nil { 319 return err 320 } 321 if err := p.sharedSync(); err != nil { 322 return err 323 } 324 return nil 325 } 326 327 // LinkOrCopyFromLocal creates a new object that is either a copy of a given 328 // local file or a hard link (if the new object is created on the same FS, and 329 // if the FS supports it). 330 // 331 // The object is not guaranteed to be durable (accessible in case of crashes) 332 // until Sync is called. 333 func (p *provider) LinkOrCopyFromLocal( 334 ctx context.Context, 335 srcFS vfs.FS, 336 srcFilePath string, 337 dstFileType base.FileType, 338 dstFileNum base.DiskFileNum, 339 opts objstorage.CreateOptions, 340 ) (objstorage.ObjectMetadata, error) { 341 shared := opts.PreferSharedStorage && p.st.Remote.CreateOnShared != remote.CreateOnSharedNone 342 if !shared && srcFS == p.st.FS { 343 // Wrap the normal filesystem with one which wraps newly created files with 344 // vfs.NewSyncingFile. 345 fs := vfs.NewSyncingFS(p.st.FS, vfs.SyncingFileOptions{ 346 NoSyncOnClose: p.st.NoSyncOnClose, 347 BytesPerSync: p.st.BytesPerSync, 348 }) 349 dstPath := p.vfsPath(dstFileType, dstFileNum) 350 if err := vfs.LinkOrCopy(fs, srcFilePath, dstPath); err != nil { 351 return objstorage.ObjectMetadata{}, err 352 } 353 354 meta := objstorage.ObjectMetadata{ 355 DiskFileNum: dstFileNum, 356 FileType: dstFileType, 357 } 358 p.addMetadata(meta) 359 return meta, nil 360 } 361 // Create the object and copy the data. 362 w, meta, err := p.Create(ctx, dstFileType, dstFileNum, opts) 363 if err != nil { 364 return objstorage.ObjectMetadata{}, err 365 } 366 f, err := srcFS.Open(srcFilePath, vfs.SequentialReadsOption) 367 if err != nil { 368 return objstorage.ObjectMetadata{}, err 369 } 370 defer f.Close() 371 buf := make([]byte, 64*1024) 372 for { 373 n, readErr := f.Read(buf) 374 if readErr != nil && readErr != io.EOF { 375 w.Abort() 376 return objstorage.ObjectMetadata{}, readErr 377 } 378 379 if n > 0 { 380 if err := w.Write(buf[:n]); err != nil { 381 w.Abort() 382 return objstorage.ObjectMetadata{}, err 383 } 384 } 385 386 if readErr == io.EOF { 387 break 388 } 389 } 390 if err := w.Finish(); err != nil { 391 return objstorage.ObjectMetadata{}, err 392 } 393 return meta, nil 394 } 395 396 // Lookup is part of the objstorage.Provider interface. 397 func (p *provider) Lookup( 398 fileType base.FileType, fileNum base.DiskFileNum, 399 ) (objstorage.ObjectMetadata, error) { 400 p.mu.RLock() 401 defer p.mu.RUnlock() 402 meta, ok := p.mu.knownObjects[fileNum] 403 if !ok { 404 return objstorage.ObjectMetadata{}, errors.Wrapf( 405 os.ErrNotExist, 406 "file %s (type %d) unknown to the objstorage provider", 407 errors.Safe(fileNum), errors.Safe(fileType), 408 ) 409 } 410 if meta.FileType != fileType { 411 return objstorage.ObjectMetadata{}, errors.AssertionFailedf( 412 "file %s type mismatch (known type %d, expected type %d)", 413 errors.Safe(fileNum), errors.Safe(meta.FileType), errors.Safe(fileType), 414 ) 415 } 416 return meta, nil 417 } 418 419 // Path is part of the objstorage.Provider interface. 420 func (p *provider) Path(meta objstorage.ObjectMetadata) string { 421 if !meta.IsRemote() { 422 return p.vfsPath(meta.FileType, meta.DiskFileNum) 423 } 424 return p.remotePath(meta) 425 } 426 427 // Size returns the size of the object. 428 func (p *provider) Size(meta objstorage.ObjectMetadata) (int64, error) { 429 if !meta.IsRemote() { 430 return p.vfsSize(meta.FileType, meta.DiskFileNum) 431 } 432 return p.remoteSize(meta) 433 } 434 435 // List is part of the objstorage.Provider interface. 436 func (p *provider) List() []objstorage.ObjectMetadata { 437 p.mu.RLock() 438 defer p.mu.RUnlock() 439 res := make([]objstorage.ObjectMetadata, 0, len(p.mu.knownObjects)) 440 for _, meta := range p.mu.knownObjects { 441 res = append(res, meta) 442 } 443 sort.Slice(res, func(i, j int) bool { 444 return res[i].DiskFileNum.FileNum() < res[j].DiskFileNum.FileNum() 445 }) 446 return res 447 } 448 449 // Metrics is part of the objstorage.Provider interface. 450 func (p *provider) Metrics() sharedcache.Metrics { 451 if p.remote.cache != nil { 452 return p.remote.cache.Metrics() 453 } 454 return sharedcache.Metrics{} 455 } 456 457 func (p *provider) addMetadata(meta objstorage.ObjectMetadata) { 458 if invariants.Enabled { 459 meta.AssertValid() 460 } 461 p.mu.Lock() 462 defer p.mu.Unlock() 463 p.mu.knownObjects[meta.DiskFileNum] = meta 464 if meta.IsRemote() { 465 p.mu.remote.catalogBatch.AddObject(remoteobjcat.RemoteObjectMetadata{ 466 FileNum: meta.DiskFileNum, 467 FileType: meta.FileType, 468 CreatorID: meta.Remote.CreatorID, 469 CreatorFileNum: meta.Remote.CreatorFileNum, 470 Locator: meta.Remote.Locator, 471 CleanupMethod: meta.Remote.CleanupMethod, 472 }) 473 } else { 474 p.mu.localObjectsChanged = true 475 } 476 } 477 478 func (p *provider) removeMetadata(fileNum base.DiskFileNum) { 479 p.mu.Lock() 480 defer p.mu.Unlock() 481 482 meta, ok := p.mu.knownObjects[fileNum] 483 if !ok { 484 return 485 } 486 delete(p.mu.knownObjects, fileNum) 487 if meta.IsRemote() { 488 p.mu.remote.catalogBatch.DeleteObject(fileNum) 489 } else { 490 p.mu.localObjectsChanged = true 491 } 492 } 493 494 // protectObject prevents the unreferencing of a remote object until 495 // unprotectObject is called. 496 func (p *provider) protectObject(fileNum base.DiskFileNum) { 497 p.mu.Lock() 498 defer p.mu.Unlock() 499 p.mu.protectedObjects[fileNum] = p.mu.protectedObjects[fileNum] + 1 500 } 501 502 func (p *provider) unprotectObject(fileNum base.DiskFileNum) { 503 p.mu.Lock() 504 defer p.mu.Unlock() 505 v := p.mu.protectedObjects[fileNum] 506 if invariants.Enabled && v == 0 { 507 panic("invalid protection count") 508 } 509 if v > 1 { 510 p.mu.protectedObjects[fileNum] = v - 1 511 } else { 512 delete(p.mu.protectedObjects, fileNum) 513 // TODO(radu): check if the object is still in knownObject; if not, unref it 514 // now. 515 } 516 } 517 518 func (p *provider) isProtected(fileNum base.DiskFileNum) bool { 519 p.mu.Lock() 520 defer p.mu.Unlock() 521 return p.mu.protectedObjects[fileNum] > 0 522 }