github.com/cockroachdb/pebble@v1.1.1-0.20240513155919-3622ade60459/table_cache.go (about) 1 // Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package pebble 6 7 import ( 8 "bytes" 9 "context" 10 "fmt" 11 "io" 12 "runtime/debug" 13 "runtime/pprof" 14 "sync" 15 "sync/atomic" 16 "unsafe" 17 18 "github.com/cockroachdb/errors" 19 "github.com/cockroachdb/pebble/internal/base" 20 "github.com/cockroachdb/pebble/internal/invariants" 21 "github.com/cockroachdb/pebble/internal/keyspan" 22 "github.com/cockroachdb/pebble/internal/manifest" 23 "github.com/cockroachdb/pebble/internal/private" 24 "github.com/cockroachdb/pebble/objstorage" 25 "github.com/cockroachdb/pebble/objstorage/objstorageprovider/objiotracing" 26 "github.com/cockroachdb/pebble/sstable" 27 ) 28 29 var emptyIter = &errorIter{err: nil} 30 var emptyKeyspanIter = &errorKeyspanIter{err: nil} 31 32 // filteredAll is a singleton internalIterator implementation used when an 33 // sstable does contain point keys, but all the keys are filtered by the active 34 // PointKeyFilters set in the iterator's IterOptions. 35 // 36 // filteredAll implements filteredIter, ensuring the level iterator recognizes 37 // when it may need to return file boundaries to keep the rangeDelIter open 38 // during mergingIter operation. 39 var filteredAll = &filteredAllKeysIter{errorIter: errorIter{err: nil}} 40 41 var _ filteredIter = filteredAll 42 43 type filteredAllKeysIter struct { 44 errorIter 45 } 46 47 func (s *filteredAllKeysIter) MaybeFilteredKeys() bool { 48 return true 49 } 50 51 var tableCacheLabels = pprof.Labels("pebble", "table-cache") 52 53 // tableCacheOpts contains the db specific fields 54 // of a table cache. This is stored in the tableCacheContainer 55 // along with the table cache. 56 // NB: It is important to make sure that the fields in this 57 // struct are read-only. Since the fields here are shared 58 // by every single tableCacheShard, if non read-only fields 59 // are updated, we could have unnecessary evictions of those 60 // fields, and the surrounding fields from the CPU caches. 61 type tableCacheOpts struct { 62 // iterCount keeps track of how many iterators are open. It is used to keep 63 // track of leaked iterators on a per-db level. 64 iterCount *atomic.Int32 65 66 loggerAndTracer LoggerAndTracer 67 cacheID uint64 68 objProvider objstorage.Provider 69 opts sstable.ReaderOptions 70 filterMetrics *sstable.FilterMetricsTracker 71 } 72 73 // tableCacheContainer contains the table cache and 74 // fields which are unique to the DB. 75 type tableCacheContainer struct { 76 tableCache *TableCache 77 78 // dbOpts contains fields relevant to the table cache 79 // which are unique to each DB. 80 dbOpts tableCacheOpts 81 } 82 83 // newTableCacheContainer will panic if the underlying cache in the table cache 84 // doesn't match Options.Cache. 85 func newTableCacheContainer( 86 tc *TableCache, cacheID uint64, objProvider objstorage.Provider, opts *Options, size int, 87 ) *tableCacheContainer { 88 // We will release a ref to table cache acquired here when tableCacheContainer.close is called. 89 if tc != nil { 90 if tc.cache != opts.Cache { 91 panic("pebble: underlying cache for the table cache and db are different") 92 } 93 tc.Ref() 94 } else { 95 // NewTableCache should create a ref to tc which the container should 96 // drop whenever it is closed. 97 tc = NewTableCache(opts.Cache, opts.Experimental.TableCacheShards, size) 98 } 99 100 t := &tableCacheContainer{} 101 t.tableCache = tc 102 t.dbOpts.loggerAndTracer = opts.LoggerAndTracer 103 t.dbOpts.cacheID = cacheID 104 t.dbOpts.objProvider = objProvider 105 t.dbOpts.opts = opts.MakeReaderOptions() 106 t.dbOpts.filterMetrics = &sstable.FilterMetricsTracker{} 107 t.dbOpts.iterCount = new(atomic.Int32) 108 return t 109 } 110 111 // Before calling close, make sure that there will be no further need 112 // to access any of the files associated with the store. 113 func (c *tableCacheContainer) close() error { 114 // We want to do some cleanup work here. Check for leaked iterators 115 // by the DB using this container. Note that we'll still perform cleanup 116 // below in the case that there are leaked iterators. 117 var err error 118 if v := c.dbOpts.iterCount.Load(); v > 0 { 119 err = errors.Errorf("leaked iterators: %d", errors.Safe(v)) 120 } 121 122 // Release nodes here. 123 for _, shard := range c.tableCache.shards { 124 if shard != nil { 125 shard.removeDB(&c.dbOpts) 126 } 127 } 128 return firstError(err, c.tableCache.Unref()) 129 } 130 131 func (c *tableCacheContainer) newIters( 132 ctx context.Context, 133 file *manifest.FileMetadata, 134 opts *IterOptions, 135 internalOpts internalIterOpts, 136 ) (internalIterator, keyspan.FragmentIterator, error) { 137 return c.tableCache.getShard(file.FileBacking.DiskFileNum).newIters(ctx, file, opts, internalOpts, &c.dbOpts) 138 } 139 140 func (c *tableCacheContainer) newRangeKeyIter( 141 file *manifest.FileMetadata, opts keyspan.SpanIterOptions, 142 ) (keyspan.FragmentIterator, error) { 143 return c.tableCache.getShard(file.FileBacking.DiskFileNum).newRangeKeyIter(file, opts, &c.dbOpts) 144 } 145 146 // getTableProperties returns the properties associated with the backing physical 147 // table if the input metadata belongs to a virtual sstable. 148 func (c *tableCacheContainer) getTableProperties(file *fileMetadata) (*sstable.Properties, error) { 149 return c.tableCache.getShard(file.FileBacking.DiskFileNum).getTableProperties(file, &c.dbOpts) 150 } 151 152 func (c *tableCacheContainer) evict(fileNum base.DiskFileNum) { 153 c.tableCache.getShard(fileNum).evict(fileNum, &c.dbOpts, false) 154 } 155 156 func (c *tableCacheContainer) metrics() (CacheMetrics, FilterMetrics) { 157 var m CacheMetrics 158 for i := range c.tableCache.shards { 159 s := c.tableCache.shards[i] 160 s.mu.RLock() 161 m.Count += int64(len(s.mu.nodes)) 162 s.mu.RUnlock() 163 m.Hits += s.hits.Load() 164 m.Misses += s.misses.Load() 165 } 166 m.Size = m.Count * int64(unsafe.Sizeof(sstable.Reader{})) 167 f := c.dbOpts.filterMetrics.Load() 168 return m, f 169 } 170 171 func (c *tableCacheContainer) estimateSize( 172 meta *fileMetadata, lower, upper []byte, 173 ) (size uint64, err error) { 174 if meta.Virtual { 175 err = c.withVirtualReader( 176 meta.VirtualMeta(), 177 func(r sstable.VirtualReader) (err error) { 178 size, err = r.EstimateDiskUsage(lower, upper) 179 return err 180 }, 181 ) 182 } else { 183 err = c.withReader( 184 meta.PhysicalMeta(), 185 func(r *sstable.Reader) (err error) { 186 size, err = r.EstimateDiskUsage(lower, upper) 187 return err 188 }, 189 ) 190 } 191 if err != nil { 192 return 0, err 193 } 194 return size, nil 195 } 196 197 // createCommonReader creates a Reader for this file. isForeign, if true for 198 // virtual sstables, is passed into the vSSTable reader so its iterators can 199 // collapse obsolete points accordingly. 200 func createCommonReader( 201 v *tableCacheValue, file *fileMetadata, isForeign bool, 202 ) sstable.CommonReader { 203 // TODO(bananabrick): We suffer an allocation if file is a virtual sstable. 204 var cr sstable.CommonReader = v.reader 205 if file.Virtual { 206 virtualReader := sstable.MakeVirtualReader( 207 v.reader, file.VirtualMeta(), isForeign, 208 ) 209 cr = &virtualReader 210 } 211 return cr 212 } 213 214 func (c *tableCacheContainer) withCommonReader( 215 meta *fileMetadata, fn func(sstable.CommonReader) error, 216 ) error { 217 s := c.tableCache.getShard(meta.FileBacking.DiskFileNum) 218 v := s.findNode(meta, &c.dbOpts) 219 defer s.unrefValue(v) 220 if v.err != nil { 221 return v.err 222 } 223 provider := c.dbOpts.objProvider 224 objMeta, err := provider.Lookup(fileTypeTable, meta.FileBacking.DiskFileNum) 225 if err != nil { 226 return err 227 } 228 return fn(createCommonReader(v, meta, provider.IsSharedForeign(objMeta))) 229 } 230 231 func (c *tableCacheContainer) withReader(meta physicalMeta, fn func(*sstable.Reader) error) error { 232 s := c.tableCache.getShard(meta.FileBacking.DiskFileNum) 233 v := s.findNode(meta.FileMetadata, &c.dbOpts) 234 defer s.unrefValue(v) 235 if v.err != nil { 236 return v.err 237 } 238 return fn(v.reader) 239 } 240 241 // withVirtualReader fetches a VirtualReader associated with a virtual sstable. 242 func (c *tableCacheContainer) withVirtualReader( 243 meta virtualMeta, fn func(sstable.VirtualReader) error, 244 ) error { 245 s := c.tableCache.getShard(meta.FileBacking.DiskFileNum) 246 v := s.findNode(meta.FileMetadata, &c.dbOpts) 247 defer s.unrefValue(v) 248 if v.err != nil { 249 return v.err 250 } 251 provider := c.dbOpts.objProvider 252 objMeta, err := provider.Lookup(fileTypeTable, meta.FileBacking.DiskFileNum) 253 if err != nil { 254 return err 255 } 256 return fn(sstable.MakeVirtualReader(v.reader, meta, provider.IsSharedForeign(objMeta))) 257 } 258 259 func (c *tableCacheContainer) iterCount() int64 { 260 return int64(c.dbOpts.iterCount.Load()) 261 } 262 263 // TableCache is a shareable cache for open sstables. 264 type TableCache struct { 265 refs atomic.Int64 266 267 cache *Cache 268 shards []*tableCacheShard 269 } 270 271 // Ref adds a reference to the table cache. Once tableCache.init returns, 272 // the table cache only remains valid if there is at least one reference 273 // to it. 274 func (c *TableCache) Ref() { 275 v := c.refs.Add(1) 276 // We don't want the reference count to ever go from 0 -> 1, 277 // cause a reference count of 0 implies that we've closed the cache. 278 if v <= 1 { 279 panic(fmt.Sprintf("pebble: inconsistent reference count: %d", v)) 280 } 281 } 282 283 // Unref removes a reference to the table cache. 284 func (c *TableCache) Unref() error { 285 v := c.refs.Add(-1) 286 switch { 287 case v < 0: 288 panic(fmt.Sprintf("pebble: inconsistent reference count: %d", v)) 289 case v == 0: 290 var err error 291 for i := range c.shards { 292 // The cache shard is not allocated yet, nothing to close 293 if c.shards[i] == nil { 294 continue 295 } 296 err = firstError(err, c.shards[i].Close()) 297 } 298 299 // Unref the cache which we create a reference to when the tableCache 300 // is first instantiated. 301 c.cache.Unref() 302 return err 303 } 304 return nil 305 } 306 307 // NewTableCache will create a reference to the table cache. It is the callers responsibility 308 // to call tableCache.Unref if they will no longer hold a reference to the table cache. 309 func NewTableCache(cache *Cache, numShards int, size int) *TableCache { 310 if size == 0 { 311 panic("pebble: cannot create a table cache of size 0") 312 } else if numShards == 0 { 313 panic("pebble: cannot create a table cache with 0 shards") 314 } 315 316 c := &TableCache{} 317 c.cache = cache 318 c.cache.Ref() 319 320 c.shards = make([]*tableCacheShard, numShards) 321 for i := range c.shards { 322 c.shards[i] = &tableCacheShard{} 323 c.shards[i].init(size / len(c.shards)) 324 } 325 326 // Hold a ref to the cache here. 327 c.refs.Store(1) 328 329 return c 330 } 331 332 func (c *TableCache) getShard(fileNum base.DiskFileNum) *tableCacheShard { 333 return c.shards[uint64(fileNum.FileNum())%uint64(len(c.shards))] 334 } 335 336 type tableCacheKey struct { 337 cacheID uint64 338 fileNum base.DiskFileNum 339 } 340 341 type tableCacheShard struct { 342 hits atomic.Int64 343 misses atomic.Int64 344 iterCount atomic.Int32 345 346 size int 347 348 mu struct { 349 sync.RWMutex 350 nodes map[tableCacheKey]*tableCacheNode 351 // The iters map is only created and populated in race builds. 352 iters map[io.Closer][]byte 353 354 handHot *tableCacheNode 355 handCold *tableCacheNode 356 handTest *tableCacheNode 357 358 coldTarget int 359 sizeHot int 360 sizeCold int 361 sizeTest int 362 } 363 releasing sync.WaitGroup 364 releasingCh chan *tableCacheValue 365 releaseLoopExit sync.WaitGroup 366 } 367 368 func (c *tableCacheShard) init(size int) { 369 c.size = size 370 371 c.mu.nodes = make(map[tableCacheKey]*tableCacheNode) 372 c.mu.coldTarget = size 373 c.releasingCh = make(chan *tableCacheValue, 100) 374 c.releaseLoopExit.Add(1) 375 go c.releaseLoop() 376 377 if invariants.RaceEnabled { 378 c.mu.iters = make(map[io.Closer][]byte) 379 } 380 } 381 382 func (c *tableCacheShard) releaseLoop() { 383 pprof.Do(context.Background(), tableCacheLabels, func(context.Context) { 384 defer c.releaseLoopExit.Done() 385 for v := range c.releasingCh { 386 v.release(c) 387 } 388 }) 389 } 390 391 // checkAndIntersectFilters checks the specific table and block property filters 392 // for intersection with any available table and block-level properties. Returns 393 // true for ok if this table should be read by this iterator. 394 func (c *tableCacheShard) checkAndIntersectFilters( 395 v *tableCacheValue, 396 tableFilter func(userProps map[string]string) bool, 397 blockPropertyFilters []BlockPropertyFilter, 398 boundLimitedFilter sstable.BoundLimitedBlockPropertyFilter, 399 ) (ok bool, filterer *sstable.BlockPropertiesFilterer, err error) { 400 if tableFilter != nil && 401 !tableFilter(v.reader.Properties.UserProperties) { 402 return false, nil, nil 403 } 404 405 if boundLimitedFilter != nil || len(blockPropertyFilters) > 0 { 406 filterer, err = sstable.IntersectsTable( 407 blockPropertyFilters, 408 boundLimitedFilter, 409 v.reader.Properties.UserProperties, 410 ) 411 // NB: IntersectsTable will return a nil filterer if the table-level 412 // properties indicate there's no intersection with the provided filters. 413 if filterer == nil || err != nil { 414 return false, nil, err 415 } 416 } 417 return true, filterer, nil 418 } 419 420 func (c *tableCacheShard) newIters( 421 ctx context.Context, 422 file *manifest.FileMetadata, 423 opts *IterOptions, 424 internalOpts internalIterOpts, 425 dbOpts *tableCacheOpts, 426 ) (internalIterator, keyspan.FragmentIterator, error) { 427 // TODO(sumeer): constructing the Reader should also use a plumbed context, 428 // since parts of the sstable are read during the construction. The Reader 429 // should not remember that context since the Reader can be long-lived. 430 431 // Calling findNode gives us the responsibility of decrementing v's 432 // refCount. If opening the underlying table resulted in error, then we 433 // decrement this straight away. Otherwise, we pass that responsibility to 434 // the sstable iterator, which decrements when it is closed. 435 v := c.findNode(file, dbOpts) 436 if v.err != nil { 437 defer c.unrefValue(v) 438 return nil, nil, v.err 439 } 440 441 hideObsoletePoints := false 442 var pointKeyFilters []BlockPropertyFilter 443 if opts != nil { 444 // This code is appending (at most one filter) in-place to 445 // opts.PointKeyFilters even though the slice is shared for iterators in 446 // the same iterator tree. This is acceptable since all the following 447 // properties are true: 448 // - The iterator tree is single threaded, so the shared backing for the 449 // slice is being mutated in a single threaded manner. 450 // - Each shallow copy of the slice has its own notion of length. 451 // - The appended element is always the obsoleteKeyBlockPropertyFilter 452 // struct, which is stateless, so overwriting that struct when creating 453 // one sstable iterator is harmless to other sstable iterators that are 454 // relying on that struct. 455 // 456 // An alternative would be to have different slices for different sstable 457 // iterators, but that requires more work to avoid allocations. 458 hideObsoletePoints, pointKeyFilters = 459 v.reader.TryAddBlockPropertyFilterForHideObsoletePoints( 460 opts.snapshotForHideObsoletePoints, file.LargestSeqNum, opts.PointKeyFilters) 461 } 462 ok := true 463 var filterer *sstable.BlockPropertiesFilterer 464 var err error 465 if opts != nil { 466 ok, filterer, err = c.checkAndIntersectFilters(v, opts.TableFilter, 467 pointKeyFilters, internalOpts.boundLimitedFilter) 468 } 469 if err != nil { 470 c.unrefValue(v) 471 return nil, nil, err 472 } 473 474 provider := dbOpts.objProvider 475 // Check if this file is a foreign file. 476 objMeta, err := provider.Lookup(fileTypeTable, file.FileBacking.DiskFileNum) 477 if err != nil { 478 return nil, nil, err 479 } 480 481 // Note: This suffers an allocation for virtual sstables. 482 cr := createCommonReader(v, file, provider.IsSharedForeign(objMeta)) 483 484 // NB: range-del iterator does not maintain a reference to the table, nor 485 // does it need to read from it after creation. 486 rangeDelIter, err := cr.NewRawRangeDelIter() 487 if err != nil { 488 c.unrefValue(v) 489 return nil, nil, err 490 } 491 492 if !ok { 493 c.unrefValue(v) 494 // Return an empty iterator. This iterator has no mutable state, so 495 // using a singleton is fine. 496 // NB: We still return the potentially non-empty rangeDelIter. This 497 // ensures the iterator observes the file's range deletions even if the 498 // block property filters exclude all the file's point keys. The range 499 // deletions may still delete keys lower in the LSM in files that DO 500 // match the active filters. 501 // 502 // The point iterator returned must implement the filteredIter 503 // interface, so that the level iterator surfaces file boundaries when 504 // range deletions are present. 505 return filteredAll, rangeDelIter, err 506 } 507 508 var iter sstable.Iterator 509 useFilter := true 510 if opts != nil { 511 useFilter = manifest.LevelToInt(opts.level) != 6 || opts.UseL6Filters 512 ctx = objiotracing.WithLevel(ctx, manifest.LevelToInt(opts.level)) 513 } 514 tableFormat, err := v.reader.TableFormat() 515 if err != nil { 516 return nil, nil, err 517 } 518 var rp sstable.ReaderProvider 519 if tableFormat >= sstable.TableFormatPebblev3 && v.reader.Properties.NumValueBlocks > 0 { 520 rp = &tableCacheShardReaderProvider{c: c, file: file, dbOpts: dbOpts} 521 } 522 523 if provider.IsSharedForeign(objMeta) { 524 if tableFormat < sstable.TableFormatPebblev4 { 525 return nil, nil, errors.New("pebble: shared foreign sstable has a lower table format than expected") 526 } 527 hideObsoletePoints = true 528 } 529 if internalOpts.bytesIterated != nil { 530 iter, err = cr.NewCompactionIter(internalOpts.bytesIterated, rp, internalOpts.bufferPool) 531 } else { 532 iter, err = cr.NewIterWithBlockPropertyFiltersAndContextEtc( 533 ctx, opts.GetLowerBound(), opts.GetUpperBound(), filterer, hideObsoletePoints, useFilter, 534 internalOpts.stats, rp) 535 } 536 if err != nil { 537 if rangeDelIter != nil { 538 _ = rangeDelIter.Close() 539 } 540 c.unrefValue(v) 541 return nil, nil, err 542 } 543 // NB: v.closeHook takes responsibility for calling unrefValue(v) here. Take 544 // care to avoid introducing an allocation here by adding a closure. 545 iter.SetCloseHook(v.closeHook) 546 547 c.iterCount.Add(1) 548 dbOpts.iterCount.Add(1) 549 if invariants.RaceEnabled { 550 c.mu.Lock() 551 c.mu.iters[iter] = debug.Stack() 552 c.mu.Unlock() 553 } 554 return iter, rangeDelIter, nil 555 } 556 557 func (c *tableCacheShard) newRangeKeyIter( 558 file *manifest.FileMetadata, opts keyspan.SpanIterOptions, dbOpts *tableCacheOpts, 559 ) (keyspan.FragmentIterator, error) { 560 // Calling findNode gives us the responsibility of decrementing v's 561 // refCount. If opening the underlying table resulted in error, then we 562 // decrement this straight away. Otherwise, we pass that responsibility to 563 // the sstable iterator, which decrements when it is closed. 564 v := c.findNode(file, dbOpts) 565 if v.err != nil { 566 defer c.unrefValue(v) 567 return nil, v.err 568 } 569 570 ok := true 571 var err error 572 // Don't filter a table's range keys if the file contains RANGEKEYDELs. 573 // The RANGEKEYDELs may delete range keys in other levels. Skipping the 574 // file's range key blocks may surface deleted range keys below. This is 575 // done here, rather than deferring to the block-property collector in order 576 // to maintain parity with point keys and the treatment of RANGEDELs. 577 if v.reader.Properties.NumRangeKeyDels == 0 { 578 ok, _, err = c.checkAndIntersectFilters(v, nil, opts.RangeKeyFilters, nil) 579 } 580 if err != nil { 581 c.unrefValue(v) 582 return nil, err 583 } 584 if !ok { 585 c.unrefValue(v) 586 // Return the empty iterator. This iterator has no mutable state, so 587 // using a singleton is fine. 588 return emptyKeyspanIter, err 589 } 590 591 var iter keyspan.FragmentIterator 592 if file.Virtual { 593 provider := dbOpts.objProvider 594 var objMeta objstorage.ObjectMetadata 595 objMeta, err = provider.Lookup(fileTypeTable, file.FileBacking.DiskFileNum) 596 if err == nil { 597 virtualReader := sstable.MakeVirtualReader( 598 v.reader, file.VirtualMeta(), provider.IsSharedForeign(objMeta), 599 ) 600 iter, err = virtualReader.NewRawRangeKeyIter() 601 } 602 } else { 603 iter, err = v.reader.NewRawRangeKeyIter() 604 } 605 606 // iter is a block iter that holds the entire value of the block in memory. 607 // No need to hold onto a ref of the cache value. 608 c.unrefValue(v) 609 610 if err != nil { 611 return nil, err 612 } 613 614 if iter == nil { 615 // NewRawRangeKeyIter can return nil even if there's no error. However, 616 // the keyspan.LevelIter expects a non-nil iterator if err is nil. 617 return emptyKeyspanIter, nil 618 } 619 620 return iter, nil 621 } 622 623 type tableCacheShardReaderProvider struct { 624 c *tableCacheShard 625 file *manifest.FileMetadata 626 dbOpts *tableCacheOpts 627 v *tableCacheValue 628 } 629 630 var _ sstable.ReaderProvider = &tableCacheShardReaderProvider{} 631 632 // GetReader implements sstable.ReaderProvider. Note that it is not the 633 // responsibility of tableCacheShardReaderProvider to ensure that the file 634 // continues to exist. The ReaderProvider is used in iterators where the 635 // top-level iterator is pinning the read state and preventing the files from 636 // being deleted. 637 // 638 // The caller must call tableCacheShardReaderProvider.Close. 639 // 640 // Note that currently the Reader returned here is only used to read value 641 // blocks. This reader shouldn't be used for other purposes like reading keys 642 // outside of virtual sstable bounds. 643 // 644 // TODO(bananabrick): We could return a wrapper over the Reader to ensure 645 // that the reader isn't used for other purposes. 646 func (rp *tableCacheShardReaderProvider) GetReader() (*sstable.Reader, error) { 647 // Calling findNode gives us the responsibility of decrementing v's 648 // refCount. 649 v := rp.c.findNode(rp.file, rp.dbOpts) 650 if v.err != nil { 651 defer rp.c.unrefValue(v) 652 return nil, v.err 653 } 654 rp.v = v 655 return v.reader, nil 656 } 657 658 // Close implements sstable.ReaderProvider. 659 func (rp *tableCacheShardReaderProvider) Close() { 660 rp.c.unrefValue(rp.v) 661 rp.v = nil 662 } 663 664 // getTableProperties return sst table properties for target file 665 func (c *tableCacheShard) getTableProperties( 666 file *fileMetadata, dbOpts *tableCacheOpts, 667 ) (*sstable.Properties, error) { 668 // Calling findNode gives us the responsibility of decrementing v's refCount here 669 v := c.findNode(file, dbOpts) 670 defer c.unrefValue(v) 671 672 if v.err != nil { 673 return nil, v.err 674 } 675 return &v.reader.Properties, nil 676 } 677 678 // releaseNode releases a node from the tableCacheShard. 679 // 680 // c.mu must be held when calling this. 681 func (c *tableCacheShard) releaseNode(n *tableCacheNode) { 682 c.unlinkNode(n) 683 c.clearNode(n) 684 } 685 686 // unlinkNode removes a node from the tableCacheShard, leaving the shard 687 // reference in place. 688 // 689 // c.mu must be held when calling this. 690 func (c *tableCacheShard) unlinkNode(n *tableCacheNode) { 691 key := tableCacheKey{n.cacheID, n.fileNum} 692 delete(c.mu.nodes, key) 693 694 switch n.ptype { 695 case tableCacheNodeHot: 696 c.mu.sizeHot-- 697 case tableCacheNodeCold: 698 c.mu.sizeCold-- 699 case tableCacheNodeTest: 700 c.mu.sizeTest-- 701 } 702 703 if n == c.mu.handHot { 704 c.mu.handHot = c.mu.handHot.prev() 705 } 706 if n == c.mu.handCold { 707 c.mu.handCold = c.mu.handCold.prev() 708 } 709 if n == c.mu.handTest { 710 c.mu.handTest = c.mu.handTest.prev() 711 } 712 713 if n.unlink() == n { 714 // This was the last entry in the cache. 715 c.mu.handHot = nil 716 c.mu.handCold = nil 717 c.mu.handTest = nil 718 } 719 720 n.links.prev = nil 721 n.links.next = nil 722 } 723 724 func (c *tableCacheShard) clearNode(n *tableCacheNode) { 725 if v := n.value; v != nil { 726 n.value = nil 727 c.unrefValue(v) 728 } 729 } 730 731 // unrefValue decrements the reference count for the specified value, releasing 732 // it if the reference count fell to 0. Note that the value has a reference if 733 // it is present in tableCacheShard.mu.nodes, so a reference count of 0 means 734 // the node has already been removed from that map. 735 func (c *tableCacheShard) unrefValue(v *tableCacheValue) { 736 if v.refCount.Add(-1) == 0 { 737 c.releasing.Add(1) 738 c.releasingCh <- v 739 } 740 } 741 742 // findNode returns the node for the table with the given file number, creating 743 // that node if it didn't already exist. The caller is responsible for 744 // decrementing the returned node's refCount. 745 func (c *tableCacheShard) findNode(meta *fileMetadata, dbOpts *tableCacheOpts) *tableCacheValue { 746 v := c.findNodeInternal(meta, dbOpts) 747 748 // Loading a file before its global sequence number is known (eg, 749 // during ingest before entering the commit pipeline) can pollute 750 // the cache with incorrect state. In invariant builds, verify 751 // that the global sequence number of the returned reader matches. 752 if invariants.Enabled { 753 if v.reader != nil && meta.LargestSeqNum == meta.SmallestSeqNum && 754 v.reader.Properties.GlobalSeqNum != meta.SmallestSeqNum { 755 panic(errors.AssertionFailedf("file %s loaded from table cache with the wrong global sequence number %d", 756 meta, v.reader.Properties.GlobalSeqNum)) 757 } 758 } 759 return v 760 } 761 762 func (c *tableCacheShard) findNodeInternal( 763 meta *fileMetadata, dbOpts *tableCacheOpts, 764 ) *tableCacheValue { 765 if refs := meta.Refs(); refs <= 0 { 766 panic(errors.AssertionFailedf("attempting to load file %s with refs=%d from table cache", 767 meta, refs)) 768 } 769 // Fast-path for a hit in the cache. 770 c.mu.RLock() 771 key := tableCacheKey{dbOpts.cacheID, meta.FileBacking.DiskFileNum} 772 if n := c.mu.nodes[key]; n != nil && n.value != nil { 773 // Fast-path hit. 774 // 775 // The caller is responsible for decrementing the refCount. 776 v := n.value 777 v.refCount.Add(1) 778 c.mu.RUnlock() 779 n.referenced.Store(true) 780 c.hits.Add(1) 781 <-v.loaded 782 return v 783 } 784 c.mu.RUnlock() 785 786 c.mu.Lock() 787 788 n := c.mu.nodes[key] 789 switch { 790 case n == nil: 791 // Slow-path miss of a non-existent node. 792 n = &tableCacheNode{ 793 fileNum: meta.FileBacking.DiskFileNum, 794 ptype: tableCacheNodeCold, 795 } 796 c.addNode(n, dbOpts) 797 c.mu.sizeCold++ 798 799 case n.value != nil: 800 // Slow-path hit of a hot or cold node. 801 // 802 // The caller is responsible for decrementing the refCount. 803 v := n.value 804 v.refCount.Add(1) 805 n.referenced.Store(true) 806 c.hits.Add(1) 807 c.mu.Unlock() 808 <-v.loaded 809 return v 810 811 default: 812 // Slow-path miss of a test node. 813 c.unlinkNode(n) 814 c.mu.coldTarget++ 815 if c.mu.coldTarget > c.size { 816 c.mu.coldTarget = c.size 817 } 818 819 n.referenced.Store(false) 820 n.ptype = tableCacheNodeHot 821 c.addNode(n, dbOpts) 822 c.mu.sizeHot++ 823 } 824 825 c.misses.Add(1) 826 827 v := &tableCacheValue{ 828 loaded: make(chan struct{}), 829 } 830 v.refCount.Store(2) 831 // Cache the closure invoked when an iterator is closed. This avoids an 832 // allocation on every call to newIters. 833 v.closeHook = func(i sstable.Iterator) error { 834 if invariants.RaceEnabled { 835 c.mu.Lock() 836 delete(c.mu.iters, i) 837 c.mu.Unlock() 838 } 839 c.unrefValue(v) 840 c.iterCount.Add(-1) 841 dbOpts.iterCount.Add(-1) 842 return nil 843 } 844 n.value = v 845 846 c.mu.Unlock() 847 848 // Note adding to the cache lists must complete before we begin loading the 849 // table as a failure during load will result in the node being unlinked. 850 pprof.Do(context.Background(), tableCacheLabels, func(context.Context) { 851 v.load( 852 loadInfo{ 853 backingFileNum: meta.FileBacking.DiskFileNum, 854 smallestSeqNum: meta.SmallestSeqNum, 855 largestSeqNum: meta.LargestSeqNum, 856 }, c, dbOpts) 857 }) 858 return v 859 } 860 861 func (c *tableCacheShard) addNode(n *tableCacheNode, dbOpts *tableCacheOpts) { 862 c.evictNodes() 863 n.cacheID = dbOpts.cacheID 864 key := tableCacheKey{n.cacheID, n.fileNum} 865 c.mu.nodes[key] = n 866 867 n.links.next = n 868 n.links.prev = n 869 if c.mu.handHot == nil { 870 // First element. 871 c.mu.handHot = n 872 c.mu.handCold = n 873 c.mu.handTest = n 874 } else { 875 c.mu.handHot.link(n) 876 } 877 878 if c.mu.handCold == c.mu.handHot { 879 c.mu.handCold = c.mu.handCold.prev() 880 } 881 } 882 883 func (c *tableCacheShard) evictNodes() { 884 for c.size <= c.mu.sizeHot+c.mu.sizeCold && c.mu.handCold != nil { 885 c.runHandCold() 886 } 887 } 888 889 func (c *tableCacheShard) runHandCold() { 890 n := c.mu.handCold 891 if n.ptype == tableCacheNodeCold { 892 if n.referenced.Load() { 893 n.referenced.Store(false) 894 n.ptype = tableCacheNodeHot 895 c.mu.sizeCold-- 896 c.mu.sizeHot++ 897 } else { 898 c.clearNode(n) 899 n.ptype = tableCacheNodeTest 900 c.mu.sizeCold-- 901 c.mu.sizeTest++ 902 for c.size < c.mu.sizeTest && c.mu.handTest != nil { 903 c.runHandTest() 904 } 905 } 906 } 907 908 c.mu.handCold = c.mu.handCold.next() 909 910 for c.size-c.mu.coldTarget <= c.mu.sizeHot && c.mu.handHot != nil { 911 c.runHandHot() 912 } 913 } 914 915 func (c *tableCacheShard) runHandHot() { 916 if c.mu.handHot == c.mu.handTest && c.mu.handTest != nil { 917 c.runHandTest() 918 if c.mu.handHot == nil { 919 return 920 } 921 } 922 923 n := c.mu.handHot 924 if n.ptype == tableCacheNodeHot { 925 if n.referenced.Load() { 926 n.referenced.Store(false) 927 } else { 928 n.ptype = tableCacheNodeCold 929 c.mu.sizeHot-- 930 c.mu.sizeCold++ 931 } 932 } 933 934 c.mu.handHot = c.mu.handHot.next() 935 } 936 937 func (c *tableCacheShard) runHandTest() { 938 if c.mu.sizeCold > 0 && c.mu.handTest == c.mu.handCold && c.mu.handCold != nil { 939 c.runHandCold() 940 if c.mu.handTest == nil { 941 return 942 } 943 } 944 945 n := c.mu.handTest 946 if n.ptype == tableCacheNodeTest { 947 c.mu.coldTarget-- 948 if c.mu.coldTarget < 0 { 949 c.mu.coldTarget = 0 950 } 951 c.unlinkNode(n) 952 c.clearNode(n) 953 } 954 955 c.mu.handTest = c.mu.handTest.next() 956 } 957 958 func (c *tableCacheShard) evict(fileNum base.DiskFileNum, dbOpts *tableCacheOpts, allowLeak bool) { 959 c.mu.Lock() 960 key := tableCacheKey{dbOpts.cacheID, fileNum} 961 n := c.mu.nodes[key] 962 var v *tableCacheValue 963 if n != nil { 964 // NB: This is equivalent to tableCacheShard.releaseNode(), but we perform 965 // the tableCacheNode.release() call synchronously below to ensure the 966 // sstable file descriptor is closed before returning. Note that 967 // tableCacheShard.releasing needs to be incremented while holding 968 // tableCacheShard.mu in order to avoid a race with Close() 969 c.unlinkNode(n) 970 v = n.value 971 if v != nil { 972 if !allowLeak { 973 if t := v.refCount.Add(-1); t != 0 { 974 dbOpts.loggerAndTracer.Fatalf("sstable %s: refcount is not zero: %d\n%s", fileNum, t, debug.Stack()) 975 } 976 } 977 c.releasing.Add(1) 978 } 979 } 980 981 c.mu.Unlock() 982 983 if v != nil { 984 v.release(c) 985 } 986 987 dbOpts.opts.Cache.EvictFile(dbOpts.cacheID, fileNum) 988 } 989 990 // removeDB evicts any nodes which have a reference to the DB 991 // associated with dbOpts.cacheID. Make sure that there will 992 // be no more accesses to the files associated with the DB. 993 func (c *tableCacheShard) removeDB(dbOpts *tableCacheOpts) { 994 var fileNums []base.DiskFileNum 995 996 c.mu.RLock() 997 // Collect the fileNums which need to be cleaned. 998 var firstNode *tableCacheNode 999 node := c.mu.handHot 1000 for node != firstNode { 1001 if firstNode == nil { 1002 firstNode = node 1003 } 1004 1005 if node.cacheID == dbOpts.cacheID { 1006 fileNums = append(fileNums, node.fileNum) 1007 } 1008 node = node.next() 1009 } 1010 c.mu.RUnlock() 1011 1012 // Evict all the nodes associated with the DB. 1013 // This should synchronously close all the files 1014 // associated with the DB. 1015 for _, fileNum := range fileNums { 1016 c.evict(fileNum, dbOpts, true) 1017 } 1018 } 1019 1020 func (c *tableCacheShard) Close() error { 1021 c.mu.Lock() 1022 defer c.mu.Unlock() 1023 1024 // Check for leaked iterators. Note that we'll still perform cleanup below in 1025 // the case that there are leaked iterators. 1026 var err error 1027 if v := c.iterCount.Load(); v > 0 { 1028 if !invariants.RaceEnabled { 1029 err = errors.Errorf("leaked iterators: %d", errors.Safe(v)) 1030 } else { 1031 var buf bytes.Buffer 1032 for _, stack := range c.mu.iters { 1033 fmt.Fprintf(&buf, "%s\n", stack) 1034 } 1035 err = errors.Errorf("leaked iterators: %d\n%s", errors.Safe(v), buf.String()) 1036 } 1037 } 1038 1039 for c.mu.handHot != nil { 1040 n := c.mu.handHot 1041 if n.value != nil { 1042 if n.value.refCount.Add(-1) == 0 { 1043 c.releasing.Add(1) 1044 c.releasingCh <- n.value 1045 } 1046 } 1047 c.unlinkNode(n) 1048 } 1049 c.mu.nodes = nil 1050 c.mu.handHot = nil 1051 c.mu.handCold = nil 1052 c.mu.handTest = nil 1053 1054 // Only shutdown the releasing goroutine if there were no leaked 1055 // iterators. If there were leaked iterators, we leave the goroutine running 1056 // and the releasingCh open so that a subsequent iterator close can 1057 // complete. This behavior is used by iterator leak tests. Leaking the 1058 // goroutine for these tests is less bad not closing the iterator which 1059 // triggers other warnings about block cache handles not being released. 1060 if err != nil { 1061 c.releasing.Wait() 1062 return err 1063 } 1064 1065 close(c.releasingCh) 1066 c.releasing.Wait() 1067 c.releaseLoopExit.Wait() 1068 return err 1069 } 1070 1071 type tableCacheValue struct { 1072 closeHook func(i sstable.Iterator) error 1073 reader *sstable.Reader 1074 err error 1075 loaded chan struct{} 1076 // Reference count for the value. The reader is closed when the reference 1077 // count drops to zero. 1078 refCount atomic.Int32 1079 } 1080 1081 type loadInfo struct { 1082 backingFileNum base.DiskFileNum 1083 largestSeqNum uint64 1084 smallestSeqNum uint64 1085 } 1086 1087 func (v *tableCacheValue) load(loadInfo loadInfo, c *tableCacheShard, dbOpts *tableCacheOpts) { 1088 // Try opening the file first. 1089 var f objstorage.Readable 1090 var err error 1091 f, err = dbOpts.objProvider.OpenForReading( 1092 context.TODO(), fileTypeTable, loadInfo.backingFileNum, objstorage.OpenOptions{MustExist: true}, 1093 ) 1094 if err == nil { 1095 cacheOpts := private.SSTableCacheOpts(dbOpts.cacheID, loadInfo.backingFileNum).(sstable.ReaderOption) 1096 v.reader, err = sstable.NewReader(f, dbOpts.opts, cacheOpts, dbOpts.filterMetrics) 1097 } 1098 if err != nil { 1099 v.err = errors.Wrapf( 1100 err, "pebble: backing file %s error", errors.Safe(loadInfo.backingFileNum.FileNum())) 1101 } 1102 if v.err == nil && loadInfo.smallestSeqNum == loadInfo.largestSeqNum { 1103 v.reader.Properties.GlobalSeqNum = loadInfo.largestSeqNum 1104 } 1105 if v.err != nil { 1106 c.mu.Lock() 1107 defer c.mu.Unlock() 1108 // Lookup the node in the cache again as it might have already been 1109 // removed. 1110 key := tableCacheKey{dbOpts.cacheID, loadInfo.backingFileNum} 1111 n := c.mu.nodes[key] 1112 if n != nil && n.value == v { 1113 c.releaseNode(n) 1114 } 1115 } 1116 close(v.loaded) 1117 } 1118 1119 func (v *tableCacheValue) release(c *tableCacheShard) { 1120 <-v.loaded 1121 // Nothing to be done about an error at this point. Close the reader if it is 1122 // open. 1123 if v.reader != nil { 1124 _ = v.reader.Close() 1125 } 1126 c.releasing.Done() 1127 } 1128 1129 type tableCacheNodeType int8 1130 1131 const ( 1132 tableCacheNodeTest tableCacheNodeType = iota 1133 tableCacheNodeCold 1134 tableCacheNodeHot 1135 ) 1136 1137 func (p tableCacheNodeType) String() string { 1138 switch p { 1139 case tableCacheNodeTest: 1140 return "test" 1141 case tableCacheNodeCold: 1142 return "cold" 1143 case tableCacheNodeHot: 1144 return "hot" 1145 } 1146 return "unknown" 1147 } 1148 1149 type tableCacheNode struct { 1150 fileNum base.DiskFileNum 1151 value *tableCacheValue 1152 1153 links struct { 1154 next *tableCacheNode 1155 prev *tableCacheNode 1156 } 1157 ptype tableCacheNodeType 1158 // referenced is atomically set to indicate that this entry has been accessed 1159 // since the last time one of the clock hands swept it. 1160 referenced atomic.Bool 1161 1162 // Storing the cache id associated with the DB instance here 1163 // avoids the need to thread the dbOpts struct through many functions. 1164 cacheID uint64 1165 } 1166 1167 func (n *tableCacheNode) next() *tableCacheNode { 1168 if n == nil { 1169 return nil 1170 } 1171 return n.links.next 1172 } 1173 1174 func (n *tableCacheNode) prev() *tableCacheNode { 1175 if n == nil { 1176 return nil 1177 } 1178 return n.links.prev 1179 } 1180 1181 func (n *tableCacheNode) link(s *tableCacheNode) { 1182 s.links.prev = n.links.prev 1183 s.links.prev.links.next = s 1184 s.links.next = n 1185 s.links.next.links.prev = s 1186 } 1187 1188 func (n *tableCacheNode) unlink() *tableCacheNode { 1189 next := n.links.next 1190 n.links.prev.links.next = n.links.next 1191 n.links.next.links.prev = n.links.prev 1192 n.links.prev = n 1193 n.links.next = n 1194 return next 1195 }