github.com/cockroachdb/pebble@v0.0.0-20231214172447-ab4952c5f87b/table_cache.go (about) 1 // Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package pebble 6 7 import ( 8 "bytes" 9 "context" 10 "fmt" 11 "io" 12 "runtime/debug" 13 "runtime/pprof" 14 "sync" 15 "sync/atomic" 16 "unsafe" 17 18 "github.com/cockroachdb/errors" 19 "github.com/cockroachdb/pebble/internal/base" 20 "github.com/cockroachdb/pebble/internal/invariants" 21 "github.com/cockroachdb/pebble/internal/keyspan" 22 "github.com/cockroachdb/pebble/internal/manifest" 23 "github.com/cockroachdb/pebble/internal/private" 24 "github.com/cockroachdb/pebble/objstorage" 25 "github.com/cockroachdb/pebble/objstorage/objstorageprovider/objiotracing" 26 "github.com/cockroachdb/pebble/sstable" 27 ) 28 29 var emptyIter = &errorIter{err: nil} 30 var emptyKeyspanIter = &errorKeyspanIter{err: nil} 31 32 // filteredAll is a singleton internalIterator implementation used when an 33 // sstable does contain point keys, but all the keys are filtered by the active 34 // PointKeyFilters set in the iterator's IterOptions. 35 // 36 // filteredAll implements filteredIter, ensuring the level iterator recognizes 37 // when it may need to return file boundaries to keep the rangeDelIter open 38 // during mergingIter operation. 39 var filteredAll = &filteredAllKeysIter{errorIter: errorIter{err: nil}} 40 41 var _ filteredIter = filteredAll 42 43 type filteredAllKeysIter struct { 44 errorIter 45 } 46 47 func (s *filteredAllKeysIter) MaybeFilteredKeys() bool { 48 return true 49 } 50 51 var tableCacheLabels = pprof.Labels("pebble", "table-cache") 52 53 // tableCacheOpts contains the db specific fields 54 // of a table cache. This is stored in the tableCacheContainer 55 // along with the table cache. 56 // NB: It is important to make sure that the fields in this 57 // struct are read-only. Since the fields here are shared 58 // by every single tableCacheShard, if non read-only fields 59 // are updated, we could have unnecessary evictions of those 60 // fields, and the surrounding fields from the CPU caches. 61 type tableCacheOpts struct { 62 // iterCount keeps track of how many iterators are open. It is used to keep 63 // track of leaked iterators on a per-db level. 64 iterCount *atomic.Int32 65 66 loggerAndTracer LoggerAndTracer 67 cacheID uint64 68 objProvider objstorage.Provider 69 opts sstable.ReaderOptions 70 filterMetrics *sstable.FilterMetricsTracker 71 sstStatsCollector *sstable.CategoryStatsCollector 72 } 73 74 // tableCacheContainer contains the table cache and 75 // fields which are unique to the DB. 76 type tableCacheContainer struct { 77 tableCache *TableCache 78 79 // dbOpts contains fields relevant to the table cache 80 // which are unique to each DB. 81 dbOpts tableCacheOpts 82 } 83 84 // newTableCacheContainer will panic if the underlying cache in the table cache 85 // doesn't match Options.Cache. 86 func newTableCacheContainer( 87 tc *TableCache, 88 cacheID uint64, 89 objProvider objstorage.Provider, 90 opts *Options, 91 size int, 92 sstStatsCollector *sstable.CategoryStatsCollector, 93 ) *tableCacheContainer { 94 // We will release a ref to table cache acquired here when tableCacheContainer.close is called. 95 if tc != nil { 96 if tc.cache != opts.Cache { 97 panic("pebble: underlying cache for the table cache and db are different") 98 } 99 tc.Ref() 100 } else { 101 // NewTableCache should create a ref to tc which the container should 102 // drop whenever it is closed. 103 tc = NewTableCache(opts.Cache, opts.Experimental.TableCacheShards, size) 104 } 105 106 t := &tableCacheContainer{} 107 t.tableCache = tc 108 t.dbOpts.loggerAndTracer = opts.LoggerAndTracer 109 t.dbOpts.cacheID = cacheID 110 t.dbOpts.objProvider = objProvider 111 t.dbOpts.opts = opts.MakeReaderOptions() 112 t.dbOpts.filterMetrics = &sstable.FilterMetricsTracker{} 113 t.dbOpts.iterCount = new(atomic.Int32) 114 t.dbOpts.sstStatsCollector = sstStatsCollector 115 return t 116 } 117 118 // Before calling close, make sure that there will be no further need 119 // to access any of the files associated with the store. 120 func (c *tableCacheContainer) close() error { 121 // We want to do some cleanup work here. Check for leaked iterators 122 // by the DB using this container. Note that we'll still perform cleanup 123 // below in the case that there are leaked iterators. 124 var err error 125 if v := c.dbOpts.iterCount.Load(); v > 0 { 126 err = errors.Errorf("leaked iterators: %d", errors.Safe(v)) 127 } 128 129 // Release nodes here. 130 for _, shard := range c.tableCache.shards { 131 if shard != nil { 132 shard.removeDB(&c.dbOpts) 133 } 134 } 135 return firstError(err, c.tableCache.Unref()) 136 } 137 138 func (c *tableCacheContainer) newIters( 139 ctx context.Context, 140 file *manifest.FileMetadata, 141 opts *IterOptions, 142 internalOpts internalIterOpts, 143 ) (internalIterator, keyspan.FragmentIterator, error) { 144 return c.tableCache.getShard(file.FileBacking.DiskFileNum).newIters(ctx, file, opts, internalOpts, &c.dbOpts) 145 } 146 147 func (c *tableCacheContainer) newRangeKeyIter( 148 file *manifest.FileMetadata, opts keyspan.SpanIterOptions, 149 ) (keyspan.FragmentIterator, error) { 150 return c.tableCache.getShard(file.FileBacking.DiskFileNum).newRangeKeyIter(file, opts, &c.dbOpts) 151 } 152 153 // getTableProperties returns the properties associated with the backing physical 154 // table if the input metadata belongs to a virtual sstable. 155 func (c *tableCacheContainer) getTableProperties(file *fileMetadata) (*sstable.Properties, error) { 156 return c.tableCache.getShard(file.FileBacking.DiskFileNum).getTableProperties(file, &c.dbOpts) 157 } 158 159 func (c *tableCacheContainer) evict(fileNum base.DiskFileNum) { 160 c.tableCache.getShard(fileNum).evict(fileNum, &c.dbOpts, false) 161 } 162 163 func (c *tableCacheContainer) metrics() (CacheMetrics, FilterMetrics) { 164 var m CacheMetrics 165 for i := range c.tableCache.shards { 166 s := c.tableCache.shards[i] 167 s.mu.RLock() 168 m.Count += int64(len(s.mu.nodes)) 169 s.mu.RUnlock() 170 m.Hits += s.hits.Load() 171 m.Misses += s.misses.Load() 172 } 173 m.Size = m.Count * int64(unsafe.Sizeof(sstable.Reader{})) 174 f := c.dbOpts.filterMetrics.Load() 175 return m, f 176 } 177 178 func (c *tableCacheContainer) estimateSize( 179 meta *fileMetadata, lower, upper []byte, 180 ) (size uint64, err error) { 181 if meta.Virtual { 182 err = c.withVirtualReader( 183 meta.VirtualMeta(), 184 func(r sstable.VirtualReader) (err error) { 185 size, err = r.EstimateDiskUsage(lower, upper) 186 return err 187 }, 188 ) 189 } else { 190 err = c.withReader( 191 meta.PhysicalMeta(), 192 func(r *sstable.Reader) (err error) { 193 size, err = r.EstimateDiskUsage(lower, upper) 194 return err 195 }, 196 ) 197 } 198 if err != nil { 199 return 0, err 200 } 201 return size, nil 202 } 203 204 // createCommonReader creates a Reader for this file. isForeign, if true for 205 // virtual sstables, is passed into the vSSTable reader so its iterators can 206 // collapse obsolete points accordingly. 207 func createCommonReader( 208 v *tableCacheValue, file *fileMetadata, isForeign bool, 209 ) sstable.CommonReader { 210 // TODO(bananabrick): We suffer an allocation if file is a virtual sstable. 211 var cr sstable.CommonReader = v.reader 212 if file.Virtual { 213 virtualReader := sstable.MakeVirtualReader( 214 v.reader, file.VirtualMeta(), isForeign, 215 ) 216 cr = &virtualReader 217 } 218 return cr 219 } 220 221 func (c *tableCacheContainer) withCommonReader( 222 meta *fileMetadata, fn func(sstable.CommonReader) error, 223 ) error { 224 s := c.tableCache.getShard(meta.FileBacking.DiskFileNum) 225 v := s.findNode(meta, &c.dbOpts) 226 defer s.unrefValue(v) 227 if v.err != nil { 228 return v.err 229 } 230 provider := c.dbOpts.objProvider 231 objMeta, err := provider.Lookup(fileTypeTable, meta.FileBacking.DiskFileNum) 232 if err != nil { 233 return err 234 } 235 return fn(createCommonReader(v, meta, provider.IsSharedForeign(objMeta))) 236 } 237 238 func (c *tableCacheContainer) withReader(meta physicalMeta, fn func(*sstable.Reader) error) error { 239 s := c.tableCache.getShard(meta.FileBacking.DiskFileNum) 240 v := s.findNode(meta.FileMetadata, &c.dbOpts) 241 defer s.unrefValue(v) 242 if v.err != nil { 243 return v.err 244 } 245 return fn(v.reader) 246 } 247 248 // withVirtualReader fetches a VirtualReader associated with a virtual sstable. 249 func (c *tableCacheContainer) withVirtualReader( 250 meta virtualMeta, fn func(sstable.VirtualReader) error, 251 ) error { 252 s := c.tableCache.getShard(meta.FileBacking.DiskFileNum) 253 v := s.findNode(meta.FileMetadata, &c.dbOpts) 254 defer s.unrefValue(v) 255 if v.err != nil { 256 return v.err 257 } 258 provider := c.dbOpts.objProvider 259 objMeta, err := provider.Lookup(fileTypeTable, meta.FileBacking.DiskFileNum) 260 if err != nil { 261 return err 262 } 263 return fn(sstable.MakeVirtualReader(v.reader, meta, provider.IsSharedForeign(objMeta))) 264 } 265 266 func (c *tableCacheContainer) iterCount() int64 { 267 return int64(c.dbOpts.iterCount.Load()) 268 } 269 270 // TableCache is a shareable cache for open sstables. 271 type TableCache struct { 272 refs atomic.Int64 273 274 cache *Cache 275 shards []*tableCacheShard 276 } 277 278 // Ref adds a reference to the table cache. Once tableCache.init returns, 279 // the table cache only remains valid if there is at least one reference 280 // to it. 281 func (c *TableCache) Ref() { 282 v := c.refs.Add(1) 283 // We don't want the reference count to ever go from 0 -> 1, 284 // cause a reference count of 0 implies that we've closed the cache. 285 if v <= 1 { 286 panic(fmt.Sprintf("pebble: inconsistent reference count: %d", v)) 287 } 288 } 289 290 // Unref removes a reference to the table cache. 291 func (c *TableCache) Unref() error { 292 v := c.refs.Add(-1) 293 switch { 294 case v < 0: 295 panic(fmt.Sprintf("pebble: inconsistent reference count: %d", v)) 296 case v == 0: 297 var err error 298 for i := range c.shards { 299 // The cache shard is not allocated yet, nothing to close 300 if c.shards[i] == nil { 301 continue 302 } 303 err = firstError(err, c.shards[i].Close()) 304 } 305 306 // Unref the cache which we create a reference to when the tableCache 307 // is first instantiated. 308 c.cache.Unref() 309 return err 310 } 311 return nil 312 } 313 314 // NewTableCache will create a reference to the table cache. It is the callers responsibility 315 // to call tableCache.Unref if they will no longer hold a reference to the table cache. 316 func NewTableCache(cache *Cache, numShards int, size int) *TableCache { 317 if size == 0 { 318 panic("pebble: cannot create a table cache of size 0") 319 } else if numShards == 0 { 320 panic("pebble: cannot create a table cache with 0 shards") 321 } 322 323 c := &TableCache{} 324 c.cache = cache 325 c.cache.Ref() 326 327 c.shards = make([]*tableCacheShard, numShards) 328 for i := range c.shards { 329 c.shards[i] = &tableCacheShard{} 330 c.shards[i].init(size / len(c.shards)) 331 } 332 333 // Hold a ref to the cache here. 334 c.refs.Store(1) 335 336 return c 337 } 338 339 func (c *TableCache) getShard(fileNum base.DiskFileNum) *tableCacheShard { 340 return c.shards[uint64(fileNum.FileNum())%uint64(len(c.shards))] 341 } 342 343 type tableCacheKey struct { 344 cacheID uint64 345 fileNum base.DiskFileNum 346 } 347 348 type tableCacheShard struct { 349 hits atomic.Int64 350 misses atomic.Int64 351 iterCount atomic.Int32 352 353 size int 354 355 mu struct { 356 sync.RWMutex 357 nodes map[tableCacheKey]*tableCacheNode 358 // The iters map is only created and populated in race builds. 359 iters map[io.Closer][]byte 360 361 handHot *tableCacheNode 362 handCold *tableCacheNode 363 handTest *tableCacheNode 364 365 coldTarget int 366 sizeHot int 367 sizeCold int 368 sizeTest int 369 } 370 releasing sync.WaitGroup 371 releasingCh chan *tableCacheValue 372 releaseLoopExit sync.WaitGroup 373 } 374 375 func (c *tableCacheShard) init(size int) { 376 c.size = size 377 378 c.mu.nodes = make(map[tableCacheKey]*tableCacheNode) 379 c.mu.coldTarget = size 380 c.releasingCh = make(chan *tableCacheValue, 100) 381 c.releaseLoopExit.Add(1) 382 go c.releaseLoop() 383 384 if invariants.RaceEnabled { 385 c.mu.iters = make(map[io.Closer][]byte) 386 } 387 } 388 389 func (c *tableCacheShard) releaseLoop() { 390 pprof.Do(context.Background(), tableCacheLabels, func(context.Context) { 391 defer c.releaseLoopExit.Done() 392 for v := range c.releasingCh { 393 v.release(c) 394 } 395 }) 396 } 397 398 // checkAndIntersectFilters checks the specific table and block property filters 399 // for intersection with any available table and block-level properties. Returns 400 // true for ok if this table should be read by this iterator. 401 func (c *tableCacheShard) checkAndIntersectFilters( 402 v *tableCacheValue, 403 tableFilter func(userProps map[string]string) bool, 404 blockPropertyFilters []BlockPropertyFilter, 405 boundLimitedFilter sstable.BoundLimitedBlockPropertyFilter, 406 ) (ok bool, filterer *sstable.BlockPropertiesFilterer, err error) { 407 if tableFilter != nil && 408 !tableFilter(v.reader.Properties.UserProperties) { 409 return false, nil, nil 410 } 411 412 if boundLimitedFilter != nil || len(blockPropertyFilters) > 0 { 413 filterer, err = sstable.IntersectsTable( 414 blockPropertyFilters, 415 boundLimitedFilter, 416 v.reader.Properties.UserProperties, 417 ) 418 // NB: IntersectsTable will return a nil filterer if the table-level 419 // properties indicate there's no intersection with the provided filters. 420 if filterer == nil || err != nil { 421 return false, nil, err 422 } 423 } 424 return true, filterer, nil 425 } 426 427 func (c *tableCacheShard) newIters( 428 ctx context.Context, 429 file *manifest.FileMetadata, 430 opts *IterOptions, 431 internalOpts internalIterOpts, 432 dbOpts *tableCacheOpts, 433 ) (internalIterator, keyspan.FragmentIterator, error) { 434 // TODO(sumeer): constructing the Reader should also use a plumbed context, 435 // since parts of the sstable are read during the construction. The Reader 436 // should not remember that context since the Reader can be long-lived. 437 438 // Calling findNode gives us the responsibility of decrementing v's 439 // refCount. If opening the underlying table resulted in error, then we 440 // decrement this straight away. Otherwise, we pass that responsibility to 441 // the sstable iterator, which decrements when it is closed. 442 v := c.findNode(file, dbOpts) 443 if v.err != nil { 444 defer c.unrefValue(v) 445 return nil, nil, v.err 446 } 447 448 hideObsoletePoints := false 449 var pointKeyFilters []BlockPropertyFilter 450 if opts != nil { 451 // This code is appending (at most one filter) in-place to 452 // opts.PointKeyFilters even though the slice is shared for iterators in 453 // the same iterator tree. This is acceptable since all the following 454 // properties are true: 455 // - The iterator tree is single threaded, so the shared backing for the 456 // slice is being mutated in a single threaded manner. 457 // - Each shallow copy of the slice has its own notion of length. 458 // - The appended element is always the obsoleteKeyBlockPropertyFilter 459 // struct, which is stateless, so overwriting that struct when creating 460 // one sstable iterator is harmless to other sstable iterators that are 461 // relying on that struct. 462 // 463 // An alternative would be to have different slices for different sstable 464 // iterators, but that requires more work to avoid allocations. 465 hideObsoletePoints, pointKeyFilters = 466 v.reader.TryAddBlockPropertyFilterForHideObsoletePoints( 467 opts.snapshotForHideObsoletePoints, file.LargestSeqNum, opts.PointKeyFilters) 468 } 469 ok := true 470 var filterer *sstable.BlockPropertiesFilterer 471 var err error 472 if opts != nil { 473 ok, filterer, err = c.checkAndIntersectFilters(v, opts.TableFilter, 474 pointKeyFilters, internalOpts.boundLimitedFilter) 475 } 476 if err != nil { 477 c.unrefValue(v) 478 return nil, nil, err 479 } 480 481 provider := dbOpts.objProvider 482 // Check if this file is a foreign file. 483 objMeta, err := provider.Lookup(fileTypeTable, file.FileBacking.DiskFileNum) 484 if err != nil { 485 return nil, nil, err 486 } 487 488 // Note: This suffers an allocation for virtual sstables. 489 cr := createCommonReader(v, file, provider.IsSharedForeign(objMeta)) 490 491 // NB: range-del iterator does not maintain a reference to the table, nor 492 // does it need to read from it after creation. 493 rangeDelIter, err := cr.NewRawRangeDelIter() 494 if err != nil { 495 c.unrefValue(v) 496 return nil, nil, err 497 } 498 499 if !ok { 500 c.unrefValue(v) 501 // Return an empty iterator. This iterator has no mutable state, so 502 // using a singleton is fine. 503 // NB: We still return the potentially non-empty rangeDelIter. This 504 // ensures the iterator observes the file's range deletions even if the 505 // block property filters exclude all the file's point keys. The range 506 // deletions may still delete keys lower in the LSM in files that DO 507 // match the active filters. 508 // 509 // The point iterator returned must implement the filteredIter 510 // interface, so that the level iterator surfaces file boundaries when 511 // range deletions are present. 512 return filteredAll, rangeDelIter, err 513 } 514 515 var iter sstable.Iterator 516 useFilter := true 517 if opts != nil { 518 useFilter = manifest.LevelToInt(opts.level) != 6 || opts.UseL6Filters 519 ctx = objiotracing.WithLevel(ctx, manifest.LevelToInt(opts.level)) 520 } 521 tableFormat, err := v.reader.TableFormat() 522 if err != nil { 523 return nil, nil, err 524 } 525 var rp sstable.ReaderProvider 526 if tableFormat >= sstable.TableFormatPebblev3 && v.reader.Properties.NumValueBlocks > 0 { 527 rp = &tableCacheShardReaderProvider{c: c, file: file, dbOpts: dbOpts} 528 } 529 530 if provider.IsSharedForeign(objMeta) { 531 if tableFormat < sstable.TableFormatPebblev4 { 532 return nil, nil, errors.New("pebble: shared foreign sstable has a lower table format than expected") 533 } 534 hideObsoletePoints = true 535 } 536 var categoryAndQoS sstable.CategoryAndQoS 537 if opts != nil { 538 categoryAndQoS = opts.CategoryAndQoS 539 } 540 if internalOpts.bytesIterated != nil { 541 iter, err = cr.NewCompactionIter( 542 internalOpts.bytesIterated, categoryAndQoS, dbOpts.sstStatsCollector, rp, 543 internalOpts.bufferPool) 544 } else { 545 iter, err = cr.NewIterWithBlockPropertyFiltersAndContextEtc( 546 ctx, opts.GetLowerBound(), opts.GetUpperBound(), filterer, hideObsoletePoints, useFilter, 547 internalOpts.stats, categoryAndQoS, dbOpts.sstStatsCollector, rp) 548 } 549 if err != nil { 550 if rangeDelIter != nil { 551 _ = rangeDelIter.Close() 552 } 553 c.unrefValue(v) 554 return nil, nil, err 555 } 556 // NB: v.closeHook takes responsibility for calling unrefValue(v) here. Take 557 // care to avoid introducing an allocation here by adding a closure. 558 iter.SetCloseHook(v.closeHook) 559 560 c.iterCount.Add(1) 561 dbOpts.iterCount.Add(1) 562 if invariants.RaceEnabled { 563 c.mu.Lock() 564 c.mu.iters[iter] = debug.Stack() 565 c.mu.Unlock() 566 } 567 return iter, rangeDelIter, nil 568 } 569 570 func (c *tableCacheShard) newRangeKeyIter( 571 file *manifest.FileMetadata, opts keyspan.SpanIterOptions, dbOpts *tableCacheOpts, 572 ) (keyspan.FragmentIterator, error) { 573 // Calling findNode gives us the responsibility of decrementing v's 574 // refCount. If opening the underlying table resulted in error, then we 575 // decrement this straight away. Otherwise, we pass that responsibility to 576 // the sstable iterator, which decrements when it is closed. 577 v := c.findNode(file, dbOpts) 578 if v.err != nil { 579 defer c.unrefValue(v) 580 return nil, v.err 581 } 582 583 ok := true 584 var err error 585 // Don't filter a table's range keys if the file contains RANGEKEYDELs. 586 // The RANGEKEYDELs may delete range keys in other levels. Skipping the 587 // file's range key blocks may surface deleted range keys below. This is 588 // done here, rather than deferring to the block-property collector in order 589 // to maintain parity with point keys and the treatment of RANGEDELs. 590 if v.reader.Properties.NumRangeKeyDels == 0 { 591 ok, _, err = c.checkAndIntersectFilters(v, nil, opts.RangeKeyFilters, nil) 592 } 593 if err != nil { 594 c.unrefValue(v) 595 return nil, err 596 } 597 if !ok { 598 c.unrefValue(v) 599 // Return the empty iterator. This iterator has no mutable state, so 600 // using a singleton is fine. 601 return emptyKeyspanIter, err 602 } 603 604 var iter keyspan.FragmentIterator 605 if file.Virtual { 606 provider := dbOpts.objProvider 607 var objMeta objstorage.ObjectMetadata 608 objMeta, err = provider.Lookup(fileTypeTable, file.FileBacking.DiskFileNum) 609 if err == nil { 610 virtualReader := sstable.MakeVirtualReader( 611 v.reader, file.VirtualMeta(), provider.IsSharedForeign(objMeta), 612 ) 613 iter, err = virtualReader.NewRawRangeKeyIter() 614 } 615 } else { 616 iter, err = v.reader.NewRawRangeKeyIter() 617 } 618 619 // iter is a block iter that holds the entire value of the block in memory. 620 // No need to hold onto a ref of the cache value. 621 c.unrefValue(v) 622 623 if err != nil { 624 return nil, err 625 } 626 627 if iter == nil { 628 // NewRawRangeKeyIter can return nil even if there's no error. However, 629 // the keyspan.LevelIter expects a non-nil iterator if err is nil. 630 return emptyKeyspanIter, nil 631 } 632 633 return iter, nil 634 } 635 636 type tableCacheShardReaderProvider struct { 637 c *tableCacheShard 638 file *manifest.FileMetadata 639 dbOpts *tableCacheOpts 640 v *tableCacheValue 641 } 642 643 var _ sstable.ReaderProvider = &tableCacheShardReaderProvider{} 644 645 // GetReader implements sstable.ReaderProvider. Note that it is not the 646 // responsibility of tableCacheShardReaderProvider to ensure that the file 647 // continues to exist. The ReaderProvider is used in iterators where the 648 // top-level iterator is pinning the read state and preventing the files from 649 // being deleted. 650 // 651 // The caller must call tableCacheShardReaderProvider.Close. 652 // 653 // Note that currently the Reader returned here is only used to read value 654 // blocks. This reader shouldn't be used for other purposes like reading keys 655 // outside of virtual sstable bounds. 656 // 657 // TODO(bananabrick): We could return a wrapper over the Reader to ensure 658 // that the reader isn't used for other purposes. 659 func (rp *tableCacheShardReaderProvider) GetReader() (*sstable.Reader, error) { 660 // Calling findNode gives us the responsibility of decrementing v's 661 // refCount. 662 v := rp.c.findNode(rp.file, rp.dbOpts) 663 if v.err != nil { 664 defer rp.c.unrefValue(v) 665 return nil, v.err 666 } 667 rp.v = v 668 return v.reader, nil 669 } 670 671 // Close implements sstable.ReaderProvider. 672 func (rp *tableCacheShardReaderProvider) Close() { 673 rp.c.unrefValue(rp.v) 674 rp.v = nil 675 } 676 677 // getTableProperties return sst table properties for target file 678 func (c *tableCacheShard) getTableProperties( 679 file *fileMetadata, dbOpts *tableCacheOpts, 680 ) (*sstable.Properties, error) { 681 // Calling findNode gives us the responsibility of decrementing v's refCount here 682 v := c.findNode(file, dbOpts) 683 defer c.unrefValue(v) 684 685 if v.err != nil { 686 return nil, v.err 687 } 688 return &v.reader.Properties, nil 689 } 690 691 // releaseNode releases a node from the tableCacheShard. 692 // 693 // c.mu must be held when calling this. 694 func (c *tableCacheShard) releaseNode(n *tableCacheNode) { 695 c.unlinkNode(n) 696 c.clearNode(n) 697 } 698 699 // unlinkNode removes a node from the tableCacheShard, leaving the shard 700 // reference in place. 701 // 702 // c.mu must be held when calling this. 703 func (c *tableCacheShard) unlinkNode(n *tableCacheNode) { 704 key := tableCacheKey{n.cacheID, n.fileNum} 705 delete(c.mu.nodes, key) 706 707 switch n.ptype { 708 case tableCacheNodeHot: 709 c.mu.sizeHot-- 710 case tableCacheNodeCold: 711 c.mu.sizeCold-- 712 case tableCacheNodeTest: 713 c.mu.sizeTest-- 714 } 715 716 if n == c.mu.handHot { 717 c.mu.handHot = c.mu.handHot.prev() 718 } 719 if n == c.mu.handCold { 720 c.mu.handCold = c.mu.handCold.prev() 721 } 722 if n == c.mu.handTest { 723 c.mu.handTest = c.mu.handTest.prev() 724 } 725 726 if n.unlink() == n { 727 // This was the last entry in the cache. 728 c.mu.handHot = nil 729 c.mu.handCold = nil 730 c.mu.handTest = nil 731 } 732 733 n.links.prev = nil 734 n.links.next = nil 735 } 736 737 func (c *tableCacheShard) clearNode(n *tableCacheNode) { 738 if v := n.value; v != nil { 739 n.value = nil 740 c.unrefValue(v) 741 } 742 } 743 744 // unrefValue decrements the reference count for the specified value, releasing 745 // it if the reference count fell to 0. Note that the value has a reference if 746 // it is present in tableCacheShard.mu.nodes, so a reference count of 0 means 747 // the node has already been removed from that map. 748 func (c *tableCacheShard) unrefValue(v *tableCacheValue) { 749 if v.refCount.Add(-1) == 0 { 750 c.releasing.Add(1) 751 c.releasingCh <- v 752 } 753 } 754 755 // findNode returns the node for the table with the given file number, creating 756 // that node if it didn't already exist. The caller is responsible for 757 // decrementing the returned node's refCount. 758 func (c *tableCacheShard) findNode(meta *fileMetadata, dbOpts *tableCacheOpts) *tableCacheValue { 759 v := c.findNodeInternal(meta, dbOpts) 760 761 // Loading a file before its global sequence number is known (eg, 762 // during ingest before entering the commit pipeline) can pollute 763 // the cache with incorrect state. In invariant builds, verify 764 // that the global sequence number of the returned reader matches. 765 if invariants.Enabled { 766 if v.reader != nil && meta.LargestSeqNum == meta.SmallestSeqNum && 767 v.reader.Properties.GlobalSeqNum != meta.SmallestSeqNum { 768 panic(errors.AssertionFailedf("file %s loaded from table cache with the wrong global sequence number %d", 769 meta, v.reader.Properties.GlobalSeqNum)) 770 } 771 } 772 return v 773 } 774 775 func (c *tableCacheShard) findNodeInternal( 776 meta *fileMetadata, dbOpts *tableCacheOpts, 777 ) *tableCacheValue { 778 if refs := meta.Refs(); refs <= 0 { 779 panic(errors.AssertionFailedf("attempting to load file %s with refs=%d from table cache", 780 meta, refs)) 781 } 782 // Fast-path for a hit in the cache. 783 c.mu.RLock() 784 key := tableCacheKey{dbOpts.cacheID, meta.FileBacking.DiskFileNum} 785 if n := c.mu.nodes[key]; n != nil && n.value != nil { 786 // Fast-path hit. 787 // 788 // The caller is responsible for decrementing the refCount. 789 v := n.value 790 v.refCount.Add(1) 791 c.mu.RUnlock() 792 n.referenced.Store(true) 793 c.hits.Add(1) 794 <-v.loaded 795 return v 796 } 797 c.mu.RUnlock() 798 799 c.mu.Lock() 800 801 n := c.mu.nodes[key] 802 switch { 803 case n == nil: 804 // Slow-path miss of a non-existent node. 805 n = &tableCacheNode{ 806 fileNum: meta.FileBacking.DiskFileNum, 807 ptype: tableCacheNodeCold, 808 } 809 c.addNode(n, dbOpts) 810 c.mu.sizeCold++ 811 812 case n.value != nil: 813 // Slow-path hit of a hot or cold node. 814 // 815 // The caller is responsible for decrementing the refCount. 816 v := n.value 817 v.refCount.Add(1) 818 n.referenced.Store(true) 819 c.hits.Add(1) 820 c.mu.Unlock() 821 <-v.loaded 822 return v 823 824 default: 825 // Slow-path miss of a test node. 826 c.unlinkNode(n) 827 c.mu.coldTarget++ 828 if c.mu.coldTarget > c.size { 829 c.mu.coldTarget = c.size 830 } 831 832 n.referenced.Store(false) 833 n.ptype = tableCacheNodeHot 834 c.addNode(n, dbOpts) 835 c.mu.sizeHot++ 836 } 837 838 c.misses.Add(1) 839 840 v := &tableCacheValue{ 841 loaded: make(chan struct{}), 842 } 843 v.refCount.Store(2) 844 // Cache the closure invoked when an iterator is closed. This avoids an 845 // allocation on every call to newIters. 846 v.closeHook = func(i sstable.Iterator) error { 847 if invariants.RaceEnabled { 848 c.mu.Lock() 849 delete(c.mu.iters, i) 850 c.mu.Unlock() 851 } 852 c.unrefValue(v) 853 c.iterCount.Add(-1) 854 dbOpts.iterCount.Add(-1) 855 return nil 856 } 857 n.value = v 858 859 c.mu.Unlock() 860 861 // Note adding to the cache lists must complete before we begin loading the 862 // table as a failure during load will result in the node being unlinked. 863 pprof.Do(context.Background(), tableCacheLabels, func(context.Context) { 864 v.load( 865 loadInfo{ 866 backingFileNum: meta.FileBacking.DiskFileNum, 867 smallestSeqNum: meta.SmallestSeqNum, 868 largestSeqNum: meta.LargestSeqNum, 869 }, c, dbOpts) 870 }) 871 return v 872 } 873 874 func (c *tableCacheShard) addNode(n *tableCacheNode, dbOpts *tableCacheOpts) { 875 c.evictNodes() 876 n.cacheID = dbOpts.cacheID 877 key := tableCacheKey{n.cacheID, n.fileNum} 878 c.mu.nodes[key] = n 879 880 n.links.next = n 881 n.links.prev = n 882 if c.mu.handHot == nil { 883 // First element. 884 c.mu.handHot = n 885 c.mu.handCold = n 886 c.mu.handTest = n 887 } else { 888 c.mu.handHot.link(n) 889 } 890 891 if c.mu.handCold == c.mu.handHot { 892 c.mu.handCold = c.mu.handCold.prev() 893 } 894 } 895 896 func (c *tableCacheShard) evictNodes() { 897 for c.size <= c.mu.sizeHot+c.mu.sizeCold && c.mu.handCold != nil { 898 c.runHandCold() 899 } 900 } 901 902 func (c *tableCacheShard) runHandCold() { 903 n := c.mu.handCold 904 if n.ptype == tableCacheNodeCold { 905 if n.referenced.Load() { 906 n.referenced.Store(false) 907 n.ptype = tableCacheNodeHot 908 c.mu.sizeCold-- 909 c.mu.sizeHot++ 910 } else { 911 c.clearNode(n) 912 n.ptype = tableCacheNodeTest 913 c.mu.sizeCold-- 914 c.mu.sizeTest++ 915 for c.size < c.mu.sizeTest && c.mu.handTest != nil { 916 c.runHandTest() 917 } 918 } 919 } 920 921 c.mu.handCold = c.mu.handCold.next() 922 923 for c.size-c.mu.coldTarget <= c.mu.sizeHot && c.mu.handHot != nil { 924 c.runHandHot() 925 } 926 } 927 928 func (c *tableCacheShard) runHandHot() { 929 if c.mu.handHot == c.mu.handTest && c.mu.handTest != nil { 930 c.runHandTest() 931 if c.mu.handHot == nil { 932 return 933 } 934 } 935 936 n := c.mu.handHot 937 if n.ptype == tableCacheNodeHot { 938 if n.referenced.Load() { 939 n.referenced.Store(false) 940 } else { 941 n.ptype = tableCacheNodeCold 942 c.mu.sizeHot-- 943 c.mu.sizeCold++ 944 } 945 } 946 947 c.mu.handHot = c.mu.handHot.next() 948 } 949 950 func (c *tableCacheShard) runHandTest() { 951 if c.mu.sizeCold > 0 && c.mu.handTest == c.mu.handCold && c.mu.handCold != nil { 952 c.runHandCold() 953 if c.mu.handTest == nil { 954 return 955 } 956 } 957 958 n := c.mu.handTest 959 if n.ptype == tableCacheNodeTest { 960 c.mu.coldTarget-- 961 if c.mu.coldTarget < 0 { 962 c.mu.coldTarget = 0 963 } 964 c.unlinkNode(n) 965 c.clearNode(n) 966 } 967 968 c.mu.handTest = c.mu.handTest.next() 969 } 970 971 func (c *tableCacheShard) evict(fileNum base.DiskFileNum, dbOpts *tableCacheOpts, allowLeak bool) { 972 c.mu.Lock() 973 key := tableCacheKey{dbOpts.cacheID, fileNum} 974 n := c.mu.nodes[key] 975 var v *tableCacheValue 976 if n != nil { 977 // NB: This is equivalent to tableCacheShard.releaseNode(), but we perform 978 // the tableCacheNode.release() call synchronously below to ensure the 979 // sstable file descriptor is closed before returning. Note that 980 // tableCacheShard.releasing needs to be incremented while holding 981 // tableCacheShard.mu in order to avoid a race with Close() 982 c.unlinkNode(n) 983 v = n.value 984 if v != nil { 985 if !allowLeak { 986 if t := v.refCount.Add(-1); t != 0 { 987 dbOpts.loggerAndTracer.Fatalf("sstable %s: refcount is not zero: %d\n%s", fileNum, t, debug.Stack()) 988 } 989 } 990 c.releasing.Add(1) 991 } 992 } 993 994 c.mu.Unlock() 995 996 if v != nil { 997 v.release(c) 998 } 999 1000 dbOpts.opts.Cache.EvictFile(dbOpts.cacheID, fileNum) 1001 } 1002 1003 // removeDB evicts any nodes which have a reference to the DB 1004 // associated with dbOpts.cacheID. Make sure that there will 1005 // be no more accesses to the files associated with the DB. 1006 func (c *tableCacheShard) removeDB(dbOpts *tableCacheOpts) { 1007 var fileNums []base.DiskFileNum 1008 1009 c.mu.RLock() 1010 // Collect the fileNums which need to be cleaned. 1011 var firstNode *tableCacheNode 1012 node := c.mu.handHot 1013 for node != firstNode { 1014 if firstNode == nil { 1015 firstNode = node 1016 } 1017 1018 if node.cacheID == dbOpts.cacheID { 1019 fileNums = append(fileNums, node.fileNum) 1020 } 1021 node = node.next() 1022 } 1023 c.mu.RUnlock() 1024 1025 // Evict all the nodes associated with the DB. 1026 // This should synchronously close all the files 1027 // associated with the DB. 1028 for _, fileNum := range fileNums { 1029 c.evict(fileNum, dbOpts, true) 1030 } 1031 } 1032 1033 func (c *tableCacheShard) Close() error { 1034 c.mu.Lock() 1035 defer c.mu.Unlock() 1036 1037 // Check for leaked iterators. Note that we'll still perform cleanup below in 1038 // the case that there are leaked iterators. 1039 var err error 1040 if v := c.iterCount.Load(); v > 0 { 1041 if !invariants.RaceEnabled { 1042 err = errors.Errorf("leaked iterators: %d", errors.Safe(v)) 1043 } else { 1044 var buf bytes.Buffer 1045 for _, stack := range c.mu.iters { 1046 fmt.Fprintf(&buf, "%s\n", stack) 1047 } 1048 err = errors.Errorf("leaked iterators: %d\n%s", errors.Safe(v), buf.String()) 1049 } 1050 } 1051 1052 for c.mu.handHot != nil { 1053 n := c.mu.handHot 1054 if n.value != nil { 1055 if n.value.refCount.Add(-1) == 0 { 1056 c.releasing.Add(1) 1057 c.releasingCh <- n.value 1058 } 1059 } 1060 c.unlinkNode(n) 1061 } 1062 c.mu.nodes = nil 1063 c.mu.handHot = nil 1064 c.mu.handCold = nil 1065 c.mu.handTest = nil 1066 1067 // Only shutdown the releasing goroutine if there were no leaked 1068 // iterators. If there were leaked iterators, we leave the goroutine running 1069 // and the releasingCh open so that a subsequent iterator close can 1070 // complete. This behavior is used by iterator leak tests. Leaking the 1071 // goroutine for these tests is less bad not closing the iterator which 1072 // triggers other warnings about block cache handles not being released. 1073 if err != nil { 1074 c.releasing.Wait() 1075 return err 1076 } 1077 1078 close(c.releasingCh) 1079 c.releasing.Wait() 1080 c.releaseLoopExit.Wait() 1081 return err 1082 } 1083 1084 type tableCacheValue struct { 1085 closeHook func(i sstable.Iterator) error 1086 reader *sstable.Reader 1087 err error 1088 loaded chan struct{} 1089 // Reference count for the value. The reader is closed when the reference 1090 // count drops to zero. 1091 refCount atomic.Int32 1092 } 1093 1094 type loadInfo struct { 1095 backingFileNum base.DiskFileNum 1096 largestSeqNum uint64 1097 smallestSeqNum uint64 1098 } 1099 1100 func (v *tableCacheValue) load(loadInfo loadInfo, c *tableCacheShard, dbOpts *tableCacheOpts) { 1101 // Try opening the file first. 1102 var f objstorage.Readable 1103 var err error 1104 f, err = dbOpts.objProvider.OpenForReading( 1105 context.TODO(), fileTypeTable, loadInfo.backingFileNum, objstorage.OpenOptions{MustExist: true}, 1106 ) 1107 if err == nil { 1108 cacheOpts := private.SSTableCacheOpts(dbOpts.cacheID, loadInfo.backingFileNum).(sstable.ReaderOption) 1109 v.reader, err = sstable.NewReader(f, dbOpts.opts, cacheOpts, dbOpts.filterMetrics) 1110 } 1111 if err != nil { 1112 v.err = errors.Wrapf( 1113 err, "pebble: backing file %s error", errors.Safe(loadInfo.backingFileNum.FileNum())) 1114 } 1115 if v.err == nil && loadInfo.smallestSeqNum == loadInfo.largestSeqNum { 1116 v.reader.Properties.GlobalSeqNum = loadInfo.largestSeqNum 1117 } 1118 if v.err != nil { 1119 c.mu.Lock() 1120 defer c.mu.Unlock() 1121 // Lookup the node in the cache again as it might have already been 1122 // removed. 1123 key := tableCacheKey{dbOpts.cacheID, loadInfo.backingFileNum} 1124 n := c.mu.nodes[key] 1125 if n != nil && n.value == v { 1126 c.releaseNode(n) 1127 } 1128 } 1129 close(v.loaded) 1130 } 1131 1132 func (v *tableCacheValue) release(c *tableCacheShard) { 1133 <-v.loaded 1134 // Nothing to be done about an error at this point. Close the reader if it is 1135 // open. 1136 if v.reader != nil { 1137 _ = v.reader.Close() 1138 } 1139 c.releasing.Done() 1140 } 1141 1142 type tableCacheNodeType int8 1143 1144 const ( 1145 tableCacheNodeTest tableCacheNodeType = iota 1146 tableCacheNodeCold 1147 tableCacheNodeHot 1148 ) 1149 1150 func (p tableCacheNodeType) String() string { 1151 switch p { 1152 case tableCacheNodeTest: 1153 return "test" 1154 case tableCacheNodeCold: 1155 return "cold" 1156 case tableCacheNodeHot: 1157 return "hot" 1158 } 1159 return "unknown" 1160 } 1161 1162 type tableCacheNode struct { 1163 fileNum base.DiskFileNum 1164 value *tableCacheValue 1165 1166 links struct { 1167 next *tableCacheNode 1168 prev *tableCacheNode 1169 } 1170 ptype tableCacheNodeType 1171 // referenced is atomically set to indicate that this entry has been accessed 1172 // since the last time one of the clock hands swept it. 1173 referenced atomic.Bool 1174 1175 // Storing the cache id associated with the DB instance here 1176 // avoids the need to thread the dbOpts struct through many functions. 1177 cacheID uint64 1178 } 1179 1180 func (n *tableCacheNode) next() *tableCacheNode { 1181 if n == nil { 1182 return nil 1183 } 1184 return n.links.next 1185 } 1186 1187 func (n *tableCacheNode) prev() *tableCacheNode { 1188 if n == nil { 1189 return nil 1190 } 1191 return n.links.prev 1192 } 1193 1194 func (n *tableCacheNode) link(s *tableCacheNode) { 1195 s.links.prev = n.links.prev 1196 s.links.prev.links.next = s 1197 s.links.next = n 1198 s.links.next.links.prev = s 1199 } 1200 1201 func (n *tableCacheNode) unlink() *tableCacheNode { 1202 next := n.links.next 1203 n.links.prev.links.next = n.links.next 1204 n.links.next.links.prev = n.links.prev 1205 n.links.prev = n 1206 n.links.next = n 1207 return next 1208 }