github.com/zuoyebang/bitalostable@v1.0.1-0.20240229032404-e3b99a834294/table_cache.go (about) 1 // Copyright 2020 The LevelDB-Go and Pebble and Bitalostored Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package bitalostable 6 7 import ( 8 "bytes" 9 "context" 10 "fmt" 11 "io" 12 "runtime/debug" 13 "runtime/pprof" 14 "sync" 15 "sync/atomic" 16 "unsafe" 17 18 "github.com/cockroachdb/errors" 19 "github.com/zuoyebang/bitalostable/internal/base" 20 "github.com/zuoyebang/bitalostable/internal/invariants" 21 "github.com/zuoyebang/bitalostable/internal/keyspan" 22 "github.com/zuoyebang/bitalostable/internal/manifest" 23 "github.com/zuoyebang/bitalostable/internal/private" 24 "github.com/zuoyebang/bitalostable/sstable" 25 "github.com/zuoyebang/bitalostable/vfs" 26 ) 27 28 var emptyIter = &errorIter{err: nil} 29 var emptyKeyspanIter = &errorKeyspanIter{err: nil} 30 31 // filteredAll is a singleton internalIterator implementation used when an 32 // sstable does contain point keys, but all the keys are filtered by the active 33 // PointKeyFilters set in the iterator's IterOptions. 34 // 35 // filteredAll implements filteredIter, ensuring the level iterator recognizes 36 // when it may need to return file boundaries to keep the rangeDelIter open 37 // during mergingIter operation. 38 var filteredAll = &filteredAllKeysIter{errorIter: errorIter{err: nil}} 39 40 var _ filteredIter = filteredAll 41 42 type filteredAllKeysIter struct { 43 errorIter 44 } 45 46 func (s *filteredAllKeysIter) MaybeFilteredKeys() bool { 47 return true 48 } 49 50 var tableCacheLabels = pprof.Labels("bitalostable", "table-cache") 51 52 // tableCacheOpts contains the db specific fields 53 // of a table cache. This is stored in the tableCacheContainer 54 // along with the table cache. 55 // NB: It is important to make sure that the fields in this 56 // struct are read-only. Since the fields here are shared 57 // by every single tableCacheShard, if non read-only fields 58 // are updated, we could have unnecessary evictions of those 59 // fields, and the surrounding fields from the CPU caches. 60 type tableCacheOpts struct { 61 atomic struct { 62 // iterCount in the tableCacheOpts keeps track of iterators 63 // opened or closed by a DB. It's used to keep track of 64 // leaked iterators on a per-db level. 65 iterCount *int32 66 } 67 68 logger Logger 69 cacheID uint64 70 dirname string 71 fs vfs.FS 72 opts sstable.ReaderOptions 73 filterMetrics *FilterMetrics 74 } 75 76 // tableCacheContainer contains the table cache and 77 // fields which are unique to the DB. 78 type tableCacheContainer struct { 79 tableCache *TableCache 80 81 // dbOpts contains fields relevant to the table cache 82 // which are unique to each DB. 83 dbOpts tableCacheOpts 84 } 85 86 // newTableCacheContainer will panic if the underlying cache in the table cache 87 // doesn't match Options.Cache. 88 func newTableCacheContainer( 89 tc *TableCache, cacheID uint64, dirname string, fs vfs.FS, opts *Options, size int, 90 ) *tableCacheContainer { 91 // We will release a ref to table cache acquired here when tableCacheContainer.close is called. 92 if tc != nil { 93 if tc.cache != opts.Cache { 94 panic("bitalostable: underlying cache for the table cache and db are different") 95 } 96 tc.Ref() 97 } else { 98 // NewTableCache should create a ref to tc which the container should 99 // drop whenever it is closed. 100 tc = NewTableCache(opts.Cache, opts.Experimental.TableCacheShards, size) 101 } 102 103 t := &tableCacheContainer{} 104 t.tableCache = tc 105 t.dbOpts.logger = opts.Logger 106 t.dbOpts.cacheID = cacheID 107 t.dbOpts.dirname = dirname 108 t.dbOpts.fs = fs 109 t.dbOpts.opts = opts.MakeReaderOptions() 110 t.dbOpts.filterMetrics = &FilterMetrics{} 111 t.dbOpts.atomic.iterCount = new(int32) 112 return t 113 } 114 115 // Before calling close, make sure that there will be no further need 116 // to access any of the files associated with the store. 117 func (c *tableCacheContainer) close() error { 118 // We want to do some cleanup work here. Check for leaked iterators 119 // by the DB using this container. Note that we'll still perform cleanup 120 // below in the case that there are leaked iterators. 121 var err error 122 if v := atomic.LoadInt32(c.dbOpts.atomic.iterCount); v > 0 { 123 err = errors.Errorf("leaked iterators: %d", errors.Safe(v)) 124 } 125 126 // Release nodes here. 127 for _, shard := range c.tableCache.shards { 128 if shard != nil { 129 shard.removeDB(&c.dbOpts) 130 } 131 } 132 return firstError(err, c.tableCache.Unref()) 133 } 134 135 func (c *tableCacheContainer) newIters( 136 file *manifest.FileMetadata, opts *IterOptions, internalOpts internalIterOpts, 137 ) (internalIterator, keyspan.FragmentIterator, error) { 138 return c.tableCache.getShard(file.FileNum).newIters(file, opts, internalOpts, &c.dbOpts) 139 } 140 141 func (c *tableCacheContainer) newRangeKeyIter( 142 file *manifest.FileMetadata, opts *keyspan.SpanIterOptions, 143 ) (keyspan.FragmentIterator, error) { 144 return c.tableCache.getShard(file.FileNum).newRangeKeyIter(file, opts, &c.dbOpts) 145 } 146 147 func (c *tableCacheContainer) getTableProperties(file *fileMetadata) (*sstable.Properties, error) { 148 return c.tableCache.getShard(file.FileNum).getTableProperties(file, &c.dbOpts) 149 } 150 151 func (c *tableCacheContainer) evict(fileNum FileNum) { 152 c.tableCache.getShard(fileNum).evict(fileNum, &c.dbOpts, false) 153 } 154 155 func (c *tableCacheContainer) metrics() (CacheMetrics, FilterMetrics) { 156 var m CacheMetrics 157 for i := range c.tableCache.shards { 158 s := c.tableCache.shards[i] 159 s.mu.RLock() 160 m.Count += int64(len(s.mu.nodes)) 161 s.mu.RUnlock() 162 m.Hits += atomic.LoadInt64(&s.atomic.hits) 163 m.Misses += atomic.LoadInt64(&s.atomic.misses) 164 } 165 m.Size = m.Count * int64(unsafe.Sizeof(sstable.Reader{})) 166 f := FilterMetrics{ 167 Hits: atomic.LoadInt64(&c.dbOpts.filterMetrics.Hits), 168 Misses: atomic.LoadInt64(&c.dbOpts.filterMetrics.Misses), 169 } 170 return m, f 171 } 172 173 func (c *tableCacheContainer) withReader(meta *fileMetadata, fn func(*sstable.Reader) error) error { 174 s := c.tableCache.getShard(meta.FileNum) 175 v := s.findNode(meta, &c.dbOpts) 176 defer s.unrefValue(v) 177 if v.err != nil { 178 base.MustExist(c.dbOpts.fs, v.filename, c.dbOpts.logger, v.err) 179 return v.err 180 } 181 return fn(v.reader) 182 } 183 184 func (c *tableCacheContainer) iterCount() int64 { 185 return int64(atomic.LoadInt32(c.dbOpts.atomic.iterCount)) 186 } 187 188 // TableCache is a shareable cache for open sstables. 189 type TableCache struct { 190 // atomic contains fields which are accessed atomically. Go allocations 191 // are guaranteed to be 64-bit aligned which we take advantage of by 192 // placing the 64-bit fields which we access atomically at the beginning 193 // of the TableCache struct. For more information, see 194 // https://golang.org/pkg/sync/atomic/#pkg-note-BUG. 195 atomic struct { 196 refs int64 197 } 198 199 cache *Cache 200 shards []*tableCacheShard 201 } 202 203 // Ref adds a reference to the table cache. Once tableCache.init returns, 204 // the table cache only remains valid if there is at least one reference 205 // to it. 206 func (c *TableCache) Ref() { 207 v := atomic.AddInt64(&c.atomic.refs, 1) 208 // We don't want the reference count to ever go from 0 -> 1, 209 // cause a reference count of 0 implies that we've closed the cache. 210 if v <= 1 { 211 panic(fmt.Sprintf("bitalostable: inconsistent reference count: %d", v)) 212 } 213 } 214 215 // Unref removes a reference to the table cache. 216 func (c *TableCache) Unref() error { 217 v := atomic.AddInt64(&c.atomic.refs, -1) 218 switch { 219 case v < 0: 220 panic(fmt.Sprintf("bitalostable: inconsistent reference count: %d", v)) 221 case v == 0: 222 var err error 223 for i := range c.shards { 224 // The cache shard is not allocated yet, nothing to close 225 if c.shards[i] == nil { 226 continue 227 } 228 err = firstError(err, c.shards[i].Close()) 229 } 230 231 // Unref the cache which we create a reference to when the tableCache 232 // is first instantiated. 233 c.cache.Unref() 234 return err 235 } 236 return nil 237 } 238 239 // NewTableCache will create a reference to the table cache. It is the callers responsibility 240 // to call tableCache.Unref if they will no longer hold a reference to the table cache. 241 func NewTableCache(cache *Cache, numShards int, size int) *TableCache { 242 if size == 0 { 243 panic("bitalostable: cannot create a table cache of size 0") 244 } else if numShards == 0 { 245 panic("bitalostable: cannot create a table cache with 0 shards") 246 } 247 248 c := &TableCache{} 249 c.cache = cache 250 c.cache.Ref() 251 252 c.shards = make([]*tableCacheShard, numShards) 253 for i := range c.shards { 254 c.shards[i] = &tableCacheShard{} 255 c.shards[i].init(size / len(c.shards)) 256 } 257 258 // Hold a ref to the cache here. 259 c.atomic.refs = 1 260 261 return c 262 } 263 264 func (c *TableCache) getShard(fileNum FileNum) *tableCacheShard { 265 return c.shards[uint64(fileNum)%uint64(len(c.shards))] 266 } 267 268 type tableCacheKey struct { 269 cacheID uint64 270 fileNum FileNum 271 } 272 273 type tableCacheShard struct { 274 // WARNING: The following struct `atomic` contains fields are accessed atomically. 275 // 276 // Go allocations are guaranteed to be 64-bit aligned which we take advantage 277 // of by placing the 64-bit fields which we access atomically at the beginning 278 // of the DB struct. For more information, see https://golang.org/pkg/sync/atomic/#pkg-note-BUG. 279 atomic struct { 280 hits int64 281 misses int64 282 iterCount int32 283 } 284 285 size int 286 287 mu struct { 288 sync.RWMutex 289 nodes map[tableCacheKey]*tableCacheNode 290 // The iters map is only created and populated in race builds. 291 iters map[io.Closer][]byte 292 293 handHot *tableCacheNode 294 handCold *tableCacheNode 295 handTest *tableCacheNode 296 297 coldTarget int 298 sizeHot int 299 sizeCold int 300 sizeTest int 301 } 302 releasing sync.WaitGroup 303 releasingCh chan *tableCacheValue 304 releaseLoopExit sync.WaitGroup 305 } 306 307 func (c *tableCacheShard) init(size int) { 308 c.size = size 309 310 c.mu.nodes = make(map[tableCacheKey]*tableCacheNode) 311 c.mu.coldTarget = size 312 c.releasingCh = make(chan *tableCacheValue, 100) 313 c.releaseLoopExit.Add(1) 314 go c.releaseLoop() 315 316 if invariants.RaceEnabled { 317 c.mu.iters = make(map[io.Closer][]byte) 318 } 319 } 320 321 func (c *tableCacheShard) releaseLoop() { 322 pprof.Do(context.Background(), tableCacheLabels, func(context.Context) { 323 defer c.releaseLoopExit.Done() 324 for v := range c.releasingCh { 325 v.release(c) 326 } 327 }) 328 } 329 330 // checkAndIntersectFilters checks the specific table and block property filters 331 // for intersection with any available table and block-level properties. Returns 332 // true for ok if this table should be read by this iterator. 333 func (c *tableCacheShard) checkAndIntersectFilters( 334 v *tableCacheValue, 335 tableFilter func(userProps map[string]string) bool, 336 blockPropertyFilters []BlockPropertyFilter, 337 boundLimitedFilter sstable.BoundLimitedBlockPropertyFilter, 338 ) (ok bool, filterer *sstable.BlockPropertiesFilterer, err error) { 339 if tableFilter != nil && 340 !tableFilter(v.reader.Properties.UserProperties) { 341 return false, nil, nil 342 } 343 344 if boundLimitedFilter != nil || len(blockPropertyFilters) > 0 { 345 filterer = sstable.NewBlockPropertiesFilterer(blockPropertyFilters, boundLimitedFilter) 346 intersects, err := 347 filterer.IntersectsUserPropsAndFinishInit(v.reader.Properties.UserProperties) 348 if err != nil { 349 return false, nil, err 350 } 351 if !intersects { 352 return false, nil, nil 353 } 354 } 355 return true, filterer, nil 356 } 357 358 func (c *tableCacheShard) newIters( 359 file *manifest.FileMetadata, 360 opts *IterOptions, 361 internalOpts internalIterOpts, 362 dbOpts *tableCacheOpts, 363 ) (internalIterator, keyspan.FragmentIterator, error) { 364 // Calling findNode gives us the responsibility of decrementing v's 365 // refCount. If opening the underlying table resulted in error, then we 366 // decrement this straight away. Otherwise, we pass that responsibility to 367 // the sstable iterator, which decrements when it is closed. 368 v := c.findNode(file, dbOpts) 369 if v.err != nil { 370 defer c.unrefValue(v) 371 base.MustExist(dbOpts.fs, v.filename, dbOpts.logger, v.err) 372 return nil, nil, v.err 373 } 374 375 ok := true 376 var filterer *sstable.BlockPropertiesFilterer 377 var err error 378 if opts != nil { 379 ok, filterer, err = c.checkAndIntersectFilters(v, opts.TableFilter, 380 opts.PointKeyFilters, internalOpts.boundLimitedFilter) 381 } 382 if err != nil { 383 c.unrefValue(v) 384 return nil, nil, err 385 } 386 387 // NB: range-del iterator does not maintain a reference to the table, nor 388 // does it need to read from it after creation. 389 rangeDelIter, err := v.reader.NewRawRangeDelIter() 390 if err != nil { 391 c.unrefValue(v) 392 return nil, nil, err 393 } 394 395 if !ok { 396 c.unrefValue(v) 397 // Return an empty iterator. This iterator has no mutable state, so 398 // using a singleton is fine. 399 // NB: We still return the potentially non-empty rangeDelIter. This 400 // ensures the iterator observes the file's range deletions even if the 401 // block property filters exclude all the file's point keys. The range 402 // deletions may still delete keys lower in the LSM in files that DO 403 // match the active filters. 404 // 405 // The point iterator returned must implement the filteredIter 406 // interface, so that the level iterator surfaces file boundaries when 407 // range deletions are present. 408 return filteredAll, rangeDelIter, err 409 } 410 411 var iter sstable.Iterator 412 useFilter := true 413 if opts != nil { 414 useFilter = manifest.LevelToInt(opts.level) != 6 || opts.UseL6Filters 415 } 416 if internalOpts.bytesIterated != nil { 417 iter, err = v.reader.NewCompactionIter(internalOpts.bytesIterated) 418 } else { 419 iter, err = v.reader.NewIterWithBlockPropertyFilters( 420 opts.GetLowerBound(), opts.GetUpperBound(), filterer, useFilter, internalOpts.stats) 421 } 422 if err != nil { 423 if rangeDelIter != nil { 424 _ = rangeDelIter.Close() 425 } 426 c.unrefValue(v) 427 return nil, nil, err 428 } 429 // NB: v.closeHook takes responsibility for calling unrefValue(v) here. Take 430 // care to avoid introduceingan allocation here by adding a closure. 431 iter.SetCloseHook(v.closeHook) 432 433 atomic.AddInt32(&c.atomic.iterCount, 1) 434 atomic.AddInt32(dbOpts.atomic.iterCount, 1) 435 if invariants.RaceEnabled { 436 c.mu.Lock() 437 c.mu.iters[iter] = debug.Stack() 438 c.mu.Unlock() 439 } 440 return iter, rangeDelIter, nil 441 } 442 443 func (c *tableCacheShard) newRangeKeyIter( 444 file *manifest.FileMetadata, opts *keyspan.SpanIterOptions, dbOpts *tableCacheOpts, 445 ) (keyspan.FragmentIterator, error) { 446 // Calling findNode gives us the responsibility of decrementing v's 447 // refCount. If opening the underlying table resulted in error, then we 448 // decrement this straight away. Otherwise, we pass that responsibility to 449 // the sstable iterator, which decrements when it is closed. 450 v := c.findNode(file, dbOpts) 451 if v.err != nil { 452 defer c.unrefValue(v) 453 base.MustExist(dbOpts.fs, v.filename, dbOpts.logger, v.err) 454 return nil, v.err 455 } 456 457 ok := true 458 var err error 459 // Don't filter a table's range keys if the file contains RANGEKEYDELs. 460 // The RANGEKEYDELs may delete range keys in other levels. Skipping the 461 // file's range key blocks may surface deleted range keys below. This is 462 // done here, rather than deferring to the block-property collector in order 463 // to maintain parity with point keys and the treatment of RANGEDELs. 464 if opts != nil && v.reader.Properties.NumRangeKeyDels == 0 { 465 ok, _, err = c.checkAndIntersectFilters(v, nil, opts.RangeKeyFilters, nil) 466 } 467 if err != nil { 468 c.unrefValue(v) 469 return nil, err 470 } 471 if !ok { 472 c.unrefValue(v) 473 // Return the empty iterator. This iterator has no mutable state, so 474 // using a singleton is fine. 475 return emptyKeyspanIter, err 476 } 477 478 var iter keyspan.FragmentIterator 479 iter, err = v.reader.NewRawRangeKeyIter() 480 // iter is a block iter that holds the entire value of the block in memory. 481 // No need to hold onto a ref of the cache value. 482 c.unrefValue(v) 483 484 if err != nil || iter == nil { 485 return nil, err 486 } 487 488 return iter, nil 489 } 490 491 // getTableProperties return sst table properties for target file 492 func (c *tableCacheShard) getTableProperties( 493 file *fileMetadata, dbOpts *tableCacheOpts, 494 ) (*sstable.Properties, error) { 495 // Calling findNode gives us the responsibility of decrementing v's refCount here 496 v := c.findNode(file, dbOpts) 497 defer c.unrefValue(v) 498 499 if v.err != nil { 500 return nil, v.err 501 } 502 return &v.reader.Properties, nil 503 } 504 505 // releaseNode releases a node from the tableCacheShard. 506 // 507 // c.mu must be held when calling this. 508 func (c *tableCacheShard) releaseNode(n *tableCacheNode) { 509 c.unlinkNode(n) 510 c.clearNode(n) 511 } 512 513 // unlinkNode removes a node from the tableCacheShard, leaving the shard 514 // reference in place. 515 // 516 // c.mu must be held when calling this. 517 func (c *tableCacheShard) unlinkNode(n *tableCacheNode) { 518 key := tableCacheKey{n.cacheID, n.meta.FileNum} 519 delete(c.mu.nodes, key) 520 521 switch n.ptype { 522 case tableCacheNodeHot: 523 c.mu.sizeHot-- 524 case tableCacheNodeCold: 525 c.mu.sizeCold-- 526 case tableCacheNodeTest: 527 c.mu.sizeTest-- 528 } 529 530 if n == c.mu.handHot { 531 c.mu.handHot = c.mu.handHot.prev() 532 } 533 if n == c.mu.handCold { 534 c.mu.handCold = c.mu.handCold.prev() 535 } 536 if n == c.mu.handTest { 537 c.mu.handTest = c.mu.handTest.prev() 538 } 539 540 if n.unlink() == n { 541 // This was the last entry in the cache. 542 c.mu.handHot = nil 543 c.mu.handCold = nil 544 c.mu.handTest = nil 545 } 546 547 n.links.prev = nil 548 n.links.next = nil 549 } 550 551 func (c *tableCacheShard) clearNode(n *tableCacheNode) { 552 if v := n.value; v != nil { 553 n.value = nil 554 c.unrefValue(v) 555 } 556 } 557 558 // unrefValue decrements the reference count for the specified value, releasing 559 // it if the reference count fell to 0. Note that the value has a reference if 560 // it is present in tableCacheShard.mu.nodes, so a reference count of 0 means 561 // the node has already been removed from that map. 562 func (c *tableCacheShard) unrefValue(v *tableCacheValue) { 563 if atomic.AddInt32(&v.refCount, -1) == 0 { 564 c.releasing.Add(1) 565 c.releasingCh <- v 566 } 567 } 568 569 // findNode returns the node for the table with the given file number, creating 570 // that node if it didn't already exist. The caller is responsible for 571 // decrementing the returned node's refCount. 572 func (c *tableCacheShard) findNode(meta *fileMetadata, dbOpts *tableCacheOpts) *tableCacheValue { 573 // Fast-path for a hit in the cache. 574 c.mu.RLock() 575 key := tableCacheKey{dbOpts.cacheID, meta.FileNum} 576 if n := c.mu.nodes[key]; n != nil && n.value != nil { 577 // Fast-path hit. 578 // 579 // The caller is responsible for decrementing the refCount. 580 v := n.value 581 atomic.AddInt32(&v.refCount, 1) 582 c.mu.RUnlock() 583 atomic.StoreInt32(&n.referenced, 1) 584 atomic.AddInt64(&c.atomic.hits, 1) 585 <-v.loaded 586 return v 587 } 588 c.mu.RUnlock() 589 590 c.mu.Lock() 591 592 n := c.mu.nodes[key] 593 switch { 594 case n == nil: 595 // Slow-path miss of a non-existent node. 596 n = &tableCacheNode{ 597 meta: meta, 598 ptype: tableCacheNodeCold, 599 } 600 c.addNode(n, dbOpts) 601 c.mu.sizeCold++ 602 603 case n.value != nil: 604 // Slow-path hit of a hot or cold node. 605 // 606 // The caller is responsible for decrementing the refCount. 607 v := n.value 608 atomic.AddInt32(&v.refCount, 1) 609 atomic.StoreInt32(&n.referenced, 1) 610 atomic.AddInt64(&c.atomic.hits, 1) 611 c.mu.Unlock() 612 <-v.loaded 613 return v 614 615 default: 616 // Slow-path miss of a test node. 617 c.unlinkNode(n) 618 c.mu.coldTarget++ 619 if c.mu.coldTarget > c.size { 620 c.mu.coldTarget = c.size 621 } 622 623 atomic.StoreInt32(&n.referenced, 0) 624 n.ptype = tableCacheNodeHot 625 c.addNode(n, dbOpts) 626 c.mu.sizeHot++ 627 } 628 629 atomic.AddInt64(&c.atomic.misses, 1) 630 631 v := &tableCacheValue{ 632 loaded: make(chan struct{}), 633 refCount: 2, 634 } 635 // Cache the closure invoked when an iterator is closed. This avoids an 636 // allocation on every call to newIters. 637 v.closeHook = func(i sstable.Iterator) error { 638 if invariants.RaceEnabled { 639 c.mu.Lock() 640 delete(c.mu.iters, i) 641 c.mu.Unlock() 642 } 643 c.unrefValue(v) 644 atomic.AddInt32(&c.atomic.iterCount, -1) 645 atomic.AddInt32(dbOpts.atomic.iterCount, -1) 646 return nil 647 } 648 n.value = v 649 650 c.mu.Unlock() 651 652 // Note adding to the cache lists must complete before we begin loading the 653 // table as a failure during load will result in the node being unlinked. 654 pprof.Do(context.Background(), tableCacheLabels, func(context.Context) { 655 v.load(meta, c, dbOpts) 656 }) 657 return v 658 } 659 660 func (c *tableCacheShard) addNode(n *tableCacheNode, dbOpts *tableCacheOpts) { 661 c.evictNodes() 662 n.cacheID = dbOpts.cacheID 663 key := tableCacheKey{n.cacheID, n.meta.FileNum} 664 c.mu.nodes[key] = n 665 666 n.links.next = n 667 n.links.prev = n 668 if c.mu.handHot == nil { 669 // First element. 670 c.mu.handHot = n 671 c.mu.handCold = n 672 c.mu.handTest = n 673 } else { 674 c.mu.handHot.link(n) 675 } 676 677 if c.mu.handCold == c.mu.handHot { 678 c.mu.handCold = c.mu.handCold.prev() 679 } 680 } 681 682 func (c *tableCacheShard) evictNodes() { 683 for c.size <= c.mu.sizeHot+c.mu.sizeCold && c.mu.handCold != nil { 684 c.runHandCold() 685 } 686 } 687 688 func (c *tableCacheShard) runHandCold() { 689 n := c.mu.handCold 690 if n.ptype == tableCacheNodeCold { 691 if atomic.LoadInt32(&n.referenced) == 1 { 692 atomic.StoreInt32(&n.referenced, 0) 693 n.ptype = tableCacheNodeHot 694 c.mu.sizeCold-- 695 c.mu.sizeHot++ 696 } else { 697 c.clearNode(n) 698 n.ptype = tableCacheNodeTest 699 c.mu.sizeCold-- 700 c.mu.sizeTest++ 701 for c.size < c.mu.sizeTest && c.mu.handTest != nil { 702 c.runHandTest() 703 } 704 } 705 } 706 707 c.mu.handCold = c.mu.handCold.next() 708 709 for c.size-c.mu.coldTarget <= c.mu.sizeHot && c.mu.handHot != nil { 710 c.runHandHot() 711 } 712 } 713 714 func (c *tableCacheShard) runHandHot() { 715 if c.mu.handHot == c.mu.handTest && c.mu.handTest != nil { 716 c.runHandTest() 717 if c.mu.handHot == nil { 718 return 719 } 720 } 721 722 n := c.mu.handHot 723 if n.ptype == tableCacheNodeHot { 724 if atomic.LoadInt32(&n.referenced) == 1 { 725 atomic.StoreInt32(&n.referenced, 0) 726 } else { 727 n.ptype = tableCacheNodeCold 728 c.mu.sizeHot-- 729 c.mu.sizeCold++ 730 } 731 } 732 733 c.mu.handHot = c.mu.handHot.next() 734 } 735 736 func (c *tableCacheShard) runHandTest() { 737 if c.mu.sizeCold > 0 && c.mu.handTest == c.mu.handCold && c.mu.handCold != nil { 738 c.runHandCold() 739 if c.mu.handTest == nil { 740 return 741 } 742 } 743 744 n := c.mu.handTest 745 if n.ptype == tableCacheNodeTest { 746 c.mu.coldTarget-- 747 if c.mu.coldTarget < 0 { 748 c.mu.coldTarget = 0 749 } 750 c.unlinkNode(n) 751 c.clearNode(n) 752 } 753 754 c.mu.handTest = c.mu.handTest.next() 755 } 756 757 func (c *tableCacheShard) evict(fileNum FileNum, dbOpts *tableCacheOpts, allowLeak bool) { 758 c.mu.Lock() 759 key := tableCacheKey{dbOpts.cacheID, fileNum} 760 n := c.mu.nodes[key] 761 var v *tableCacheValue 762 if n != nil { 763 // NB: This is equivalent to tableCacheShard.releaseNode(), but we perform 764 // the tableCacheNode.release() call synchronously below to ensure the 765 // sstable file descriptor is closed before returning. Note that 766 // tableCacheShard.releasing needs to be incremented while holding 767 // tableCacheShard.mu in order to avoid a race with Close() 768 c.unlinkNode(n) 769 v = n.value 770 if v != nil { 771 if !allowLeak { 772 if t := atomic.AddInt32(&v.refCount, -1); t != 0 { 773 dbOpts.logger.Fatalf("sstable %s: refcount is not zero: %d\n%s", fileNum, t, debug.Stack()) 774 } 775 } 776 c.releasing.Add(1) 777 } 778 } 779 780 c.mu.Unlock() 781 782 if v != nil { 783 v.release(c) 784 } 785 786 dbOpts.opts.Cache.EvictFile(dbOpts.cacheID, fileNum) 787 } 788 789 // removeDB evicts any nodes which have a reference to the DB 790 // associated with dbOpts.cacheID. Make sure that there will 791 // be no more accesses to the files associated with the DB. 792 func (c *tableCacheShard) removeDB(dbOpts *tableCacheOpts) { 793 var fileNums []base.FileNum 794 795 c.mu.RLock() 796 // Collect the fileNums which need to be cleaned. 797 var firstNode *tableCacheNode 798 node := c.mu.handHot 799 for node != firstNode { 800 if firstNode == nil { 801 firstNode = node 802 } 803 804 if node.cacheID == dbOpts.cacheID { 805 fileNums = append(fileNums, node.meta.FileNum) 806 } 807 node = node.next() 808 } 809 c.mu.RUnlock() 810 811 // Evict all the nodes associated with the DB. 812 // This should synchronously close all the files 813 // associated with the DB. 814 for _, fNum := range fileNums { 815 c.evict(fNum, dbOpts, true) 816 } 817 } 818 819 func (c *tableCacheShard) Close() error { 820 c.mu.Lock() 821 defer c.mu.Unlock() 822 823 // Check for leaked iterators. Note that we'll still perform cleanup below in 824 // the case that there are leaked iterators. 825 var err error 826 if v := atomic.LoadInt32(&c.atomic.iterCount); v > 0 { 827 if !invariants.RaceEnabled { 828 err = errors.Errorf("leaked iterators: %d", errors.Safe(v)) 829 } else { 830 var buf bytes.Buffer 831 for _, stack := range c.mu.iters { 832 fmt.Fprintf(&buf, "%s\n", stack) 833 } 834 err = errors.Errorf("leaked iterators: %d\n%s", errors.Safe(v), buf.String()) 835 } 836 } 837 838 for c.mu.handHot != nil { 839 n := c.mu.handHot 840 if n.value != nil { 841 if atomic.AddInt32(&n.value.refCount, -1) == 0 { 842 c.releasing.Add(1) 843 c.releasingCh <- n.value 844 } 845 } 846 c.unlinkNode(n) 847 } 848 c.mu.nodes = nil 849 c.mu.handHot = nil 850 c.mu.handCold = nil 851 c.mu.handTest = nil 852 853 // Only shutdown the releasing goroutine if there were no leaked 854 // iterators. If there were leaked iterators, we leave the goroutine running 855 // and the releasingCh open so that a subsequent iterator close can 856 // complete. This behavior is used by iterator leak tests. Leaking the 857 // goroutine for these tests is less bad not closing the iterator which 858 // triggers other warnings about block cache handles not being released. 859 if err != nil { 860 c.releasing.Wait() 861 return err 862 } 863 864 close(c.releasingCh) 865 c.releasing.Wait() 866 c.releaseLoopExit.Wait() 867 return err 868 } 869 870 type tableCacheValue struct { 871 closeHook func(i sstable.Iterator) error 872 reader *sstable.Reader 873 filename string 874 err error 875 loaded chan struct{} 876 // Reference count for the value. The reader is closed when the reference 877 // count drops to zero. 878 refCount int32 879 } 880 881 func (v *tableCacheValue) load(meta *fileMetadata, c *tableCacheShard, dbOpts *tableCacheOpts) { 882 // Try opening the fileTypeTable first. 883 var f vfs.File 884 v.filename = base.MakeFilepath(dbOpts.fs, dbOpts.dirname, fileTypeTable, meta.FileNum) 885 f, v.err = dbOpts.fs.Open(v.filename, vfs.RandomReadsOption) 886 if v.err == nil { 887 cacheOpts := private.SSTableCacheOpts(dbOpts.cacheID, meta.FileNum).(sstable.ReaderOption) 888 reopenOpt := sstable.FileReopenOpt{FS: dbOpts.fs, Filename: v.filename} 889 v.reader, v.err = sstable.NewReader(f, dbOpts.opts, cacheOpts, dbOpts.filterMetrics, reopenOpt) 890 } 891 if v.err == nil { 892 if meta.SmallestSeqNum == meta.LargestSeqNum { 893 v.reader.Properties.GlobalSeqNum = meta.LargestSeqNum 894 } 895 } 896 if v.err != nil { 897 c.mu.Lock() 898 defer c.mu.Unlock() 899 // Lookup the node in the cache again as it might have already been 900 // removed. 901 key := tableCacheKey{dbOpts.cacheID, meta.FileNum} 902 n := c.mu.nodes[key] 903 if n != nil && n.value == v { 904 c.releaseNode(n) 905 } 906 } 907 close(v.loaded) 908 } 909 910 func (v *tableCacheValue) release(c *tableCacheShard) { 911 <-v.loaded 912 // Nothing to be done about an error at this point. Close the reader if it is 913 // open. 914 if v.reader != nil { 915 _ = v.reader.Close() 916 } 917 c.releasing.Done() 918 } 919 920 type tableCacheNodeType int8 921 922 const ( 923 tableCacheNodeTest tableCacheNodeType = iota 924 tableCacheNodeCold 925 tableCacheNodeHot 926 ) 927 928 func (p tableCacheNodeType) String() string { 929 switch p { 930 case tableCacheNodeTest: 931 return "test" 932 case tableCacheNodeCold: 933 return "cold" 934 case tableCacheNodeHot: 935 return "hot" 936 } 937 return "unknown" 938 } 939 940 type tableCacheNode struct { 941 meta *fileMetadata 942 value *tableCacheValue 943 944 links struct { 945 next *tableCacheNode 946 prev *tableCacheNode 947 } 948 ptype tableCacheNodeType 949 // referenced is atomically set to indicate that this entry has been accessed 950 // since the last time one of the clock hands swept it. 951 referenced int32 952 953 // Storing the cache id associated with the DB instance here 954 // avoids the need to thread the dbOpts struct through many functions. 955 cacheID uint64 956 } 957 958 func (n *tableCacheNode) next() *tableCacheNode { 959 if n == nil { 960 return nil 961 } 962 return n.links.next 963 } 964 965 func (n *tableCacheNode) prev() *tableCacheNode { 966 if n == nil { 967 return nil 968 } 969 return n.links.prev 970 } 971 972 func (n *tableCacheNode) link(s *tableCacheNode) { 973 s.links.prev = n.links.prev 974 s.links.prev.links.next = s 975 s.links.next = n 976 s.links.next.links.prev = s 977 } 978 979 func (n *tableCacheNode) unlink() *tableCacheNode { 980 next := n.links.next 981 n.links.prev.links.next = n.links.next 982 n.links.next.links.prev = n.links.prev 983 n.links.prev = n 984 n.links.next = n 985 return next 986 }