github.com/petermattis/pebble@v0.0.0-20190905164901-ab51a2166067/table_cache.go (about) 1 // Copyright 2013 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package pebble 6 7 import ( 8 "bytes" 9 "errors" 10 "fmt" 11 "runtime" 12 "runtime/debug" 13 "sync" 14 "sync/atomic" 15 16 "github.com/petermattis/pebble/internal/base" 17 "github.com/petermattis/pebble/sstable" 18 "github.com/petermattis/pebble/vfs" 19 ) 20 21 const defaultTableCacheHitBuffer = 64 22 23 var emptyIter = &errorIter{err: nil} 24 25 type tableCache struct { 26 shards []tableCacheShard 27 } 28 29 func (c *tableCache) init( 30 dbNum uint64, dirname string, fs vfs.FS, opts *Options, size, hitBuffer int, 31 ) { 32 c.shards = make([]tableCacheShard, runtime.NumCPU()) 33 for i := range c.shards { 34 c.shards[i].init(dbNum, dirname, fs, opts, size/len(c.shards), hitBuffer) 35 } 36 } 37 38 func (c *tableCache) getShard(fileNum uint64) *tableCacheShard { 39 return &c.shards[fileNum%uint64(len(c.shards))] 40 } 41 42 func (c *tableCache) newIters( 43 meta *fileMetadata, opts *IterOptions, bytesIterated *uint64, 44 ) (internalIterator, internalIterator, error) { 45 return c.getShard(meta.FileNum).newIters(meta, opts, bytesIterated) 46 } 47 48 func (c *tableCache) evict(fileNum uint64) { 49 c.getShard(fileNum).evict(fileNum) 50 } 51 52 func (c *tableCache) Close() error { 53 for i := range c.shards { 54 err := c.shards[i].Close() 55 if err != nil { 56 return err 57 } 58 } 59 return nil 60 } 61 62 type tableCacheShard struct { 63 dbNum uint64 64 dirname string 65 fs vfs.FS 66 opts *Options 67 size int 68 69 mu struct { 70 sync.RWMutex 71 nodes map[uint64]*tableCacheNode 72 // The iters map is only created and populated in race builds. 73 iters map[sstable.Iterator][]byte 74 lru tableCacheNode 75 } 76 77 iterCount int32 78 releasing sync.WaitGroup 79 hitsPool *sync.Pool 80 } 81 82 func (c *tableCacheShard) init( 83 dbNum uint64, dirname string, fs vfs.FS, opts *Options, size, hitBuffer int, 84 ) { 85 c.dbNum = dbNum 86 c.dirname = dirname 87 c.fs = fs 88 c.opts = opts 89 c.size = size 90 c.mu.nodes = make(map[uint64]*tableCacheNode) 91 c.mu.lru.next = &c.mu.lru 92 c.mu.lru.prev = &c.mu.lru 93 c.hitsPool = &sync.Pool{ 94 New: func() interface{} { 95 return &tableCacheHits{ 96 hits: make([]*tableCacheNode, 0, hitBuffer), 97 shard: c, 98 } 99 }, 100 } 101 102 if raceEnabled { 103 c.mu.iters = make(map[sstable.Iterator][]byte) 104 } 105 } 106 107 func (c *tableCacheShard) newIters( 108 meta *fileMetadata, opts *IterOptions, bytesIterated *uint64, 109 ) (internalIterator, internalIterator, error) { 110 // Calling findNode gives us the responsibility of decrementing n's 111 // refCount. If opening the underlying table resulted in error, then we 112 // decrement this straight away. Otherwise, we pass that responsibility to 113 // the sstable iterator, which decrements when it is closed. 114 n := c.findNode(meta) 115 <-n.loaded 116 if n.err != nil { 117 c.unrefNode(n) 118 return nil, nil, n.err 119 } 120 121 if opts != nil && 122 opts.TableFilter != nil && 123 !opts.TableFilter(n.reader.Properties.UserProperties) { 124 // Return the empty iterator. This iterator has no mutable state, so 125 // using a singleton is fine. 126 return emptyIter, nil, nil 127 } 128 var iter sstable.Iterator 129 if bytesIterated != nil { 130 iter = n.reader.NewCompactionIter(bytesIterated) 131 } else { 132 iter = n.reader.NewIter(opts.GetLowerBound(), opts.GetUpperBound()) 133 } 134 atomic.AddInt32(&c.iterCount, 1) 135 if raceEnabled { 136 c.mu.Lock() 137 c.mu.iters[iter] = debug.Stack() 138 c.mu.Unlock() 139 } 140 iter.SetCloseHook(n.closeHook) 141 142 // NB: range-del iterator does not maintain a reference to the table, nor 143 // does it need to read from it after creation. 144 if rangeDelIter := n.reader.NewRangeDelIter(); rangeDelIter != nil { 145 return iter, rangeDelIter, nil 146 } 147 // NB: Translate a nil range-del iterator into a nil interface. 148 return iter, nil, nil 149 } 150 151 // releaseNode releases a node from the tableCacheShard. 152 // 153 // c.mu must be held when calling this. 154 func (c *tableCacheShard) releaseNode(n *tableCacheNode) { 155 delete(c.mu.nodes, n.meta.FileNum) 156 n.next.prev = n.prev 157 n.prev.next = n.next 158 n.prev = nil 159 n.next = nil 160 c.unrefNode(n) 161 } 162 163 // unrefNode decrements the reference count for the specified node, releasing 164 // it if the reference count fell to 0. Note that the node has a reference if 165 // it is present in tableCacheShard.mu.nodes, so a reference count of 0 means the 166 // node has already been removed from that map. 167 // 168 // Returns true if the node was released and false otherwise. 169 func (c *tableCacheShard) unrefNode(n *tableCacheNode) { 170 if atomic.AddInt32(&n.refCount, -1) == 0 { 171 c.releasing.Add(1) 172 go n.release(c) 173 } 174 } 175 176 // findNode returns the node for the table with the given file number, creating 177 // that node if it didn't already exist. The caller is responsible for 178 // decrementing the returned node's refCount. 179 func (c *tableCacheShard) findNode(meta *fileMetadata) *tableCacheNode { 180 // Fast-path for a hit in the cache. We grab the lock in shared mode, and use 181 // a batching mechanism to perform updates to the LRU list. 182 c.mu.RLock() 183 if n := c.mu.nodes[meta.FileNum]; n != nil { 184 // The caller is responsible for decrementing the refCount. 185 atomic.AddInt32(&n.refCount, 1) 186 c.mu.RUnlock() 187 188 // Record a hit for the node. This has to be done with tableCacheShard.mu 189 // unlocked as it might result in a call to 190 // tableCacheShard.recordHits. Note that the sync.Pool acts as a 191 // thread-local cache of the accesses. This is lossy (a GC can result in 192 // the sync.Pool be cleared), but that is ok as we don't need perfect 193 // accuracy for the LRU list. 194 hits := c.hitsPool.Get().(*tableCacheHits) 195 hits.recordHit(n) 196 c.hitsPool.Put(hits) 197 return n 198 } 199 c.mu.RUnlock() 200 201 c.mu.Lock() 202 defer c.mu.Unlock() 203 204 { 205 // Flush the thread-local hits buffer as we already have the shard locked 206 // exclusively. 207 hits := c.hitsPool.Get().(*tableCacheHits) 208 hits.flushLocked() 209 c.hitsPool.Put(hits) 210 } 211 212 n := c.mu.nodes[meta.FileNum] 213 if n == nil { 214 n = &tableCacheNode{ 215 // Cache the closure invoked when an iterator is closed. This avoids an 216 // allocation on every call to newIters. 217 closeHook: func(i sstable.Iterator) error { 218 if raceEnabled { 219 c.mu.Lock() 220 delete(c.mu.iters, i) 221 c.mu.Unlock() 222 } 223 c.unrefNode(n) 224 atomic.AddInt32(&c.iterCount, -1) 225 return nil 226 }, 227 meta: meta, 228 refCount: 1, 229 loaded: make(chan struct{}), 230 } 231 c.mu.nodes[meta.FileNum] = n 232 if len(c.mu.nodes) > c.size { 233 // Release the tail node. 234 c.releaseNode(c.mu.lru.prev) 235 } 236 go n.load(c) 237 } else { 238 // Remove n from the doubly-linked list. 239 n.next.prev = n.prev 240 n.prev.next = n.next 241 } 242 // Insert n at the front of the doubly-linked list. 243 n.next = c.mu.lru.next 244 n.prev = &c.mu.lru 245 n.next.prev = n 246 n.prev.next = n 247 // The caller is responsible for decrementing the refCount. 248 atomic.AddInt32(&n.refCount, 1) 249 return n 250 } 251 252 func (c *tableCacheShard) evict(fileNum uint64) { 253 c.mu.Lock() 254 if n := c.mu.nodes[fileNum]; n != nil { 255 c.releaseNode(n) 256 } 257 c.mu.Unlock() 258 259 c.opts.Cache.EvictFile(c.dbNum, fileNum) 260 } 261 262 func (c *tableCacheShard) recordHits(hits []*tableCacheNode) { 263 c.mu.Lock() 264 c.recordHitsLocked(hits) 265 c.mu.Unlock() 266 } 267 268 func (c *tableCacheShard) recordHitsLocked(hits []*tableCacheNode) { 269 for _, n := range hits { 270 if n.next == nil || n.prev == nil { 271 // The node is no longer on the LRU list. 272 continue 273 } 274 // Remove n from the doubly-linked list. 275 n.next.prev = n.prev 276 n.prev.next = n.next 277 // Insert n at the front of the doubly-linked list. 278 n.next = c.mu.lru.next 279 n.prev = &c.mu.lru 280 n.next.prev = n 281 n.prev.next = n 282 } 283 } 284 285 func (c *tableCacheShard) Close() error { 286 c.mu.Lock() 287 defer c.mu.Unlock() 288 289 if v := atomic.LoadInt32(&c.iterCount); v > 0 { 290 if !raceEnabled { 291 return fmt.Errorf("leaked iterators: %d", v) 292 } 293 var buf bytes.Buffer 294 fmt.Fprintf(&buf, "leaked iterators: %d\n", v) 295 for _, stack := range c.mu.iters { 296 fmt.Fprintf(&buf, "%s\n", stack) 297 } 298 return errors.New(buf.String()) 299 } 300 301 for n := c.mu.lru.next; n != &c.mu.lru; n = n.next { 302 if atomic.AddInt32(&n.refCount, -1) == 0 { 303 c.releasing.Add(1) 304 go n.release(c) 305 } 306 } 307 c.mu.nodes = nil 308 c.mu.lru.next = nil 309 c.mu.lru.prev = nil 310 311 c.releasing.Wait() 312 return nil 313 } 314 315 type tableCacheNode struct { 316 closeHook func(i sstable.Iterator) error 317 318 meta *fileMetadata 319 reader *sstable.Reader 320 err error 321 loaded chan struct{} 322 323 // The remaining fields are protected by the tableCache mutex. 324 325 next, prev *tableCacheNode 326 refCount int32 327 } 328 329 func (n *tableCacheNode) load(c *tableCacheShard) { 330 // Try opening the fileTypeTable first. 331 f, err := c.fs.Open(base.MakeFilename(c.dirname, fileTypeTable, n.meta.FileNum), 332 vfs.RandomReadsOption) 333 if err != nil { 334 n.err = err 335 close(n.loaded) 336 return 337 } 338 n.reader, n.err = sstable.NewReader(f, c.dbNum, n.meta.FileNum, c.opts) 339 if n.meta.SmallestSeqNum == n.meta.LargestSeqNum { 340 n.reader.Properties.GlobalSeqNum = n.meta.LargestSeqNum 341 } 342 close(n.loaded) 343 } 344 345 func (n *tableCacheNode) release(c *tableCacheShard) { 346 <-n.loaded 347 // Nothing to be done about an error at this point. Close the reader if it is 348 // open. 349 if n.reader != nil { 350 _ = n.reader.Close() 351 } 352 c.releasing.Done() 353 } 354 355 // tableCacheHits batches a set of node accesses in order to amortize exclusive 356 // lock acquisition. 357 type tableCacheHits struct { 358 hits []*tableCacheNode 359 shard *tableCacheShard 360 } 361 362 func (f *tableCacheHits) recordHit(n *tableCacheNode) { 363 f.hits = append(f.hits, n) 364 if len(f.hits) == cap(f.hits) { 365 f.shard.recordHits(f.hits) 366 f.hits = f.hits[:0] 367 } 368 } 369 370 func (f *tableCacheHits) flushLocked() { 371 f.shard.recordHitsLocked(f.hits) 372 f.hits = f.hits[:0] 373 }