github.com/cockroachdb/pebble@v1.1.2/objstorage/objstorageprovider/sharedcache/shared_cache.go (about) 1 // Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package sharedcache 6 7 import ( 8 "context" 9 "fmt" 10 "io" 11 "math/bits" 12 "sync" 13 "sync/atomic" 14 "time" 15 16 "github.com/cockroachdb/errors" 17 "github.com/cockroachdb/pebble/internal/base" 18 "github.com/cockroachdb/pebble/internal/invariants" 19 "github.com/cockroachdb/pebble/objstorage/remote" 20 "github.com/cockroachdb/pebble/vfs" 21 "github.com/prometheus/client_golang/prometheus" 22 ) 23 24 // Exported to enable exporting from package pebble to enable 25 // exporting metrics with below buckets in CRDB. 26 var ( 27 IOBuckets = prometheus.ExponentialBucketsRange(float64(time.Millisecond*1), float64(10*time.Second), 50) 28 ChannelWriteBuckets = prometheus.ExponentialBucketsRange(float64(time.Microsecond*1), float64(10*time.Second), 50) 29 ) 30 31 // Cache is a persistent cache backed by a local filesystem. It is intended 32 // to cache data that is in slower shared storage (e.g. S3), hence the 33 // package name 'sharedcache'. 34 type Cache struct { 35 shards []shard 36 writeWorkers writeWorkers 37 38 bm blockMath 39 shardingBlockSize int64 40 41 logger base.Logger 42 metrics internalMetrics 43 } 44 45 // Metrics is a struct containing metrics exported by the secondary cache. 46 // TODO(josh): Reconsider the set of metrics exported by the secondary cache 47 // before we release the secondary cache to users. We choose to export many metrics 48 // right now, so we learn a lot from the benchmarking we are doing over the 23.2 49 // cycle. 50 type Metrics struct { 51 // The number of sstable bytes stored in the cache. 52 Size int64 53 // The count of cache blocks in the cache (not sstable blocks). 54 Count int64 55 56 // The number of calls to ReadAt. 57 TotalReads int64 58 // The number of calls to ReadAt that require reading data from 2+ shards. 59 MultiShardReads int64 60 // The number of calls to ReadAt that require reading data from 2+ cache blocks. 61 MultiBlockReads int64 62 // The number of calls to ReadAt where all data returned was read from the cache. 63 ReadsWithFullHit int64 64 // The number of calls to ReadAt where some data returned was read from the cache. 65 ReadsWithPartialHit int64 66 // The number of calls to ReadAt where no data returned was read from the cache. 67 ReadsWithNoHit int64 68 69 // The number of times a cache block was evicted from the cache. 70 Evictions int64 71 // The number of times writing a cache block to the cache failed. 72 WriteBackFailures int64 73 74 // The latency of calls to get some data from the cache. 75 GetLatency prometheus.Histogram 76 // The latency of reads of a single cache block from disk. 77 DiskReadLatency prometheus.Histogram 78 // The latency of writing data to write back to the cache to a channel. 79 // Generally should be low, but if the channel is full, could be high. 80 QueuePutLatency prometheus.Histogram 81 // The latency of calls to put some data read from block storage into the cache. 82 PutLatency prometheus.Histogram 83 // The latency of writes of a single cache block to disk. 84 DiskWriteLatency prometheus.Histogram 85 } 86 87 // See docs at Metrics. 88 type internalMetrics struct { 89 count atomic.Int64 90 91 totalReads atomic.Int64 92 multiShardReads atomic.Int64 93 multiBlockReads atomic.Int64 94 readsWithFullHit atomic.Int64 95 readsWithPartialHit atomic.Int64 96 readsWithNoHit atomic.Int64 97 98 evictions atomic.Int64 99 writeBackFailures atomic.Int64 100 101 getLatency prometheus.Histogram 102 diskReadLatency prometheus.Histogram 103 queuePutLatency prometheus.Histogram 104 putLatency prometheus.Histogram 105 diskWriteLatency prometheus.Histogram 106 } 107 108 const ( 109 // writeWorkersPerShard is used to establish the number of worker goroutines 110 // that perform writes to the cache. 111 writeWorkersPerShard = 4 112 // writeTaskPerWorker is used to establish how many tasks can be queued up 113 // until we have to block. 114 writeTasksPerWorker = 4 115 ) 116 117 // Open opens a cache. If there is no existing cache at fsDir, a new one 118 // is created. 119 func Open( 120 fs vfs.FS, 121 logger base.Logger, 122 fsDir string, 123 blockSize int, 124 // shardingBlockSize is the size of a shard block. The cache is split into contiguous 125 // shardingBlockSize units. The units are distributed across multiple independent shards 126 // of the cache, via a hash(offset) modulo num shards operation. The cache replacement 127 // policies operate at the level of shard, not whole cache. This is done to reduce lock 128 // contention. 129 shardingBlockSize int64, 130 sizeBytes int64, 131 numShards int, 132 ) (*Cache, error) { 133 if minSize := shardingBlockSize * int64(numShards); sizeBytes < minSize { 134 // Up the size so that we have one block per shard. In practice, this should 135 // only happen in tests. 136 sizeBytes = minSize 137 } 138 139 c := &Cache{ 140 logger: logger, 141 bm: makeBlockMath(blockSize), 142 shardingBlockSize: shardingBlockSize, 143 } 144 c.shards = make([]shard, numShards) 145 blocksPerShard := sizeBytes / int64(numShards) / int64(blockSize) 146 for i := range c.shards { 147 if err := c.shards[i].init(c, fs, fsDir, i, blocksPerShard, blockSize, shardingBlockSize); err != nil { 148 return nil, err 149 } 150 } 151 152 c.writeWorkers.Start(c, numShards*writeWorkersPerShard) 153 154 c.metrics.getLatency = prometheus.NewHistogram(prometheus.HistogramOpts{Buckets: IOBuckets}) 155 c.metrics.diskReadLatency = prometheus.NewHistogram(prometheus.HistogramOpts{Buckets: IOBuckets}) 156 c.metrics.putLatency = prometheus.NewHistogram(prometheus.HistogramOpts{Buckets: IOBuckets}) 157 c.metrics.diskWriteLatency = prometheus.NewHistogram(prometheus.HistogramOpts{Buckets: IOBuckets}) 158 159 // Measures a channel write, so lower min. 160 c.metrics.queuePutLatency = prometheus.NewHistogram(prometheus.HistogramOpts{Buckets: ChannelWriteBuckets}) 161 162 return c, nil 163 } 164 165 // Close closes the cache. Methods such as ReadAt should not be called after Close is 166 // called. 167 func (c *Cache) Close() error { 168 c.writeWorkers.Stop() 169 170 var retErr error 171 for i := range c.shards { 172 if err := c.shards[i].close(); err != nil && retErr == nil { 173 retErr = err 174 } 175 } 176 c.shards = nil 177 return retErr 178 } 179 180 // Metrics return metrics for the cache. Callers should not mutate 181 // the returned histograms, which are pointer types. 182 func (c *Cache) Metrics() Metrics { 183 return Metrics{ 184 Count: c.metrics.count.Load(), 185 Size: c.metrics.count.Load() * int64(c.bm.BlockSize()), 186 TotalReads: c.metrics.totalReads.Load(), 187 MultiShardReads: c.metrics.multiShardReads.Load(), 188 MultiBlockReads: c.metrics.multiBlockReads.Load(), 189 ReadsWithFullHit: c.metrics.readsWithFullHit.Load(), 190 ReadsWithPartialHit: c.metrics.readsWithPartialHit.Load(), 191 ReadsWithNoHit: c.metrics.readsWithNoHit.Load(), 192 Evictions: c.metrics.evictions.Load(), 193 WriteBackFailures: c.metrics.writeBackFailures.Load(), 194 GetLatency: c.metrics.getLatency, 195 DiskReadLatency: c.metrics.diskReadLatency, 196 QueuePutLatency: c.metrics.queuePutLatency, 197 PutLatency: c.metrics.putLatency, 198 DiskWriteLatency: c.metrics.diskWriteLatency, 199 } 200 } 201 202 // ReadFlags contains options for Cache.ReadAt. 203 type ReadFlags struct { 204 // ReadOnly instructs ReadAt to not write any new data into the cache; it is 205 // used when the data is unlikely to be used again. 206 ReadOnly bool 207 } 208 209 // ReadAt performs a read form an object, attempting to use cached data when 210 // possible. 211 func (c *Cache) ReadAt( 212 ctx context.Context, 213 fileNum base.DiskFileNum, 214 p []byte, 215 ofs int64, 216 objReader remote.ObjectReader, 217 objSize int64, 218 flags ReadFlags, 219 ) error { 220 c.metrics.totalReads.Add(1) 221 if ofs >= objSize { 222 if invariants.Enabled { 223 panic(fmt.Sprintf("invalid ReadAt offset %v %v", ofs, objSize)) 224 } 225 return io.EOF 226 } 227 // TODO(radu): for compaction reads, we may not want to read from the cache at 228 // all. 229 { 230 start := time.Now() 231 n, err := c.get(fileNum, p, ofs) 232 c.metrics.getLatency.Observe(float64(time.Since(start))) 233 if err != nil { 234 return err 235 } 236 if n == len(p) { 237 // Everything was in cache! 238 c.metrics.readsWithFullHit.Add(1) 239 return nil 240 } 241 if n == 0 { 242 c.metrics.readsWithNoHit.Add(1) 243 } else { 244 c.metrics.readsWithPartialHit.Add(1) 245 } 246 247 // Note this. The below code does not need the original ofs, as with the earlier 248 // reading from the cache done, the relevant offset is ofs + int64(n). Same with p. 249 ofs += int64(n) 250 p = p[n:] 251 252 if invariants.Enabled { 253 if n != 0 && c.bm.Remainder(ofs) != 0 { 254 panic(fmt.Sprintf("after non-zero read from cache, ofs is not block-aligned: %v %v", ofs, n)) 255 } 256 } 257 } 258 259 if flags.ReadOnly { 260 return objReader.ReadAt(ctx, p, ofs) 261 } 262 263 // We must do reads with offset & size that are multiples of the block size. Else 264 // later cache hits may return incorrect zeroed results from the cache. 265 firstBlockInd := c.bm.Block(ofs) 266 adjustedOfs := c.bm.BlockOffset(firstBlockInd) 267 268 // Take the length of what is left to read plus the length of the adjustment of 269 // the offset plus the size of a block minus one and divide by the size of a block 270 // to get the number of blocks to read from the object. 271 sizeOfOffAdjustment := int(ofs - adjustedOfs) 272 adjustedLen := int(c.bm.RoundUp(int64(len(p) + sizeOfOffAdjustment))) 273 adjustedP := make([]byte, adjustedLen) 274 275 // Read the rest from the object. We may need to cap the length to avoid past EOF reads. 276 eofCap := int64(adjustedLen) 277 if adjustedOfs+eofCap > objSize { 278 eofCap = objSize - adjustedOfs 279 } 280 if err := objReader.ReadAt(ctx, adjustedP[:eofCap], adjustedOfs); err != nil { 281 return err 282 } 283 copy(p, adjustedP[sizeOfOffAdjustment:]) 284 285 start := time.Now() 286 c.writeWorkers.QueueWrite(fileNum, adjustedP, adjustedOfs) 287 c.metrics.queuePutLatency.Observe(float64(time.Since(start))) 288 289 return nil 290 } 291 292 // get attempts to read the requested data from the cache, if it is already 293 // there. 294 // 295 // If all data is available, returns n = len(p). 296 // 297 // If data is partially available, a prefix of the data is read; returns n < len(p) 298 // and no error. If no prefix is available, returns n = 0 and no error. 299 func (c *Cache) get(fileNum base.DiskFileNum, p []byte, ofs int64) (n int, _ error) { 300 // The data extent might cross shard boundaries, hence the loop. In the hot 301 // path, max two iterations of this loop will be executed, since reads are sized 302 // in units of sstable block size. 303 var multiShard bool 304 for { 305 shard := c.getShard(fileNum, ofs+int64(n)) 306 cappedLen := len(p[n:]) 307 if toBoundary := int(c.shardingBlockSize - ((ofs + int64(n)) % c.shardingBlockSize)); cappedLen > toBoundary { 308 cappedLen = toBoundary 309 } 310 numRead, err := shard.get(fileNum, p[n:n+cappedLen], ofs+int64(n)) 311 if err != nil { 312 return n, err 313 } 314 n += numRead 315 if numRead < cappedLen { 316 // We only read a prefix from this shard. 317 return n, nil 318 } 319 if n == len(p) { 320 // We are done. 321 return n, nil 322 } 323 // Data extent crosses shard boundary, continue with next shard. 324 if !multiShard { 325 c.metrics.multiShardReads.Add(1) 326 multiShard = true 327 } 328 } 329 } 330 331 // set attempts to write the requested data to the cache. Both ofs & len(p) must 332 // be multiples of the block size. 333 // 334 // If all of p is not written to the shard, set returns a non-nil error. 335 func (c *Cache) set(fileNum base.DiskFileNum, p []byte, ofs int64) error { 336 if invariants.Enabled { 337 if c.bm.Remainder(ofs) != 0 || c.bm.Remainder(int64(len(p))) != 0 { 338 panic(fmt.Sprintf("set with ofs & len not multiples of block size: %v %v", ofs, len(p))) 339 } 340 } 341 342 // The data extent might cross shard boundaries, hence the loop. In the hot 343 // path, max two iterations of this loop will be executed, since reads are sized 344 // in units of sstable block size. 345 n := 0 346 for { 347 shard := c.getShard(fileNum, ofs+int64(n)) 348 cappedLen := len(p[n:]) 349 if toBoundary := int(c.shardingBlockSize - ((ofs + int64(n)) % c.shardingBlockSize)); cappedLen > toBoundary { 350 cappedLen = toBoundary 351 } 352 err := shard.set(fileNum, p[n:n+cappedLen], ofs+int64(n)) 353 if err != nil { 354 return err 355 } 356 // set returns an error if cappedLen bytes aren't written to the shard. 357 n += cappedLen 358 if n == len(p) { 359 // We are done. 360 return nil 361 } 362 // Data extent crosses shard boundary, continue with next shard. 363 } 364 } 365 366 func (c *Cache) getShard(fileNum base.DiskFileNum, ofs int64) *shard { 367 const prime64 = 1099511628211 368 hash := uint64(fileNum.FileNum())*prime64 + uint64(ofs/c.shardingBlockSize) 369 // TODO(josh): Instance change ops are often run in production. Such an operation 370 // updates len(c.shards); see openSharedCache. As a result, the behavior of this 371 // function changes, and the cache empties out at restart time. We may want a better 372 // story here eventually. 373 return &c.shards[hash%uint64(len(c.shards))] 374 } 375 376 type shard struct { 377 cache *Cache 378 file vfs.File 379 sizeInBlocks int64 380 bm blockMath 381 shardingBlockSize int64 382 mu struct { 383 sync.Mutex 384 // TODO(josh): None of these datastructures are space-efficient. 385 // Focusing on correctness to start. 386 where whereMap 387 blocks []cacheBlockState 388 // Head of LRU list (doubly-linked circular). 389 lruHead cacheBlockIndex 390 // Head of free list (singly-linked chain). 391 freeHead cacheBlockIndex 392 } 393 } 394 395 type cacheBlockState struct { 396 lock lockState 397 logical logicalBlockID 398 399 // next is the next block in the LRU or free list (or invalidBlockIndex if it 400 // is the last block in the free list). 401 next cacheBlockIndex 402 403 // prev is the previous block in the LRU list. It is not used when the block 404 // is in the free list. 405 prev cacheBlockIndex 406 } 407 408 // Maps a logical block in an SST to an index of the cache block with the 409 // file contents (to the "cache block index"). 410 type whereMap map[logicalBlockID]cacheBlockIndex 411 412 type logicalBlockID struct { 413 filenum base.DiskFileNum 414 cacheBlockIdx cacheBlockIndex 415 } 416 417 type lockState int64 418 419 const ( 420 unlocked lockState = 0 421 // >0 lockState tracks the number of distinct readers of some cache block / logical block 422 // which is in the secondary cache. It is used to ensure that a cache block is not evicted 423 // and overwritten, while there are active readers. 424 readLockTakenInc = 1 425 // -1 lockState indicates that some cache block is currently being populated with data from 426 // blob storage. It is used to ensure that a cache block is not read or evicted again, while 427 // it is being populated. 428 writeLockTaken = -1 429 ) 430 431 func (s *shard) init( 432 cache *Cache, 433 fs vfs.FS, 434 fsDir string, 435 shardIdx int, 436 sizeInBlocks int64, 437 blockSize int, 438 shardingBlockSize int64, 439 ) error { 440 *s = shard{ 441 cache: cache, 442 sizeInBlocks: sizeInBlocks, 443 } 444 if blockSize < 1024 || shardingBlockSize%int64(blockSize) != 0 { 445 return errors.Newf("invalid block size %d (must divide %d)", blockSize, shardingBlockSize) 446 } 447 s.bm = makeBlockMath(blockSize) 448 s.shardingBlockSize = shardingBlockSize 449 file, err := fs.OpenReadWrite(fs.PathJoin(fsDir, fmt.Sprintf("SHARED-CACHE-%03d", shardIdx))) 450 if err != nil { 451 return err 452 } 453 // TODO(radu): truncate file if necessary (especially important if we restart 454 // with more shards). 455 if err := file.Preallocate(0, int64(blockSize)*sizeInBlocks); err != nil { 456 return err 457 } 458 s.file = file 459 460 // TODO(josh): Right now, the secondary cache is not persistent. All existing 461 // cache contents will be over-written, since all metadata is only stored in 462 // memory. 463 s.mu.where = make(whereMap) 464 s.mu.blocks = make([]cacheBlockState, sizeInBlocks) 465 s.mu.lruHead = invalidBlockIndex 466 s.mu.freeHead = invalidBlockIndex 467 for i := range s.mu.blocks { 468 s.freePush(cacheBlockIndex(i)) 469 } 470 471 return nil 472 } 473 474 func (s *shard) close() error { 475 defer func() { 476 s.file = nil 477 }() 478 return s.file.Close() 479 } 480 481 // freePush pushes a block to the front of the free list. 482 func (s *shard) freePush(index cacheBlockIndex) { 483 s.mu.blocks[index].next = s.mu.freeHead 484 s.mu.freeHead = index 485 } 486 487 // freePop removes the block from the front of the free list. Must not be called 488 // if the list is empty (i.e. freeHead = invalidBlockIndex). 489 func (s *shard) freePop() cacheBlockIndex { 490 index := s.mu.freeHead 491 s.mu.freeHead = s.mu.blocks[index].next 492 return index 493 } 494 495 // lruInsertFront inserts a block at the front of the LRU list. 496 func (s *shard) lruInsertFront(index cacheBlockIndex) { 497 b := &s.mu.blocks[index] 498 if s.mu.lruHead == invalidBlockIndex { 499 b.next = index 500 b.prev = index 501 } else { 502 b.next = s.mu.lruHead 503 h := &s.mu.blocks[s.mu.lruHead] 504 b.prev = h.prev 505 s.mu.blocks[h.prev].next = index 506 h.prev = index 507 } 508 s.mu.lruHead = index 509 } 510 511 func (s *shard) lruNext(index cacheBlockIndex) cacheBlockIndex { 512 return s.mu.blocks[index].next 513 } 514 515 func (s *shard) lruPrev(index cacheBlockIndex) cacheBlockIndex { 516 return s.mu.blocks[index].prev 517 } 518 519 // lruUnlink removes a block from the LRU list. 520 func (s *shard) lruUnlink(index cacheBlockIndex) { 521 b := &s.mu.blocks[index] 522 if b.next == index { 523 s.mu.lruHead = invalidBlockIndex 524 } else { 525 s.mu.blocks[b.prev].next = b.next 526 s.mu.blocks[b.next].prev = b.prev 527 if s.mu.lruHead == index { 528 s.mu.lruHead = b.next 529 } 530 } 531 b.next, b.prev = invalidBlockIndex, invalidBlockIndex 532 } 533 534 // get attempts to read the requested data from the shard. The data must not 535 // cross a shard boundary. 536 // 537 // If all data is available, returns n = len(p). 538 // 539 // If data is partially available, a prefix of the data is read; returns n < len(p) 540 // and no error. If no prefix is available, returns n = 0 and no error. 541 // 542 // TODO(josh): Today, if there are two cache blocks needed to satisfy a read, and the 543 // first block is not in the cache and the second one is, we will read both from 544 // blob storage. We should fix this. This is not an unlikely scenario if we are doing 545 // a reverse scan, since those iterate over sstable blocks in reverse order and due to 546 // cache block aligned reads will have read the suffix of the sstable block that will 547 // be needed next. 548 func (s *shard) get(fileNum base.DiskFileNum, p []byte, ofs int64) (n int, _ error) { 549 if invariants.Enabled { 550 if ofs/s.shardingBlockSize != (ofs+int64(len(p))-1)/s.shardingBlockSize { 551 panic(fmt.Sprintf("get crosses shard boundary: %v %v", ofs, len(p))) 552 } 553 s.assertShardStateIsConsistent() 554 } 555 556 // The data extent might cross cache block boundaries, hence the loop. In the hot 557 // path, max two iterations of this loop will be executed, since reads are sized 558 // in units of sstable block size. 559 var multiBlock bool 560 for { 561 k := logicalBlockID{ 562 filenum: fileNum, 563 cacheBlockIdx: s.bm.Block(ofs + int64(n)), 564 } 565 s.mu.Lock() 566 cacheBlockIdx, ok := s.mu.where[k] 567 // TODO(josh): Multiple reads within the same few milliseconds (anything that is smaller 568 // than blob storage read latency) that miss on the same logical block ID will not necessarily 569 // be rare. We may want to do only one read, with the later readers blocking on the first read 570 // completing. This could be implemented either here or in the primary block cache. See 571 // https://github.com/cockroachdb/pebble/pull/2586 for additional discussion. 572 if !ok { 573 s.mu.Unlock() 574 return n, nil 575 } 576 if s.mu.blocks[cacheBlockIdx].lock == writeLockTaken { 577 // In practice, if we have two reads of the same SST block in close succession, we 578 // would expect the second to hit in the in-memory block cache. So it's not worth 579 // optimizing this case here. 580 s.mu.Unlock() 581 return n, nil 582 } 583 s.mu.blocks[cacheBlockIdx].lock += readLockTakenInc 584 // Move to front of the LRU list. 585 s.lruUnlink(cacheBlockIdx) 586 s.lruInsertFront(cacheBlockIdx) 587 s.mu.Unlock() 588 589 readAt := s.bm.BlockOffset(cacheBlockIdx) 590 readSize := s.bm.BlockSize() 591 if n == 0 { // if first read 592 rem := s.bm.Remainder(ofs) 593 readAt += rem 594 readSize -= int(rem) 595 } 596 597 if len(p[n:]) <= readSize { 598 start := time.Now() 599 numRead, err := s.file.ReadAt(p[n:], readAt) 600 s.cache.metrics.diskReadLatency.Observe(float64(time.Since(start))) 601 s.dropReadLock(cacheBlockIdx) 602 return n + numRead, err 603 } 604 start := time.Now() 605 numRead, err := s.file.ReadAt(p[n:n+readSize], readAt) 606 s.cache.metrics.diskReadLatency.Observe(float64(time.Since(start))) 607 s.dropReadLock(cacheBlockIdx) 608 if err != nil { 609 return 0, err 610 } 611 612 // Note that numRead == readSize, since we checked for an error above. 613 n += numRead 614 615 if !multiBlock { 616 s.cache.metrics.multiBlockReads.Add(1) 617 multiBlock = true 618 } 619 } 620 } 621 622 // set attempts to write the requested data to the shard. The data must not 623 // cross a shard boundary, and both ofs & len(p) must be multiples of the 624 // block size. 625 // 626 // If all of p is not written to the shard, set returns a non-nil error. 627 func (s *shard) set(fileNum base.DiskFileNum, p []byte, ofs int64) error { 628 if invariants.Enabled { 629 if ofs/s.shardingBlockSize != (ofs+int64(len(p))-1)/s.shardingBlockSize { 630 panic(fmt.Sprintf("set crosses shard boundary: %v %v", ofs, len(p))) 631 } 632 if s.bm.Remainder(ofs) != 0 || s.bm.Remainder(int64(len(p))) != 0 { 633 panic(fmt.Sprintf("set with ofs & len not multiples of block size: %v %v", ofs, len(p))) 634 } 635 s.assertShardStateIsConsistent() 636 } 637 638 // The data extent might cross cache block boundaries, hence the loop. In the hot 639 // path, max two iterations of this loop will be executed, since reads are sized 640 // in units of sstable block size. 641 n := 0 642 for { 643 if n == len(p) { 644 return nil 645 } 646 if invariants.Enabled { 647 if n > len(p) { 648 panic(fmt.Sprintf("set with n greater than len(p): %v %v", n, len(p))) 649 } 650 } 651 652 // If the logical block is already in the cache, we should skip doing a set. 653 k := logicalBlockID{ 654 filenum: fileNum, 655 cacheBlockIdx: s.bm.Block(ofs + int64(n)), 656 } 657 s.mu.Lock() 658 if _, ok := s.mu.where[k]; ok { 659 s.mu.Unlock() 660 n += s.bm.BlockSize() 661 continue 662 } 663 664 var cacheBlockIdx cacheBlockIndex 665 if s.mu.freeHead == invalidBlockIndex { 666 if invariants.Enabled && s.mu.lruHead == invalidBlockIndex { 667 panic("both LRU and free lists empty") 668 } 669 670 // Find the last element in the LRU list which is not locked. 671 for idx := s.lruPrev(s.mu.lruHead); ; idx = s.lruPrev(idx) { 672 if lock := s.mu.blocks[idx].lock; lock == unlocked { 673 cacheBlockIdx = idx 674 break 675 } 676 if idx == s.mu.lruHead { 677 // No unlocked block to evict. 678 // 679 // TODO(josh): We may want to block until a block frees up, instead of returning 680 // an error here. But I think we can do that later on, e.g. after running some production 681 // experiments. 682 s.mu.Unlock() 683 return errors.New("no block to evict so skipping write to cache") 684 } 685 } 686 s.cache.metrics.evictions.Add(1) 687 s.lruUnlink(cacheBlockIdx) 688 delete(s.mu.where, s.mu.blocks[cacheBlockIdx].logical) 689 } else { 690 s.cache.metrics.count.Add(1) 691 cacheBlockIdx = s.freePop() 692 } 693 694 s.lruInsertFront(cacheBlockIdx) 695 s.mu.where[k] = cacheBlockIdx 696 s.mu.blocks[cacheBlockIdx].logical = k 697 s.mu.blocks[cacheBlockIdx].lock = writeLockTaken 698 s.mu.Unlock() 699 700 writeAt := s.bm.BlockOffset(cacheBlockIdx) 701 702 writeSize := s.bm.BlockSize() 703 if len(p[n:]) <= writeSize { 704 writeSize = len(p[n:]) 705 } 706 707 start := time.Now() 708 _, err := s.file.WriteAt(p[n:n+writeSize], writeAt) 709 s.cache.metrics.diskWriteLatency.Observe(float64(time.Since(start))) 710 if err != nil { 711 // Free the block. 712 s.mu.Lock() 713 defer s.mu.Unlock() 714 715 delete(s.mu.where, k) 716 s.lruUnlink(cacheBlockIdx) 717 s.freePush(cacheBlockIdx) 718 return err 719 } 720 s.dropWriteLock(cacheBlockIdx) 721 n += writeSize 722 } 723 } 724 725 // Doesn't inline currently. This might be okay, but something to keep in mind. 726 func (s *shard) dropReadLock(cacheBlockInd cacheBlockIndex) { 727 s.mu.Lock() 728 s.mu.blocks[cacheBlockInd].lock -= readLockTakenInc 729 if invariants.Enabled && s.mu.blocks[cacheBlockInd].lock < 0 { 730 panic(fmt.Sprintf("unexpected lock state %v in dropReadLock", s.mu.blocks[cacheBlockInd].lock)) 731 } 732 s.mu.Unlock() 733 } 734 735 // Doesn't inline currently. This might be okay, but something to keep in mind. 736 func (s *shard) dropWriteLock(cacheBlockInd cacheBlockIndex) { 737 s.mu.Lock() 738 if invariants.Enabled && s.mu.blocks[cacheBlockInd].lock != writeLockTaken { 739 panic(fmt.Sprintf("unexpected lock state %v in dropWriteLock", s.mu.blocks[cacheBlockInd].lock)) 740 } 741 s.mu.blocks[cacheBlockInd].lock = unlocked 742 s.mu.Unlock() 743 } 744 745 func (s *shard) assertShardStateIsConsistent() { 746 s.mu.Lock() 747 defer s.mu.Unlock() 748 749 lruLen := 0 750 if s.mu.lruHead != invalidBlockIndex { 751 for b := s.mu.lruHead; ; { 752 lruLen++ 753 if idx, ok := s.mu.where[s.mu.blocks[b].logical]; !ok || idx != b { 754 panic("block in LRU list with no entry in where map") 755 } 756 b = s.lruNext(b) 757 if b == s.mu.lruHead { 758 break 759 } 760 } 761 } 762 if lruLen != len(s.mu.where) { 763 panic(fmt.Sprintf("lru list len is %d but where map has %d entries", lruLen, len(s.mu.where))) 764 } 765 freeLen := 0 766 for n := s.mu.freeHead; n != invalidBlockIndex; n = s.mu.blocks[n].next { 767 freeLen++ 768 } 769 770 if lruLen+freeLen != int(s.sizeInBlocks) { 771 panic(fmt.Sprintf("%d lru blocks and %d free blocks don't add up to %d", lruLen, freeLen, s.sizeInBlocks)) 772 } 773 for i := range s.mu.blocks { 774 if state := s.mu.blocks[i].lock; state < writeLockTaken { 775 panic(fmt.Sprintf("lock state %v is not allowed", state)) 776 } 777 } 778 } 779 780 // cacheBlockIndex is the index of a blockSize-aligned cache block. 781 type cacheBlockIndex int64 782 783 // invalidBlockIndex is used for the head of a list when the list is empty. 784 const invalidBlockIndex cacheBlockIndex = -1 785 786 // blockMath is a helper type for performing conversions between offsets and 787 // block indexes. 788 type blockMath struct { 789 blockSizeBits int8 790 } 791 792 func makeBlockMath(blockSize int) blockMath { 793 bm := blockMath{ 794 blockSizeBits: int8(bits.Len64(uint64(blockSize)) - 1), 795 } 796 if blockSize != (1 << bm.blockSizeBits) { 797 panic(fmt.Sprintf("blockSize %d is not a power of 2", blockSize)) 798 } 799 return bm 800 } 801 802 func (bm blockMath) mask() int64 { 803 return (1 << bm.blockSizeBits) - 1 804 } 805 806 // BlockSize returns the block size. 807 func (bm blockMath) BlockSize() int { 808 return 1 << bm.blockSizeBits 809 } 810 811 // Block returns the block index containing the given offset. 812 func (bm blockMath) Block(offset int64) cacheBlockIndex { 813 return cacheBlockIndex(offset >> bm.blockSizeBits) 814 } 815 816 // Remainder returns the offset relative to the start of the cache block. 817 func (bm blockMath) Remainder(offset int64) int64 { 818 return offset & bm.mask() 819 } 820 821 // BlockOffset returns the object offset where the given block starts. 822 func (bm blockMath) BlockOffset(block cacheBlockIndex) int64 { 823 return int64(block) << bm.blockSizeBits 824 } 825 826 // RoundUp rounds up the given value to the closest multiple of block size. 827 func (bm blockMath) RoundUp(x int64) int64 { 828 return (x + bm.mask()) & ^(bm.mask()) 829 } 830 831 type writeWorkers struct { 832 doneCh chan struct{} 833 doneWaitGroup sync.WaitGroup 834 835 numWorkers int 836 tasksCh chan writeTask 837 } 838 839 type writeTask struct { 840 fileNum base.DiskFileNum 841 p []byte 842 offset int64 843 } 844 845 // Start starts the worker goroutines. 846 func (w *writeWorkers) Start(c *Cache, numWorkers int) { 847 doneCh := make(chan struct{}) 848 tasksCh := make(chan writeTask, numWorkers*writeTasksPerWorker) 849 850 w.numWorkers = numWorkers 851 w.doneCh = doneCh 852 w.tasksCh = tasksCh 853 w.doneWaitGroup.Add(numWorkers) 854 for i := 0; i < numWorkers; i++ { 855 go func() { 856 defer w.doneWaitGroup.Done() 857 for { 858 select { 859 case <-doneCh: 860 return 861 case task, ok := <-tasksCh: 862 if !ok { 863 // The tasks channel was closed; this is used in testing code to 864 // ensure all writes are completed. 865 return 866 } 867 // TODO(radu): set() can perform multiple writes; perhaps each one 868 // should be its own task. 869 start := time.Now() 870 err := c.set(task.fileNum, task.p, task.offset) 871 c.metrics.putLatency.Observe(float64(time.Since(start))) 872 if err != nil { 873 c.metrics.writeBackFailures.Add(1) 874 // TODO(radu): throttle logs. 875 c.logger.Infof("writing back to cache after miss failed: %v", err) 876 } 877 } 878 } 879 }() 880 } 881 } 882 883 // Stop waits for any in-progress writes to complete and stops the worker 884 // goroutines and waits for any in-pro. Any queued writes not yet started are 885 // discarded. 886 func (w *writeWorkers) Stop() { 887 close(w.doneCh) 888 w.doneCh = nil 889 w.tasksCh = nil 890 w.doneWaitGroup.Wait() 891 } 892 893 // QueueWrite adds a write task to the queue. Can block if the queue is full. 894 func (w *writeWorkers) QueueWrite(fileNum base.DiskFileNum, p []byte, offset int64) { 895 w.tasksCh <- writeTask{ 896 fileNum: fileNum, 897 p: p, 898 offset: offset, 899 } 900 }