github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/raftentry/cache.go (about) 1 // Copyright 2018 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 // Package raftentry provides a cache for entries to avoid extra 12 // deserializations. 13 package raftentry 14 15 import ( 16 "math" 17 "sync/atomic" 18 "unsafe" 19 20 "github.com/cockroachdb/cockroach/pkg/roachpb" 21 "github.com/cockroachdb/cockroach/pkg/util/syncutil" 22 "github.com/cockroachdb/errors" 23 "go.etcd.io/etcd/raft/raftpb" 24 ) 25 26 // Cache is a specialized data structure for storing deserialized raftpb.Entry 27 // values tailored to the access patterns of the storage package. 28 // Cache is safe for concurrent access. 29 type Cache struct { 30 metrics Metrics 31 maxBytes int32 32 33 // accessed with atomics 34 bytes int32 35 entries int32 36 37 mu syncutil.Mutex 38 lru partitionList 39 parts map[roachpb.RangeID]*partition 40 } 41 42 // Design 43 // 44 // Cache is designed to be a shared store-wide object which incurs low 45 // contention for operations on different ranges while maintaining a global 46 // memory policy. This is achieved through the use of a two-level locking scheme. 47 // Cache.mu is acquired to access any data in the cache (Add, Clear, Get, or 48 // Scan) in order to locate the partition for the operation and update the LRU 49 // state. In the case of Add operations, partitions are lazily constructed 50 // under the lock. In addition to partition location, Add operations record the 51 // maximal amount of space that the write may add to the cache, accepting that 52 // in certain cases, less space may actually be consumed leading to unnecessary 53 // evictions. Once a partition has been located (or not found) and LRU state has 54 // been appropriately modified, operations release Cache.mu and proceed by 55 // operating on the partition under its RWMutex. 56 // 57 // This disjoint, two-level locking pattern permits the "anomaly" whereby a 58 // partition may be accessed and evicted concurrently. This condition is made 59 // safe in the implementation by using atomics to update the cache bookkeeping 60 // and by taking care to not mutate the partition's cache state upon eviction. 61 // As noted above, the Cache and partition's bookkeeping is updated with an 62 // initial estimate of the byte size of an addition while holding Cache.mu. 63 // Because empty additions are elided, this initial bookkeeping guarantees that 64 // the cacheSize of partition is non-zero while an Add operation proceeds unless 65 // the partition has been evicted. The updated value of partition.size is 66 // recorded before releasing Cache.mu. When a partition mutation operation 67 // concludes the Cache's stats need to be updated such that they reflect the new 68 // reality. This update (Cache.recordUpdate) is mediated through the use of an 69 // atomic compare and swap operation on partition.size. If the operation 70 // succeeds, then we know that future evictions of this partition will see the 71 // new updated partition.size and so any delta from what was optimistically 72 // recorded in the Cache stats should be updated (using atomics, see 73 // add(Bytes|Entries)). If the operation fails, then we know that any change 74 // just made to the partition are no longer stored in the cache and thus the 75 // Cache stats shall not change. 76 // 77 // This approach admits several undesirable conditions, fortunately they aren't 78 // practical concerns. 79 // 80 // 1) Evicted partitions are reclaimed asynchronously only after operations 81 // concurrent with evictions complete. 82 // 2) Memory reuse with object pools is difficult. 83 84 type partition struct { 85 id roachpb.RangeID 86 87 mu syncutil.RWMutex 88 ringBuf // implements rangeCache, embedded to avoid interface and allocation 89 90 size cacheSize // accessed with atomics 91 92 next, prev *partition // accessed under Cache.mu 93 } 94 95 const partitionSize = int32(unsafe.Sizeof(partition{})) 96 97 // rangeCache represents the interface that the partition uses. 98 // It is never explicitly used but a new implementation to replace ringBuf must 99 // implement the below interface. 100 type rangeCache interface { 101 add(ent []raftpb.Entry) (bytesAdded, entriesAdded int32) 102 truncateFrom(lo uint64) (bytesRemoved, entriesRemoved int32) 103 clearTo(hi uint64) (bytesRemoved, entriesRemoved int32) 104 get(index uint64) (raftpb.Entry, bool) 105 scan(ents []raftpb.Entry, lo, hi, maxBytes uint64) ( 106 _ []raftpb.Entry, bytes uint64, nextIdx uint64, exceededMaxBytes bool) 107 } 108 109 // ringBuf implements rangeCache. 110 var _ rangeCache = (*ringBuf)(nil) 111 112 // NewCache creates a cache with a max size. 113 // Size must be less than math.MaxInt32. 114 func NewCache(maxBytes uint64) *Cache { 115 if maxBytes > math.MaxInt32 { 116 maxBytes = math.MaxInt32 117 } 118 return &Cache{ 119 maxBytes: int32(maxBytes), 120 metrics: makeMetrics(), 121 parts: map[roachpb.RangeID]*partition{}, 122 } 123 } 124 125 // Metrics returns a struct which contains metrics for the raft entry cache. 126 func (c *Cache) Metrics() Metrics { 127 return c.metrics 128 } 129 130 // Drop drops all cached entries associated with the specified range. 131 func (c *Cache) Drop(id roachpb.RangeID) { 132 c.mu.Lock() 133 defer c.mu.Unlock() 134 p := c.getPartLocked(id, false /* create */, false /* recordUse */) 135 if p != nil { 136 c.updateGauges(c.evictPartitionLocked(p)) 137 } 138 } 139 140 // Add inserts ents into the cache. If truncate is true, the method also removes 141 // all entries with indices equal to or greater than the indices of the entries 142 // provided. ents is expected to consist of entries with a contiguous sequence 143 // of indices. 144 func (c *Cache) Add(id roachpb.RangeID, ents []raftpb.Entry, truncate bool) { 145 if len(ents) == 0 { 146 return 147 } 148 bytesGuessed := analyzeEntries(ents) 149 add := bytesGuessed <= c.maxBytes 150 if !add { 151 bytesGuessed = 0 152 } 153 154 c.mu.Lock() 155 // Get p and move the partition to the front of the LRU. 156 p := c.getPartLocked(id, add /* create */, true /* recordUse */) 157 if bytesGuessed > 0 { 158 c.evictLocked(bytesGuessed) 159 if len(c.parts) == 0 { // Get p again if we evicted everything. 160 p = c.getPartLocked(id, true /* create */, false /* recordUse */) 161 } 162 // Use the atomic (load|set)Size partition methods to avoid a race condition 163 // on p.size and to ensure that p.size.bytes() reflects the number of bytes 164 // in c.bytes associated with p in the face of concurrent updates due to calls 165 // to c.recordUpdate. 166 for { 167 prev := p.loadSize() 168 if p.setSize(prev, prev.add(bytesGuessed, 0)) { 169 break 170 } 171 } 172 } 173 c.mu.Unlock() 174 if p == nil { 175 // The partition did not exist and we did not create it. 176 // Only possible if !add. 177 return 178 } 179 180 p.mu.Lock() 181 defer p.mu.Unlock() 182 var bytesAdded, entriesAdded, bytesRemoved, entriesRemoved int32 183 if add { 184 bytesAdded, entriesAdded = p.add(ents) 185 } 186 if truncate { 187 truncIdx := ents[0].Index 188 if add { 189 // Some entries were already overwritten. 190 truncIdx = ents[len(ents)-1].Index + 1 191 } 192 bytesRemoved, entriesRemoved = p.truncateFrom(truncIdx) 193 } 194 c.recordUpdate(p, bytesAdded-bytesRemoved, bytesGuessed, entriesAdded-entriesRemoved) 195 } 196 197 // Clear removes all entries on the given range with index less than hi. 198 func (c *Cache) Clear(id roachpb.RangeID, hi uint64) { 199 c.mu.Lock() 200 p := c.getPartLocked(id, false /* create */, false /* recordUse */) 201 if p == nil { 202 c.mu.Unlock() 203 return 204 } 205 c.mu.Unlock() 206 p.mu.Lock() 207 defer p.mu.Unlock() 208 bytesRemoved, entriesRemoved := p.clearTo(hi) 209 c.recordUpdate(p, -1*bytesRemoved, 0, -1*entriesRemoved) 210 } 211 212 // Get returns the entry for the specified index and true for the second return 213 // value. If the index is not present in the cache, false is returned. 214 func (c *Cache) Get(id roachpb.RangeID, idx uint64) (e raftpb.Entry, ok bool) { 215 c.metrics.Accesses.Inc(1) 216 c.mu.Lock() 217 p := c.getPartLocked(id, false /* create */, true /* recordUse */) 218 c.mu.Unlock() 219 if p == nil { 220 return e, false 221 } 222 p.mu.RLock() 223 defer p.mu.RUnlock() 224 e, ok = p.get(idx) 225 if ok { 226 c.metrics.Hits.Inc(1) 227 } 228 return e, ok 229 } 230 231 // Scan returns entries between [lo, hi) for specified range. If any entries are 232 // returned for the specified indices, they will start with index lo and proceed 233 // sequentially without gaps until 1) all entries exclusive of hi are fetched, 234 // 2) fetching another entry would add up to more than maxBytes of data, or 3) a 235 // cache miss occurs. The returned size reflects the size of the returned 236 // entries. 237 func (c *Cache) Scan( 238 ents []raftpb.Entry, id roachpb.RangeID, lo, hi, maxBytes uint64, 239 ) (_ []raftpb.Entry, bytes uint64, nextIdx uint64, exceededMaxBytes bool) { 240 c.metrics.Accesses.Inc(1) 241 c.mu.Lock() 242 p := c.getPartLocked(id, false /* create */, true /* recordUse */) 243 c.mu.Unlock() 244 if p == nil { 245 return ents, 0, lo, false 246 } 247 p.mu.RLock() 248 defer p.mu.RUnlock() 249 250 ents, bytes, nextIdx, exceededMaxBytes = p.scan(ents, lo, hi, maxBytes) 251 if nextIdx == hi || exceededMaxBytes { 252 // Only consider an access a "hit" if it returns all requested entries or 253 // stops short because of a maximum bytes limit. 254 c.metrics.Hits.Inc(1) 255 } 256 return ents, bytes, nextIdx, exceededMaxBytes 257 } 258 259 func (c *Cache) getPartLocked(id roachpb.RangeID, create, recordUse bool) *partition { 260 part := c.parts[id] 261 if create && part == nil { 262 part = c.lru.pushFront(id) 263 c.parts[id] = part 264 c.addBytes(partitionSize) 265 } 266 if recordUse && part != nil { 267 c.lru.moveToFront(part) 268 } 269 return part 270 } 271 272 // evictLocked adds toAdd to the current cache byte size and evicts partitions 273 // until the cache is below the maxBytes threshold. toAdd must be smaller than 274 // c.maxBytes. 275 func (c *Cache) evictLocked(toAdd int32) { 276 bytes := c.addBytes(toAdd) 277 for bytes > c.maxBytes && len(c.parts) > 0 { 278 bytes, _ = c.evictPartitionLocked(c.lru.back()) 279 } 280 } 281 282 func (c *Cache) evictPartitionLocked(p *partition) (updatedBytes, updatedEntries int32) { 283 delete(c.parts, p.id) 284 c.lru.remove(p) 285 pBytes, pEntries := p.evict() 286 return c.addBytes(-1 * pBytes), c.addEntries(-1 * pEntries) 287 } 288 289 // recordUpdate adjusts the partition and cache bookkeeping to account for the 290 // changes which actually occurred in an update relative to the guess made 291 // before the update. 292 func (c *Cache) recordUpdate(p *partition, bytesAdded, bytesGuessed, entriesAdded int32) { 293 // This method is always called while p.mu is held. 294 // The below code takes care to ensure that all bytes in c due to p are 295 // updated appropriately. 296 297 // NB: The loop and atomics are used because p.size can be modified 298 // concurrently to calls to recordUpdate. In all cases where p.size is updated 299 // outside of this function occur while c.mu is held inside of c.Add. These 300 // occur when either: 301 // 302 // 1) a new write adds its guessed write size to p 303 // 2) p is evicted to make room for a write 304 // 305 // Thus p.size is either increasing or becomes evicted while we attempt to 306 // record the update to p. Once p is evicted it stays evicted forever. 307 // These facts combine to ensure that p.size never becomes negative from the 308 // below call to add. 309 310 delta := bytesAdded - bytesGuessed 311 for { 312 curSize := p.loadSize() 313 if curSize == evicted { 314 return 315 } 316 newSize := curSize.add(delta, entriesAdded) 317 if updated := p.setSize(curSize, newSize); updated { 318 c.updateGauges(c.addBytes(delta), c.addEntries(entriesAdded)) 319 return 320 } 321 } 322 } 323 324 func (c *Cache) addBytes(toAdd int32) int32 { 325 return atomic.AddInt32(&c.bytes, toAdd) 326 } 327 328 func (c *Cache) addEntries(toAdd int32) int32 { 329 return atomic.AddInt32(&c.entries, toAdd) 330 } 331 332 func (c *Cache) updateGauges(bytes, entries int32) { 333 c.metrics.Bytes.Update(int64(bytes)) 334 c.metrics.Size.Update(int64(entries)) 335 } 336 337 var initialSize = newCacheSize(partitionSize, 0) 338 339 func newPartition(id roachpb.RangeID) *partition { 340 return &partition{ 341 id: id, 342 size: initialSize, 343 } 344 } 345 346 const evicted cacheSize = 0 347 348 func (p *partition) evict() (bytes, entries int32) { 349 // Atomically setting size to evicted signals that the partition has been 350 // evicted. Changes to p which happen concurrently with the eviction should 351 // not be reflected in the Cache. The loop in recordUpdate detects the action 352 // of this call. 353 cs := p.loadSize() 354 for !p.setSize(cs, evicted) { 355 cs = p.loadSize() 356 } 357 return cs.bytes(), cs.entries() 358 } 359 360 func (p *partition) loadSize() cacheSize { 361 return cacheSize(atomic.LoadUint64((*uint64)(&p.size))) 362 } 363 364 func (p *partition) setSize(orig, new cacheSize) bool { 365 return atomic.CompareAndSwapUint64((*uint64)(&p.size), uint64(orig), uint64(new)) 366 } 367 368 // analyzeEntries calculates the size in bytes of ents and ensures that the 369 // entries in ents have contiguous indices. 370 func analyzeEntries(ents []raftpb.Entry) (size int32) { 371 var prevIndex uint64 372 for i, e := range ents { 373 if i != 0 && e.Index != prevIndex+1 { 374 panic(errors.Errorf("invalid non-contiguous set of entries %d and %d", prevIndex, e.Index)) 375 } 376 prevIndex = e.Index 377 size += int32(e.Size()) 378 } 379 return 380 } 381 382 // cacheSize stores int32 counters for numbers of bytes and entries in a single 383 // 64-bit word. 384 type cacheSize uint64 385 386 func newCacheSize(bytes, entries int32) cacheSize { 387 return cacheSize((uint64(entries) << 32) | uint64(bytes)) 388 } 389 390 func (cs cacheSize) entries() int32 { 391 return int32(cs >> 32) 392 } 393 394 func (cs cacheSize) bytes() int32 { 395 return int32(cs & math.MaxUint32) 396 } 397 398 // add constructs a new cacheSize with signed additions to entries and bytes. 399 // It is illegal to use values that will make cs negative. 400 func (cs cacheSize) add(bytes, entries int32) cacheSize { 401 return newCacheSize(cs.bytes()+bytes, cs.entries()+entries) 402 } 403 404 // entryList is a double-linked circular list of *partition elements. The code 405 // is derived from the stdlib container/list but customized to partition in 406 // order to avoid a separate allocation for every element. 407 type partitionList struct { 408 root partition 409 } 410 411 func (l *partitionList) lazyInit() { 412 if l.root.next == nil { 413 l.root.next = &l.root 414 l.root.prev = &l.root 415 } 416 } 417 418 func (l *partitionList) pushFront(id roachpb.RangeID) *partition { 419 l.lazyInit() 420 return l.insert(newPartition(id), &l.root) 421 } 422 423 func (l *partitionList) moveToFront(p *partition) { 424 l.insert(l.remove(p), &l.root) 425 } 426 427 func (l *partitionList) insert(e, at *partition) *partition { 428 n := at.next 429 at.next = e 430 e.prev = at 431 e.next = n 432 n.prev = e 433 return e 434 } 435 436 func (l *partitionList) back() *partition { 437 if l.root.prev == nil || l.root.prev == &l.root { 438 return nil 439 } 440 return l.root.prev 441 } 442 443 func (l *partitionList) remove(e *partition) *partition { 444 if e == &l.root { 445 panic("cannot remove root list node") 446 } 447 if e.next != nil { 448 e.prev.next = e.next 449 e.next.prev = e.prev 450 e.next = nil // avoid memory leaks 451 e.prev = nil // avoid memory leaks 452 } 453 return e 454 }