github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/raftentry/cache.go

github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/raftentry/cache.go (about)

     1  // Copyright 2018 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  // Package raftentry provides a cache for entries to avoid extra
    12  // deserializations.
    13  package raftentry
    14  
    15  import (
    16  	"math"
    17  	"sync/atomic"
    18  	"unsafe"
    19  
    20  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    21  	"github.com/cockroachdb/cockroach/pkg/util/syncutil"
    22  	"github.com/cockroachdb/errors"
    23  	"go.etcd.io/etcd/raft/raftpb"
    24  )
    25  
    26  // Cache is a specialized data structure for storing deserialized raftpb.Entry
    27  // values tailored to the access patterns of the storage package.
    28  // Cache is safe for concurrent access.
    29  type Cache struct {
    30  	metrics  Metrics
    31  	maxBytes int32
    32  
    33  	// accessed with atomics
    34  	bytes   int32
    35  	entries int32
    36  
    37  	mu    syncutil.Mutex
    38  	lru   partitionList
    39  	parts map[roachpb.RangeID]*partition
    40  }
    41  
    42  // Design
    43  //
    44  // Cache is designed to be a shared store-wide object which incurs low
    45  // contention for operations on different ranges while maintaining a global
    46  // memory policy. This is achieved through the use of a two-level locking scheme.
    47  // Cache.mu is acquired to access any data in the cache (Add, Clear, Get, or
    48  // Scan) in order to locate the partition for the operation and update the LRU
    49  // state. In the case of Add operations, partitions are lazily constructed
    50  // under the lock. In addition to partition location, Add operations record the
    51  // maximal amount of space that the write may add to the cache, accepting that
    52  // in certain cases, less space may actually be consumed leading to unnecessary
    53  // evictions. Once a partition has been located (or not found) and LRU state has
    54  // been appropriately modified, operations release Cache.mu and proceed by
    55  // operating on the partition under its RWMutex.
    56  //
    57  // This disjoint, two-level locking pattern permits the "anomaly" whereby a
    58  // partition may be accessed and evicted concurrently. This condition is made
    59  // safe in the implementation by using atomics to update the cache bookkeeping
    60  // and by taking care to not mutate the partition's cache state upon eviction.
    61  // As noted above, the Cache and partition's bookkeeping is updated with an
    62  // initial estimate of the byte size of an addition while holding Cache.mu.
    63  // Because empty additions are elided, this initial bookkeeping guarantees that
    64  // the cacheSize of partition is non-zero while an Add operation proceeds unless
    65  // the partition has been evicted. The updated value of partition.size is
    66  // recorded before releasing Cache.mu. When a partition mutation operation
    67  // concludes the Cache's stats need to be updated such that they reflect the new
    68  // reality. This update (Cache.recordUpdate) is mediated through the use of an
    69  // atomic compare and swap operation on partition.size. If the operation
    70  // succeeds, then we know that future evictions of this partition will see the
    71  // new updated partition.size and so any delta from what was optimistically
    72  // recorded in the Cache stats should be updated (using atomics, see
    73  // add(Bytes|Entries)). If the operation fails, then we know that any change
    74  // just made to the partition are no longer stored in the cache and thus the
    75  // Cache stats shall not change.
    76  //
    77  // This approach admits several undesirable conditions, fortunately they aren't
    78  // practical concerns.
    79  //
    80  //   1) Evicted partitions are reclaimed asynchronously only after operations
    81  //      concurrent with evictions complete.
    82  //   2) Memory reuse with object pools is difficult.
    83  
    84  type partition struct {
    85  	id roachpb.RangeID
    86  
    87  	mu      syncutil.RWMutex
    88  	ringBuf // implements rangeCache, embedded to avoid interface and allocation
    89  
    90  	size cacheSize // accessed with atomics
    91  
    92  	next, prev *partition // accessed under Cache.mu
    93  }
    94  
    95  const partitionSize = int32(unsafe.Sizeof(partition{}))
    96  
    97  // rangeCache represents the interface that the partition uses.
    98  // It is never explicitly used but a new implementation to replace ringBuf must
    99  // implement the below interface.
   100  type rangeCache interface {
   101  	add(ent []raftpb.Entry) (bytesAdded, entriesAdded int32)
   102  	truncateFrom(lo uint64) (bytesRemoved, entriesRemoved int32)
   103  	clearTo(hi uint64) (bytesRemoved, entriesRemoved int32)
   104  	get(index uint64) (raftpb.Entry, bool)
   105  	scan(ents []raftpb.Entry, lo, hi, maxBytes uint64) (
   106  		_ []raftpb.Entry, bytes uint64, nextIdx uint64, exceededMaxBytes bool)
   107  }
   108  
   109  // ringBuf implements rangeCache.
   110  var _ rangeCache = (*ringBuf)(nil)
   111  
   112  // NewCache creates a cache with a max size.
   113  // Size must be less than math.MaxInt32.
   114  func NewCache(maxBytes uint64) *Cache {
   115  	if maxBytes > math.MaxInt32 {
   116  		maxBytes = math.MaxInt32
   117  	}
   118  	return &Cache{
   119  		maxBytes: int32(maxBytes),
   120  		metrics:  makeMetrics(),
   121  		parts:    map[roachpb.RangeID]*partition{},
   122  	}
   123  }
   124  
   125  // Metrics returns a struct which contains metrics for the raft entry cache.
   126  func (c *Cache) Metrics() Metrics {
   127  	return c.metrics
   128  }
   129  
   130  // Drop drops all cached entries associated with the specified range.
   131  func (c *Cache) Drop(id roachpb.RangeID) {
   132  	c.mu.Lock()
   133  	defer c.mu.Unlock()
   134  	p := c.getPartLocked(id, false /* create */, false /* recordUse */)
   135  	if p != nil {
   136  		c.updateGauges(c.evictPartitionLocked(p))
   137  	}
   138  }
   139  
   140  // Add inserts ents into the cache. If truncate is true, the method also removes
   141  // all entries with indices equal to or greater than the indices of the entries
   142  // provided. ents is expected to consist of entries with a contiguous sequence
   143  // of indices.
   144  func (c *Cache) Add(id roachpb.RangeID, ents []raftpb.Entry, truncate bool) {
   145  	if len(ents) == 0 {
   146  		return
   147  	}
   148  	bytesGuessed := analyzeEntries(ents)
   149  	add := bytesGuessed <= c.maxBytes
   150  	if !add {
   151  		bytesGuessed = 0
   152  	}
   153  
   154  	c.mu.Lock()
   155  	// Get p and move the partition to the front of the LRU.
   156  	p := c.getPartLocked(id, add /* create */, true /* recordUse */)
   157  	if bytesGuessed > 0 {
   158  		c.evictLocked(bytesGuessed)
   159  		if len(c.parts) == 0 { // Get p again if we evicted everything.
   160  			p = c.getPartLocked(id, true /* create */, false /* recordUse */)
   161  		}
   162  		// Use the atomic (load|set)Size partition methods to avoid a race condition
   163  		// on p.size and to ensure that p.size.bytes() reflects the number of bytes
   164  		// in c.bytes associated with p in the face of concurrent updates due to calls
   165  		// to c.recordUpdate.
   166  		for {
   167  			prev := p.loadSize()
   168  			if p.setSize(prev, prev.add(bytesGuessed, 0)) {
   169  				break
   170  			}
   171  		}
   172  	}
   173  	c.mu.Unlock()
   174  	if p == nil {
   175  		// The partition did not exist and we did not create it.
   176  		// Only possible if !add.
   177  		return
   178  	}
   179  
   180  	p.mu.Lock()
   181  	defer p.mu.Unlock()
   182  	var bytesAdded, entriesAdded, bytesRemoved, entriesRemoved int32
   183  	if add {
   184  		bytesAdded, entriesAdded = p.add(ents)
   185  	}
   186  	if truncate {
   187  		truncIdx := ents[0].Index
   188  		if add {
   189  			// Some entries were already overwritten.
   190  			truncIdx = ents[len(ents)-1].Index + 1
   191  		}
   192  		bytesRemoved, entriesRemoved = p.truncateFrom(truncIdx)
   193  	}
   194  	c.recordUpdate(p, bytesAdded-bytesRemoved, bytesGuessed, entriesAdded-entriesRemoved)
   195  }
   196  
   197  // Clear removes all entries on the given range with index less than hi.
   198  func (c *Cache) Clear(id roachpb.RangeID, hi uint64) {
   199  	c.mu.Lock()
   200  	p := c.getPartLocked(id, false /* create */, false /* recordUse */)
   201  	if p == nil {
   202  		c.mu.Unlock()
   203  		return
   204  	}
   205  	c.mu.Unlock()
   206  	p.mu.Lock()
   207  	defer p.mu.Unlock()
   208  	bytesRemoved, entriesRemoved := p.clearTo(hi)
   209  	c.recordUpdate(p, -1*bytesRemoved, 0, -1*entriesRemoved)
   210  }
   211  
   212  // Get returns the entry for the specified index and true for the second return
   213  // value. If the index is not present in the cache, false is returned.
   214  func (c *Cache) Get(id roachpb.RangeID, idx uint64) (e raftpb.Entry, ok bool) {
   215  	c.metrics.Accesses.Inc(1)
   216  	c.mu.Lock()
   217  	p := c.getPartLocked(id, false /* create */, true /* recordUse */)
   218  	c.mu.Unlock()
   219  	if p == nil {
   220  		return e, false
   221  	}
   222  	p.mu.RLock()
   223  	defer p.mu.RUnlock()
   224  	e, ok = p.get(idx)
   225  	if ok {
   226  		c.metrics.Hits.Inc(1)
   227  	}
   228  	return e, ok
   229  }
   230  
   231  // Scan returns entries between [lo, hi) for specified range. If any entries are
   232  // returned for the specified indices, they will start with index lo and proceed
   233  // sequentially without gaps until 1) all entries exclusive of hi are fetched,
   234  // 2) fetching another entry would add up to more than maxBytes of data, or 3) a
   235  // cache miss occurs. The returned size reflects the size of the returned
   236  // entries.
   237  func (c *Cache) Scan(
   238  	ents []raftpb.Entry, id roachpb.RangeID, lo, hi, maxBytes uint64,
   239  ) (_ []raftpb.Entry, bytes uint64, nextIdx uint64, exceededMaxBytes bool) {
   240  	c.metrics.Accesses.Inc(1)
   241  	c.mu.Lock()
   242  	p := c.getPartLocked(id, false /* create */, true /* recordUse */)
   243  	c.mu.Unlock()
   244  	if p == nil {
   245  		return ents, 0, lo, false
   246  	}
   247  	p.mu.RLock()
   248  	defer p.mu.RUnlock()
   249  
   250  	ents, bytes, nextIdx, exceededMaxBytes = p.scan(ents, lo, hi, maxBytes)
   251  	if nextIdx == hi || exceededMaxBytes {
   252  		// Only consider an access a "hit" if it returns all requested entries or
   253  		// stops short because of a maximum bytes limit.
   254  		c.metrics.Hits.Inc(1)
   255  	}
   256  	return ents, bytes, nextIdx, exceededMaxBytes
   257  }
   258  
   259  func (c *Cache) getPartLocked(id roachpb.RangeID, create, recordUse bool) *partition {
   260  	part := c.parts[id]
   261  	if create && part == nil {
   262  		part = c.lru.pushFront(id)
   263  		c.parts[id] = part
   264  		c.addBytes(partitionSize)
   265  	}
   266  	if recordUse && part != nil {
   267  		c.lru.moveToFront(part)
   268  	}
   269  	return part
   270  }
   271  
   272  // evictLocked adds toAdd to the current cache byte size and evicts partitions
   273  // until the cache is below the maxBytes threshold. toAdd must be smaller than
   274  // c.maxBytes.
   275  func (c *Cache) evictLocked(toAdd int32) {
   276  	bytes := c.addBytes(toAdd)
   277  	for bytes > c.maxBytes && len(c.parts) > 0 {
   278  		bytes, _ = c.evictPartitionLocked(c.lru.back())
   279  	}
   280  }
   281  
   282  func (c *Cache) evictPartitionLocked(p *partition) (updatedBytes, updatedEntries int32) {
   283  	delete(c.parts, p.id)
   284  	c.lru.remove(p)
   285  	pBytes, pEntries := p.evict()
   286  	return c.addBytes(-1 * pBytes), c.addEntries(-1 * pEntries)
   287  }
   288  
   289  // recordUpdate adjusts the partition and cache bookkeeping to account for the
   290  // changes which actually occurred in an update relative to the guess made
   291  // before the update.
   292  func (c *Cache) recordUpdate(p *partition, bytesAdded, bytesGuessed, entriesAdded int32) {
   293  	// This method is always called while p.mu is held.
   294  	// The below code takes care to ensure that all bytes in c due to p are
   295  	// updated appropriately.
   296  
   297  	// NB: The loop and atomics are used because p.size can be modified
   298  	// concurrently to calls to recordUpdate. In all cases where p.size is updated
   299  	// outside of this function occur while c.mu is held inside of c.Add. These
   300  	// occur when either:
   301  	//
   302  	//   1) a new write adds its guessed write size to p
   303  	//   2) p is evicted to make room for a write
   304  	//
   305  	// Thus p.size is either increasing or becomes evicted while we attempt to
   306  	// record the update to p. Once p is evicted it stays evicted forever.
   307  	// These facts combine to ensure that p.size never becomes negative from the
   308  	// below call to add.
   309  
   310  	delta := bytesAdded - bytesGuessed
   311  	for {
   312  		curSize := p.loadSize()
   313  		if curSize == evicted {
   314  			return
   315  		}
   316  		newSize := curSize.add(delta, entriesAdded)
   317  		if updated := p.setSize(curSize, newSize); updated {
   318  			c.updateGauges(c.addBytes(delta), c.addEntries(entriesAdded))
   319  			return
   320  		}
   321  	}
   322  }
   323  
   324  func (c *Cache) addBytes(toAdd int32) int32 {
   325  	return atomic.AddInt32(&c.bytes, toAdd)
   326  }
   327  
   328  func (c *Cache) addEntries(toAdd int32) int32 {
   329  	return atomic.AddInt32(&c.entries, toAdd)
   330  }
   331  
   332  func (c *Cache) updateGauges(bytes, entries int32) {
   333  	c.metrics.Bytes.Update(int64(bytes))
   334  	c.metrics.Size.Update(int64(entries))
   335  }
   336  
   337  var initialSize = newCacheSize(partitionSize, 0)
   338  
   339  func newPartition(id roachpb.RangeID) *partition {
   340  	return &partition{
   341  		id:   id,
   342  		size: initialSize,
   343  	}
   344  }
   345  
   346  const evicted cacheSize = 0
   347  
   348  func (p *partition) evict() (bytes, entries int32) {
   349  	// Atomically setting size to evicted signals that the partition has been
   350  	// evicted. Changes to p which happen concurrently with the eviction should
   351  	// not be reflected in the Cache. The loop in recordUpdate detects the action
   352  	// of this call.
   353  	cs := p.loadSize()
   354  	for !p.setSize(cs, evicted) {
   355  		cs = p.loadSize()
   356  	}
   357  	return cs.bytes(), cs.entries()
   358  }
   359  
   360  func (p *partition) loadSize() cacheSize {
   361  	return cacheSize(atomic.LoadUint64((*uint64)(&p.size)))
   362  }
   363  
   364  func (p *partition) setSize(orig, new cacheSize) bool {
   365  	return atomic.CompareAndSwapUint64((*uint64)(&p.size), uint64(orig), uint64(new))
   366  }
   367  
   368  // analyzeEntries calculates the size in bytes of ents and ensures that the
   369  // entries in ents have contiguous indices.
   370  func analyzeEntries(ents []raftpb.Entry) (size int32) {
   371  	var prevIndex uint64
   372  	for i, e := range ents {
   373  		if i != 0 && e.Index != prevIndex+1 {
   374  			panic(errors.Errorf("invalid non-contiguous set of entries %d and %d", prevIndex, e.Index))
   375  		}
   376  		prevIndex = e.Index
   377  		size += int32(e.Size())
   378  	}
   379  	return
   380  }
   381  
   382  // cacheSize stores int32 counters for numbers of bytes and entries in a single
   383  // 64-bit word.
   384  type cacheSize uint64
   385  
   386  func newCacheSize(bytes, entries int32) cacheSize {
   387  	return cacheSize((uint64(entries) << 32) | uint64(bytes))
   388  }
   389  
   390  func (cs cacheSize) entries() int32 {
   391  	return int32(cs >> 32)
   392  }
   393  
   394  func (cs cacheSize) bytes() int32 {
   395  	return int32(cs & math.MaxUint32)
   396  }
   397  
   398  // add constructs a new cacheSize with signed additions to entries and bytes.
   399  // It is illegal to use values that will make cs negative.
   400  func (cs cacheSize) add(bytes, entries int32) cacheSize {
   401  	return newCacheSize(cs.bytes()+bytes, cs.entries()+entries)
   402  }
   403  
   404  // entryList is a double-linked circular list of *partition elements. The code
   405  // is derived from the stdlib container/list but customized to partition in
   406  // order to avoid a separate allocation for every element.
   407  type partitionList struct {
   408  	root partition
   409  }
   410  
   411  func (l *partitionList) lazyInit() {
   412  	if l.root.next == nil {
   413  		l.root.next = &l.root
   414  		l.root.prev = &l.root
   415  	}
   416  }
   417  
   418  func (l *partitionList) pushFront(id roachpb.RangeID) *partition {
   419  	l.lazyInit()
   420  	return l.insert(newPartition(id), &l.root)
   421  }
   422  
   423  func (l *partitionList) moveToFront(p *partition) {
   424  	l.insert(l.remove(p), &l.root)
   425  }
   426  
   427  func (l *partitionList) insert(e, at *partition) *partition {
   428  	n := at.next
   429  	at.next = e
   430  	e.prev = at
   431  	e.next = n
   432  	n.prev = e
   433  	return e
   434  }
   435  
   436  func (l *partitionList) back() *partition {
   437  	if l.root.prev == nil || l.root.prev == &l.root {
   438  		return nil
   439  	}
   440  	return l.root.prev
   441  }
   442  
   443  func (l *partitionList) remove(e *partition) *partition {
   444  	if e == &l.root {
   445  		panic("cannot remove root list node")
   446  	}
   447  	if e.next != nil {
   448  		e.prev.next = e.next
   449  		e.next.prev = e.prev
   450  		e.next = nil // avoid memory leaks
   451  		e.prev = nil // avoid memory leaks
   452  	}
   453  	return e
   454  }