github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/tscache/tree_impl.go (about)

     1  // Copyright 2017 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package tscache
    12  
    13  import (
    14  	"fmt"
    15  	"unsafe"
    16  
    17  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    18  	"github.com/cockroachdb/cockroach/pkg/util/cache"
    19  	"github.com/cockroachdb/cockroach/pkg/util/hlc"
    20  	"github.com/cockroachdb/cockroach/pkg/util/interval"
    21  	"github.com/cockroachdb/cockroach/pkg/util/syncutil"
    22  	"github.com/cockroachdb/cockroach/pkg/util/uuid"
    23  )
    24  
    25  const (
    26  	// defaultTreeImplSize is the default size in bytes for a treeImpl timestamp
    27  	// cache. Note that the timestamp cache can use more memory than this
    28  	// because it holds on to all entries that are younger than
    29  	// MinRetentionWindow.
    30  	defaultTreeImplSize = 64 << 20 // 64 MB
    31  )
    32  
    33  func makeCacheEntry(key cache.IntervalKey, value cacheValue) *cache.Entry {
    34  	alloc := struct {
    35  		key   cache.IntervalKey
    36  		value cacheValue
    37  		entry cache.Entry
    38  	}{
    39  		key:   key,
    40  		value: value,
    41  	}
    42  	alloc.entry.Key = &alloc.key
    43  	alloc.entry.Value = &alloc.value
    44  	return &alloc.entry
    45  }
    46  
    47  var cacheEntryOverhead = uint64(unsafe.Sizeof(cache.IntervalKey{}) +
    48  	unsafe.Sizeof(cacheValue{}) + unsafe.Sizeof(cache.Entry{}))
    49  
    50  func cacheEntrySize(start, end interval.Comparable) uint64 {
    51  	n := uint64(cap(start))
    52  	if end != nil && len(start) > 0 && len(end) > 0 && &end[0] != &start[0] {
    53  		// If the end key exists and is not sharing memory with the start key,
    54  		// account for its memory usage.
    55  		n += uint64(cap(end))
    56  	}
    57  	n += cacheEntryOverhead
    58  	return n
    59  }
    60  
    61  // treeImpl implements the Cache interface. It maintains an interval tree FIFO
    62  // cache of keys or key ranges and the timestamps at which they were most
    63  // recently read or written. If a timestamp was read or written by a
    64  // transaction, the txn ID is stored with the timestamp to avoid advancing
    65  // timestamps on successive requests from the same transaction.
    66  type treeImpl struct {
    67  	syncutil.RWMutex
    68  
    69  	cache            *cache.IntervalCache
    70  	lowWater, latest hlc.Timestamp
    71  
    72  	bytes    uint64
    73  	maxBytes uint64
    74  	metrics  Metrics
    75  }
    76  
    77  var _ Cache = &treeImpl{}
    78  
    79  // newTreeImpl returns a new treeImpl with the supplied hybrid clock.
    80  func newTreeImpl(clock *hlc.Clock) *treeImpl {
    81  	tc := &treeImpl{
    82  		cache:    cache.NewIntervalCache(cache.Config{Policy: cache.CacheFIFO}),
    83  		maxBytes: uint64(defaultTreeImplSize),
    84  		metrics:  makeMetrics(),
    85  	}
    86  	tc.clear(clock.Now())
    87  	tc.cache.Config.ShouldEvict = tc.shouldEvict
    88  	tc.cache.Config.OnEvicted = tc.onEvicted
    89  	return tc
    90  }
    91  
    92  // clear clears the cache and resets the low-water mark.
    93  func (tc *treeImpl) clear(lowWater hlc.Timestamp) {
    94  	tc.Lock()
    95  	defer tc.Unlock()
    96  	tc.cache.Clear()
    97  	tc.lowWater = lowWater
    98  	tc.latest = tc.lowWater
    99  }
   100  
   101  // len returns the total number of read and write intervals in the cache.
   102  func (tc *treeImpl) len() int {
   103  	tc.RLock()
   104  	defer tc.RUnlock()
   105  	return tc.cache.Len()
   106  }
   107  
   108  // Add implements the Cache interface.
   109  func (tc *treeImpl) Add(start, end roachpb.Key, ts hlc.Timestamp, txnID uuid.UUID) {
   110  	// This gives us a memory-efficient end key if end is empty.
   111  	if len(end) == 0 {
   112  		end = start.Next()
   113  		start = end[:len(start)]
   114  	}
   115  
   116  	tc.Lock()
   117  	defer tc.Unlock()
   118  	tc.latest.Forward(ts)
   119  
   120  	// Only add to the cache if the timestamp is more recent than the
   121  	// low water mark.
   122  	if tc.lowWater.Less(ts) {
   123  
   124  		addRange := func(r interval.Range) {
   125  			value := cacheValue{ts: ts, txnID: txnID}
   126  			key := tc.cache.MakeKey(r.Start, r.End)
   127  			entry := makeCacheEntry(key, value)
   128  			tc.bytes += cacheEntrySize(r.Start, r.End)
   129  			tc.cache.AddEntry(entry)
   130  		}
   131  		addEntryAfter := func(entry, after *cache.Entry) {
   132  			ck := entry.Key.(*cache.IntervalKey)
   133  			tc.bytes += cacheEntrySize(ck.Start, ck.End)
   134  			tc.cache.AddEntryAfter(entry, after)
   135  		}
   136  
   137  		r := interval.Range{
   138  			Start: interval.Comparable(start),
   139  			End:   interval.Comparable(end),
   140  		}
   141  
   142  		// Check existing, overlapping entries and truncate/split/remove if
   143  		// superseded and in the past. If existing entries are in the future,
   144  		// subtract from the range/ranges that need to be added to cache.
   145  		for _, entry := range tc.cache.GetOverlaps(r.Start, r.End) {
   146  			cv := entry.Value.(*cacheValue)
   147  			key := entry.Key.(*cache.IntervalKey)
   148  			sCmp := r.Start.Compare(key.Start)
   149  			eCmp := r.End.Compare(key.End)
   150  			// Some of the cases below adjust cv and key in-place (in a manner that
   151  			// maintains the IntervalCache invariants). These in-place modifications
   152  			// change the size of the entry. To capture all of these modifications we
   153  			// compute the current size of the entry and then use the new size at the
   154  			// end of this iteration to update Cache.bytes.
   155  			oldSize := cacheEntrySize(key.Start, key.End)
   156  			if cv.ts.Less(ts) {
   157  				// The existing interval has a timestamp less than the new
   158  				// interval. Compare interval ranges to determine how to
   159  				// modify existing interval.
   160  				switch {
   161  				case sCmp == 0 && eCmp == 0:
   162  					// New and old are equal; replace old with new and avoid the need to insert new.
   163  					//
   164  					// New: ------------
   165  					// Old: ------------
   166  					//
   167  					// New: ------------
   168  					// Old:
   169  					*cv = cacheValue{ts: ts, txnID: txnID}
   170  					tc.cache.MoveToEnd(entry)
   171  					return
   172  				case sCmp <= 0 && eCmp >= 0:
   173  					// New contains or is equal to old; delete old.
   174  					//
   175  					// New: ------------      ------------      ------------
   176  					// Old:   --------    or    ----------  or  ----------
   177  					//
   178  					// New: ------------      ------------      ------------
   179  					// Old:
   180  					tc.cache.DelEntry(entry)
   181  					continue // DelEntry adjusted tc.bytes, don't do it again
   182  				case sCmp > 0 && eCmp < 0:
   183  					// Old contains new; split up old into two.
   184  					//
   185  					// New:     ----
   186  					// Old: ------------
   187  					//
   188  					// New:     ----
   189  					// Old: ----    ----
   190  					oldEnd := key.End
   191  					key.End = r.Start
   192  
   193  					newKey := tc.cache.MakeKey(r.End, oldEnd)
   194  					newEntry := makeCacheEntry(newKey, *cv)
   195  					addEntryAfter(newEntry, entry)
   196  				case eCmp >= 0:
   197  					// Left partial overlap; truncate old end.
   198  					//
   199  					// New:     --------          --------
   200  					// Old: --------      or  ------------
   201  					//
   202  					// New:     --------          --------
   203  					// Old: ----              ----
   204  					key.End = r.Start
   205  				case sCmp <= 0:
   206  					// Right partial overlap; truncate old start.
   207  					//
   208  					// New: --------          --------
   209  					// Old:     --------  or  ------------
   210  					//
   211  					// New: --------          --------
   212  					// Old:         ----              ----
   213  					key.Start = r.End
   214  				default:
   215  					panic(fmt.Sprintf("no overlap between %v and %v", key.Range, r))
   216  				}
   217  			} else if ts.Less(cv.ts) {
   218  				// The existing interval has a timestamp greater than the new interval.
   219  				// Compare interval ranges to determine how to modify new interval before
   220  				// adding it to the timestamp cache.
   221  				switch {
   222  				case sCmp >= 0 && eCmp <= 0:
   223  					// Old contains or is equal to new; no need to add.
   224  					//
   225  					// Old: -----------      -----------      -----------      -----------
   226  					// New:    -----     or  -----------  or  --------     or     --------
   227  					//
   228  					// Old: -----------      -----------      -----------      -----------
   229  					// New:
   230  					return
   231  				case sCmp < 0 && eCmp > 0:
   232  					// New contains old; split up old into two. We can add the left piece
   233  					// immediately because it is guaranteed to be before the rest of the
   234  					// overlaps.
   235  					//
   236  					// Old:    ------
   237  					// New: ------------
   238  					//
   239  					// Old:    ------
   240  					// New: ---      ---
   241  					lr := interval.Range{Start: r.Start, End: key.Start}
   242  					addRange(lr)
   243  
   244  					r.Start = key.End
   245  				case eCmp > 0:
   246  					// Left partial overlap; truncate new start.
   247  					//
   248  					// Old: --------          --------
   249  					// New:     --------  or  ------------
   250  					//
   251  					// Old: --------          --------
   252  					// New:         ----              ----
   253  					r.Start = key.End
   254  				case sCmp < 0:
   255  					// Right partial overlap; truncate new end.
   256  					//
   257  					// Old:     --------          --------
   258  					// New: --------      or  ------------
   259  					//
   260  					// Old:     --------          --------
   261  					// New: ----              ----
   262  					r.End = key.Start
   263  				default:
   264  					panic(fmt.Sprintf("no overlap between %v and %v", key.Range, r))
   265  				}
   266  			} else if cv.txnID == txnID {
   267  				// The existing interval has a timestamp equal to the new
   268  				// interval, and the same transaction ID.
   269  				switch {
   270  				case sCmp >= 0 && eCmp <= 0:
   271  					// Old contains or is equal to new; no need to add.
   272  					//
   273  					// New:    -----     or  -----------  or  --------     or     --------
   274  					// Old: -----------      -----------      -----------      -----------
   275  					//
   276  					// New:
   277  					// Old: -----------      -----------      -----------      -----------
   278  					return
   279  				case sCmp <= 0 && eCmp >= 0:
   280  					// New contains old; delete old.
   281  					//
   282  					// New: ------------      ------------      ------------
   283  					// Old:   --------    or    ----------  or  ----------
   284  					//
   285  					// New: ------------      ------------      ------------
   286  					// Old:
   287  					tc.cache.DelEntry(entry)
   288  					continue // DelEntry adjusted tc.bytes, don't do it again
   289  				case eCmp >= 0:
   290  					// Left partial overlap; truncate old end.
   291  					//
   292  					// New:     --------          --------
   293  					// Old: --------      or  ------------
   294  					//
   295  					// New:     --------          --------
   296  					// Old: ----              ----
   297  					key.End = r.Start
   298  				case sCmp <= 0:
   299  					// Right partial overlap; truncate old start.
   300  					//
   301  					// New: --------          --------
   302  					// Old:     --------  or  ------------
   303  					//
   304  					// New: --------          --------
   305  					// Old:         ----              ----
   306  					key.Start = r.End
   307  				default:
   308  					panic(fmt.Sprintf("no overlap between %v and %v", key.Range, r))
   309  				}
   310  			} else {
   311  				// The existing interval has a timestamp equal to the new
   312  				// interval and a different transaction ID.
   313  				switch {
   314  				case sCmp == 0 && eCmp == 0:
   315  					// New and old are equal. Segment is no longer owned by any
   316  					// transaction.
   317  					//
   318  					// New: ------------
   319  					// Old: ------------
   320  					//
   321  					// New:
   322  					// Nil: ============
   323  					// Old:
   324  					cv.txnID = noTxnID
   325  					tc.bytes += cacheEntrySize(key.Start, key.End) - oldSize
   326  					return
   327  				case sCmp == 0 && eCmp > 0:
   328  					// New contains old, left-aligned. Clear ownership of the
   329  					// existing segment and truncate new.
   330  					//
   331  					// New: ------------
   332  					// Old: ----------
   333  					//
   334  					// New:           --
   335  					// Nil: ==========
   336  					// Old:
   337  					cv.txnID = noTxnID
   338  					r.Start = key.End
   339  				case sCmp < 0 && eCmp == 0:
   340  					// New contains old, right-aligned. Clear ownership of the
   341  					// existing segment and truncate new.
   342  					//
   343  					// New: ------------
   344  					// Old:   ----------
   345  					//
   346  					// New: --
   347  					// Nil:   ==========
   348  					// Old:
   349  					cv.txnID = noTxnID
   350  					r.End = key.Start
   351  				case sCmp < 0 && eCmp > 0:
   352  					// New contains old; split into three segments with the
   353  					// overlap owned by no txn.
   354  					//
   355  					// New: ------------
   356  					// Old:   --------
   357  					//
   358  					// New: --        --
   359  					// Nil:   ========
   360  					// Old:
   361  					cv.txnID = noTxnID
   362  
   363  					newKey := tc.cache.MakeKey(r.Start, key.Start)
   364  					newEntry := makeCacheEntry(newKey, cacheValue{ts: ts, txnID: txnID})
   365  					addEntryAfter(newEntry, entry)
   366  					r.Start = key.End
   367  				case sCmp > 0 && eCmp < 0:
   368  					// Old contains new; split up old into two. New segment is
   369  					// owned by no txn.
   370  					//
   371  					// New:     ----
   372  					// Old: ------------
   373  					//
   374  					// New:
   375  					// Nil:     ====
   376  					// Old: ----    ----
   377  					txnID = noTxnID
   378  					oldEnd := key.End
   379  					key.End = r.Start
   380  
   381  					newKey := tc.cache.MakeKey(r.End, oldEnd)
   382  					newEntry := makeCacheEntry(newKey, *cv)
   383  					addEntryAfter(newEntry, entry)
   384  				case eCmp == 0:
   385  					// Old contains new, right-aligned; truncate old end and clear
   386  					// ownership of new segment.
   387  					//
   388  					// New:     --------
   389  					// Old: ------------
   390  					//
   391  					// New:
   392  					// Nil:     ========
   393  					// Old: ----
   394  					txnID = noTxnID
   395  					key.End = r.Start
   396  				case sCmp == 0:
   397  					// Old contains new, left-aligned; truncate old start and
   398  					// clear ownership of new segment.
   399  					// New: --------
   400  					// Old: ------------
   401  					//
   402  					// New:
   403  					// Nil: ========
   404  					// Old:         ----
   405  					txnID = noTxnID
   406  					key.Start = r.End
   407  				case eCmp > 0:
   408  					// Left partial overlap; truncate old end and split new into
   409  					// segments owned by no txn (the overlap) and the new txn.
   410  					//
   411  					// New:     --------
   412  					// Old: --------
   413  					//
   414  					// New:         ----
   415  					// Nil:     ====
   416  					// Old: ----
   417  					key.End, r.Start = r.Start, key.End
   418  
   419  					newKey := tc.cache.MakeKey(key.End, r.Start)
   420  					newCV := cacheValue{ts: cv.ts}
   421  					newEntry := makeCacheEntry(newKey, newCV)
   422  					addEntryAfter(newEntry, entry)
   423  				case sCmp < 0:
   424  					// Right partial overlap; truncate old start and split new into
   425  					// segments owned by no txn (the overlap) and the new txn.
   426  					//
   427  					// New: --------
   428  					// Old:     --------
   429  					//
   430  					// New: ----
   431  					// Nil:     ====
   432  					// Old:         ----
   433  					key.Start, r.End = r.End, key.Start
   434  
   435  					newKey := tc.cache.MakeKey(r.End, key.Start)
   436  					newCV := cacheValue{ts: cv.ts}
   437  					newEntry := makeCacheEntry(newKey, newCV)
   438  					addEntryAfter(newEntry, entry)
   439  				default:
   440  					panic(fmt.Sprintf("no overlap between %v and %v", key.Range, r))
   441  				}
   442  			}
   443  			tc.bytes += cacheEntrySize(key.Start, key.End) - oldSize
   444  		}
   445  		addRange(r)
   446  	}
   447  }
   448  
   449  // SetLowWater implements the Cache interface.
   450  func (tc *treeImpl) SetLowWater(start, end roachpb.Key, ts hlc.Timestamp) {
   451  	tc.Add(start, end, ts, noTxnID)
   452  }
   453  
   454  // getLowWater implements the Cache interface.
   455  func (tc *treeImpl) getLowWater() hlc.Timestamp {
   456  	tc.RLock()
   457  	defer tc.RUnlock()
   458  	return tc.lowWater
   459  }
   460  
   461  // GetMax implements the Cache interface.
   462  func (tc *treeImpl) GetMax(start, end roachpb.Key) (hlc.Timestamp, uuid.UUID) {
   463  	return tc.getMax(start, end)
   464  }
   465  
   466  func (tc *treeImpl) getMax(start, end roachpb.Key) (hlc.Timestamp, uuid.UUID) {
   467  	tc.Lock()
   468  	defer tc.Unlock()
   469  	if len(end) == 0 {
   470  		end = start.Next()
   471  	}
   472  	maxTS := tc.lowWater
   473  	maxTxnID := noTxnID
   474  	for _, o := range tc.cache.GetOverlaps(start, end) {
   475  		ce := o.Value.(*cacheValue)
   476  		if maxTS.Less(ce.ts) {
   477  			maxTS = ce.ts
   478  			maxTxnID = ce.txnID
   479  		} else if maxTS == ce.ts && maxTxnID != ce.txnID {
   480  			maxTxnID = noTxnID
   481  		}
   482  	}
   483  	return maxTS, maxTxnID
   484  }
   485  
   486  // shouldEvict returns true if the cache entry's timestamp is no
   487  // longer within the MinRetentionWindow.
   488  func (tc *treeImpl) shouldEvict(size int, key, value interface{}) bool {
   489  	if tc.bytes <= tc.maxBytes {
   490  		return false
   491  	}
   492  	ce := value.(*cacheValue)
   493  	// In case low water mark was set higher, evict any entries
   494  	// which occurred before it.
   495  	if ce.ts.Less(tc.lowWater) {
   496  		return true
   497  	}
   498  	// Compute the edge of the cache window.
   499  	edge := tc.latest
   500  	edge.WallTime -= MinRetentionWindow.Nanoseconds()
   501  	// We evict and update the low water mark if the proposed evictee's
   502  	// timestamp is <= than the edge of the window.
   503  	if ce.ts.LessEq(edge) {
   504  		tc.lowWater = ce.ts
   505  		return true
   506  	}
   507  	return false
   508  }
   509  
   510  // onEvicted is called when an entry is evicted from the cache.
   511  func (tc *treeImpl) onEvicted(k, v interface{}) {
   512  	ck := k.(*cache.IntervalKey)
   513  	reqSize := cacheEntrySize(ck.Start, ck.End)
   514  	if tc.bytes < reqSize {
   515  		panic(fmt.Sprintf("bad reqSize: %d < %d", tc.bytes, reqSize))
   516  	}
   517  	tc.bytes -= reqSize
   518  }
   519  
   520  // Metrics implements the Cache interface.
   521  func (tc *treeImpl) Metrics() Metrics {
   522  	return tc.metrics
   523  }