github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/tscache/interval_skl.go (about)

     1  // Copyright 2017 Andy Kimball
     2  // Copyright 2017 The Cockroach Authors.
     3  //
     4  // Use of this software is governed by the Business Source License
     5  // included in the file licenses/BSL.txt.
     6  //
     7  // As of the Change Date specified in that file, in accordance with
     8  // the Business Source License, use of this software will be governed
     9  // by the Apache License, Version 2.0, included in the file
    10  // licenses/APL.txt.
    11  
    12  package tscache
    13  
    14  import (
    15  	"bytes"
    16  	"container/list"
    17  	"context"
    18  	"encoding/binary"
    19  	"fmt"
    20  	"sync/atomic"
    21  	"time"
    22  	"unsafe"
    23  
    24  	"github.com/andy-kimball/arenaskl"
    25  	"github.com/cockroachdb/cockroach/pkg/util"
    26  	"github.com/cockroachdb/cockroach/pkg/util/hlc"
    27  	"github.com/cockroachdb/cockroach/pkg/util/log"
    28  	"github.com/cockroachdb/cockroach/pkg/util/syncutil"
    29  	"github.com/cockroachdb/cockroach/pkg/util/uuid"
    30  	"github.com/cockroachdb/errors"
    31  )
    32  
    33  // rangeOptions are passed to AddRange to indicate the bounds of the range. By
    34  // default, the "from" and "to" keys are inclusive. Setting these bit flags
    35  // indicates that one or both is exclusive instead.
    36  type rangeOptions int
    37  
    38  const (
    39  	// excludeFrom indicates that the range does not include the starting key.
    40  	excludeFrom = rangeOptions(1 << iota)
    41  
    42  	// excludeTo indicates that the range does not include the ending key.
    43  	excludeTo
    44  )
    45  
    46  // nodeOptions are meta tags on skiplist nodes that indicate the status and role
    47  // of that node in the intervalSkl. The options are bit flags that can be
    48  // independently added and removed.
    49  //
    50  // Each node in the intervalSkl holds a key and, optionally, the latest read
    51  // timestamp for that key. In addition, the node optionally holds the latest
    52  // read timestamp for the range of keys between itself and the next key that is
    53  // present in the skiplist. This space between keys is called the "gap", and the
    54  // timestamp for that range is called the "gap timestamp". Here is a simplified
    55  // representation that would result after these ranges were added to an empty
    56  // intervalSkl:
    57  //   ["apple", "orange") = 200
    58  //   ["kiwi", "raspberry"] = 100
    59  //
    60  //   "apple"    "orange"   "raspberry"
    61  //   keyts=200  keyts=100  keyts=100
    62  //   gapts=200  gapts=100  gapts=0
    63  //
    64  // That is, the range from apple (inclusive) to orange (exclusive) has a read
    65  // timestamp of 200. The range from orange (inclusive) to raspberry (inclusive)
    66  // has a read timestamp of 100. All other keys have a read timestamp of 0.
    67  type nodeOptions int
    68  
    69  const (
    70  	// initialized indicates that the node has been created and fully
    71  	// initialized. Key and gap values are final, and can now be used.
    72  	initialized = 1 << iota
    73  
    74  	// cantInit indicates that the node should never be allowed to initialize.
    75  	// This is set on nodes which were unable to ratchet their values at some
    76  	// point because of a full arena. In this case, the node's values should
    77  	// never become final and any goroutines trying to initialize it it will be
    78  	// forced to create it again in a new page when they notice this flag.
    79  	cantInit
    80  
    81  	// hasKey indicates that the node has an associated key value. If this is
    82  	// not set, then the key timestamp is assumed to be zero and the key is
    83  	// assumed to not have a corresponding txnID.
    84  	hasKey
    85  
    86  	// hasGap indicates that the node has an associated gap value. If this is
    87  	// not set, then the gap timestamp is assumed to be zero and the gap is
    88  	// assumed to not have a corresponding txnID.
    89  	hasGap
    90  )
    91  
    92  const (
    93  	encodedTsSize    = int(unsafe.Sizeof(int64(0)) + unsafe.Sizeof(int32(0)))
    94  	encodedTxnIDSize = int(unsafe.Sizeof(uuid.UUID{}))
    95  	encodedValSize   = encodedTsSize + encodedTxnIDSize
    96  
    97  	// initialSklPageSize is the initial size of each page in the sklImpl's
    98  	// intervalSkl. The pages start small to limit the memory footprint of
    99  	// the data structure for short-lived tests. Reducing this size can hurt
   100  	// performance but it decreases the risk of OOM failures when many tests
   101  	// are running concurrently.
   102  	initialSklPageSize = 128 << 10 // 128 KB
   103  	// maximumSklPageSize is the maximum size of each page in the sklImpl's
   104  	// intervalSkl. A long-running server is expected to settle on pages of
   105  	// this size under steady-state load.
   106  	maximumSklPageSize = 32 << 20 // 32 MB
   107  
   108  	defaultMinSklPages = 2
   109  )
   110  
   111  // initialSklAllocSize is the amount of space in its arena that an empty
   112  // arenaskl.Skiplist consumes.
   113  var initialSklAllocSize = func() int {
   114  	a := arenaskl.NewArena(1000)
   115  	_ = arenaskl.NewSkiplist(a)
   116  	return int(a.Size())
   117  }()
   118  
   119  // intervalSkl efficiently tracks the latest logical time at which any key or
   120  // range of keys has been accessed. Keys are binary values of any length, and
   121  // times are represented as hybrid logical timestamps (see hlc package). The
   122  // data structure guarantees that the read timestamp of any given key or range
   123  // will never decrease. In other words, if a lookup returns timestamp A and
   124  // repeating the same lookup returns timestamp B, then B >= A.
   125  //
   126  // Add and lookup operations do not block or interfere with one another, which
   127  // enables predictable operation latencies. Also, the impact of the structure on
   128  // the GC is virtually nothing, even when the structure is very large. These
   129  // properties are enabled by employing a lock-free skiplist implementation that
   130  // uses an arena allocator. Skiplist nodes refer to one another by offset into
   131  // the arena rather than by pointer, so the GC has very few objects to track.
   132  //
   133  //
   134  // The data structure can conceptually be thought of as being parameterized over
   135  // a key and a value type, such that the key implements a Comparable interface
   136  // (see interval.Comparable) and the value implements a Ratchetable interface:
   137  //
   138  //   type Ratchetable interface {
   139  //     Ratchet(other Ratchetable) (changed bool)
   140  //   }
   141  //
   142  // In other words, if Go supported zero-cost abstractions, this type might look
   143  // like:
   144  //
   145  //   type intervalSkl<K: Comparable, V: Ratchetable>
   146  //
   147  type intervalSkl struct {
   148  	// rotMutex synchronizes page rotation with all other operations. The read
   149  	// lock is acquired by the Add and Lookup operations. The write lock is
   150  	// acquired only when the pages are rotated. Since that is very rare, the
   151  	// vast majority of operations can proceed without blocking.
   152  	rotMutex syncutil.RWMutex
   153  
   154  	// The following fields are used to enforce a minimum retention window on
   155  	// all timestamp intervals. intervalSkl promises to retain all timestamp
   156  	// intervals until they are at least this old before allowing the floor
   157  	// timestamp to ratchet and subsume them. If clock is nil then no minimum
   158  	// retention policy will be employed.
   159  	clock  *hlc.Clock
   160  	minRet time.Duration
   161  
   162  	// The size of the last allocated page in the data structure, in bytes. When
   163  	// a page fills, a new page will be allocate, the pages will be rotated, and
   164  	// older entries will be discarded. Page sizes grow exponentially as pages
   165  	// are allocated up to a maximum of maximumSklPageSize. The value will never
   166  	// regress over the lifetime of an intervalSkl instance.
   167  	//
   168  	// The entire data structure is typically bound to a maximum a size of
   169  	// maximumSklPageSize*minPages. However, this limit can be violated if the
   170  	// intervalSkl needs to grow larger to enforce a minimum retention policy.
   171  	pageSize      uint32
   172  	pageSizeFixed bool // testing only
   173  
   174  	// The linked list maintains fixed-size skiplist pages, ordered by creation
   175  	// time such that the first page is the one most recently created. When the
   176  	// first page fills, a new empty page is prepended to the front of the list
   177  	// and all others are pushed back. This first page is the only sklPage that
   178  	// is written to, all others are immutable after they have left the front of
   179  	// the list. However, earlier pages are accessed whenever necessary during
   180  	// lookups. Pages are evicted when they become too old, subject to a minimum
   181  	// retention policy described above.
   182  	pages    list.List // List<*sklPage>
   183  	minPages int
   184  
   185  	// In order to ensure that timestamps never decrease, intervalSkl maintains
   186  	// a floor timestamp, which is the minimum timestamp that can be returned by
   187  	// the lookup operations. When the earliest page is discarded, its current
   188  	// maximum timestamp becomes the new floor timestamp for the overall
   189  	// intervalSkl.
   190  	floorTS hlc.Timestamp
   191  
   192  	metrics sklMetrics
   193  }
   194  
   195  // newIntervalSkl creates a new interval skiplist with the given minimum
   196  // retention duration and the maximum size.
   197  func newIntervalSkl(clock *hlc.Clock, minRet time.Duration, metrics sklMetrics) *intervalSkl {
   198  	s := intervalSkl{
   199  		clock:    clock,
   200  		minRet:   minRet,
   201  		pageSize: initialSklPageSize / 2, // doubled in pushNewPage
   202  		minPages: defaultMinSklPages,
   203  		metrics:  metrics,
   204  	}
   205  	s.pushNewPage(0 /* maxWallTime */, nil /* arena */)
   206  	s.metrics.Pages.Update(1)
   207  	return &s
   208  }
   209  
   210  // Add marks the a single key as having been read at the given timestamp. Once
   211  // Add completes, future lookups of this key are guaranteed to return an equal
   212  // or greater timestamp.
   213  func (s *intervalSkl) Add(key []byte, val cacheValue) {
   214  	s.AddRange(nil, key, 0, val)
   215  }
   216  
   217  // AddRange marks the given range of keys [from, to] as having been read at the
   218  // given timestamp. The starting and ending points of the range are inclusive by
   219  // default, but can be excluded by passing the applicable range options. nil can
   220  // be passed as the "from" key, in which case only the end key will be added.
   221  // nil can also be passed as the "to" key, in which case an open range will be
   222  // added spanning [from, infinity). However, it is illegal to pass nil for both
   223  // "from" and "to". It is also illegal for "from" > "to", which would be an
   224  // inverted range.
   225  //
   226  // intervalSkl defines the domain of possible keys to span ["", nil). A range
   227  // with a starting key of []byte("") is treated as a closed range beginning at
   228  // the minimum key. A range with an ending key of []byte(nil) is treated as an
   229  // open range extending to infinity (as such, excludeTo has not effect on it). A
   230  // range starting at []byte("") and ending at []byte(nil) will span all keys.
   231  //
   232  // If some or all of the range was previously read at a higher timestamp, then
   233  // the range is split into sub-ranges that are each marked with the maximum read
   234  // timestamp for that sub-range. Once AddRange completes, future lookups at any
   235  // point in the range are guaranteed to return an equal or greater timestamp.
   236  func (s *intervalSkl) AddRange(from, to []byte, opt rangeOptions, val cacheValue) {
   237  	if from == nil && to == nil {
   238  		panic("from and to keys cannot be nil")
   239  	}
   240  	if encodedRangeSize(from, to, opt) > int(s.maximumPageSize())-initialSklAllocSize {
   241  		// Without this check, we could fall into an infinite page rotation loop
   242  		// if a range would take up more space than available in an empty page.
   243  		panic("key range too large to fit in any page")
   244  	}
   245  
   246  	if to != nil {
   247  		cmp := 0
   248  		if from != nil {
   249  			cmp = bytes.Compare(from, to)
   250  		}
   251  
   252  		switch {
   253  		case cmp > 0:
   254  			// Starting key is after ending key. This shouldn't happen. Determine
   255  			// the index where the keys diverged and panic.
   256  			d := 0
   257  			for d < len(from) && d < len(to) {
   258  				if from[d] != to[d] {
   259  					break
   260  				}
   261  				d++
   262  			}
   263  			msg := fmt.Sprintf("inverted range (issue #32149): key lens = [%d,%d), diff @ index %d",
   264  				len(from), len(to), d)
   265  			log.Errorf(context.Background(), "%s, [%s,%s)", msg, from, to)
   266  			panic(log.Safe(msg))
   267  		case cmp == 0:
   268  			// Starting key is same as ending key, so just add single node.
   269  			if opt == (excludeFrom | excludeTo) {
   270  				// Both from and to keys are excluded, so range is zero length.
   271  				return
   272  			}
   273  
   274  			// Just add the ending key.
   275  			from = nil
   276  			opt = 0
   277  		}
   278  	}
   279  
   280  	for {
   281  		// Try to add the range to the later page.
   282  		filledPage := s.addRange(from, to, opt, val)
   283  		if filledPage == nil {
   284  			break
   285  		}
   286  
   287  		// The page was filled up, so rotate the pages and then try again.
   288  		s.rotatePages(filledPage)
   289  	}
   290  }
   291  
   292  // addRange marks the given range of keys [from, to] as having been read at the
   293  // given timestamp. The key range and the rangeOptions observe the same behavior
   294  // as is specified for AddRange above. Notably, addRange treats nil "from" and
   295  // "to" arguments in accordance with AddRange's contract. It returns nil if the
   296  // operation was successful, or a pointer to an sklPage if the operation failed
   297  // because that page was full.
   298  func (s *intervalSkl) addRange(from, to []byte, opt rangeOptions, val cacheValue) *sklPage {
   299  	// Acquire the rotation mutex read lock so that the page will not be rotated
   300  	// while add or lookup operations are in progress.
   301  	s.rotMutex.RLock()
   302  	defer s.rotMutex.RUnlock()
   303  
   304  	// If floor ts is >= requested timestamp, then no need to perform a search
   305  	// or add any records.
   306  	if val.ts.LessEq(s.floorTS) {
   307  		return nil
   308  	}
   309  
   310  	fp := s.frontPage()
   311  
   312  	var it arenaskl.Iterator
   313  	it.Init(fp.list)
   314  
   315  	// Start by ensuring that the ending node has been created (unless "to" is
   316  	// nil, in which case the range extends indefinitely). Do this before creating
   317  	// the start node, so that the range won't extend past the end point during
   318  	// the period between creating the two endpoints. Since we need the ending node
   319  	// to be initialized before creating the starting node, we pass mustInit = true.
   320  	var err error
   321  	if to != nil {
   322  		if (opt & excludeTo) == 0 {
   323  			err = fp.addNode(&it, to, val, hasKey, true /* mustInit */)
   324  		} else {
   325  			err = fp.addNode(&it, to, val, 0, true /* mustInit */)
   326  		}
   327  
   328  		if errors.Is(err, arenaskl.ErrArenaFull) {
   329  			return fp
   330  		}
   331  	}
   332  
   333  	// If from is nil, then the "range" is just a single key. We already
   334  	// asserted above that if from == nil then to != nil.
   335  	if from == nil {
   336  		return nil
   337  	}
   338  
   339  	// Ensure that the starting node has been created.
   340  	if (opt & excludeFrom) == 0 {
   341  		err = fp.addNode(&it, from, val, hasKey|hasGap, false /* mustInit */)
   342  	} else {
   343  		err = fp.addNode(&it, from, val, hasGap, false /* mustInit */)
   344  	}
   345  
   346  	if errors.Is(err, arenaskl.ErrArenaFull) {
   347  		return fp
   348  	}
   349  
   350  	// Seek to the node immediately after the "from" node.
   351  	//
   352  	// If there are no nodes after the "from" node (only possible if to == nil),
   353  	// then ensureFloorValue below will be a no-op because no other nodes need
   354  	// to be adjusted.
   355  	if !it.Valid() || !bytes.Equal(it.Key(), from) {
   356  		// We will only reach this state if we didn't need to add a node at
   357  		// "from" due to the previous gap value being larger than val. The fast
   358  		// path for this case is in sklPage.addNode. For all other times, adding
   359  		// the new node will have positioned the iterator at "from".
   360  		//
   361  		// If Seek returns false then we're already at the following node, so
   362  		// there's no need to call Next.
   363  		if it.Seek(from) {
   364  			it.Next()
   365  		}
   366  	} else {
   367  		it.Next()
   368  	}
   369  
   370  	// Now iterate forwards and ensure that all nodes between the start and
   371  	// end (exclusive) have timestamps that are >= the range timestamp. end
   372  	// is exclusive because we already added a node at that key.
   373  	if !fp.ensureFloorValue(&it, to, val) {
   374  		// Page is filled up, so rotate pages and try again.
   375  		return fp
   376  	}
   377  
   378  	return nil
   379  }
   380  
   381  // frontPage returns the front page of the intervalSkl.
   382  func (s *intervalSkl) frontPage() *sklPage {
   383  	return s.pages.Front().Value.(*sklPage)
   384  }
   385  
   386  // pushNewPage prepends a new empty page to the front of the pages list. It
   387  // accepts an optional arena argument to facilitate re-use.
   388  func (s *intervalSkl) pushNewPage(maxWallTime int64, arena *arenaskl.Arena) {
   389  	size := s.nextPageSize()
   390  	if arena != nil && arena.Cap() == size {
   391  		// Re-use the provided arena, if possible.
   392  		arena.Reset()
   393  	} else {
   394  		// Otherwise, construct new memory arena.
   395  		arena = arenaskl.NewArena(size)
   396  	}
   397  	p := newSklPage(arena)
   398  	p.maxWallTime = maxWallTime
   399  	s.pages.PushFront(p)
   400  }
   401  
   402  // nextPageSize returns the size that the next allocated page should use.
   403  func (s *intervalSkl) nextPageSize() uint32 {
   404  	if s.pageSizeFixed || s.pageSize == maximumSklPageSize {
   405  		return s.pageSize
   406  	}
   407  	s.pageSize *= 2
   408  	if s.pageSize > maximumSklPageSize {
   409  		s.pageSize = maximumSklPageSize
   410  	}
   411  	return s.pageSize
   412  }
   413  
   414  // maximumPageSize returns the maximum page size that this instance of the
   415  // intervalSkl will be able to accommodate. The method takes into consideration
   416  // whether the page size is fixed or dynamic.
   417  func (s *intervalSkl) maximumPageSize() uint32 {
   418  	if s.pageSizeFixed {
   419  		return s.pageSize
   420  	}
   421  	return maximumSklPageSize
   422  }
   423  
   424  // rotatePages makes the later page the earlier page, and then discards the
   425  // earlier page. The max timestamp of the earlier page becomes the new floor
   426  // timestamp, in order to guarantee that timestamp lookups never return decreasing
   427  // values.
   428  func (s *intervalSkl) rotatePages(filledPage *sklPage) {
   429  	// Acquire the rotation mutex write lock to lock the entire intervalSkl.
   430  	s.rotMutex.Lock()
   431  	defer s.rotMutex.Unlock()
   432  
   433  	fp := s.frontPage()
   434  	if filledPage != fp {
   435  		// Another thread already rotated the pages, so don't do anything more.
   436  		return
   437  	}
   438  
   439  	// Determine the minimum timestamp a page must contain to be within the
   440  	// minimum retention window. If clock is nil, we have no minimum retention
   441  	// window.
   442  	minTSToRetain := hlc.MaxTimestamp
   443  	if s.clock != nil {
   444  		minTSToRetain = s.clock.Now()
   445  		minTSToRetain.WallTime -= s.minRet.Nanoseconds()
   446  	}
   447  
   448  	// Iterate over the pages in reverse, evicting pages that are no longer
   449  	// needed and ratcheting up the floor timestamp in the process.
   450  	//
   451  	// If possible, keep a reference to an evicted page's arena so that we can
   452  	// re-use it. This is safe because we're holding the rotation mutex write
   453  	// lock, so there cannot be concurrent readers and no reader will ever
   454  	// access evicted pages once we unlock.
   455  	back := s.pages.Back()
   456  	var oldArena *arenaskl.Arena
   457  	for s.pages.Len() >= s.minPages {
   458  		bp := back.Value.(*sklPage)
   459  		bpMaxTS := hlc.Timestamp{WallTime: bp.maxWallTime}
   460  		if minTSToRetain.LessEq(bpMaxTS) {
   461  			// The back page's maximum timestamp is within the time
   462  			// window we've promised to retain, so we can't evict it.
   463  			break
   464  		}
   465  
   466  		// Max timestamp of the back page becomes the new floor timestamp.
   467  		s.floorTS.Forward(bpMaxTS)
   468  
   469  		// Evict the page.
   470  		oldArena = bp.list.Arena()
   471  		evict := back
   472  		back = back.Prev()
   473  		s.pages.Remove(evict)
   474  	}
   475  
   476  	// Push a new empty page on the front of the pages list. We give this page
   477  	// the maxWallTime of the old front page. This assures that the maxWallTime
   478  	// for a page is always equal to or greater than that for all earlier pages.
   479  	// In other words, it assures that the maxWallTime for a page is not only
   480  	// the maximum timestamp for all values it contains, but also for all values
   481  	// any earlier pages contain.
   482  	s.pushNewPage(fp.maxWallTime, oldArena)
   483  
   484  	// Update metrics.
   485  	s.metrics.Pages.Update(int64(s.pages.Len()))
   486  	s.metrics.PageRotations.Inc(1)
   487  }
   488  
   489  // LookupTimestamp returns the latest timestamp value at which the given key was
   490  // read. If this operation is repeated with the same key, it will always result
   491  // in an equal or greater timestamp.
   492  func (s *intervalSkl) LookupTimestamp(key []byte) cacheValue {
   493  	return s.LookupTimestampRange(nil, key, 0)
   494  }
   495  
   496  // LookupTimestampRange returns the latest timestamp value of any key within the
   497  // specified range. If this operation is repeated with the same range, it will
   498  // always result in an equal or greater timestamp.
   499  func (s *intervalSkl) LookupTimestampRange(from, to []byte, opt rangeOptions) cacheValue {
   500  	if from == nil && to == nil {
   501  		panic("from and to keys cannot be nil")
   502  	}
   503  
   504  	// Acquire the rotation mutex read lock so that the page will not be rotated
   505  	// while add or lookup operations are in progress.
   506  	s.rotMutex.RLock()
   507  	defer s.rotMutex.RUnlock()
   508  
   509  	// Iterate over the pages, performing the lookup on each and remembering the
   510  	// maximum value we've seen so far.
   511  	var val cacheValue
   512  	for e := s.pages.Front(); e != nil; e = e.Next() {
   513  		p := e.Value.(*sklPage)
   514  
   515  		// If the maximum value's timestamp is greater than the max timestamp in
   516  		// the current page, then there's no need to do the lookup in this page.
   517  		// There's also no reason to do the lookup in any earlier pages either,
   518  		// because rotatePages assures that a page will never have a max
   519  		// timestamp smaller than that of any page earlier than it.
   520  		//
   521  		// NB: if the max timestamp of the current page is equal to the maximum
   522  		// value's timestamp, then we still need to perform the lookup. This is
   523  		// because the current page's max timestamp _may_ (if the hlc.Timestamp
   524  		// ceil operation in sklPage.ratchetMaxTimestamp was a no-op) correspond
   525  		// to a real range's timestamp, and this range _may_ overlap with our
   526  		// lookup range. If that is the case and that other range has a
   527  		// different txnID than our current cacheValue result (val), then we
   528  		// need to remove the txnID from our result, per the ratcheting policy
   529  		// for cacheValues. This is tested in TestIntervalSklMaxPageTS.
   530  		maxTS := hlc.Timestamp{WallTime: atomic.LoadInt64(&p.maxWallTime)}
   531  		if maxTS.Less(val.ts) {
   532  			break
   533  		}
   534  
   535  		val2 := p.lookupTimestampRange(from, to, opt)
   536  		val, _ = ratchetValue(val, val2)
   537  	}
   538  
   539  	// Return the higher value from the the page lookups and the floor
   540  	// timestamp.
   541  	floorVal := cacheValue{ts: s.floorTS, txnID: noTxnID}
   542  	val, _ = ratchetValue(val, floorVal)
   543  
   544  	return val
   545  }
   546  
   547  // FloorTS returns the receiver's floor timestamp.
   548  func (s *intervalSkl) FloorTS() hlc.Timestamp {
   549  	s.rotMutex.RLock()
   550  	defer s.rotMutex.RUnlock()
   551  	return s.floorTS
   552  }
   553  
   554  // sklPage maintains a skiplist based on a fixed-size arena. When the arena has
   555  // filled up, it returns arenaskl.ErrArenaFull. At that point, a new fixed page
   556  // must be allocated and used instead.
   557  type sklPage struct {
   558  	list        *arenaskl.Skiplist
   559  	maxWallTime int64 // accessed atomically
   560  	isFull      int32 // accessed atomically
   561  }
   562  
   563  func newSklPage(arena *arenaskl.Arena) *sklPage {
   564  	return &sklPage{list: arenaskl.NewSkiplist(arena)}
   565  }
   566  
   567  func (p *sklPage) lookupTimestampRange(from, to []byte, opt rangeOptions) cacheValue {
   568  	if to != nil {
   569  		cmp := 0
   570  		if from != nil {
   571  			cmp = bytes.Compare(from, to)
   572  		}
   573  
   574  		if cmp > 0 {
   575  			// Starting key is after ending key, so range is zero length.
   576  			return cacheValue{}
   577  		}
   578  		if cmp == 0 {
   579  			// Starting key is same as ending key.
   580  			if opt == (excludeFrom | excludeTo) {
   581  				// Both from and to keys are excluded, so range is zero length.
   582  				return cacheValue{}
   583  			}
   584  
   585  			// Scan over a single key.
   586  			from = to
   587  			opt = 0
   588  		}
   589  	}
   590  
   591  	var it arenaskl.Iterator
   592  	it.Init(p.list)
   593  	it.SeekForPrev(from)
   594  
   595  	return p.maxInRange(&it, from, to, opt)
   596  }
   597  
   598  // addNode adds a new node at key with the provided value if one does not exist.
   599  // If one does exist, it ratchets the existing node's value instead.
   600  //
   601  // If the mustInit flag is set, the function will ensure that the node is
   602  // initialized by the time the method returns, even if a different goroutine
   603  // created the node. If the flag is not set and a different goroutine created
   604  // the node, the method won't try to help.
   605  func (p *sklPage) addNode(
   606  	it *arenaskl.Iterator, key []byte, val cacheValue, opt nodeOptions, mustInit bool,
   607  ) error {
   608  	// Array with constant size will remain on the stack.
   609  	var arr [encodedValSize * 2]byte
   610  	var keyVal, gapVal cacheValue
   611  
   612  	if (opt & hasKey) != 0 {
   613  		keyVal = val
   614  	}
   615  
   616  	if (opt & hasGap) != 0 {
   617  		gapVal = val
   618  	}
   619  
   620  	if !it.SeekForPrev(key) {
   621  		// The key was not found. Scan for the previous gap value.
   622  		prevGapVal := p.incomingGapVal(it, key)
   623  
   624  		var err error
   625  		if it.Valid() && bytes.Equal(it.Key(), key) {
   626  			// Another thread raced and added a node at key while we were
   627  			// scanning backwards. Ratchet the new node.
   628  			err = arenaskl.ErrRecordExists
   629  		} else {
   630  			// There is still no node at key. If the previous node has a gap
   631  			// value that would not be updated with the new value, then there is
   632  			// no need to add another node, since its timestamp would be the
   633  			// same as the gap timestamp and its txnID would be the same as the
   634  			// gap txnID.
   635  			if _, update := ratchetValue(prevGapVal, val); !update {
   636  				return nil
   637  			}
   638  
   639  			// Ratchet max timestamp before adding the node.
   640  			p.ratchetMaxTimestamp(val.ts)
   641  
   642  			// Ensure that a new node is created. It needs to stay in the
   643  			// initializing state until the gap value of its preceding node
   644  			// has been found and used to ratchet this node's value. During
   645  			// the search for the gap value, this node acts as a sentinel
   646  			// for other ongoing operations - when they see this node they're
   647  			// forced to stop and ratchet its value before they can continue.
   648  			b, meta := encodeValueSet(arr[:0], keyVal, gapVal)
   649  			err = it.Add(key, b, meta)
   650  		}
   651  
   652  		switch {
   653  		case errors.Is(err, arenaskl.ErrArenaFull):
   654  			atomic.StoreInt32(&p.isFull, 1)
   655  			return err
   656  		case errors.Is(err, arenaskl.ErrRecordExists):
   657  			// Another thread raced and added the node, so just ratchet its
   658  			// values instead (down below).
   659  		case err == nil:
   660  			// Add was successful, so finish initialization by scanning for gap
   661  			// value and using it to ratchet the new nodes' values.
   662  			return p.ensureInitialized(it, key)
   663  		default:
   664  			panic(fmt.Sprintf("unexpected error: %v", err))
   665  		}
   666  	}
   667  
   668  	// If mustInit is set to true then we're promising that the node will be
   669  	// initialized by the time this method returns. Ensure this by helping out
   670  	// the goroutine that created the node.
   671  	if (it.Meta()&initialized) == 0 && mustInit {
   672  		if err := p.ensureInitialized(it, key); err != nil {
   673  			return err
   674  		}
   675  	}
   676  
   677  	// Ratchet up the timestamps on the existing node, but don't set the
   678  	// initialized bit. If mustInit is set then we already made sure the node
   679  	// was initialized. If mustInit is not set then we don't require it to be
   680  	// initialized.
   681  	if opt == 0 {
   682  		// Don't need to set either key or gap value, so done.
   683  		return nil
   684  	}
   685  	return p.ratchetValueSet(it, always, keyVal, gapVal, false /* setInit */)
   686  }
   687  
   688  // ensureInitialized ensures that the node at the specified key is initialized.
   689  // It does so by first scanning backwards to the first initialized node and
   690  // using its gap value as the initial "previous gap value". It then scans
   691  // forward until it reaches the desired key, ratcheting any uninitialized nodes
   692  // it encounters (but not initializing them), and updating the candidate
   693  // "previous gap value" as it goes. Finally, it initializes the node with the
   694  // "previous gap value".
   695  //
   696  // Iterating backwards and then forwards solves potential race conditions with
   697  // other threads. During backwards iteration, other nodes can be inserting new
   698  // nodes between the previous node and the lookup node, which could change the
   699  // choice for the "previous gap value". The solution is two-fold:
   700  //
   701  // 1. Add new nodes in two phases - initializing and then initialized. Nodes in
   702  //    the initializing state act as a synchronization point between goroutines
   703  //    that are adding a particular node and goroutines that are scanning for gap
   704  //    values. Scanning goroutines encounter the initializing nodes and are
   705  //    forced to ratchet them before continuing. If they fail to ratchet them
   706  //    because an arena is full, the nodes must never be initialized so they are
   707  //    set to cantInit. This is critical for correctness, because if one of these
   708  //    initializing nodes was not ratcheted when encountered during a forward
   709  //    scan and later initialized, we could see a ratchet inversion. For example,
   710  //    the inversion would occur if:
   711  //    - 1: a goroutine is scanning forwards after finding a previous gap value
   712  //         from node A in which it plans to initialize node C.
   713  //    - 2: node B is created and initialized between node A and node C with a
   714  //         larger value than either.
   715  //    - 1: the iterator scanning forwards to node C is already past node B when
   716  //         it is created.
   717  //    - 3: a lookup for the timestamp of node C comes in. Since it's not
   718  //         initialized, it uses node B's gap value.
   719  //    - 1: the iterator reaches node C and initializes it with node A's gap
   720  //         value, which is smaller than node B's.
   721  //    - 4: another lookup for the timestamp of node C comes it. It returns the
   722  //         nodes newly initialized value, which is smaller than the one it
   723  //         reported before.
   724  //    Ratcheting initializing nodes when encountered with the current gap value
   725  //    avoids this race.
   726  //
   727  //    However, only a goroutine that saw a node in an uninitialized state before
   728  //    scanning backwards can switch it from initializing to initialized. This
   729  //    enforces a "happens-before" relationship between the creation of a node
   730  //    and the discovery of the gap value that is used when initializing it. If
   731  //    any goroutine was able to initialize a node, then this relationship would
   732  //    not exist and we could experience races where a newly inserted node A's
   733  //    call to ensureFloorValue could come before the insertion of a node B, but
   734  //    node B could be initialized with a gap value discovered before the
   735  //    insertion of node A. For more on this, see the discussion in #19672.
   736  //
   737  // 2. After the gap value of the first initialized node with a key less than or
   738  //    equal to the desired key has been found, the scanning goroutine will scan
   739  //    forwards until it reaches the original key. It will ratchet any
   740  //    uninitialized nodes along the way and inherit the gap value from them as
   741  //    it goes. By the time it reaches the original key, it has a valid gap
   742  //    value, which we have called the "previous gap value". At this point, if
   743  //    the node at key is uninitialized, the node can be initialized with the
   744  //    "previous gap value".
   745  //
   746  // It is an error to call ensureInitialized on a key without a node. When
   747  // finished, the iterator will be positioned the same as if it.Seek(key) had
   748  // been called.
   749  func (p *sklPage) ensureInitialized(it *arenaskl.Iterator, key []byte) error {
   750  	// Determine the incoming gap value.
   751  	prevGapVal := p.incomingGapVal(it, key)
   752  
   753  	// Make sure we're on the right key again.
   754  	if util.RaceEnabled && !bytes.Equal(it.Key(), key) {
   755  		panic("no node found")
   756  	}
   757  
   758  	// If the node isn't initialized, initialize it.
   759  	return p.ratchetValueSet(it, onlyIfUninitialized, prevGapVal, prevGapVal, true /* setInit */)
   760  }
   761  
   762  // ensureFloorValue scans from the current position of the iterator to the
   763  // provided key, ratcheting all initialized or uninitialized nodes as it goes
   764  // with the provided value. It returns a boolean indicating whether it was
   765  // successful (true) or whether it saw an ErrArenaFull while ratcheting (false).
   766  func (p *sklPage) ensureFloorValue(it *arenaskl.Iterator, to []byte, val cacheValue) bool {
   767  	for it.Valid() {
   768  		util.RacePreempt()
   769  
   770  		// If "to" is not nil (open range) then it is treated as an exclusive
   771  		// bound.
   772  		if to != nil && bytes.Compare(it.Key(), to) >= 0 {
   773  			break
   774  		}
   775  
   776  		if atomic.LoadInt32(&p.isFull) == 1 {
   777  			// Page is full, so stop iterating. The caller will then be able to
   778  			// release the read lock and rotate the pages. Not doing this could
   779  			// result in forcing all other operations to wait for this thread to
   780  			// completely finish iteration. That could take a long time if this
   781  			// range is very large.
   782  			return false
   783  		}
   784  
   785  		// Don't clear the initialization bit, since we don't have the gap
   786  		// timestamp from the previous node, and don't need an initialized node
   787  		// for this operation anyway.
   788  		err := p.ratchetValueSet(it, always, val, val, false /* setInit */)
   789  		switch {
   790  		case err == nil:
   791  			// Continue scanning.
   792  		case errors.Is(err, arenaskl.ErrArenaFull):
   793  			// Page is too full to ratchet value, so stop iterating.
   794  			return false
   795  		default:
   796  			panic(fmt.Sprintf("unexpected error: %v", err))
   797  		}
   798  
   799  		it.Next()
   800  	}
   801  
   802  	return true
   803  }
   804  
   805  func (p *sklPage) ratchetMaxTimestamp(ts hlc.Timestamp) {
   806  	// Cheat and just use the max wall time portion of the timestamp, since it's
   807  	// fine for the max timestamp to be a bit too large. This is the case
   808  	// because it's always safe to increase the timestamp in a range. It's also
   809  	// always safe to remove the transaction ID from a range. Either of these
   810  	// changes may force a transaction to lose "ownership" over a range of keys,
   811  	// but they'll never allow a transaction to gain "ownership" over a range of
   812  	// keys that it wouldn't otherwise have. In other words, it's ok for the
   813  	// intervalSkl to produce false negatives but never ok for it to produce
   814  	// false positives.
   815  	//
   816  	// We could use an atomic.Value to store a "MaxValue" cacheValue for a given
   817  	// page, but this would be more expensive and it's not clear that it would
   818  	// be worth it.
   819  	new := ts.WallTime
   820  	if ts.Logical > 0 {
   821  		new++
   822  	}
   823  
   824  	for {
   825  		old := atomic.LoadInt64(&p.maxWallTime)
   826  		if new <= old {
   827  			break
   828  		}
   829  
   830  		if atomic.CompareAndSwapInt64(&p.maxWallTime, old, new) {
   831  			break
   832  		}
   833  	}
   834  }
   835  
   836  // ratchetPolicy defines the behavior a ratcheting attempt should take when
   837  // trying to ratchet a node. Certain operations require nodes to be ratcheted
   838  // regardless of whether they're already initialized or not. Other operations
   839  // only want nodes that are uninitialized to be ratcheted.
   840  type ratchetPolicy bool
   841  
   842  const (
   843  	// always is a policy to ratchet a node regardless of whether it is already
   844  	// initialized or not.
   845  	always ratchetPolicy = false
   846  	// onlyIfUninitialized is a policy to only ratchet a node if it has not been
   847  	// initialized yet.
   848  	onlyIfUninitialized ratchetPolicy = true
   849  )
   850  
   851  // ratchetValueSet will update the current node's key and gap values to the
   852  // maximum of their current values or the given values. If setInit is true, then
   853  // the initialized bit will be set, indicating that the node is now fully
   854  // initialized and its values can now be relied upon.
   855  //
   856  // The method will return ErrArenaFull if the arena was too full to ratchet the
   857  // node's value set. In that case, the node will be marked with the "cantInit"
   858  // flag because its values should never be trusted in isolation.
   859  func (p *sklPage) ratchetValueSet(
   860  	it *arenaskl.Iterator, policy ratchetPolicy, keyVal, gapVal cacheValue, setInit bool,
   861  ) error {
   862  	// Array with constant size will remain on the stack.
   863  	var arr [encodedValSize * 2]byte
   864  
   865  	for {
   866  		util.RacePreempt()
   867  
   868  		meta := it.Meta()
   869  		inited := (meta & initialized) != 0
   870  		if inited && policy == onlyIfUninitialized {
   871  			// If the node is already initialized and the policy is
   872  			// onlyIfUninitialized, return. If this isn't the first ratcheting
   873  			// attempt then we must have raced with node initialization before.
   874  			return nil
   875  		}
   876  		if (meta & cantInit) != 0 {
   877  			// If the meta has the cantInit flag set to true, we fail with an
   878  			// ErrArenaFull error to force the current goroutine to retry on a
   879  			// new page.
   880  			return arenaskl.ErrArenaFull
   881  		}
   882  
   883  		newMeta := meta
   884  		updateInit := setInit && !inited
   885  		if updateInit {
   886  			newMeta |= initialized
   887  		}
   888  
   889  		var keyValUpdate, gapValUpdate bool
   890  		oldKeyVal, oldGapVal := decodeValueSet(it.Value(), meta)
   891  		keyVal, keyValUpdate = ratchetValue(oldKeyVal, keyVal)
   892  		gapVal, gapValUpdate = ratchetValue(oldGapVal, gapVal)
   893  		updateVals := keyValUpdate || gapValUpdate
   894  
   895  		if updateVals {
   896  			// If we're updating the values (and maybe the init flag) then we
   897  			// need to call it.Set. This can return an ErrArenaFull, which we
   898  			// must handle with care.
   899  
   900  			// Ratchet the max timestamp.
   901  			keyTs, gapTs := keyVal.ts, gapVal.ts
   902  			if gapTs.Less(keyTs) {
   903  				p.ratchetMaxTimestamp(keyTs)
   904  			} else {
   905  				p.ratchetMaxTimestamp(gapTs)
   906  			}
   907  
   908  			// Remove the hasKey and hasGap flags from the meta. These will be
   909  			// replaced below.
   910  			newMeta &^= (hasKey | hasGap)
   911  
   912  			// Update the values, possibly preserving the init bit.
   913  			b, valMeta := encodeValueSet(arr[:0], keyVal, gapVal)
   914  			newMeta |= valMeta
   915  
   916  			err := it.Set(b, newMeta)
   917  			switch {
   918  			case err == nil:
   919  				// Success.
   920  				return nil
   921  			case errors.Is(err, arenaskl.ErrRecordUpdated):
   922  				// Record was updated by another thread, so restart ratchet attempt.
   923  				continue
   924  			case errors.Is(err, arenaskl.ErrArenaFull):
   925  				// The arena was full which means that we were unable to ratchet
   926  				// the value of this node. Mark the page as full and make sure
   927  				// that the node is moved to the "cantInit" state if it hasn't
   928  				// been initialized yet. This is critical because if the node
   929  				// was initialized after this, its value set would be relied
   930  				// upon to stand on its own even though it would be missing the
   931  				// ratcheting we tried to perform here.
   932  				atomic.StoreInt32(&p.isFull, 1)
   933  
   934  				if !inited && (meta&cantInit) == 0 {
   935  					err := it.SetMeta(meta | cantInit)
   936  					switch {
   937  					case errors.Is(err, arenaskl.ErrRecordUpdated):
   938  						// Record was updated by another thread, so restart
   939  						// ratchet attempt.
   940  						continue
   941  					case errors.Is(err, arenaskl.ErrArenaFull):
   942  						panic(fmt.Sprintf("SetMeta with larger meta should not return %v", err))
   943  					}
   944  				}
   945  				return arenaskl.ErrArenaFull
   946  			default:
   947  				panic(fmt.Sprintf("unexpected error: %v", err))
   948  			}
   949  		} else if updateInit {
   950  			// If we're only updating the init flag and not the values, we can
   951  			// use it.SetMeta instead of it.Set, which avoids allocating new
   952  			// chunks in the arena.
   953  			err := it.SetMeta(newMeta)
   954  			switch {
   955  			case err == nil:
   956  				// Success.
   957  				return nil
   958  			case errors.Is(err, arenaskl.ErrRecordUpdated):
   959  				// Record was updated by another thread, so restart ratchet attempt.
   960  				continue
   961  			case errors.Is(err, arenaskl.ErrArenaFull):
   962  				panic(fmt.Sprintf("SetMeta with larger meta should not return %v", err))
   963  			default:
   964  				panic(fmt.Sprintf("unexpected error: %v", err))
   965  			}
   966  		} else {
   967  			return nil
   968  		}
   969  	}
   970  }
   971  
   972  // maxInRange scans the range of keys between from and to and returns the
   973  // maximum (initialized or uninitialized) value found. When finished, the
   974  // iterator will be positioned the same as if it.Seek(to) had been called.
   975  func (p *sklPage) maxInRange(it *arenaskl.Iterator, from, to []byte, opt rangeOptions) cacheValue {
   976  	// Determine the previous gap value. This will move the iterator to the
   977  	// first node >= from.
   978  	prevGapVal := p.incomingGapVal(it, from)
   979  
   980  	if !it.Valid() {
   981  		// No more nodes.
   982  		return prevGapVal
   983  	} else if bytes.Equal(it.Key(), from) {
   984  		// Found a node at from.
   985  		if (it.Meta() & initialized) != 0 {
   986  			// The node was initialized. Ignore the previous gap value.
   987  			prevGapVal = cacheValue{}
   988  		}
   989  	} else {
   990  		// No node at from. Remove excludeFrom option.
   991  		opt &^= excludeFrom
   992  	}
   993  
   994  	// Scan the rest of the way. Notice that we provide the previous gap value.
   995  	// This is important for two reasons:
   996  	// 1. it will be counted towards the maxVal result.
   997  	// 2. it will be used to ratchet uninitialized nodes that the scan sees
   998  	//    before any initialized nodes.
   999  	_, maxVal := p.scanTo(it, to, opt, prevGapVal)
  1000  	return maxVal
  1001  }
  1002  
  1003  // incomingGapVal determines the gap value active at the specified key by first
  1004  // scanning backwards to the first initialized node and then scanning forwards
  1005  // to the specified key. If there is already a node at key then the previous gap
  1006  // value will be returned. When finished, the iterator will be positioned the
  1007  // same as if it.Seek(key) had been called.
  1008  //
  1009  // During forward iteration, if another goroutine inserts a new gap node in the
  1010  // interval between the previous node and the original key, then either:
  1011  //
  1012  // 1. The forward iteration finds it and looks up its gap value. That node's gap
  1013  //    value now becomes the new "previous gap value", and iteration continues.
  1014  //
  1015  // 2. The new node is created after the iterator has move past its position. As
  1016  //    part of node creation, the creator had to scan backwards to find the gap
  1017  //    value of the previous node. It is guaranteed to find a gap value that is
  1018  //    >= the gap value found by the original goroutine.
  1019  //
  1020  // This means that no matter what gets inserted, or when it gets inserted, the
  1021  // scanning goroutine is guaranteed to end up with a value that will never
  1022  // decrease on future lookups, which is the critical invariant.
  1023  func (p *sklPage) incomingGapVal(it *arenaskl.Iterator, key []byte) cacheValue {
  1024  	// Iterate backwards to the nearest initialized node.
  1025  	prevInitNode(it)
  1026  
  1027  	// Iterate forwards to key, remembering the last gap value.
  1028  	prevGapVal, _ := p.scanTo(it, key, 0, cacheValue{})
  1029  	return prevGapVal
  1030  }
  1031  
  1032  // scanTo scans from the current iterator position until the key "to". While
  1033  // scanning, any uninitialized values are ratcheted with the current gap value,
  1034  // which is essential to avoiding ratchet inversions (see the comment on
  1035  // ensureInitialized).
  1036  //
  1037  // The function then returns the maximum value seen along with the gap value at
  1038  // the end of the scan. If the iterator is positioned at a key > "to", the
  1039  // function will return zero values. The function takes an optional initial gap
  1040  // value argument, which is used to initialize the running maximum and gap
  1041  // values. When finished, the iterator will be positioned the same as if
  1042  // it.Seek(to) had been called.
  1043  func (p *sklPage) scanTo(
  1044  	it *arenaskl.Iterator, to []byte, opt rangeOptions, initGapVal cacheValue,
  1045  ) (prevGapVal, maxVal cacheValue) {
  1046  	prevGapVal, maxVal = initGapVal, initGapVal
  1047  	first := true
  1048  	for {
  1049  		util.RacePreempt()
  1050  
  1051  		if !it.Valid() {
  1052  			// No more nodes, which can happen for open ranges.
  1053  			return
  1054  		}
  1055  
  1056  		toCmp := bytes.Compare(it.Key(), to)
  1057  		if to == nil {
  1058  			// to == nil means open range, so toCmp will always be -1.
  1059  			toCmp = -1
  1060  		}
  1061  		if toCmp > 0 || (toCmp == 0 && (opt&excludeTo) != 0) {
  1062  			// Past the end key or we don't want to consider the end key.
  1063  			return
  1064  		}
  1065  
  1066  		// Ratchet uninitialized nodes. We pass onlyIfUninitialized, so if
  1067  		// the node is already initialized then this is a no-op.
  1068  		ratchetErr := p.ratchetValueSet(it, onlyIfUninitialized,
  1069  			prevGapVal, prevGapVal, false /* setInit */)
  1070  
  1071  		// Decode the current node's value set.
  1072  		keyVal, gapVal := decodeValueSet(it.Value(), it.Meta())
  1073  		if errors.Is(ratchetErr, arenaskl.ErrArenaFull) {
  1074  			// If we failed to ratchet an uninitialized node above, the desired
  1075  			// ratcheting won't be reflected in the decoded values. Perform the
  1076  			// ratcheting manually.
  1077  			keyVal, _ = ratchetValue(keyVal, prevGapVal)
  1078  			gapVal, _ = ratchetValue(gapVal, prevGapVal)
  1079  		}
  1080  
  1081  		if !(first && (opt&excludeFrom) != 0) {
  1082  			// As long as this isn't the first key and opt says to exclude the
  1083  			// first key, we ratchet the maxVal.
  1084  			maxVal, _ = ratchetValue(maxVal, keyVal)
  1085  		}
  1086  
  1087  		if toCmp == 0 {
  1088  			// We're on the scan's end key, so return the max value seen.
  1089  			return
  1090  		}
  1091  
  1092  		// Ratchet the maxVal by the current gapVal.
  1093  		maxVal, _ = ratchetValue(maxVal, gapVal)
  1094  
  1095  		// Haven't yet reached the scan's end key, so keep iterating.
  1096  		prevGapVal = gapVal
  1097  		first = false
  1098  		it.Next()
  1099  	}
  1100  }
  1101  
  1102  // prevInitNode moves the iterator backwards to the nearest initialized node. If
  1103  // the iterator is already positioned on an initialized node then this function
  1104  // is a no-op.
  1105  func prevInitNode(it *arenaskl.Iterator) {
  1106  	for {
  1107  		util.RacePreempt()
  1108  
  1109  		if !it.Valid() {
  1110  			// No more previous nodes, so use the zero value.
  1111  			it.SeekToFirst()
  1112  			break
  1113  		}
  1114  
  1115  		if (it.Meta() & initialized) != 0 {
  1116  			// Found an initialized node.
  1117  			break
  1118  		}
  1119  
  1120  		// Haven't yet reached an initialized node, so keep iterating.
  1121  		it.Prev()
  1122  	}
  1123  }
  1124  
  1125  func decodeValueSet(b []byte, meta uint16) (keyVal, gapVal cacheValue) {
  1126  	if (meta & hasKey) != 0 {
  1127  		b, keyVal = decodeValue(b)
  1128  	}
  1129  
  1130  	if (meta & hasGap) != 0 {
  1131  		_, gapVal = decodeValue(b)
  1132  	}
  1133  
  1134  	return
  1135  }
  1136  
  1137  func encodeValueSet(b []byte, keyVal, gapVal cacheValue) (ret []byte, meta uint16) {
  1138  	if keyVal.ts.WallTime != 0 || keyVal.ts.Logical != 0 {
  1139  		b = encodeValue(b, keyVal)
  1140  		meta |= hasKey
  1141  	}
  1142  
  1143  	if gapVal.ts.WallTime != 0 || gapVal.ts.Logical != 0 {
  1144  		b = encodeValue(b, gapVal)
  1145  		meta |= hasGap
  1146  	}
  1147  
  1148  	ret = b
  1149  	return
  1150  }
  1151  
  1152  func decodeValue(b []byte) (ret []byte, val cacheValue) {
  1153  	val.ts.WallTime = int64(binary.BigEndian.Uint64(b))
  1154  	val.ts.Logical = int32(binary.BigEndian.Uint32(b[8:]))
  1155  	var err error
  1156  	if val.txnID, err = uuid.FromBytes(b[encodedTsSize:encodedValSize]); err != nil {
  1157  		panic(err)
  1158  	}
  1159  	ret = b[encodedValSize:]
  1160  	return
  1161  }
  1162  
  1163  func encodeValue(b []byte, val cacheValue) []byte {
  1164  	l := len(b)
  1165  	b = b[:l+encodedValSize]
  1166  	binary.BigEndian.PutUint64(b[l:], uint64(val.ts.WallTime))
  1167  	binary.BigEndian.PutUint32(b[l+8:], uint32(val.ts.Logical))
  1168  	if _, err := val.txnID.MarshalTo(b[l+encodedTsSize:]); err != nil {
  1169  		panic(err)
  1170  	}
  1171  	return b
  1172  }
  1173  
  1174  func encodedRangeSize(from, to []byte, opt rangeOptions) int {
  1175  	vals := 1
  1176  	if (opt & excludeTo) == 0 {
  1177  		vals++
  1178  	}
  1179  	if (opt & excludeFrom) == 0 {
  1180  		vals++
  1181  	}
  1182  	// This will be an overestimate because nodes will almost
  1183  	// always be smaller than arenaskl.MaxNodeSize.
  1184  	return len(from) + len(to) + (vals * encodedValSize) + (2 * arenaskl.MaxNodeSize)
  1185  }