github.com/cockroachdb/pebble@v1.1.1-0.20240513155919-3622ade60459/internal/arenaskl/skl.go

github.com/cockroachdb/pebble@v1.1.1-0.20240513155919-3622ade60459/internal/arenaskl/skl.go (about)

     1  /*
     2   * Copyright 2017 Dgraph Labs, Inc. and Contributors
     3   * Modifications copyright (C) 2017 Andy Kimball and Contributors
     4   *
     5   * Licensed under the Apache License, Version 2.0 (the "License");
     6   * you may not use this file except in compliance with the License.
     7   * You may obtain a copy of the License at
     8   *
     9   *     http://www.apache.org/licenses/LICENSE-2.0
    10   *
    11   * Unless required by applicable law or agreed to in writing, software
    12   * distributed under the License is distributed on an "AS IS" BASIS,
    13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14   * See the License for the specific language governing permissions and
    15   * limitations under the License.
    16   */
    17  
    18  /*
    19  Adapted from RocksDB inline skiplist.
    20  
    21  Key differences:
    22  - No optimization for sequential inserts (no "prev").
    23  - No custom comparator.
    24  - Support overwrites. This requires care when we see the same key when inserting.
    25    For RocksDB or LevelDB, overwrites are implemented as a newer sequence number in the key, so
    26  	there is no need for values. We don't intend to support versioning. In-place updates of values
    27  	would be more efficient.
    28  - We discard all non-concurrent code.
    29  - We do not support Splices. This simplifies the code a lot.
    30  - No AllocateNode or other pointer arithmetic.
    31  - We combine the findLessThan, findGreaterOrEqual, etc into one function.
    32  */
    33  
    34  /*
    35  Further adapted from Badger: https://github.com/dgraph-io/badger.
    36  
    37  Key differences:
    38  - Support for previous pointers - doubly linked lists. Note that it's up to higher
    39    level code to deal with the intermediate state that occurs during insertion,
    40    where node A is linked to node B, but node B is not yet linked back to node A.
    41  - Iterator includes mutator functions.
    42  */
    43  
    44  package arenaskl // import "github.com/cockroachdb/pebble/internal/arenaskl"
    45  
    46  import (
    47  	"math"
    48  	"runtime"
    49  	"sync/atomic"
    50  	"unsafe"
    51  
    52  	"github.com/cockroachdb/errors"
    53  	"github.com/cockroachdb/pebble/internal/base"
    54  	"github.com/cockroachdb/pebble/internal/fastrand"
    55  )
    56  
    57  const (
    58  	maxHeight   = 20
    59  	maxNodeSize = int(unsafe.Sizeof(node{}))
    60  	linksSize   = int(unsafe.Sizeof(links{}))
    61  	pValue      = 1 / math.E
    62  )
    63  
    64  // ErrRecordExists indicates that an entry with the specified key already
    65  // exists in the skiplist. Duplicate entries are not directly supported and
    66  // instead must be handled by the user by appending a unique version suffix to
    67  // keys.
    68  var ErrRecordExists = errors.New("record with this key already exists")
    69  
    70  // Skiplist is a fast, concurrent skiplist implementation that supports forward
    71  // and backward iteration. See batchskl.Skiplist for a non-concurrent
    72  // skiplist. Keys and values are immutable once added to the skiplist and
    73  // deletion is not supported. Instead, higher-level code is expected to add new
    74  // entries that shadow existing entries and perform deletion via tombstones. It
    75  // is up to the user to process these shadow entries and tombstones
    76  // appropriately during retrieval.
    77  type Skiplist struct {
    78  	arena  *Arena
    79  	cmp    base.Compare
    80  	head   *node
    81  	tail   *node
    82  	height atomic.Uint32 // Current height. 1 <= height <= maxHeight. CAS.
    83  
    84  	// If set to true by tests, then extra delays are added to make it easier to
    85  	// detect unusual race conditions.
    86  	testing bool
    87  }
    88  
    89  // Inserter TODO(peter)
    90  type Inserter struct {
    91  	spl    [maxHeight]splice
    92  	height uint32
    93  }
    94  
    95  // Add TODO(peter)
    96  func (ins *Inserter) Add(list *Skiplist, key base.InternalKey, value []byte) error {
    97  	return list.addInternal(key, value, ins)
    98  }
    99  
   100  var (
   101  	probabilities [maxHeight]uint32
   102  )
   103  
   104  func init() {
   105  	// Precompute the skiplist probabilities so that only a single random number
   106  	// needs to be generated and so that the optimal pvalue can be used (inverse
   107  	// of Euler's number).
   108  	p := float64(1.0)
   109  	for i := 0; i < maxHeight; i++ {
   110  		probabilities[i] = uint32(float64(math.MaxUint32) * p)
   111  		p *= pValue
   112  	}
   113  }
   114  
   115  // NewSkiplist constructs and initializes a new, empty skiplist. All nodes, keys,
   116  // and values in the skiplist will be allocated from the given arena.
   117  func NewSkiplist(arena *Arena, cmp base.Compare) *Skiplist {
   118  	skl := &Skiplist{}
   119  	skl.Reset(arena, cmp)
   120  	return skl
   121  }
   122  
   123  // Reset the skiplist to empty and re-initialize.
   124  func (s *Skiplist) Reset(arena *Arena, cmp base.Compare) {
   125  	// Allocate head and tail nodes.
   126  	head, err := newRawNode(arena, maxHeight, 0, 0)
   127  	if err != nil {
   128  		panic("arenaSize is not large enough to hold the head node")
   129  	}
   130  	head.keyOffset = 0
   131  
   132  	tail, err := newRawNode(arena, maxHeight, 0, 0)
   133  	if err != nil {
   134  		panic("arenaSize is not large enough to hold the tail node")
   135  	}
   136  	tail.keyOffset = 0
   137  
   138  	// Link all head/tail levels together.
   139  	headOffset := arena.getPointerOffset(unsafe.Pointer(head))
   140  	tailOffset := arena.getPointerOffset(unsafe.Pointer(tail))
   141  	for i := 0; i < maxHeight; i++ {
   142  		head.tower[i].nextOffset.Store(tailOffset)
   143  		tail.tower[i].prevOffset.Store(headOffset)
   144  	}
   145  
   146  	*s = Skiplist{
   147  		arena: arena,
   148  		cmp:   cmp,
   149  		head:  head,
   150  		tail:  tail,
   151  	}
   152  	s.height.Store(1)
   153  }
   154  
   155  // Height returns the height of the highest tower within any of the nodes that
   156  // have ever been allocated as part of this skiplist.
   157  func (s *Skiplist) Height() uint32 { return s.height.Load() }
   158  
   159  // Arena returns the arena backing this skiplist.
   160  func (s *Skiplist) Arena() *Arena { return s.arena }
   161  
   162  // Size returns the number of bytes that have allocated from the arena.
   163  func (s *Skiplist) Size() uint32 { return s.arena.Size() }
   164  
   165  // Add adds a new key if it does not yet exist. If the key already exists, then
   166  // Add returns ErrRecordExists. If there isn't enough room in the arena, then
   167  // Add returns ErrArenaFull.
   168  func (s *Skiplist) Add(key base.InternalKey, value []byte) error {
   169  	var ins Inserter
   170  	return s.addInternal(key, value, &ins)
   171  }
   172  
   173  func (s *Skiplist) addInternal(key base.InternalKey, value []byte, ins *Inserter) error {
   174  	if s.findSplice(key, ins) {
   175  		// Found a matching node, but handle case where it's been deleted.
   176  		return ErrRecordExists
   177  	}
   178  
   179  	if s.testing {
   180  		// Add delay to make it easier to test race between this thread
   181  		// and another thread that sees the intermediate state between
   182  		// finding the splice and using it.
   183  		runtime.Gosched()
   184  	}
   185  
   186  	nd, height, err := s.newNode(key, value)
   187  	if err != nil {
   188  		return err
   189  	}
   190  
   191  	ndOffset := s.arena.getPointerOffset(unsafe.Pointer(nd))
   192  
   193  	// We always insert from the base level and up. After you add a node in base
   194  	// level, we cannot create a node in the level above because it would have
   195  	// discovered the node in the base level.
   196  	var found bool
   197  	var invalidateSplice bool
   198  	for i := 0; i < int(height); i++ {
   199  		prev := ins.spl[i].prev
   200  		next := ins.spl[i].next
   201  
   202  		if prev == nil {
   203  			// New node increased the height of the skiplist, so assume that the
   204  			// new level has not yet been populated.
   205  			if next != nil {
   206  				panic("next is expected to be nil, since prev is nil")
   207  			}
   208  
   209  			prev = s.head
   210  			next = s.tail
   211  		}
   212  
   213  		// +----------------+     +------------+     +----------------+
   214  		// |      prev      |     |     nd     |     |      next      |
   215  		// | prevNextOffset |---->|            |     |                |
   216  		// |                |<----| prevOffset |     |                |
   217  		// |                |     | nextOffset |---->|                |
   218  		// |                |     |            |<----| nextPrevOffset |
   219  		// +----------------+     +------------+     +----------------+
   220  		//
   221  		// 1. Initialize prevOffset and nextOffset to point to prev and next.
   222  		// 2. CAS prevNextOffset to repoint from next to nd.
   223  		// 3. CAS nextPrevOffset to repoint from prev to nd.
   224  		for {
   225  			prevOffset := s.arena.getPointerOffset(unsafe.Pointer(prev))
   226  			nextOffset := s.arena.getPointerOffset(unsafe.Pointer(next))
   227  			nd.tower[i].init(prevOffset, nextOffset)
   228  
   229  			// Check whether next has an updated link to prev. If it does not,
   230  			// that can mean one of two things:
   231  			//   1. The thread that added the next node hasn't yet had a chance
   232  			//      to add the prev link (but will shortly).
   233  			//   2. Another thread has added a new node between prev and next.
   234  			nextPrevOffset := next.prevOffset(i)
   235  			if nextPrevOffset != prevOffset {
   236  				// Determine whether #1 or #2 is true by checking whether prev
   237  				// is still pointing to next. As long as the atomic operations
   238  				// have at least acquire/release semantics (no need for
   239  				// sequential consistency), this works, as it is equivalent to
   240  				// the "publication safety" pattern.
   241  				prevNextOffset := prev.nextOffset(i)
   242  				if prevNextOffset == nextOffset {
   243  					// Ok, case #1 is true, so help the other thread along by
   244  					// updating the next node's prev link.
   245  					next.casPrevOffset(i, nextPrevOffset, prevOffset)
   246  				}
   247  			}
   248  
   249  			if prev.casNextOffset(i, nextOffset, ndOffset) {
   250  				// Managed to insert nd between prev and next, so update the next
   251  				// node's prev link and go to the next level.
   252  				if s.testing {
   253  					// Add delay to make it easier to test race between this thread
   254  					// and another thread that sees the intermediate state between
   255  					// setting next and setting prev.
   256  					runtime.Gosched()
   257  				}
   258  
   259  				next.casPrevOffset(i, prevOffset, ndOffset)
   260  				break
   261  			}
   262  
   263  			// CAS failed. We need to recompute prev and next. It is unlikely to
   264  			// be helpful to try to use a different level as we redo the search,
   265  			// because it is unlikely that lots of nodes are inserted between prev
   266  			// and next.
   267  			prev, next, found = s.findSpliceForLevel(key, i, prev)
   268  			if found {
   269  				if i != 0 {
   270  					panic("how can another thread have inserted a node at a non-base level?")
   271  				}
   272  
   273  				return ErrRecordExists
   274  			}
   275  			invalidateSplice = true
   276  		}
   277  	}
   278  
   279  	// If we had to recompute the splice for a level, invalidate the entire
   280  	// cached splice.
   281  	if invalidateSplice {
   282  		ins.height = 0
   283  	} else {
   284  		// The splice was valid. We inserted a node between spl[i].prev and
   285  		// spl[i].next. Optimistically update spl[i].prev for use in a subsequent
   286  		// call to add.
   287  		for i := uint32(0); i < height; i++ {
   288  			ins.spl[i].prev = nd
   289  		}
   290  	}
   291  
   292  	return nil
   293  }
   294  
   295  // NewIter returns a new Iterator object. The lower and upper bound parameters
   296  // control the range of keys the iterator will return. Specifying for nil for
   297  // lower or upper bound disables the check for that boundary. Note that lower
   298  // bound is not checked on {SeekGE,First} and upper bound is not check on
   299  // {SeekLT,Last}. The user is expected to perform that check. Note that it is
   300  // safe for an iterator to be copied by value.
   301  func (s *Skiplist) NewIter(lower, upper []byte) *Iterator {
   302  	it := iterPool.Get().(*Iterator)
   303  	*it = Iterator{list: s, nd: s.head, lower: lower, upper: upper}
   304  	return it
   305  }
   306  
   307  // NewFlushIter returns a new flushIterator, which is similar to an Iterator
   308  // but also sets the current number of the bytes that have been iterated
   309  // through.
   310  func (s *Skiplist) NewFlushIter(bytesFlushed *uint64) base.InternalIterator {
   311  	return &flushIterator{
   312  		Iterator:      Iterator{list: s, nd: s.head},
   313  		bytesIterated: bytesFlushed,
   314  	}
   315  }
   316  
   317  func (s *Skiplist) newNode(
   318  	key base.InternalKey, value []byte,
   319  ) (nd *node, height uint32, err error) {
   320  	height = s.randomHeight()
   321  	nd, err = newNode(s.arena, height, key, value)
   322  	if err != nil {
   323  		return
   324  	}
   325  
   326  	// Try to increase s.height via CAS.
   327  	listHeight := s.Height()
   328  	for height > listHeight {
   329  		if s.height.CompareAndSwap(listHeight, height) {
   330  			// Successfully increased skiplist.height.
   331  			break
   332  		}
   333  
   334  		listHeight = s.Height()
   335  	}
   336  
   337  	return
   338  }
   339  
   340  func (s *Skiplist) randomHeight() uint32 {
   341  	rnd := fastrand.Uint32()
   342  
   343  	h := uint32(1)
   344  	for h < maxHeight && rnd <= probabilities[h] {
   345  		h++
   346  	}
   347  
   348  	return h
   349  }
   350  
   351  func (s *Skiplist) findSplice(key base.InternalKey, ins *Inserter) (found bool) {
   352  	listHeight := s.Height()
   353  	var level int
   354  
   355  	prev := s.head
   356  	if ins.height < listHeight {
   357  		// Our cached height is less than the list height, which means there were
   358  		// inserts that increased the height of the list. Recompute the splice from
   359  		// scratch.
   360  		ins.height = listHeight
   361  		level = int(ins.height)
   362  	} else {
   363  		// Our cached height is equal to the list height.
   364  		for ; level < int(listHeight); level++ {
   365  			spl := &ins.spl[level]
   366  			if s.getNext(spl.prev, level) != spl.next {
   367  				// One or more nodes have been inserted between the splice at this
   368  				// level.
   369  				continue
   370  			}
   371  			if spl.prev != s.head && !s.keyIsAfterNode(spl.prev, key) {
   372  				// Key lies before splice.
   373  				level = int(listHeight)
   374  				break
   375  			}
   376  			if spl.next != s.tail && s.keyIsAfterNode(spl.next, key) {
   377  				// Key lies after splice.
   378  				level = int(listHeight)
   379  				break
   380  			}
   381  			// The splice brackets the key!
   382  			prev = spl.prev
   383  			break
   384  		}
   385  	}
   386  
   387  	for level = level - 1; level >= 0; level-- {
   388  		var next *node
   389  		prev, next, found = s.findSpliceForLevel(key, level, prev)
   390  		if next == nil {
   391  			next = s.tail
   392  		}
   393  		ins.spl[level].init(prev, next)
   394  	}
   395  
   396  	return
   397  }
   398  
   399  func (s *Skiplist) findSpliceForLevel(
   400  	key base.InternalKey, level int, start *node,
   401  ) (prev, next *node, found bool) {
   402  	prev = start
   403  
   404  	for {
   405  		// Assume prev.key < key.
   406  		next = s.getNext(prev, level)
   407  		if next == s.tail {
   408  			// Tail node, so done.
   409  			break
   410  		}
   411  
   412  		offset, size := next.keyOffset, next.keySize
   413  		nextKey := s.arena.buf[offset : offset+size]
   414  		cmp := s.cmp(key.UserKey, nextKey)
   415  		if cmp < 0 {
   416  			// We are done for this level, since prev.key < key < next.key.
   417  			break
   418  		}
   419  		if cmp == 0 {
   420  			// User-key equality.
   421  			if key.Trailer == next.keyTrailer {
   422  				// Internal key equality.
   423  				found = true
   424  				break
   425  			}
   426  			if key.Trailer > next.keyTrailer {
   427  				// We are done for this level, since prev.key < key < next.key.
   428  				break
   429  			}
   430  		}
   431  
   432  		// Keep moving right on this level.
   433  		prev = next
   434  	}
   435  
   436  	return
   437  }
   438  
   439  func (s *Skiplist) keyIsAfterNode(nd *node, key base.InternalKey) bool {
   440  	ndKey := s.arena.buf[nd.keyOffset : nd.keyOffset+nd.keySize]
   441  	cmp := s.cmp(ndKey, key.UserKey)
   442  	if cmp < 0 {
   443  		return true
   444  	}
   445  	if cmp > 0 {
   446  		return false
   447  	}
   448  	// User-key equality.
   449  	if key.Trailer == nd.keyTrailer {
   450  		// Internal key equality.
   451  		return false
   452  	}
   453  	return key.Trailer < nd.keyTrailer
   454  }
   455  
   456  func (s *Skiplist) getNext(nd *node, h int) *node {
   457  	offset := nd.tower[h].nextOffset.Load()
   458  	return (*node)(s.arena.getPointer(offset))
   459  }
   460  
   461  func (s *Skiplist) getPrev(nd *node, h int) *node {
   462  	offset := nd.tower[h].prevOffset.Load()
   463  	return (*node)(s.arena.getPointer(offset))
   464  }