github.com/zuoyebang/bitalostable@v1.0.1-0.20240229032404-e3b99a834294/internal/arenaskl/skl.go (about)

     1  /*
     2   * Copyright 2017 Dgraph Labs, Inc. and Contributors
     3   * Modifications copyright (C) 2017 Andy Kimball and Contributors
     4   *
     5   * Licensed under the Apache License, Version 2.0 (the "License");
     6   * you may not use this file except in compliance with the License.
     7   * You may obtain a copy of the License at
     8   *
     9   *     http://www.apache.org/licenses/LICENSE-2.0
    10   *
    11   * Unless required by applicable law or agreed to in writing, software
    12   * distributed under the License is distributed on an "AS IS" BASIS,
    13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14   * See the License for the specific language governing permissions and
    15   * limitations under the License.
    16   */
    17  
    18  /*
    19  Adapted from RocksDB inline skiplist.
    20  
    21  Key differences:
    22  - No optimization for sequential inserts (no "prev").
    23  - No custom comparator.
    24  - Support overwrites. This requires care when we see the same key when inserting.
    25    For RocksDB or LevelDB, overwrites are implemented as a newer sequence number in the key, so
    26  	there is no need for values. We don't intend to support versioning. In-place updates of values
    27  	would be more efficient.
    28  - We discard all non-concurrent code.
    29  - We do not support Splices. This simplifies the code a lot.
    30  - No AllocateNode or other pointer arithmetic.
    31  - We combine the findLessThan, findGreaterOrEqual, etc into one function.
    32  */
    33  
    34  /*
    35  Further adapted from Badger: https://github.com/dgraph-io/badger.
    36  
    37  Key differences:
    38  - Support for previous pointers - doubly linked lists. Note that it's up to higher
    39    level code to deal with the intermediate state that occurs during insertion,
    40    where node A is linked to node B, but node B is not yet linked back to node A.
    41  - Iterator includes mutator functions.
    42  */
    43  
    44  package arenaskl // import "github.com/zuoyebang/bitalostable/internal/arenaskl"
    45  
    46  import (
    47  	"encoding/binary"
    48  	"math"
    49  	"runtime"
    50  	"sync/atomic"
    51  	"unsafe"
    52  
    53  	"github.com/cockroachdb/errors"
    54  	"github.com/zuoyebang/bitalostable/internal/base"
    55  	"github.com/zuoyebang/bitalostable/internal/fastrand"
    56  )
    57  
    58  const (
    59  	maxHeight   = 20
    60  	maxNodeSize = int(unsafe.Sizeof(node{}))
    61  	linksSize   = int(unsafe.Sizeof(links{}))
    62  	pValue      = 1 / math.E
    63  )
    64  
    65  // ErrRecordExists indicates that an entry with the specified key already
    66  // exists in the skiplist. Duplicate entries are not directly supported and
    67  // instead must be handled by the user by appending a unique version suffix to
    68  // keys.
    69  var ErrRecordExists = errors.New("record with this key already exists")
    70  
    71  // Skiplist is a fast, cocnurrent skiplist implementation that supports forward
    72  // and backward iteration. See batchskl.Skiplist for a non-concurrent
    73  // skiplist. Keys and values are immutable once added to the skiplist and
    74  // deletion is not supported. Instead, higher-level code is expected to add new
    75  // entries that shadow existing entries and perform deletion via tombstones. It
    76  // is up to the user to process these shadow entries and tombstones
    77  // appropriately during retrieval.
    78  type Skiplist struct {
    79  	arena  *Arena
    80  	cmp    base.Compare
    81  	head   *node
    82  	tail   *node
    83  	height uint32 // Current height. 1 <= height <= maxHeight. CAS.
    84  
    85  	// If set to true by tests, then extra delays are added to make it easier to
    86  	// detect unusual race conditions.
    87  	testing bool
    88  }
    89  
    90  // Inserter TODO(peter)
    91  type Inserter struct {
    92  	spl    [maxHeight]splice
    93  	height uint32
    94  }
    95  
    96  // Add TODO(peter)
    97  func (ins *Inserter) Add(list *Skiplist, key base.InternalKey, value []byte) error {
    98  	return list.addInternal(key, value, ins)
    99  }
   100  
   101  var (
   102  	probabilities [maxHeight]uint32
   103  )
   104  
   105  func init() {
   106  	// Precompute the skiplist probabilities so that only a single random number
   107  	// needs to be generated and so that the optimal pvalue can be used (inverse
   108  	// of Euler's number).
   109  	p := float64(1.0)
   110  	for i := 0; i < maxHeight; i++ {
   111  		probabilities[i] = uint32(float64(math.MaxUint32) * p)
   112  		p *= pValue
   113  	}
   114  }
   115  
   116  // NewSkiplist constructs and initializes a new, empty skiplist. All nodes, keys,
   117  // and values in the skiplist will be allocated from the given arena.
   118  func NewSkiplist(arena *Arena, cmp base.Compare) *Skiplist {
   119  	skl := &Skiplist{}
   120  	skl.Reset(arena, cmp)
   121  	return skl
   122  }
   123  
   124  // Reset the skiplist to empty and re-initialize.
   125  func (s *Skiplist) Reset(arena *Arena, cmp base.Compare) {
   126  	// Allocate head and tail nodes.
   127  	head, err := newRawNode(arena, maxHeight, 0, 0)
   128  	if err != nil {
   129  		panic("arenaSize is not large enough to hold the head node")
   130  	}
   131  	head.keyOffset = 0
   132  	head.skipToFirst = 0
   133  	head.skipToLast = 0
   134  
   135  	tail, err := newRawNode(arena, maxHeight, 0, 0)
   136  	if err != nil {
   137  		panic("arenaSize is not large enough to hold the tail node")
   138  	}
   139  	tail.keyOffset = 0
   140  	tail.skipToFirst = 0
   141  	tail.skipToLast = 0
   142  
   143  	// Link all head/tail levels together.
   144  	headOffset := arena.getPointerOffset(unsafe.Pointer(head))
   145  	tailOffset := arena.getPointerOffset(unsafe.Pointer(tail))
   146  	for i := 0; i < maxHeight; i++ {
   147  		head.tower[i].nextOffset = tailOffset
   148  		tail.tower[i].prevOffset = headOffset
   149  	}
   150  
   151  	*s = Skiplist{
   152  		arena:  arena,
   153  		cmp:    cmp,
   154  		head:   head,
   155  		tail:   tail,
   156  		height: 1,
   157  	}
   158  }
   159  
   160  // Height returns the height of the highest tower within any of the nodes that
   161  // have ever been allocated as part of this skiplist.
   162  func (s *Skiplist) Height() uint32 { return atomic.LoadUint32(&s.height) }
   163  
   164  // Arena returns the arena backing this skiplist.
   165  func (s *Skiplist) Arena() *Arena { return s.arena }
   166  
   167  // Size returns the number of bytes that have allocated from the arena.
   168  func (s *Skiplist) Size() uint32 { return s.arena.Size() }
   169  
   170  // Add adds a new key if it does not yet exist. If the key already exists, then
   171  // Add returns ErrRecordExists. If there isn't enough room in the arena, then
   172  // Add returns ErrArenaFull.
   173  func (s *Skiplist) Add(key base.InternalKey, value []byte) error {
   174  	var ins Inserter
   175  	return s.addInternal(key, value, &ins)
   176  }
   177  
   178  func (s *Skiplist) addInternal(key base.InternalKey, value []byte, ins *Inserter) error {
   179  	if s.findSplice(key, ins) {
   180  		// Found a matching node, but handle case where it's been deleted.
   181  		return ErrRecordExists
   182  	}
   183  
   184  	if s.testing {
   185  		// Add delay to make it easier to test race between this thread
   186  		// and another thread that sees the intermediate state between
   187  		// finding the splice and using it.
   188  		runtime.Gosched()
   189  	}
   190  
   191  	nd, height, err := s.newNode(key, value)
   192  	if err != nil {
   193  		return err
   194  	}
   195  
   196  	ndOffset := s.arena.getPointerOffset(unsafe.Pointer(nd))
   197  
   198  	// We always insert from the base level and up. After you add a node in base
   199  	// level, we cannot create a node in the level above because it would have
   200  	// discovered the node in the base level.
   201  	var found bool
   202  	var invalidateSplice bool
   203  	for i := 0; i < int(height); i++ {
   204  		prev := ins.spl[i].prev
   205  		next := ins.spl[i].next
   206  
   207  		if prev == nil {
   208  			// New node increased the height of the skiplist, so assume that the
   209  			// new level has not yet been populated.
   210  			if next != nil {
   211  				panic("next is expected to be nil, since prev is nil")
   212  			}
   213  
   214  			prev = s.head
   215  			next = s.tail
   216  		}
   217  
   218  		// +----------------+     +------------+     +----------------+
   219  		// |      prev      |     |     nd     |     |      next      |
   220  		// | prevNextOffset |---->|            |     |                |
   221  		// |                |<----| prevOffset |     |                |
   222  		// |                |     | nextOffset |---->|                |
   223  		// |                |     |            |<----| nextPrevOffset |
   224  		// +----------------+     +------------+     +----------------+
   225  		//
   226  		// 1. Initialize prevOffset and nextOffset to point to prev and next.
   227  		// 2. CAS prevNextOffset to repoint from next to nd.
   228  		// 3. CAS nextPrevOffset to repoint from prev to nd.
   229  		for {
   230  			prevOffset := s.arena.getPointerOffset(unsafe.Pointer(prev))
   231  			nextOffset := s.arena.getPointerOffset(unsafe.Pointer(next))
   232  			nd.tower[i].init(prevOffset, nextOffset)
   233  
   234  			// Check whether next has an updated link to prev. If it does not,
   235  			// that can mean one of two things:
   236  			//   1. The thread that added the next node hasn't yet had a chance
   237  			//      to add the prev link (but will shortly).
   238  			//   2. Another thread has added a new node between prev and next.
   239  			nextPrevOffset := next.prevOffset(i)
   240  			if nextPrevOffset != prevOffset {
   241  				// Determine whether #1 or #2 is true by checking whether prev
   242  				// is still pointing to next. As long as the atomic operations
   243  				// have at least acquire/release semantics (no need for
   244  				// sequential consistency), this works, as it is equivalent to
   245  				// the "publication safety" pattern.
   246  				prevNextOffset := prev.nextOffset(i)
   247  				if prevNextOffset == nextOffset {
   248  					// Ok, case #1 is true, so help the other thread along by
   249  					// updating the next node's prev link.
   250  					next.casPrevOffset(i, nextPrevOffset, prevOffset)
   251  				}
   252  			}
   253  
   254  			if prev.casNextOffset(i, nextOffset, ndOffset) {
   255  				// Managed to insert nd between prev and next, so update the next
   256  				// node's prev link and go to the next level.
   257  				if s.testing {
   258  					// Add delay to make it easier to test race between this thread
   259  					// and another thread that sees the intermediate state between
   260  					// setting next and setting prev.
   261  					runtime.Gosched()
   262  				}
   263  
   264  				next.casPrevOffset(i, prevOffset, ndOffset)
   265  				break
   266  			}
   267  
   268  			// CAS failed. We need to recompute prev and next. It is unlikely to
   269  			// be helpful to try to use a different level as we redo the search,
   270  			// because it is unlikely that lots of nodes are inserted between prev
   271  			// and next.
   272  			prev, next, found = s.findSpliceForLevel(key, i, prev)
   273  			if found {
   274  				if i != 0 {
   275  					panic("how can another thread have inserted a node at a non-base level?")
   276  				}
   277  
   278  				return ErrRecordExists
   279  			}
   280  			invalidateSplice = true
   281  		}
   282  	}
   283  
   284  	s.setNodeSkipOffset(nd, ndOffset, key)
   285  
   286  	// If we had to recompute the splice for a level, invalidate the entire
   287  	// cached splice.
   288  	if invalidateSplice {
   289  		ins.height = 0
   290  	} else {
   291  		// The splice was valid. We inserted a node between spl[i].prev and
   292  		// spl[i].next. Optimistically update spl[i].prev for use in a subsequent
   293  		// call to add.
   294  		for i := uint32(0); i < height; i++ {
   295  			ins.spl[i].prev = nd
   296  		}
   297  	}
   298  
   299  	return nil
   300  }
   301  
   302  func (s *Skiplist) setNodeSkipOffset(nd *node, ndOffset uint32, key base.InternalKey) {
   303  	nextNd := s.getNext(nd, 0)
   304  	if nextNd == s.tail {
   305  		return
   306  	}
   307  
   308  	offset, size := nextNd.keyOffset, nextNd.keySize
   309  	nextKey := s.arena.buf[offset : offset+size]
   310  	n := int32(size) - 8
   311  	if n < 0 || s.cmp(key.UserKey, nextKey[:n]) != 0 || key.Trailer <= binary.LittleEndian.Uint64(nextKey[n:]) {
   312  		return
   313  	}
   314  
   315  	skipToFirstOffset := nextNd.skipToFirstOffset()
   316  	if skipToFirstOffset > 0 {
   317  		nd.setSkipToFirstOffset(skipToFirstOffset)
   318  
   319  		skipToFirstNd := (*node)(s.arena.getPointer(skipToFirstOffset))
   320  		if skipToFirstNd == s.tail {
   321  			return
   322  		}
   323  
   324  		skipToFirstNd.setSkipToLastOffset(ndOffset)
   325  	} else {
   326  		nextNdOffset := s.arena.getPointerOffset(unsafe.Pointer(nextNd))
   327  		nd.setSkipToFirstOffset(nextNdOffset)
   328  	}
   329  }
   330  
   331  // NewIter returns a new Iterator object. The lower and upper bound parameters
   332  // control the range of keys the iterator will return. Specifying for nil for
   333  // lower or upper bound disables the check for that boundary. Note that lower
   334  // bound is not checked on {SeekGE,First} and upper bound is not check on
   335  // {SeekLT,Last}. The user is expected to perform that check. Note that it is
   336  // safe for an iterator to be copied by value.
   337  func (s *Skiplist) NewIter(lower, upper []byte) *Iterator {
   338  	it := iterPool.Get().(*Iterator)
   339  	*it = Iterator{list: s, nd: s.head, lower: lower, upper: upper}
   340  	return it
   341  }
   342  
   343  // NewFlushIter returns a new flushIterator, which is similar to an Iterator
   344  // but also sets the current number of the bytes that have been iterated
   345  // through.
   346  func (s *Skiplist) NewFlushIter(bytesFlushed *uint64) base.InternalIterator {
   347  	return &flushIterator{
   348  		Iterator:      Iterator{list: s, nd: s.head},
   349  		bytesIterated: bytesFlushed,
   350  	}
   351  }
   352  
   353  func (s *Skiplist) newNode(
   354  	key base.InternalKey, value []byte,
   355  ) (nd *node, height uint32, err error) {
   356  	height = s.randomHeight()
   357  	nd, err = newNode(s.arena, height, key, value)
   358  	if err != nil {
   359  		return
   360  	}
   361  
   362  	// Try to increase s.height via CAS.
   363  	listHeight := s.Height()
   364  	for height > listHeight {
   365  		if atomic.CompareAndSwapUint32(&s.height, listHeight, height) {
   366  			// Successfully increased skiplist.height.
   367  			break
   368  		}
   369  
   370  		listHeight = s.Height()
   371  	}
   372  
   373  	return
   374  }
   375  
   376  func (s *Skiplist) randomHeight() uint32 {
   377  	rnd := fastrand.Uint32()
   378  
   379  	h := uint32(1)
   380  	for h < maxHeight && rnd <= probabilities[h] {
   381  		h++
   382  	}
   383  
   384  	return h
   385  }
   386  
   387  func (s *Skiplist) findSplice(key base.InternalKey, ins *Inserter) (found bool) {
   388  	listHeight := s.Height()
   389  	var level int
   390  
   391  	prev := s.head
   392  	if ins.height < listHeight {
   393  		// Our cached height is less than the list height, which means there were
   394  		// inserts that increased the height of the list. Recompute the splice from
   395  		// scratch.
   396  		ins.height = listHeight
   397  		level = int(ins.height)
   398  	} else {
   399  		// Our cached height is equal to the list height.
   400  		for ; level < int(listHeight); level++ {
   401  			spl := &ins.spl[level]
   402  			if s.getNext(spl.prev, level) != spl.next {
   403  				// One or more nodes have been inserted between the splice at this
   404  				// level.
   405  				continue
   406  			}
   407  			if spl.prev != s.head && !s.keyIsAfterNode(spl.prev, key) {
   408  				// Key lies before splice.
   409  				level = int(listHeight)
   410  				break
   411  			}
   412  			if spl.next != s.tail && s.keyIsAfterNode(spl.next, key) {
   413  				// Key lies after splice.
   414  				level = int(listHeight)
   415  				break
   416  			}
   417  			// The splice brackets the key!
   418  			prev = spl.prev
   419  			break
   420  		}
   421  	}
   422  
   423  	for level = level - 1; level >= 0; level-- {
   424  		var next *node
   425  		prev, next, found = s.findSpliceForLevel(key, level, prev)
   426  		if next == nil {
   427  			next = s.tail
   428  		}
   429  		ins.spl[level].init(prev, next)
   430  	}
   431  
   432  	return
   433  }
   434  
   435  func (s *Skiplist) findSpliceForLevel(
   436  	key base.InternalKey, level int, start *node,
   437  ) (prev, next *node, found bool) {
   438  	prev = start
   439  
   440  	for {
   441  		// Assume prev.key < key.
   442  		next = s.getNext(prev, level)
   443  		if next == s.tail {
   444  			// Tail node, so done.
   445  			break
   446  		}
   447  
   448  		offset, size := next.keyOffset, next.keySize
   449  		nextKey := s.arena.buf[offset : offset+size]
   450  		n := int32(size) - 8
   451  		cmp := s.cmp(key.UserKey, nextKey[:n])
   452  		if cmp < 0 {
   453  			// We are done for this level, since prev.key < key < next.key.
   454  			break
   455  		}
   456  		if cmp == 0 {
   457  			// User-key equality.
   458  			var nextTrailer uint64
   459  			if n >= 0 {
   460  				nextTrailer = binary.LittleEndian.Uint64(nextKey[n:])
   461  			} else {
   462  				nextTrailer = uint64(base.InternalKeyKindInvalid)
   463  			}
   464  			if key.Trailer == nextTrailer {
   465  				// Internal key equality.
   466  				found = true
   467  				break
   468  			}
   469  			if key.Trailer > nextTrailer {
   470  				// We are done for this level, since prev.key < key < next.key.
   471  				break
   472  			}
   473  		}
   474  
   475  		// Keep moving right on this level.
   476  		prev = next
   477  	}
   478  
   479  	return
   480  }
   481  
   482  func (s *Skiplist) keyIsAfterNode(nd *node, key base.InternalKey) bool {
   483  	ndKey := s.arena.buf[nd.keyOffset : nd.keyOffset+nd.keySize]
   484  	n := int32(nd.keySize) - 8
   485  	cmp := s.cmp(ndKey[:n], key.UserKey)
   486  	if cmp < 0 {
   487  		return true
   488  	}
   489  	if cmp > 0 {
   490  		return false
   491  	}
   492  	// User-key equality.
   493  	var ndTrailer uint64
   494  	if n >= 0 {
   495  		ndTrailer = binary.LittleEndian.Uint64(ndKey[n:])
   496  	} else {
   497  		ndTrailer = uint64(base.InternalKeyKindInvalid)
   498  	}
   499  	if key.Trailer == ndTrailer {
   500  		// Internal key equality.
   501  		return false
   502  	}
   503  	return key.Trailer < ndTrailer
   504  }
   505  
   506  func (s *Skiplist) getNext(nd *node, h int) *node {
   507  	offset := atomic.LoadUint32(&nd.tower[h].nextOffset)
   508  	return (*node)(s.arena.getPointer(offset))
   509  }
   510  
   511  func (s *Skiplist) getPrev(nd *node, h int) *node {
   512  	offset := atomic.LoadUint32(&nd.tower[h].prevOffset)
   513  	return (*node)(s.arena.getPointer(offset))
   514  }
   515  
   516  func (s *Skiplist) getSkipNext(nd *node) *node {
   517  	var nextNd *node
   518  	skipToFirstOffset := nd.skipToFirstOffset()
   519  	if skipToFirstOffset > 0 {
   520  		nextNd = (*node)(s.arena.getPointer(skipToFirstOffset))
   521  	} else {
   522  		offset := atomic.LoadUint32(&nd.tower[0].nextOffset)
   523  		nextNd = (*node)(s.arena.getPointer(offset))
   524  	}
   525  	return nextNd
   526  }
   527  
   528  func (s *Skiplist) getSkipPrev(nd *node) *node {
   529  	var prevNd *node
   530  	skipToLastOffset := nd.skipToLastOffset()
   531  	if skipToLastOffset > 0 {
   532  		prevNd = (*node)(s.arena.getPointer(skipToLastOffset))
   533  	} else {
   534  		offset := atomic.LoadUint32(&nd.tower[0].prevOffset)
   535  		prevNd = (*node)(s.arena.getPointer(offset))
   536  	}
   537  	return prevNd
   538  }