github.com/matrixorigin/matrixone@v1.2.0/pkg/common/arenaskl/skl.go (about)

     1  /*
     2   * Copyright 2017 Dgraph Labs, Inc. and Contributors
     3   * Modifications copyright (C) 2017 Andy Kimball and Contributors
     4   * and copyright (C) 2024 MatrixOrigin Inc.
     5   *
     6   * Licensed under the Apache License, Version 2.0 (the "License");
     7   * you may not use this file except in compliance with the License.
     8   * You may obtain a copy of the License at
     9   *
    10   *     http://www.apache.org/licenses/LICENSE-2.0
    11   *
    12   * Unless required by applicable law or agreed to in writing, software
    13   * distributed under the License is distributed on an "AS IS" BASIS,
    14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    15   * See the License for the specific language governing permissions and
    16   * limitations under the License.
    17   */
    18  
    19  /*
    20  Adapted from RocksDB inline skiplist.
    21  
    22  Key differences:
    23  - No optimization for sequential inserts (no "prev").
    24  - No custom comparator.
    25  - Support overwrites. This requires care when we see the same key when inserting.
    26    For RocksDB or LevelDB, overwrites are implemented as a newer sequence number in the key, so
    27  	there is no need for values. We don't intend to support versioning. In-place updates of values
    28  	would be more efficient.
    29  - We discard all non-concurrent code.
    30  - We do not support Splices. This simplifies the code a lot.
    31  - No AllocateNode or other pointer arithmetic.
    32  - We combine the findLessThan, findGreaterOrEqual, etc into one function.
    33  */
    34  
    35  /*
    36  Further adapted from Badger: https://github.com/dgraph-io/badger.
    37  
    38  Key differences:
    39  - Support for previous pointers - doubly linked lists. Note that it's up to higher
    40    level code to deal with the intermediate state that occurs during insertion,
    41    where node A is linked to node B, but node B is not yet linked back to node A.
    42  - Iterator includes mutator functions.
    43  */
    44  
    45  package arenaskl
    46  
    47  import (
    48  	"math"
    49  	"runtime"
    50  	"sync/atomic"
    51  	"unsafe"
    52  
    53  	"github.com/matrixorigin/matrixone/pkg/common/fastrand"
    54  	"github.com/matrixorigin/matrixone/pkg/common/moerr"
    55  )
    56  
    57  const (
    58  	maxHeight   = 20
    59  	maxNodeSize = int(unsafe.Sizeof(node{}))
    60  	linksSize   = int(unsafe.Sizeof(links{}))
    61  	pValue      = 1 / math.E
    62  )
    63  
    64  // Compare is a comparison function for keys.
    65  type Compare func(a, b []byte) int
    66  
    67  // ErrRecordExists indicates that an entry with the specified key already
    68  // exists in the skiplist. Duplicate entries are not directly supported and
    69  // instead must be handled by the user by appending a unique version suffix to
    70  // keys.
    71  var ErrRecordExists = moerr.NewKeyAlreadyExistsNoCtx()
    72  
    73  // Skiplist is a fast, concurrent skiplist implementation that supports forward
    74  // and backward iteration. See batchskl.Skiplist for a non-concurrent
    75  // skiplist. Keys and values are immutable once added to the skiplist and
    76  // deletion is not supported. Instead, higher-level code is expected to add new
    77  // entries that shadow existing entries and perform deletion via tombstones. It
    78  // is up to the user to process these shadow entries and tombstones
    79  // appropriately during retrieval.
    80  type Skiplist struct {
    81  	arena  *Arena
    82  	cmp    Compare
    83  	head   *node
    84  	tail   *node
    85  	height atomic.Uint32 // Current height. 1 <= height <= maxHeight. CAS.
    86  
    87  	// If set to true by tests, then extra delays are added to make it easier to
    88  	// detect unusual race conditions.
    89  	testing bool
    90  }
    91  
    92  // Inserter TODO(peter)
    93  type Inserter struct {
    94  	spl    [maxHeight]splice
    95  	height uint32
    96  }
    97  
    98  // Add TODO(peter)
    99  func (ins *Inserter) Add(list *Skiplist, key, value []byte) error {
   100  	return list.addInternal(key, value, ins)
   101  }
   102  
   103  var (
   104  	probabilities [maxHeight]uint32
   105  )
   106  
   107  func init() {
   108  	// Precompute the skiplist probabilities so that only a single random number
   109  	// needs to be generated and so that the optimal pvalue can be used (inverse
   110  	// of Euler's number).
   111  	p := float64(1.0)
   112  	for i := 0; i < maxHeight; i++ {
   113  		probabilities[i] = uint32(float64(math.MaxUint32) * p)
   114  		p *= pValue
   115  	}
   116  }
   117  
   118  // NewSkiplist constructs and initializes a new, empty skiplist. All nodes, keys,
   119  // and values in the skiplist will be allocated from the given arena.
   120  func NewSkiplist(arena *Arena, cmp Compare) *Skiplist {
   121  	skl := &Skiplist{}
   122  	skl.Reset(arena, cmp)
   123  	return skl
   124  }
   125  
   126  // Reset the skiplist to empty and re-initialize.
   127  func (s *Skiplist) Reset(arena *Arena, cmp Compare) {
   128  	// Allocate head and tail nodes.
   129  	head, err := newRawNode(arena, maxHeight, 0, 0)
   130  	if err != nil {
   131  		panic("arenaSize is not large enough to hold the head node")
   132  	}
   133  	head.keyOffset = 0
   134  
   135  	tail, err := newRawNode(arena, maxHeight, 0, 0)
   136  	if err != nil {
   137  		panic("arenaSize is not large enough to hold the tail node")
   138  	}
   139  	tail.keyOffset = 0
   140  
   141  	// Link all head/tail levels together.
   142  	headOffset := arena.getPointerOffset(unsafe.Pointer(head))
   143  	tailOffset := arena.getPointerOffset(unsafe.Pointer(tail))
   144  	for i := 0; i < maxHeight; i++ {
   145  		head.tower[i].nextOffset.Store(tailOffset)
   146  		tail.tower[i].prevOffset.Store(headOffset)
   147  	}
   148  
   149  	*s = Skiplist{
   150  		arena: arena,
   151  		cmp:   cmp,
   152  		head:  head,
   153  		tail:  tail,
   154  	}
   155  	s.height.Store(1)
   156  }
   157  
   158  // Height returns the height of the highest tower within any of the nodes that
   159  // have ever been allocated as part of this skiplist.
   160  func (s *Skiplist) Height() uint32 { return s.height.Load() }
   161  
   162  // Arena returns the arena backing this skiplist.
   163  func (s *Skiplist) Arena() *Arena { return s.arena }
   164  
   165  // Size returns the number of bytes that have allocated from the arena.
   166  func (s *Skiplist) Size() uint32 { return s.arena.Size() }
   167  
   168  // Add adds a new key if it does not yet exist. If the key already exists, then
   169  // Add returns ErrRecordExists. If there isn't enough room in the arena, then
   170  // Add returns ErrArenaFull.
   171  func (s *Skiplist) Add(key, value []byte) error {
   172  	var ins Inserter
   173  	return s.addInternal(key, value, &ins)
   174  }
   175  
   176  func (s *Skiplist) addInternal(key, value []byte, ins *Inserter) error {
   177  	if s.findSplice(key, ins) {
   178  		// Found a matching node, but handle case where it's been deleted.
   179  		return ErrRecordExists
   180  	}
   181  
   182  	if s.testing {
   183  		// Add delay to make it easier to test race between this thread
   184  		// and another thread that sees the intermediate state between
   185  		// finding the splice and using it.
   186  		runtime.Gosched()
   187  	}
   188  
   189  	nd, height, err := s.newNode(key, value)
   190  	if err != nil {
   191  		return err
   192  	}
   193  
   194  	ndOffset := s.arena.getPointerOffset(unsafe.Pointer(nd))
   195  
   196  	// We always insert from the base level and up. After you add a node in base
   197  	// level, we cannot create a node in the level above because it would have
   198  	// discovered the node in the base level.
   199  	var found bool
   200  	var invalidateSplice bool
   201  	for i := 0; i < int(height); i++ {
   202  		prev := ins.spl[i].prev
   203  		next := ins.spl[i].next
   204  
   205  		if prev == nil {
   206  			// New node increased the height of the skiplist, so assume that the
   207  			// new level has not yet been populated.
   208  			if next != nil {
   209  				panic("next is expected to be nil, since prev is nil")
   210  			}
   211  
   212  			prev = s.head
   213  			next = s.tail
   214  		}
   215  
   216  		// +----------------+     +------------+     +----------------+
   217  		// |      prev      |     |     nd     |     |      next      |
   218  		// | prevNextOffset |---->|            |     |                |
   219  		// |                |<----| prevOffset |     |                |
   220  		// |                |     | nextOffset |---->|                |
   221  		// |                |     |            |<----| nextPrevOffset |
   222  		// +----------------+     +------------+     +----------------+
   223  		//
   224  		// 1. Initialize prevOffset and nextOffset to point to prev and next.
   225  		// 2. CAS prevNextOffset to repoint from next to nd.
   226  		// 3. CAS nextPrevOffset to repoint from prev to nd.
   227  		for {
   228  			prevOffset := s.arena.getPointerOffset(unsafe.Pointer(prev))
   229  			nextOffset := s.arena.getPointerOffset(unsafe.Pointer(next))
   230  			nd.tower[i].init(prevOffset, nextOffset)
   231  
   232  			// Check whether next has an updated link to prev. If it does not,
   233  			// that can mean one of two things:
   234  			//   1. The thread that added the next node hasn't yet had a chance
   235  			//      to add the prev link (but will shortly).
   236  			//   2. Another thread has added a new node between prev and next.
   237  			nextPrevOffset := next.prevOffset(i)
   238  			if nextPrevOffset != prevOffset {
   239  				// Determine whether #1 or #2 is true by checking whether prev
   240  				// is still pointing to next. As long as the atomic operations
   241  				// have at least acquire/release semantics (no need for
   242  				// sequential consistency), this works, as it is equivalent to
   243  				// the "publication safety" pattern.
   244  				prevNextOffset := prev.nextOffset(i)
   245  				if prevNextOffset == nextOffset {
   246  					// Ok, case #1 is true, so help the other thread along by
   247  					// updating the next node's prev link.
   248  					next.casPrevOffset(i, nextPrevOffset, prevOffset)
   249  				}
   250  			}
   251  
   252  			if prev.casNextOffset(i, nextOffset, ndOffset) {
   253  				// Managed to insert nd between prev and next, so update the next
   254  				// node's prev link and go to the next level.
   255  				if s.testing {
   256  					// Add delay to make it easier to test race between this thread
   257  					// and another thread that sees the intermediate state between
   258  					// setting next and setting prev.
   259  					runtime.Gosched()
   260  				}
   261  
   262  				next.casPrevOffset(i, prevOffset, ndOffset)
   263  				break
   264  			}
   265  
   266  			// CAS failed. We need to recompute prev and next. It is unlikely to
   267  			// be helpful to try to use a different level as we redo the search,
   268  			// because it is unlikely that lots of nodes are inserted between prev
   269  			// and next.
   270  			prev, next, found = s.findSpliceForLevel(key, i, prev)
   271  			if found {
   272  				if i != 0 {
   273  					panic("how can another thread have inserted a node at a non-base level?")
   274  				}
   275  
   276  				return ErrRecordExists
   277  			}
   278  			invalidateSplice = true
   279  		}
   280  	}
   281  
   282  	// If we had to recompute the splice for a level, invalidate the entire
   283  	// cached splice.
   284  	if invalidateSplice {
   285  		ins.height = 0
   286  	} else {
   287  		// The splice was valid. We inserted a node between spl[i].prev and
   288  		// spl[i].next. Optimistically update spl[i].prev for use in a subsequent
   289  		// call to add.
   290  		for i := uint32(0); i < height; i++ {
   291  			ins.spl[i].prev = nd
   292  		}
   293  	}
   294  
   295  	return nil
   296  }
   297  
   298  // NewIter returns a new Iterator object. The lower and upper bound parameters
   299  // control the range of keys the iterator will return. Specifying for nil for
   300  // lower or upper bound disables the check for that boundary. Note that lower
   301  // bound is not checked on {SeekGE,First} and upper bound is not check on
   302  // {SeekLT,Last}. The user is expected to perform that check. Note that it is
   303  // safe for an iterator to be copied by value.
   304  func (s *Skiplist) NewIter(lower, upper []byte) *Iterator {
   305  	it := iterPool.Get().(*Iterator)
   306  	*it = Iterator{list: s, nd: s.head, lower: lower, upper: upper}
   307  	return it
   308  }
   309  
   310  func (s *Skiplist) newNode(
   311  	key, value []byte,
   312  ) (nd *node, height uint32, err error) {
   313  	height = s.randomHeight()
   314  	nd, err = newNode(s.arena, height, key, value)
   315  	if err != nil {
   316  		return
   317  	}
   318  
   319  	// Try to increase s.height via CAS.
   320  	listHeight := s.Height()
   321  	for height > listHeight {
   322  		if s.height.CompareAndSwap(listHeight, height) {
   323  			// Successfully increased skiplist.height.
   324  			break
   325  		}
   326  
   327  		listHeight = s.Height()
   328  	}
   329  
   330  	return
   331  }
   332  
   333  func (s *Skiplist) randomHeight() uint32 {
   334  	rnd := fastrand.Uint32()
   335  
   336  	h := uint32(1)
   337  	for h < maxHeight && rnd <= probabilities[h] {
   338  		h++
   339  	}
   340  
   341  	return h
   342  }
   343  
   344  func (s *Skiplist) findSplice(key []byte, ins *Inserter) (found bool) {
   345  	listHeight := s.Height()
   346  	var level int
   347  
   348  	prev := s.head
   349  	if ins.height < listHeight {
   350  		// Our cached height is less than the list height, which means there were
   351  		// inserts that increased the height of the list. Recompute the splice from
   352  		// scratch.
   353  		ins.height = listHeight
   354  		level = int(ins.height)
   355  	} else {
   356  		// Our cached height is equal to the list height.
   357  		for ; level < int(listHeight); level++ {
   358  			spl := &ins.spl[level]
   359  			if s.getNext(spl.prev, level) != spl.next {
   360  				// One or more nodes have been inserted between the splice at this
   361  				// level.
   362  				continue
   363  			}
   364  			if spl.prev != s.head && !s.keyIsAfterNode(spl.prev, key) {
   365  				// Key lies before splice.
   366  				level = int(listHeight)
   367  				break
   368  			}
   369  			if spl.next != s.tail && s.keyIsAfterNode(spl.next, key) {
   370  				// Key lies after splice.
   371  				level = int(listHeight)
   372  				break
   373  			}
   374  			// The splice brackets the key!
   375  			prev = spl.prev
   376  			break
   377  		}
   378  	}
   379  
   380  	for level = level - 1; level >= 0; level-- {
   381  		var next *node
   382  		prev, next, found = s.findSpliceForLevel(key, level, prev)
   383  		if next == nil {
   384  			next = s.tail
   385  		}
   386  		ins.spl[level].init(prev, next)
   387  	}
   388  
   389  	return
   390  }
   391  
   392  func (s *Skiplist) findSpliceForLevel(
   393  	key []byte, level int, start *node,
   394  ) (prev, next *node, found bool) {
   395  	prev = start
   396  
   397  	for {
   398  		// Assume prev.key < key.
   399  		next = s.getNext(prev, level)
   400  		if next == s.tail {
   401  			// Tail node, so done.
   402  			break
   403  		}
   404  
   405  		offset, size := next.keyOffset, next.keySize
   406  		nextKey := s.arena.buf[offset : offset+size]
   407  		cmp := s.cmp(key, nextKey)
   408  		if cmp < 0 {
   409  			// We are done for this level, since prev.key < key < next.key.
   410  			break
   411  		}
   412  		if cmp == 0 {
   413  			found = true
   414  			break
   415  		}
   416  
   417  		// Keep moving right on this level.
   418  		prev = next
   419  	}
   420  
   421  	return
   422  }
   423  
   424  func (s *Skiplist) keyIsAfterNode(nd *node, key []byte) bool {
   425  	ndKey := s.arena.buf[nd.keyOffset : nd.keyOffset+nd.keySize]
   426  	return s.cmp(ndKey, key) < 0
   427  }
   428  
   429  func (s *Skiplist) getNext(nd *node, h int) *node {
   430  	offset := nd.tower[h].nextOffset.Load()
   431  	return (*node)(s.arena.getPointer(offset))
   432  }
   433  
   434  func (s *Skiplist) getPrev(nd *node, h int) *node {
   435  	offset := nd.tower[h].prevOffset.Load()
   436  	return (*node)(s.arena.getPointer(offset))
   437  }