github.com/zuoyebang/bitalostable@v1.0.1-0.20240229032404-e3b99a834294/internal/batchskl/skl.go (about)

     1  /*
     2   * Copyright 2017 Dgraph Labs, Inc. and Contributors
     3   * Modifications copyright (C) 2017 Andy Kimball and Contributors
     4   *
     5   * Licensed under the Apache License, Version 2.0 (the "License")
     6   * you may not use this file except in compliance with the License.
     7   * You may obtain a copy of the License at
     8   *
     9   *     http://www.apache.org/licenses/LICENSE-2.0
    10   *
    11   * Unless required by applicable law or agreed to in writing, software
    12   * distributed under the License is distributed on an "AS IS" BASIS,
    13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14   * See the License for the specific language governing permissions and
    15   * limitations under the License.
    16   */
    17  
    18  /*
    19  Adapted from RocksDB inline skiplist.
    20  
    21  Key differences:
    22  - No optimization for sequential inserts (no "prev").
    23  - No custom comparator.
    24  - Support overwrites. This requires care when we see the same key when inserting.
    25    For RocksDB or LevelDB, overwrites are implemented as a newer sequence number in the key, so
    26  	there is no need for values. We don't intend to support versioning. In-place updates of values
    27  	would be more efficient.
    28  - We discard all non-concurrent code.
    29  - We do not support Splices. This simplifies the code a lot.
    30  - No AllocateNode or other pointer arithmetic.
    31  - We combine the findLessThan, findGreaterOrEqual, etc into one function.
    32  */
    33  
    34  /*
    35  Further adapted from Badger: https://github.com/dgraph-io/badger.
    36  
    37  Key differences:
    38  - Support for previous pointers - doubly linked lists. Note that it's up to higher
    39    level code to deal with the intermediate state that occurs during insertion,
    40    where node A is linked to node B, but node B is not yet linked back to node A.
    41  - Iterator includes mutator functions.
    42  */
    43  
    44  /*
    45  Further adapted from arenaskl: https://github.com/andy-kimball/arenaskl
    46  
    47  Key differences:
    48  - Removed support for deletion.
    49  - Removed support for concurrency.
    50  - External storage of keys.
    51  - Node storage grows to an arbitrary size.
    52  */
    53  
    54  package batchskl // import "github.com/zuoyebang/bitalostable/internal/batchskl"
    55  
    56  import (
    57  	"bytes"
    58  	"encoding/binary"
    59  	"fmt"
    60  	"math"
    61  	"time"
    62  	"unsafe"
    63  
    64  	"github.com/cockroachdb/errors"
    65  	"github.com/zuoyebang/bitalostable/internal/base"
    66  	"golang.org/x/exp/rand"
    67  )
    68  
    69  const (
    70  	maxHeight    = 20
    71  	maxNodeSize  = int(unsafe.Sizeof(node{}))
    72  	linksSize    = int(unsafe.Sizeof(links{}))
    73  	maxNodesSize = math.MaxUint32
    74  )
    75  
    76  var (
    77  	// ErrExists indicates that a duplicate record was inserted. This should never
    78  	// happen for normal usage of batchskl as every key should have a unique
    79  	// sequence number.
    80  	ErrExists = errors.New("record with this key already exists")
    81  
    82  	// ErrTooManyRecords is a sentinel error returned when the size of the raw
    83  	// nodes slice exceeds the maximum allowed size (currently 1 << 32 - 1). This
    84  	// corresponds to ~117 M skiplist entries.
    85  	ErrTooManyRecords = errors.New("too many records")
    86  )
    87  
    88  type links struct {
    89  	next uint32
    90  	prev uint32
    91  }
    92  
    93  type node struct {
    94  	// The offset of the start of the record in the storage.
    95  	offset uint32
    96  	// The offset of the start and end of the key in storage.
    97  	keyStart uint32
    98  	keyEnd   uint32
    99  	// A fixed 8-byte abbreviation of the key, used to avoid retrieval of the key
   100  	// during seek operations. The key retrieval can be expensive purely due to
   101  	// cache misses while the abbreviatedKey stored here will be in the same
   102  	// cache line as the key and the links making accessing and comparing against
   103  	// it almost free.
   104  	abbreviatedKey uint64
   105  	// Most nodes do not need to use the full height of the link tower, since the
   106  	// probability of each successive level decreases exponentially. Because
   107  	// these elements are never accessed, they do not need to be allocated.
   108  	// Therefore, when a node is allocated, its memory footprint is deliberately
   109  	// truncated to not include unneeded link elements.
   110  	links [maxHeight]links
   111  }
   112  
   113  // Skiplist is a fast, non-cocnurrent skiplist implementation that supports
   114  // forward and backward iteration. See arenaskl.Skiplist for a concurrent
   115  // skiplist. Keys and values are stored externally from the skiplist via the
   116  // Storage interface. Deletion is not supported. Instead, higher-level code is
   117  // expected to perform deletion via tombstones and needs to process those
   118  // tombstones appropriately during retrieval operations.
   119  type Skiplist struct {
   120  	storage        *[]byte
   121  	cmp            base.Compare
   122  	abbreviatedKey base.AbbreviatedKey
   123  	nodes          []byte
   124  	head           uint32
   125  	tail           uint32
   126  	height         uint32 // Current height: 1 <= height <= maxHeight
   127  	rand           rand.PCGSource
   128  }
   129  
   130  var (
   131  	probabilities [maxHeight]uint32
   132  )
   133  
   134  func init() {
   135  	const pValue = 1 / math.E
   136  
   137  	// Precompute the skiplist probabilities so that only a single random number
   138  	// needs to be generated and so that the optimal pvalue can be used (inverse
   139  	// of Euler's number).
   140  	p := float64(1.0)
   141  	for i := 0; i < maxHeight; i++ {
   142  		probabilities[i] = uint32(float64(math.MaxUint32) * p)
   143  		p *= pValue
   144  	}
   145  }
   146  
   147  // NewSkiplist constructs and initializes a new, empty skiplist.
   148  func NewSkiplist(storage *[]byte, cmp base.Compare, abbreviatedKey base.AbbreviatedKey) *Skiplist {
   149  	s := &Skiplist{}
   150  	s.Init(storage, cmp, abbreviatedKey)
   151  	return s
   152  }
   153  
   154  // Reset the fields in the skiplist for reuse.
   155  func (s *Skiplist) Reset() {
   156  	*s = Skiplist{
   157  		nodes:  s.nodes[:0],
   158  		height: 1,
   159  	}
   160  	const batchMaxRetainedSize = 1 << 20 // 1 MB
   161  	if cap(s.nodes) > batchMaxRetainedSize {
   162  		s.nodes = nil
   163  	}
   164  }
   165  
   166  // Init the skiplist to empty and re-initialize.
   167  func (s *Skiplist) Init(storage *[]byte, cmp base.Compare, abbreviatedKey base.AbbreviatedKey) {
   168  	*s = Skiplist{
   169  		storage:        storage,
   170  		cmp:            cmp,
   171  		abbreviatedKey: abbreviatedKey,
   172  		nodes:          s.nodes[:0],
   173  		height:         1,
   174  	}
   175  	s.rand.Seed(uint64(time.Now().UnixNano()))
   176  
   177  	const initBufSize = 256
   178  	if cap(s.nodes) < initBufSize {
   179  		s.nodes = make([]byte, 0, initBufSize)
   180  	}
   181  
   182  	// Allocate head and tail nodes. While allocating a new node can fail, in the
   183  	// context of initializing the skiplist we consider it unrecoverable.
   184  	var err error
   185  	s.head, err = s.newNode(maxHeight, 0, 0, 0, 0)
   186  	if err != nil {
   187  		panic(err)
   188  	}
   189  	s.tail, err = s.newNode(maxHeight, 0, 0, 0, 0)
   190  	if err != nil {
   191  		panic(err)
   192  	}
   193  
   194  	// Link all head/tail levels together.
   195  	headNode := s.node(s.head)
   196  	tailNode := s.node(s.tail)
   197  	for i := uint32(0); i < maxHeight; i++ {
   198  		headNode.links[i].next = s.tail
   199  		tailNode.links[i].prev = s.head
   200  	}
   201  }
   202  
   203  // Add adds a new key to the skiplist if it does not yet exist. If the record
   204  // already exists, then Add returns ErrRecordExists.
   205  func (s *Skiplist) Add(keyOffset uint32) error {
   206  	data := (*s.storage)[keyOffset+1:]
   207  	v, n := binary.Uvarint(data)
   208  	if n <= 0 {
   209  		return errors.Errorf("corrupted batch entry: %d", errors.Safe(keyOffset))
   210  	}
   211  	data = data[n:]
   212  	if v > uint64(len(data)) {
   213  		return errors.Errorf("corrupted batch entry: %d", errors.Safe(keyOffset))
   214  	}
   215  	keyStart := 1 + keyOffset + uint32(n)
   216  	keyEnd := keyStart + uint32(v)
   217  	key := data[:v]
   218  	abbreviatedKey := s.abbreviatedKey(key)
   219  
   220  	// spl holds the list of next and previous links for each level in the
   221  	// skiplist indicating where the new node will be inserted.
   222  	var spl [maxHeight]splice
   223  
   224  	// Fast-path for in-order insertion of keys: compare the new key against the
   225  	// last key.
   226  	prev := s.getPrev(s.tail, 0)
   227  	if prevNode := s.node(prev); prev == s.head ||
   228  		abbreviatedKey > prevNode.abbreviatedKey ||
   229  		(abbreviatedKey == prevNode.abbreviatedKey &&
   230  			s.cmp(key, (*s.storage)[prevNode.keyStart:prevNode.keyEnd]) > 0) {
   231  		for level := uint32(0); level < s.height; level++ {
   232  			spl[level].prev = s.getPrev(s.tail, level)
   233  			spl[level].next = s.tail
   234  		}
   235  	} else {
   236  		s.findSplice(key, abbreviatedKey, &spl)
   237  	}
   238  
   239  	height := s.randomHeight()
   240  	// Increase s.height as necessary.
   241  	for ; s.height < height; s.height++ {
   242  		spl[s.height].next = s.tail
   243  		spl[s.height].prev = s.head
   244  	}
   245  
   246  	// We always insert from the base level and up. After you add a node in base
   247  	// level, we cannot create a node in the level above because it would have
   248  	// discovered the node in the base level.
   249  	nd, err := s.newNode(height, keyOffset, keyStart, keyEnd, abbreviatedKey)
   250  	if err != nil {
   251  		return err
   252  	}
   253  	newNode := s.node(nd)
   254  	for level := uint32(0); level < height; level++ {
   255  		next := spl[level].next
   256  		prev := spl[level].prev
   257  		newNode.links[level].next = next
   258  		newNode.links[level].prev = prev
   259  		s.node(next).links[level].prev = nd
   260  		s.node(prev).links[level].next = nd
   261  	}
   262  
   263  	return nil
   264  }
   265  
   266  // NewIter returns a new Iterator object. The lower and upper bound parameters
   267  // control the range of keys the iterator will return. Specifying for nil for
   268  // lower or upper bound disables the check for that boundary. Note that lower
   269  // bound is not checked on {SeekGE,First} and upper bound is not check on
   270  // {SeekLT,Last}. The user is expected to perform that check. Note that it is
   271  // safe for an iterator to be copied by value.
   272  func (s *Skiplist) NewIter(lower, upper []byte) Iterator {
   273  	return Iterator{list: s, lower: lower, upper: upper}
   274  }
   275  
   276  func (s *Skiplist) newNode(
   277  	height,
   278  	offset, keyStart, keyEnd uint32, abbreviatedKey uint64,
   279  ) (uint32, error) {
   280  	if height < 1 || height > maxHeight {
   281  		panic("height cannot be less than one or greater than the max height")
   282  	}
   283  
   284  	unusedSize := (maxHeight - int(height)) * linksSize
   285  	nodeOffset, err := s.alloc(uint32(maxNodeSize - unusedSize))
   286  	if err != nil {
   287  		return 0, err
   288  	}
   289  	nd := s.node(nodeOffset)
   290  
   291  	nd.offset = offset
   292  	nd.keyStart = keyStart
   293  	nd.keyEnd = keyEnd
   294  	nd.abbreviatedKey = abbreviatedKey
   295  	return nodeOffset, nil
   296  }
   297  
   298  func (s *Skiplist) alloc(size uint32) (uint32, error) {
   299  	offset := len(s.nodes)
   300  
   301  	// We only have a need for memory up to offset + size, but we never want
   302  	// to allocate a node whose tail points into unallocated memory.
   303  	minAllocSize := offset + maxNodeSize
   304  	if cap(s.nodes) < minAllocSize {
   305  		allocSize := cap(s.nodes) * 2
   306  		if allocSize < minAllocSize {
   307  			allocSize = minAllocSize
   308  		}
   309  		// Cap the allocation at the max allowed size to avoid wasted capacity.
   310  		if allocSize > maxNodesSize {
   311  			// The new record may still not fit within the allocation, in which case
   312  			// we return early with an error. This avoids the panic below when we
   313  			// resize the slice. It also avoids the allocation and copy.
   314  			if uint64(offset)+uint64(size) > maxNodesSize {
   315  				return 0, errors.Wrapf(ErrTooManyRecords,
   316  					"alloc of new record (size=%d) would overflow uint32 (current size=%d)",
   317  					uint64(offset)+uint64(size), offset,
   318  				)
   319  			}
   320  			allocSize = maxNodesSize
   321  		}
   322  		tmp := make([]byte, len(s.nodes), allocSize)
   323  		copy(tmp, s.nodes)
   324  		s.nodes = tmp
   325  	}
   326  
   327  	newSize := uint32(offset) + size
   328  	s.nodes = s.nodes[:newSize]
   329  	return uint32(offset), nil
   330  }
   331  
   332  func (s *Skiplist) node(offset uint32) *node {
   333  	return (*node)(unsafe.Pointer(&s.nodes[offset]))
   334  }
   335  
   336  func (s *Skiplist) randomHeight() uint32 {
   337  	rnd := uint32(s.rand.Uint64())
   338  	h := uint32(1)
   339  	for h < maxHeight && rnd <= probabilities[h] {
   340  		h++
   341  	}
   342  	return h
   343  }
   344  
   345  func (s *Skiplist) findSplice(key []byte, abbreviatedKey uint64, spl *[maxHeight]splice) {
   346  	prev := s.head
   347  
   348  	for level := s.height - 1; ; level-- {
   349  		// The code in this loop is the same as findSpliceForLevel(). For some
   350  		// reason, calling findSpliceForLevel() here is much much slower than the
   351  		// inlined code below. The excess time is also caught up in the final
   352  		// return statement which makes little sense. Revisit when in go1.14 or
   353  		// later if inlining improves.
   354  
   355  		next := s.getNext(prev, level)
   356  		for next != s.tail {
   357  			// Assume prev.key < key.
   358  			nextNode := s.node(next)
   359  			nextAbbreviatedKey := nextNode.abbreviatedKey
   360  			if abbreviatedKey < nextAbbreviatedKey {
   361  				// We are done for this level, since prev.key < key < next.key.
   362  				break
   363  			}
   364  			if abbreviatedKey == nextAbbreviatedKey {
   365  				if s.cmp(key, (*s.storage)[nextNode.keyStart:nextNode.keyEnd]) <= 0 {
   366  					// We are done for this level, since prev.key < key <= next.key.
   367  					break
   368  				}
   369  			}
   370  
   371  			// Keep moving right on this level.
   372  			prev = next
   373  			next = nextNode.links[level].next
   374  		}
   375  
   376  		spl[level].prev = prev
   377  		spl[level].next = next
   378  		if level == 0 {
   379  			break
   380  		}
   381  	}
   382  }
   383  
   384  func (s *Skiplist) findSpliceForLevel(
   385  	key []byte, abbreviatedKey uint64, level, start uint32,
   386  ) (prev, next uint32) {
   387  	prev = start
   388  	next = s.getNext(prev, level)
   389  
   390  	for next != s.tail {
   391  		// Assume prev.key < key.
   392  		nextNode := s.node(next)
   393  		nextAbbreviatedKey := nextNode.abbreviatedKey
   394  		if abbreviatedKey < nextAbbreviatedKey {
   395  			// We are done for this level, since prev.key < key < next.key.
   396  			break
   397  		}
   398  		if abbreviatedKey == nextAbbreviatedKey {
   399  			if s.cmp(key, (*s.storage)[nextNode.keyStart:nextNode.keyEnd]) <= 0 {
   400  				// We are done for this level, since prev.key < key < next.key.
   401  				break
   402  			}
   403  		}
   404  
   405  		// Keep moving right on this level.
   406  		prev = next
   407  		next = nextNode.links[level].next
   408  	}
   409  
   410  	return
   411  }
   412  
   413  func (s *Skiplist) getKey(nd uint32) base.InternalKey {
   414  	n := s.node(nd)
   415  	kind := base.InternalKeyKind((*s.storage)[n.offset])
   416  	key := (*s.storage)[n.keyStart:n.keyEnd]
   417  	return base.MakeInternalKey(key, uint64(n.offset)|base.InternalKeySeqNumBatch, kind)
   418  }
   419  
   420  func (s *Skiplist) getNext(nd, h uint32) uint32 {
   421  	return s.node(nd).links[h].next
   422  }
   423  
   424  func (s *Skiplist) getPrev(nd, h uint32) uint32 {
   425  	return s.node(nd).links[h].prev
   426  }
   427  
   428  func (s *Skiplist) debug() string {
   429  	var buf bytes.Buffer
   430  	for level := uint32(0); level < s.height; level++ {
   431  		var count int
   432  		for nd := s.head; nd != s.tail; nd = s.getNext(nd, level) {
   433  			count++
   434  		}
   435  		fmt.Fprintf(&buf, "%d: %d\n", level, count)
   436  	}
   437  	return buf.String()
   438  }
   439  
   440  // Silence unused warning.
   441  var _ = (*Skiplist).debug