github.com/cockroachdb/pebble@v1.1.1-0.20240513155919-3622ade60459/internal/batchskl/skl.go

github.com/cockroachdb/pebble@v1.1.1-0.20240513155919-3622ade60459/internal/batchskl/skl.go (about)

     1  /*
     2   * Copyright 2017 Dgraph Labs, Inc. and Contributors
     3   * Modifications copyright (C) 2017 Andy Kimball and Contributors
     4   *
     5   * Licensed under the Apache License, Version 2.0 (the "License")
     6   * you may not use this file except in compliance with the License.
     7   * You may obtain a copy of the License at
     8   *
     9   *     http://www.apache.org/licenses/LICENSE-2.0
    10   *
    11   * Unless required by applicable law or agreed to in writing, software
    12   * distributed under the License is distributed on an "AS IS" BASIS,
    13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14   * See the License for the specific language governing permissions and
    15   * limitations under the License.
    16   */
    17  
    18  /*
    19  Adapted from RocksDB inline skiplist.
    20  
    21  Key differences:
    22  - No optimization for sequential inserts (no "prev").
    23  - No custom comparator.
    24  - Support overwrites. This requires care when we see the same key when inserting.
    25    For RocksDB or LevelDB, overwrites are implemented as a newer sequence number in the key, so
    26  	there is no need for values. We don't intend to support versioning. In-place updates of values
    27  	would be more efficient.
    28  - We discard all non-concurrent code.
    29  - We do not support Splices. This simplifies the code a lot.
    30  - No AllocateNode or other pointer arithmetic.
    31  - We combine the findLessThan, findGreaterOrEqual, etc into one function.
    32  */
    33  
    34  /*
    35  Further adapted from Badger: https://github.com/dgraph-io/badger.
    36  
    37  Key differences:
    38  - Support for previous pointers - doubly linked lists. Note that it's up to higher
    39    level code to deal with the intermediate state that occurs during insertion,
    40    where node A is linked to node B, but node B is not yet linked back to node A.
    41  - Iterator includes mutator functions.
    42  */
    43  
    44  /*
    45  Further adapted from arenaskl: https://github.com/andy-kimball/arenaskl
    46  
    47  Key differences:
    48  - Removed support for deletion.
    49  - Removed support for concurrency.
    50  - External storage of keys.
    51  - Node storage grows to an arbitrary size.
    52  */
    53  
    54  package batchskl // import "github.com/cockroachdb/pebble/internal/batchskl"
    55  
    56  import (
    57  	"bytes"
    58  	"encoding/binary"
    59  	"fmt"
    60  	"math"
    61  	"time"
    62  	"unsafe"
    63  
    64  	"github.com/cockroachdb/errors"
    65  	"github.com/cockroachdb/pebble/internal/base"
    66  	"github.com/cockroachdb/pebble/internal/constants"
    67  	"golang.org/x/exp/rand"
    68  )
    69  
    70  const (
    71  	maxHeight    = 20
    72  	maxNodeSize  = uint64(unsafe.Sizeof(node{}))
    73  	linksSize    = uint64(unsafe.Sizeof(links{}))
    74  	maxNodesSize = constants.MaxUint32OrInt
    75  )
    76  
    77  var (
    78  	// ErrExists indicates that a duplicate record was inserted. This should never
    79  	// happen for normal usage of batchskl as every key should have a unique
    80  	// sequence number.
    81  	ErrExists = errors.New("record with this key already exists")
    82  
    83  	// ErrTooManyRecords is a sentinel error returned when the size of the raw
    84  	// nodes slice exceeds the maximum allowed size (currently 1 << 32 - 1). This
    85  	// corresponds to ~117 M skiplist entries.
    86  	ErrTooManyRecords = errors.New("too many records")
    87  )
    88  
    89  type links struct {
    90  	next uint32
    91  	prev uint32
    92  }
    93  
    94  type node struct {
    95  	// The offset of the start of the record in the storage.
    96  	offset uint32
    97  	// The offset of the start and end of the key in storage.
    98  	keyStart uint32
    99  	keyEnd   uint32
   100  	// A fixed 8-byte abbreviation of the key, used to avoid retrieval of the key
   101  	// during seek operations. The key retrieval can be expensive purely due to
   102  	// cache misses while the abbreviatedKey stored here will be in the same
   103  	// cache line as the key and the links making accessing and comparing against
   104  	// it almost free.
   105  	abbreviatedKey uint64
   106  	// Most nodes do not need to use the full height of the link tower, since the
   107  	// probability of each successive level decreases exponentially. Because
   108  	// these elements are never accessed, they do not need to be allocated.
   109  	// Therefore, when a node is allocated, its memory footprint is deliberately
   110  	// truncated to not include unneeded link elements.
   111  	links [maxHeight]links
   112  }
   113  
   114  // Skiplist is a fast, non-cocnurrent skiplist implementation that supports
   115  // forward and backward iteration. See arenaskl.Skiplist for a concurrent
   116  // skiplist. Keys and values are stored externally from the skiplist via the
   117  // Storage interface. Deletion is not supported. Instead, higher-level code is
   118  // expected to perform deletion via tombstones and needs to process those
   119  // tombstones appropriately during retrieval operations.
   120  type Skiplist struct {
   121  	storage        *[]byte
   122  	cmp            base.Compare
   123  	abbreviatedKey base.AbbreviatedKey
   124  	nodes          []byte
   125  	head           uint32
   126  	tail           uint32
   127  	height         uint32 // Current height: 1 <= height <= maxHeight
   128  	rand           rand.PCGSource
   129  }
   130  
   131  var (
   132  	probabilities [maxHeight]uint32
   133  )
   134  
   135  func init() {
   136  	const pValue = 1 / math.E
   137  
   138  	// Precompute the skiplist probabilities so that only a single random number
   139  	// needs to be generated and so that the optimal pvalue can be used (inverse
   140  	// of Euler's number).
   141  	p := float64(1.0)
   142  	for i := 0; i < maxHeight; i++ {
   143  		probabilities[i] = uint32(float64(math.MaxUint32) * p)
   144  		p *= pValue
   145  	}
   146  }
   147  
   148  // NewSkiplist constructs and initializes a new, empty skiplist.
   149  func NewSkiplist(storage *[]byte, cmp base.Compare, abbreviatedKey base.AbbreviatedKey) *Skiplist {
   150  	s := &Skiplist{}
   151  	s.Init(storage, cmp, abbreviatedKey)
   152  	return s
   153  }
   154  
   155  // Reset the fields in the skiplist for reuse.
   156  func (s *Skiplist) Reset() {
   157  	*s = Skiplist{
   158  		nodes:  s.nodes[:0],
   159  		height: 1,
   160  	}
   161  	const batchMaxRetainedSize = 1 << 20 // 1 MB
   162  	if cap(s.nodes) > batchMaxRetainedSize {
   163  		s.nodes = nil
   164  	}
   165  }
   166  
   167  // Init the skiplist to empty and re-initialize.
   168  func (s *Skiplist) Init(storage *[]byte, cmp base.Compare, abbreviatedKey base.AbbreviatedKey) {
   169  	*s = Skiplist{
   170  		storage:        storage,
   171  		cmp:            cmp,
   172  		abbreviatedKey: abbreviatedKey,
   173  		nodes:          s.nodes[:0],
   174  		height:         1,
   175  	}
   176  	s.rand.Seed(uint64(time.Now().UnixNano()))
   177  
   178  	const initBufSize = 256
   179  	if cap(s.nodes) < initBufSize {
   180  		s.nodes = make([]byte, 0, initBufSize)
   181  	}
   182  
   183  	// Allocate head and tail nodes. While allocating a new node can fail, in the
   184  	// context of initializing the skiplist we consider it unrecoverable.
   185  	var err error
   186  	s.head, err = s.newNode(maxHeight, 0, 0, 0, 0)
   187  	if err != nil {
   188  		panic(err)
   189  	}
   190  	s.tail, err = s.newNode(maxHeight, 0, 0, 0, 0)
   191  	if err != nil {
   192  		panic(err)
   193  	}
   194  
   195  	// Link all head/tail levels together.
   196  	headNode := s.node(s.head)
   197  	tailNode := s.node(s.tail)
   198  	for i := uint32(0); i < maxHeight; i++ {
   199  		headNode.links[i].next = s.tail
   200  		tailNode.links[i].prev = s.head
   201  	}
   202  }
   203  
   204  // Add adds a new key to the skiplist if it does not yet exist. If the record
   205  // already exists, then Add returns ErrRecordExists.
   206  func (s *Skiplist) Add(keyOffset uint32) error {
   207  	data := (*s.storage)[keyOffset+1:]
   208  	v, n := binary.Uvarint(data)
   209  	if n <= 0 {
   210  		return errors.Errorf("corrupted batch entry: %d", errors.Safe(keyOffset))
   211  	}
   212  	data = data[n:]
   213  	if v > uint64(len(data)) {
   214  		return errors.Errorf("corrupted batch entry: %d", errors.Safe(keyOffset))
   215  	}
   216  	keyStart := 1 + keyOffset + uint32(n)
   217  	keyEnd := keyStart + uint32(v)
   218  	key := data[:v]
   219  	abbreviatedKey := s.abbreviatedKey(key)
   220  
   221  	// spl holds the list of next and previous links for each level in the
   222  	// skiplist indicating where the new node will be inserted.
   223  	var spl [maxHeight]splice
   224  
   225  	// Fast-path for in-order insertion of keys: compare the new key against the
   226  	// last key.
   227  	prev := s.getPrev(s.tail, 0)
   228  	if prevNode := s.node(prev); prev == s.head ||
   229  		abbreviatedKey > prevNode.abbreviatedKey ||
   230  		(abbreviatedKey == prevNode.abbreviatedKey &&
   231  			s.cmp(key, (*s.storage)[prevNode.keyStart:prevNode.keyEnd]) > 0) {
   232  		for level := uint32(0); level < s.height; level++ {
   233  			spl[level].prev = s.getPrev(s.tail, level)
   234  			spl[level].next = s.tail
   235  		}
   236  	} else {
   237  		s.findSplice(key, abbreviatedKey, &spl)
   238  	}
   239  
   240  	height := s.randomHeight()
   241  	// Increase s.height as necessary.
   242  	for ; s.height < height; s.height++ {
   243  		spl[s.height].next = s.tail
   244  		spl[s.height].prev = s.head
   245  	}
   246  
   247  	// We always insert from the base level and up. After you add a node in base
   248  	// level, we cannot create a node in the level above because it would have
   249  	// discovered the node in the base level.
   250  	nd, err := s.newNode(height, keyOffset, keyStart, keyEnd, abbreviatedKey)
   251  	if err != nil {
   252  		return err
   253  	}
   254  	newNode := s.node(nd)
   255  	for level := uint32(0); level < height; level++ {
   256  		next := spl[level].next
   257  		prev := spl[level].prev
   258  		newNode.links[level].next = next
   259  		newNode.links[level].prev = prev
   260  		s.node(next).links[level].prev = nd
   261  		s.node(prev).links[level].next = nd
   262  	}
   263  
   264  	return nil
   265  }
   266  
   267  // NewIter returns a new Iterator object. The lower and upper bound parameters
   268  // control the range of keys the iterator will return. Specifying for nil for
   269  // lower or upper bound disables the check for that boundary. Note that lower
   270  // bound is not checked on {SeekGE,First} and upper bound is not check on
   271  // {SeekLT,Last}. The user is expected to perform that check. Note that it is
   272  // safe for an iterator to be copied by value.
   273  func (s *Skiplist) NewIter(lower, upper []byte) Iterator {
   274  	return Iterator{list: s, lower: lower, upper: upper}
   275  }
   276  
   277  func (s *Skiplist) newNode(
   278  	height,
   279  	offset, keyStart, keyEnd uint32, abbreviatedKey uint64,
   280  ) (uint32, error) {
   281  	if height < 1 || height > maxHeight {
   282  		panic("height cannot be less than one or greater than the max height")
   283  	}
   284  
   285  	unusedSize := uint64(maxHeight-int(height)) * linksSize
   286  	nodeOffset, err := s.alloc(uint32(maxNodeSize - unusedSize))
   287  	if err != nil {
   288  		return 0, err
   289  	}
   290  	nd := s.node(nodeOffset)
   291  
   292  	nd.offset = offset
   293  	nd.keyStart = keyStart
   294  	nd.keyEnd = keyEnd
   295  	nd.abbreviatedKey = abbreviatedKey
   296  	return nodeOffset, nil
   297  }
   298  
   299  func (s *Skiplist) alloc(size uint32) (uint32, error) {
   300  	offset := uint64(len(s.nodes))
   301  
   302  	// We only have a need for memory up to offset + size, but we never want
   303  	// to allocate a node whose tail points into unallocated memory.
   304  	minAllocSize := offset + maxNodeSize
   305  	if uint64(cap(s.nodes)) < minAllocSize {
   306  		allocSize := uint64(cap(s.nodes)) * 2
   307  		if allocSize < minAllocSize {
   308  			allocSize = minAllocSize
   309  		}
   310  		// Cap the allocation at the max allowed size to avoid wasted capacity.
   311  		if allocSize > maxNodesSize {
   312  			// The new record may still not fit within the allocation, in which case
   313  			// we return early with an error. This avoids the panic below when we
   314  			// resize the slice. It also avoids the allocation and copy.
   315  			if uint64(offset)+uint64(size) > maxNodesSize {
   316  				return 0, errors.Wrapf(ErrTooManyRecords,
   317  					"alloc of new record (size=%d) would overflow uint32 (current size=%d)",
   318  					uint64(offset)+uint64(size), offset,
   319  				)
   320  			}
   321  			allocSize = maxNodesSize
   322  		}
   323  		tmp := make([]byte, len(s.nodes), allocSize)
   324  		copy(tmp, s.nodes)
   325  		s.nodes = tmp
   326  	}
   327  
   328  	newSize := uint32(offset) + size
   329  	s.nodes = s.nodes[:newSize]
   330  	return uint32(offset), nil
   331  }
   332  
   333  func (s *Skiplist) node(offset uint32) *node {
   334  	return (*node)(unsafe.Pointer(&s.nodes[offset]))
   335  }
   336  
   337  func (s *Skiplist) randomHeight() uint32 {
   338  	rnd := uint32(s.rand.Uint64())
   339  	h := uint32(1)
   340  	for h < maxHeight && rnd <= probabilities[h] {
   341  		h++
   342  	}
   343  	return h
   344  }
   345  
   346  func (s *Skiplist) findSplice(key []byte, abbreviatedKey uint64, spl *[maxHeight]splice) {
   347  	prev := s.head
   348  
   349  	for level := s.height - 1; ; level-- {
   350  		// The code in this loop is the same as findSpliceForLevel(). For some
   351  		// reason, calling findSpliceForLevel() here is much much slower than the
   352  		// inlined code below. The excess time is also caught up in the final
   353  		// return statement which makes little sense. Revisit when in go1.14 or
   354  		// later if inlining improves.
   355  
   356  		next := s.getNext(prev, level)
   357  		for next != s.tail {
   358  			// Assume prev.key < key.
   359  			nextNode := s.node(next)
   360  			nextAbbreviatedKey := nextNode.abbreviatedKey
   361  			if abbreviatedKey < nextAbbreviatedKey {
   362  				// We are done for this level, since prev.key < key < next.key.
   363  				break
   364  			}
   365  			if abbreviatedKey == nextAbbreviatedKey {
   366  				if s.cmp(key, (*s.storage)[nextNode.keyStart:nextNode.keyEnd]) <= 0 {
   367  					// We are done for this level, since prev.key < key <= next.key.
   368  					break
   369  				}
   370  			}
   371  
   372  			// Keep moving right on this level.
   373  			prev = next
   374  			next = nextNode.links[level].next
   375  		}
   376  
   377  		spl[level].prev = prev
   378  		spl[level].next = next
   379  		if level == 0 {
   380  			break
   381  		}
   382  	}
   383  }
   384  
   385  func (s *Skiplist) findSpliceForLevel(
   386  	key []byte, abbreviatedKey uint64, level, start uint32,
   387  ) (prev, next uint32) {
   388  	prev = start
   389  	next = s.getNext(prev, level)
   390  
   391  	for next != s.tail {
   392  		// Assume prev.key < key.
   393  		nextNode := s.node(next)
   394  		nextAbbreviatedKey := nextNode.abbreviatedKey
   395  		if abbreviatedKey < nextAbbreviatedKey {
   396  			// We are done for this level, since prev.key < key < next.key.
   397  			break
   398  		}
   399  		if abbreviatedKey == nextAbbreviatedKey {
   400  			if s.cmp(key, (*s.storage)[nextNode.keyStart:nextNode.keyEnd]) <= 0 {
   401  				// We are done for this level, since prev.key < key < next.key.
   402  				break
   403  			}
   404  		}
   405  
   406  		// Keep moving right on this level.
   407  		prev = next
   408  		next = nextNode.links[level].next
   409  	}
   410  
   411  	return
   412  }
   413  
   414  func (s *Skiplist) getKey(nd uint32) base.InternalKey {
   415  	n := s.node(nd)
   416  	kind := base.InternalKeyKind((*s.storage)[n.offset])
   417  	key := (*s.storage)[n.keyStart:n.keyEnd]
   418  	return base.MakeInternalKey(key, uint64(n.offset)|base.InternalKeySeqNumBatch, kind)
   419  }
   420  
   421  func (s *Skiplist) getNext(nd, h uint32) uint32 {
   422  	return s.node(nd).links[h].next
   423  }
   424  
   425  func (s *Skiplist) getPrev(nd, h uint32) uint32 {
   426  	return s.node(nd).links[h].prev
   427  }
   428  
   429  func (s *Skiplist) debug() string {
   430  	var buf bytes.Buffer
   431  	for level := uint32(0); level < s.height; level++ {
   432  		var count int
   433  		for nd := s.head; nd != s.tail; nd = s.getNext(nd, level) {
   434  			count++
   435  		}
   436  		fmt.Fprintf(&buf, "%d: %d\n", level, count)
   437  	}
   438  	return buf.String()
   439  }
   440  
   441  // Silence unused warning.
   442  var _ = (*Skiplist).debug