github.com/petermattis/pebble@v0.0.0-20190905164901-ab51a2166067/internal/batchskl/skl.go (about)

     1  /*
     2   * Copyright 2017 Dgraph Labs, Inc. and Contributors
     3   * Modifications copyright (C) 2017 Andy Kimball and Contributors
     4   *
     5   * Licensed under the Apache License, Version 2.0 (the "License")
     6   * you may not use this file except in compliance with the License.
     7   * You may obtain a copy of the License at
     8   *
     9   *     http://www.apache.org/licenses/LICENSE-2.0
    10   *
    11   * Unless required by applicable law or agreed to in writing, software
    12   * distributed under the License is distributed on an "AS IS" BASIS,
    13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14   * See the License for the specific language governing permissions and
    15   * limitations under the License.
    16   */
    17  
    18  /*
    19  Adapted from RocksDB inline skiplist.
    20  
    21  Key differences:
    22  - No optimization for sequential inserts (no "prev").
    23  - No custom comparator.
    24  - Support overwrites. This requires care when we see the same key when inserting.
    25    For RocksDB or LevelDB, overwrites are implemented as a newer sequence number in the key, so
    26  	there is no need for values. We don't intend to support versioning. In-place updates of values
    27  	would be more efficient.
    28  - We discard all non-concurrent code.
    29  - We do not support Splices. This simplifies the code a lot.
    30  - No AllocateNode or other pointer arithmetic.
    31  - We combine the findLessThan, findGreaterOrEqual, etc into one function.
    32  */
    33  
    34  /*
    35  Further adapted from Badger: https://github.com/dgraph-io/badger.
    36  
    37  Key differences:
    38  - Support for previous pointers - doubly linked lists. Note that it's up to higher
    39    level code to deal with the intermediate state that occurs during insertion,
    40    where node A is linked to node B, but node B is not yet linked back to node A.
    41  - Iterator includes mutator functions.
    42  */
    43  
    44  /*
    45  Further adapted from arenaskl: https://github.com/andy-kimball/arenaskl
    46  
    47  Key differences:
    48  - Removed support for deletion.
    49  - Removed support for concurrency.
    50  - External storage of keys.
    51  - Node storage grows to an arbitrary size.
    52  */
    53  
    54  package batchskl // import "github.com/petermattis/pebble/internal/batchskl"
    55  
    56  import (
    57  	"bytes"
    58  	"errors"
    59  	"fmt"
    60  	"math"
    61  	"time"
    62  	"unsafe"
    63  
    64  	"github.com/petermattis/pebble/internal/base"
    65  	"golang.org/x/exp/rand"
    66  )
    67  
    68  const (
    69  	maxHeight   = 20
    70  	maxNodeSize = int(unsafe.Sizeof(node{}))
    71  	linksSize   = int(unsafe.Sizeof(links{}))
    72  )
    73  
    74  var ErrExists = errors.New("record with this key already exists")
    75  
    76  type links struct {
    77  	next uint32
    78  	prev uint32
    79  }
    80  
    81  type node struct {
    82  	// The offset of the key in storage. See Storage.Get.
    83  	key uint32
    84  	// A fixed 8-byte abbreviation of the key, used to avoid retrieval of the key
    85  	// during seek operations. The key retrieval can be expensive purely due to
    86  	// cache misses while the abbreviatedKey stored here will be in the same
    87  	// cache line as the key and the links making accessing and comparing against
    88  	// it almost free.
    89  	abbreviatedKey uint64
    90  	// Most nodes do not need to use the full height of the link tower, since the
    91  	// probability of each successive level decreases exponentially. Because
    92  	// these elements are never accessed, they do not need to be allocated.
    93  	// Therefore, when a node is allocated, its memory footprint is deliberately
    94  	// truncated to not include unneeded link elements.
    95  	links [maxHeight]links
    96  }
    97  
    98  // Storage defines the storage interface for retrieval and comparison of keys.
    99  type Storage interface {
   100  	// Get returns the key stored at the specified offset.
   101  	Get(offset uint32) base.InternalKey
   102  
   103  	// AbbreviatedKey returns a fixed length prefix of the specified key such
   104  	// that AbbreviatedKey(a) < AbbreviatedKey(b) iff a < b and AbbreviatedKey(a)
   105  	// > AbbreviatedKey(b) iff a > b. If AbbreviatedKey(a) == AbbreviatedKey(b)
   106  	// an additional comparison is required to determine if the two keys are
   107  	// actually equal.
   108  	AbbreviatedKey(key []byte) uint64
   109  
   110  	// Compare returns -1, 0, or +1 depending on whether a is 'less than', 'equal
   111  	// to', or 'greater than' the key stored at b.
   112  	Compare(a []byte, b uint32) int
   113  }
   114  
   115  // Skiplist is a fast, non-cocnurrent skiplist implementation that supports
   116  // forward and backward iteration. See arenaskl.Skiplist for a concurrent
   117  // skiplist. Keys and values are stored externally from the skiplist via the
   118  // Storage interface. Deletion is not supported. Instead, higher-level code is
   119  // expected to perform deletion via tombstones and needs to process those
   120  // tombstones appropriately during retrieval operations.
   121  type Skiplist struct {
   122  	storage Storage
   123  	nodes   []byte
   124  	head    uint32
   125  	tail    uint32
   126  	height  uint32 // Current height: 1 <= height <= maxHeight
   127  	rand    rand.PCGSource
   128  }
   129  
   130  var (
   131  	probabilities [maxHeight]uint32
   132  )
   133  
   134  func init() {
   135  	const pValue = 1 / math.E
   136  
   137  	// Precompute the skiplist probabilities so that only a single random number
   138  	// needs to be generated and so that the optimal pvalue can be used (inverse
   139  	// of Euler's number).
   140  	p := float64(1.0)
   141  	for i := 0; i < maxHeight; i++ {
   142  		probabilities[i] = uint32(float64(math.MaxUint32) * p)
   143  		p *= pValue
   144  	}
   145  }
   146  
   147  // NewSkiplist constructs and initializes a new, empty skiplist.
   148  func NewSkiplist(storage Storage, initBufSize int) *Skiplist {
   149  	if initBufSize < 256 {
   150  		initBufSize = 256
   151  	}
   152  	s := &Skiplist{
   153  		storage: storage,
   154  		nodes:   make([]byte, 0, initBufSize),
   155  		height:  1,
   156  	}
   157  	s.rand.Seed(uint64(time.Now().UnixNano()))
   158  
   159  	// Allocate head and tail nodes.
   160  	s.head = s.newNode(maxHeight, 0, 0)
   161  	s.tail = s.newNode(maxHeight, 0, 0)
   162  
   163  	// Link all head/tail levels together.
   164  	for i := uint32(0); i < maxHeight; i++ {
   165  		s.setNext(s.head, i, s.tail)
   166  		s.setPrev(s.tail, i, s.head)
   167  	}
   168  
   169  	return s
   170  }
   171  
   172  // Reset the skiplist to empty and re-initialize.
   173  func (s *Skiplist) Reset(storage Storage, initBufSize int) {
   174  	if initBufSize < 256 {
   175  		initBufSize = 256
   176  	}
   177  	*s = Skiplist{
   178  		storage: storage,
   179  		nodes:   make([]byte, 0, initBufSize),
   180  		height:  1,
   181  	}
   182  
   183  	// Allocate head and tail nodes.
   184  	s.head = s.newNode(maxHeight, 0, 0)
   185  	s.tail = s.newNode(maxHeight, 0, 0)
   186  
   187  	// Link all head/tail levels together.
   188  	for i := uint32(0); i < maxHeight; i++ {
   189  		s.setNext(s.head, i, s.tail)
   190  		s.setPrev(s.tail, i, s.head)
   191  	}
   192  }
   193  
   194  // Add adds a new key to the skiplist if it does not yet exist. If the record
   195  // already exists, then Add returns ErrRecordExists.
   196  func (s *Skiplist) Add(keyOffset uint32) error {
   197  	key := s.storage.Get(keyOffset)
   198  	abbreviatedKey := s.storage.AbbreviatedKey(key.UserKey)
   199  
   200  	var spl [maxHeight]splice
   201  	if s.findSplice(key.UserKey, abbreviatedKey, &spl) {
   202  		return ErrExists
   203  	}
   204  
   205  	height := s.randomHeight()
   206  	nd := s.newNode(height, keyOffset, abbreviatedKey)
   207  	// Increase s.height as necessary.
   208  	for ; s.height < height; s.height++ {
   209  		spl[s.height].next = s.tail
   210  		spl[s.height].prev = s.head
   211  	}
   212  
   213  	// We always insert from the base level and up. After you add a node in base
   214  	// level, we cannot create a node in the level above because it would have
   215  	// discovered the node in the base level.
   216  	for i := uint32(0); i < height; i++ {
   217  		next := spl[i].next
   218  		prev := spl[i].prev
   219  		s.setNext(nd, i, next)
   220  		s.setPrev(nd, i, prev)
   221  		s.setNext(prev, i, nd)
   222  		s.setPrev(next, i, nd)
   223  	}
   224  
   225  	return nil
   226  }
   227  
   228  // NewIter returns a new Iterator object. The lower and upper bound parameters
   229  // control the range of keys the iterator will return. Specifying for nil for
   230  // lower or upper bound disables the check for that boundary. Note that lower
   231  // bound is not checked on {SeekGE,First} and upper bound is not check on
   232  // {SeekLT,Last}. The user is expected to perform that check. Note that it is
   233  // safe for an iterator to be copied by value.
   234  func (s *Skiplist) NewIter(lower, upper []byte) Iterator {
   235  	return Iterator{list: s, lower: lower, upper: upper}
   236  }
   237  
   238  func (s *Skiplist) newNode(height, key uint32, abbreviatedKey uint64) uint32 {
   239  	if height < 1 || height > maxHeight {
   240  		panic("height cannot be less than one or greater than the max height")
   241  	}
   242  
   243  	unusedSize := (maxHeight - int(height)) * linksSize
   244  	offset := s.alloc(uint32(maxNodeSize - unusedSize))
   245  	nd := s.node(offset)
   246  
   247  	nd.key = key
   248  	nd.abbreviatedKey = abbreviatedKey
   249  	return offset
   250  }
   251  
   252  func (s *Skiplist) alloc(size uint32) uint32 {
   253  	offset := uint32(len(s.nodes))
   254  	newSize := offset + size
   255  	if cap(s.nodes) < int(newSize) {
   256  		allocSize := uint32(cap(s.nodes) * 2)
   257  		if allocSize < newSize {
   258  			allocSize = newSize
   259  		}
   260  		tmp := make([]byte, len(s.nodes), allocSize)
   261  		copy(tmp, s.nodes)
   262  		s.nodes = tmp
   263  	}
   264  
   265  	s.nodes = s.nodes[:newSize]
   266  	return offset
   267  }
   268  
   269  func (s *Skiplist) node(offset uint32) *node {
   270  	return (*node)(unsafe.Pointer(&s.nodes[offset]))
   271  }
   272  
   273  func (s *Skiplist) randomHeight() uint32 {
   274  	rnd := uint32(s.rand.Uint64())
   275  	h := uint32(1)
   276  	for h < maxHeight && rnd <= probabilities[h] {
   277  		h++
   278  	}
   279  	return h
   280  }
   281  
   282  func (s *Skiplist) findSplice(
   283  	key []byte, abbreviatedKey uint64, spl *[maxHeight]splice,
   284  ) (found bool) {
   285  	var prev, next uint32
   286  	prev = s.head
   287  
   288  	for level := s.height - 1; ; level-- {
   289  		prev, next, found = s.findSpliceForLevel(key, abbreviatedKey, level, prev)
   290  		spl[level].init(prev, next)
   291  		if level == 0 {
   292  			break
   293  		}
   294  	}
   295  
   296  	return
   297  }
   298  
   299  func (s *Skiplist) findSpliceForLevel(
   300  	key []byte, abbreviatedKey uint64, level, start uint32,
   301  ) (prev, next uint32, found bool) {
   302  	prev = start
   303  
   304  	for {
   305  		// Assume prev.key < key.
   306  		next = s.getNext(prev, level)
   307  		if next == s.tail {
   308  			// Tail node, so done.
   309  			break
   310  		}
   311  
   312  		nextAbbreviatedKey := s.getAbbreviatedKey(next)
   313  		if abbreviatedKey < nextAbbreviatedKey {
   314  			// We are done for this level, since prev.key < key < next.key.
   315  			break
   316  		}
   317  		if abbreviatedKey == nextAbbreviatedKey {
   318  			cmp := s.storage.Compare(key, s.getKey(next))
   319  			if cmp == 0 {
   320  				// Equality case.
   321  				found = true
   322  				break
   323  			}
   324  			if cmp < 0 {
   325  				// We are done for this level, since prev.key < key < next.key.
   326  				break
   327  			}
   328  		}
   329  
   330  		// Keep moving right on this level.
   331  		prev = next
   332  	}
   333  
   334  	return
   335  }
   336  
   337  func (s *Skiplist) getKey(nd uint32) uint32 {
   338  	return s.node(nd).key
   339  }
   340  
   341  func (s *Skiplist) getAbbreviatedKey(nd uint32) uint64 {
   342  	return s.node(nd).abbreviatedKey
   343  }
   344  
   345  func (s *Skiplist) getNext(nd, h uint32) uint32 {
   346  	return s.node(nd).links[h].next
   347  }
   348  
   349  func (s *Skiplist) getPrev(nd, h uint32) uint32 {
   350  	return s.node(nd).links[h].prev
   351  }
   352  
   353  func (s *Skiplist) setNext(nd, h, next uint32) {
   354  	s.node(nd).links[h].next = next
   355  }
   356  
   357  func (s *Skiplist) setPrev(nd, h, prev uint32) {
   358  	s.node(nd).links[h].prev = prev
   359  }
   360  
   361  func (s *Skiplist) debug() string {
   362  	var buf bytes.Buffer
   363  	for level := uint32(0); level < s.height; level++ {
   364  		var count int
   365  		for nd := s.head; nd != s.tail; nd = s.getNext(nd, level) {
   366  			count++
   367  		}
   368  		fmt.Fprintf(&buf, "%d: %d\n", level, count)
   369  	}
   370  	return buf.String()
   371  }
   372  
   373  // Silence unused warning.
   374  var _ = (*Skiplist).debug