github.com/petermattis/pebble@v0.0.0-20190905164901-ab51a2166067/internal/batchskl/skl.go (about) 1 /* 2 * Copyright 2017 Dgraph Labs, Inc. and Contributors 3 * Modifications copyright (C) 2017 Andy Kimball and Contributors 4 * 5 * Licensed under the Apache License, Version 2.0 (the "License") 6 * you may not use this file except in compliance with the License. 7 * You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 18 /* 19 Adapted from RocksDB inline skiplist. 20 21 Key differences: 22 - No optimization for sequential inserts (no "prev"). 23 - No custom comparator. 24 - Support overwrites. This requires care when we see the same key when inserting. 25 For RocksDB or LevelDB, overwrites are implemented as a newer sequence number in the key, so 26 there is no need for values. We don't intend to support versioning. In-place updates of values 27 would be more efficient. 28 - We discard all non-concurrent code. 29 - We do not support Splices. This simplifies the code a lot. 30 - No AllocateNode or other pointer arithmetic. 31 - We combine the findLessThan, findGreaterOrEqual, etc into one function. 32 */ 33 34 /* 35 Further adapted from Badger: https://github.com/dgraph-io/badger. 36 37 Key differences: 38 - Support for previous pointers - doubly linked lists. Note that it's up to higher 39 level code to deal with the intermediate state that occurs during insertion, 40 where node A is linked to node B, but node B is not yet linked back to node A. 41 - Iterator includes mutator functions. 42 */ 43 44 /* 45 Further adapted from arenaskl: https://github.com/andy-kimball/arenaskl 46 47 Key differences: 48 - Removed support for deletion. 49 - Removed support for concurrency. 50 - External storage of keys. 51 - Node storage grows to an arbitrary size. 52 */ 53 54 package batchskl // import "github.com/petermattis/pebble/internal/batchskl" 55 56 import ( 57 "bytes" 58 "errors" 59 "fmt" 60 "math" 61 "time" 62 "unsafe" 63 64 "github.com/petermattis/pebble/internal/base" 65 "golang.org/x/exp/rand" 66 ) 67 68 const ( 69 maxHeight = 20 70 maxNodeSize = int(unsafe.Sizeof(node{})) 71 linksSize = int(unsafe.Sizeof(links{})) 72 ) 73 74 var ErrExists = errors.New("record with this key already exists") 75 76 type links struct { 77 next uint32 78 prev uint32 79 } 80 81 type node struct { 82 // The offset of the key in storage. See Storage.Get. 83 key uint32 84 // A fixed 8-byte abbreviation of the key, used to avoid retrieval of the key 85 // during seek operations. The key retrieval can be expensive purely due to 86 // cache misses while the abbreviatedKey stored here will be in the same 87 // cache line as the key and the links making accessing and comparing against 88 // it almost free. 89 abbreviatedKey uint64 90 // Most nodes do not need to use the full height of the link tower, since the 91 // probability of each successive level decreases exponentially. Because 92 // these elements are never accessed, they do not need to be allocated. 93 // Therefore, when a node is allocated, its memory footprint is deliberately 94 // truncated to not include unneeded link elements. 95 links [maxHeight]links 96 } 97 98 // Storage defines the storage interface for retrieval and comparison of keys. 99 type Storage interface { 100 // Get returns the key stored at the specified offset. 101 Get(offset uint32) base.InternalKey 102 103 // AbbreviatedKey returns a fixed length prefix of the specified key such 104 // that AbbreviatedKey(a) < AbbreviatedKey(b) iff a < b and AbbreviatedKey(a) 105 // > AbbreviatedKey(b) iff a > b. If AbbreviatedKey(a) == AbbreviatedKey(b) 106 // an additional comparison is required to determine if the two keys are 107 // actually equal. 108 AbbreviatedKey(key []byte) uint64 109 110 // Compare returns -1, 0, or +1 depending on whether a is 'less than', 'equal 111 // to', or 'greater than' the key stored at b. 112 Compare(a []byte, b uint32) int 113 } 114 115 // Skiplist is a fast, non-cocnurrent skiplist implementation that supports 116 // forward and backward iteration. See arenaskl.Skiplist for a concurrent 117 // skiplist. Keys and values are stored externally from the skiplist via the 118 // Storage interface. Deletion is not supported. Instead, higher-level code is 119 // expected to perform deletion via tombstones and needs to process those 120 // tombstones appropriately during retrieval operations. 121 type Skiplist struct { 122 storage Storage 123 nodes []byte 124 head uint32 125 tail uint32 126 height uint32 // Current height: 1 <= height <= maxHeight 127 rand rand.PCGSource 128 } 129 130 var ( 131 probabilities [maxHeight]uint32 132 ) 133 134 func init() { 135 const pValue = 1 / math.E 136 137 // Precompute the skiplist probabilities so that only a single random number 138 // needs to be generated and so that the optimal pvalue can be used (inverse 139 // of Euler's number). 140 p := float64(1.0) 141 for i := 0; i < maxHeight; i++ { 142 probabilities[i] = uint32(float64(math.MaxUint32) * p) 143 p *= pValue 144 } 145 } 146 147 // NewSkiplist constructs and initializes a new, empty skiplist. 148 func NewSkiplist(storage Storage, initBufSize int) *Skiplist { 149 if initBufSize < 256 { 150 initBufSize = 256 151 } 152 s := &Skiplist{ 153 storage: storage, 154 nodes: make([]byte, 0, initBufSize), 155 height: 1, 156 } 157 s.rand.Seed(uint64(time.Now().UnixNano())) 158 159 // Allocate head and tail nodes. 160 s.head = s.newNode(maxHeight, 0, 0) 161 s.tail = s.newNode(maxHeight, 0, 0) 162 163 // Link all head/tail levels together. 164 for i := uint32(0); i < maxHeight; i++ { 165 s.setNext(s.head, i, s.tail) 166 s.setPrev(s.tail, i, s.head) 167 } 168 169 return s 170 } 171 172 // Reset the skiplist to empty and re-initialize. 173 func (s *Skiplist) Reset(storage Storage, initBufSize int) { 174 if initBufSize < 256 { 175 initBufSize = 256 176 } 177 *s = Skiplist{ 178 storage: storage, 179 nodes: make([]byte, 0, initBufSize), 180 height: 1, 181 } 182 183 // Allocate head and tail nodes. 184 s.head = s.newNode(maxHeight, 0, 0) 185 s.tail = s.newNode(maxHeight, 0, 0) 186 187 // Link all head/tail levels together. 188 for i := uint32(0); i < maxHeight; i++ { 189 s.setNext(s.head, i, s.tail) 190 s.setPrev(s.tail, i, s.head) 191 } 192 } 193 194 // Add adds a new key to the skiplist if it does not yet exist. If the record 195 // already exists, then Add returns ErrRecordExists. 196 func (s *Skiplist) Add(keyOffset uint32) error { 197 key := s.storage.Get(keyOffset) 198 abbreviatedKey := s.storage.AbbreviatedKey(key.UserKey) 199 200 var spl [maxHeight]splice 201 if s.findSplice(key.UserKey, abbreviatedKey, &spl) { 202 return ErrExists 203 } 204 205 height := s.randomHeight() 206 nd := s.newNode(height, keyOffset, abbreviatedKey) 207 // Increase s.height as necessary. 208 for ; s.height < height; s.height++ { 209 spl[s.height].next = s.tail 210 spl[s.height].prev = s.head 211 } 212 213 // We always insert from the base level and up. After you add a node in base 214 // level, we cannot create a node in the level above because it would have 215 // discovered the node in the base level. 216 for i := uint32(0); i < height; i++ { 217 next := spl[i].next 218 prev := spl[i].prev 219 s.setNext(nd, i, next) 220 s.setPrev(nd, i, prev) 221 s.setNext(prev, i, nd) 222 s.setPrev(next, i, nd) 223 } 224 225 return nil 226 } 227 228 // NewIter returns a new Iterator object. The lower and upper bound parameters 229 // control the range of keys the iterator will return. Specifying for nil for 230 // lower or upper bound disables the check for that boundary. Note that lower 231 // bound is not checked on {SeekGE,First} and upper bound is not check on 232 // {SeekLT,Last}. The user is expected to perform that check. Note that it is 233 // safe for an iterator to be copied by value. 234 func (s *Skiplist) NewIter(lower, upper []byte) Iterator { 235 return Iterator{list: s, lower: lower, upper: upper} 236 } 237 238 func (s *Skiplist) newNode(height, key uint32, abbreviatedKey uint64) uint32 { 239 if height < 1 || height > maxHeight { 240 panic("height cannot be less than one or greater than the max height") 241 } 242 243 unusedSize := (maxHeight - int(height)) * linksSize 244 offset := s.alloc(uint32(maxNodeSize - unusedSize)) 245 nd := s.node(offset) 246 247 nd.key = key 248 nd.abbreviatedKey = abbreviatedKey 249 return offset 250 } 251 252 func (s *Skiplist) alloc(size uint32) uint32 { 253 offset := uint32(len(s.nodes)) 254 newSize := offset + size 255 if cap(s.nodes) < int(newSize) { 256 allocSize := uint32(cap(s.nodes) * 2) 257 if allocSize < newSize { 258 allocSize = newSize 259 } 260 tmp := make([]byte, len(s.nodes), allocSize) 261 copy(tmp, s.nodes) 262 s.nodes = tmp 263 } 264 265 s.nodes = s.nodes[:newSize] 266 return offset 267 } 268 269 func (s *Skiplist) node(offset uint32) *node { 270 return (*node)(unsafe.Pointer(&s.nodes[offset])) 271 } 272 273 func (s *Skiplist) randomHeight() uint32 { 274 rnd := uint32(s.rand.Uint64()) 275 h := uint32(1) 276 for h < maxHeight && rnd <= probabilities[h] { 277 h++ 278 } 279 return h 280 } 281 282 func (s *Skiplist) findSplice( 283 key []byte, abbreviatedKey uint64, spl *[maxHeight]splice, 284 ) (found bool) { 285 var prev, next uint32 286 prev = s.head 287 288 for level := s.height - 1; ; level-- { 289 prev, next, found = s.findSpliceForLevel(key, abbreviatedKey, level, prev) 290 spl[level].init(prev, next) 291 if level == 0 { 292 break 293 } 294 } 295 296 return 297 } 298 299 func (s *Skiplist) findSpliceForLevel( 300 key []byte, abbreviatedKey uint64, level, start uint32, 301 ) (prev, next uint32, found bool) { 302 prev = start 303 304 for { 305 // Assume prev.key < key. 306 next = s.getNext(prev, level) 307 if next == s.tail { 308 // Tail node, so done. 309 break 310 } 311 312 nextAbbreviatedKey := s.getAbbreviatedKey(next) 313 if abbreviatedKey < nextAbbreviatedKey { 314 // We are done for this level, since prev.key < key < next.key. 315 break 316 } 317 if abbreviatedKey == nextAbbreviatedKey { 318 cmp := s.storage.Compare(key, s.getKey(next)) 319 if cmp == 0 { 320 // Equality case. 321 found = true 322 break 323 } 324 if cmp < 0 { 325 // We are done for this level, since prev.key < key < next.key. 326 break 327 } 328 } 329 330 // Keep moving right on this level. 331 prev = next 332 } 333 334 return 335 } 336 337 func (s *Skiplist) getKey(nd uint32) uint32 { 338 return s.node(nd).key 339 } 340 341 func (s *Skiplist) getAbbreviatedKey(nd uint32) uint64 { 342 return s.node(nd).abbreviatedKey 343 } 344 345 func (s *Skiplist) getNext(nd, h uint32) uint32 { 346 return s.node(nd).links[h].next 347 } 348 349 func (s *Skiplist) getPrev(nd, h uint32) uint32 { 350 return s.node(nd).links[h].prev 351 } 352 353 func (s *Skiplist) setNext(nd, h, next uint32) { 354 s.node(nd).links[h].next = next 355 } 356 357 func (s *Skiplist) setPrev(nd, h, prev uint32) { 358 s.node(nd).links[h].prev = prev 359 } 360 361 func (s *Skiplist) debug() string { 362 var buf bytes.Buffer 363 for level := uint32(0); level < s.height; level++ { 364 var count int 365 for nd := s.head; nd != s.tail; nd = s.getNext(nd, level) { 366 count++ 367 } 368 fmt.Fprintf(&buf, "%d: %d\n", level, count) 369 } 370 return buf.String() 371 } 372 373 // Silence unused warning. 374 var _ = (*Skiplist).debug