github.com/pingcap/badger@v1.5.1-0.20230103063557-828f39b09b6d/table/memtable/skl.go (about) 1 /* 2 * Copyright 2017 Dgraph Labs, Inc. and Contributors 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 /* 18 Adapted from RocksDB inline skiplist. 19 20 Key differences: 21 - No optimization for sequential inserts (no "prev"). 22 - No custom comparator. 23 - Support overwrites. This requires care when we see the same key when inserting. 24 For RocksDB or LevelDB, overwrites are implemented as a newer sequence number in the key, so 25 there is no need for values. We don't intend to support versioning. In-place updates of values 26 would be more efficient. 27 - We discard all non-concurrent code. 28 - We do not support Splices. This simplifies the code a lot. 29 - No AllocateNode or other pointer arithmetic. 30 - We combine the findLessThan, findGreaterOrEqual, etc into one function. 31 */ 32 33 package memtable 34 35 import ( 36 "bytes" 37 "math" 38 "sync/atomic" 39 "unsafe" 40 41 "github.com/coocood/rtutil" 42 "github.com/pingcap/badger/y" 43 ) 44 45 const ( 46 maxHeight = 20 47 heightIncrease = math.MaxUint32 / 3 48 ) 49 50 // MaxNodeSize is the memory footprint of a node of maximum height. 51 const ( 52 MaxNodeSize = int(unsafe.Sizeof(node{})) 53 EstimateNodeSize = MaxNodeSize + nodeAlign 54 ) 55 56 type node struct { 57 // Multiple parts of the value are encoded as a single uint64 so that it 58 // can be atomically loaded and stored: 59 // value offset: uint32 (bits 0-31) 60 // value size : uint16 (bits 32-47) 61 valueAddr uint64 62 63 // A byte slice is 24 bytes. We are trying to save space here. 64 keyOffset uint32 // Immutable. No need to lock to access key. 65 keySize uint16 // Immutable. No need to lock to access key. 66 67 // Height of the tower. 68 height uint16 69 70 // Most nodes do not need to use the full height of the tower, since the 71 // probability of each successive level decreases exponentially. Because 72 // these elements are never accessed, they do not need to be allocated. 73 // Therefore, when a node is allocated in the arena, its memory footprint 74 // is deliberately truncated to not include unneeded tower elements. 75 // 76 // All accesses to elements should use CAS operations, with no need to lock. 77 tower [maxHeight]uint32 78 } 79 80 // skiplist maps keys to values (in memory) 81 type skiplist struct { 82 height int32 // Current height. 1 <= height <= kMaxHeight. CAS. 83 head *node 84 arena *arena 85 } 86 87 // DecrRef decrements the refcount, deallocating the Skiplist when done using it 88 func (s *skiplist) Delete() { 89 s.arena.reset() 90 // Indicate we are closed. Good for testing. Also, lets GC reclaim memory. Race condition 91 // here would suggest we are accessing skiplist when we are supposed to have no reference! 92 s.arena = nil 93 s.head = nil 94 } 95 96 func (s *skiplist) valid() bool { return s.arena != nil } 97 98 func newNode(a *arena, key []byte, v y.ValueStruct, height int) *node { 99 // The base level is already allocated in the node struct. 100 offset := a.putNode(height) 101 node := a.getNode(offset) 102 node.keyOffset = a.putKey(key) 103 node.keySize = uint16(len(key)) 104 node.height = uint16(height) 105 node.valueAddr = encodeValueAddr(a.putVal(v), v.EncodedSize()) 106 return node 107 } 108 109 func encodeValueAddr(valOffset uint32, valSize uint32) uint64 { 110 return uint64(valSize)<<32 | uint64(valOffset) 111 } 112 113 func decodeValueAddr(value uint64) (valOffset uint32, valSize uint32) { 114 return uint32(value), uint32(value >> 32) 115 } 116 117 // newSkiplist makes a new empty skiplist, with a given arena size 118 func newSkiplist(arenaSize int64) *skiplist { 119 arena := newArena(arenaSize) 120 head := newNode(arena, nil, y.ValueStruct{}, maxHeight) 121 return &skiplist{ 122 height: 1, 123 head: head, 124 arena: arena, 125 } 126 } 127 128 func (n *node) getValueAddr() (uint32, uint32) { 129 value := atomic.LoadUint64(&n.valueAddr) 130 return decodeValueAddr(value) 131 } 132 133 func (n *node) key(arena *arena) []byte { 134 return arena.getKey(n.keyOffset, n.keySize) 135 } 136 137 func (n *node) setValue(arena *arena, v y.ValueStruct) { 138 for { 139 oldValueAddr := atomic.LoadUint64(&n.valueAddr) 140 oldValOff, size := n.getValueAddr() 141 if size == 0 { 142 vn := arena.getValueNode(oldValOff) 143 oldValOff, size = decodeValueAddr(vn.valAddr) 144 } 145 oldV := arena.getVal(oldValOff, size) 146 if v.Version <= oldV.Version { 147 // Only happens in Restore backup, do nothing. 148 return 149 } 150 newValueOff := arena.putVal(v) 151 newValueAddr := encodeValueAddr(newValueOff, v.EncodedSize()) 152 vn := valueNode{ 153 valAddr: newValueAddr, 154 nextValAddr: oldValueAddr, 155 } 156 valueNodeOff := arena.putValueNode(vn) 157 // value node has fixed size, so we can use 0 size to represent a value node. 158 valueNodeAddr := encodeValueAddr(valueNodeOff, 0) 159 if !atomic.CompareAndSwapUint64(&n.valueAddr, oldValueAddr, valueNodeAddr) { 160 continue 161 } 162 break 163 } 164 } 165 166 func (n *node) getNextOffset(h int) uint32 { 167 return atomic.LoadUint32(&n.tower[h]) 168 } 169 170 func (n *node) casNextOffset(h int, old, val uint32) bool { 171 return atomic.CompareAndSwapUint32(&n.tower[h], old, val) 172 } 173 174 // Returns true if key is strictly > n.key. 175 // If n is nil, this is an "end" marker and we return false. 176 //func (s *Skiplist) keyIsAfterNode(key []byte, n *node) bool { 177 // y.Assert(n != s.head) 178 // return n != nil && y.CompareKeysWithVer(key, n.key) > 0 179 //} 180 181 func (s *skiplist) randomHeight() int { 182 h := 1 183 for h < maxHeight && rtutil.FastRand() <= heightIncrease { 184 h++ 185 } 186 return h 187 } 188 189 func (s *skiplist) getNext(nd *node, height int) *node { 190 return s.arena.getNode(nd.getNextOffset(height)) 191 } 192 193 // findNear finds the node near to key. 194 // If less=true, it finds rightmost node such that node.key < key (if allowEqual=false) or 195 // node.key <= key (if allowEqual=true). 196 // If less=false, it finds leftmost node such that node.key > key (if allowEqual=false) or 197 // node.key >= key (if allowEqual=true). 198 // Returns the node found. The bool returned is true if the node has key equal to given key. 199 200 func (s *skiplist) findNear(key []byte, less bool, allowEqual bool) (*node, bool) { 201 x := s.head 202 level := int(s.getHeight() - 1) 203 var afterNode *node 204 for { 205 // Assume x.key < key. 206 next := s.getNext(x, level) 207 if next == nil { 208 // x.key < key < END OF LIST 209 if level > 0 { 210 // Can descend further to iterate closer to the end. 211 level-- 212 continue 213 } 214 // Level=0. Cannot descend further. Let's return something that makes sense. 215 if !less { 216 return nil, false 217 } 218 // Try to return x. Make sure it is not a head node. 219 if x == s.head { 220 return nil, false 221 } 222 return x, false 223 } 224 var cmp int 225 if next == afterNode { 226 // We compared the same node on the upper level, no need to compare again. 227 cmp = -1 228 } else { 229 nextKey := next.key(s.arena) 230 cmp = bytes.Compare(key, nextKey) 231 } 232 if cmp > 0 { 233 // x.key < next.key < key. We can continue to move right. 234 x = next 235 continue 236 } 237 if cmp == 0 { 238 // x.key < key == next.key. 239 if allowEqual { 240 return next, true 241 } 242 if !less { 243 // We want >, so go to base level to grab the next bigger note. 244 return s.getNext(next, 0), false 245 } 246 // We want <. If not base level, we should go closer in the next level. 247 if level > 0 { 248 level-- 249 continue 250 } 251 // On base level. Return x. 252 if x == s.head { 253 return nil, false 254 } 255 return x, false 256 } 257 // cmp < 0. In other words, x.key < key < next. 258 if level > 0 { 259 afterNode = next 260 level-- 261 continue 262 } 263 // At base level. Need to return something. 264 if !less { 265 return next, false 266 } 267 // Try to return x. Make sure it is not a head node. 268 if x == s.head { 269 return nil, false 270 } 271 return x, false 272 } 273 } 274 275 // findSpliceForLevel returns (outBefore, outAfter, match) with outBefore.key < key <= outAfter.key. 276 // The input "before" tells us where to start looking. 277 // If we found a node with the same key, then we return match = true. 278 // Otherwise, outBefore.key < key < outAfter.key. 279 func (s *skiplist) findSpliceForLevel(key []byte, before *node, level int) (*node, *node, bool) { 280 for { 281 // Assume before.key < key. 282 next := s.getNext(before, level) 283 if next == nil { 284 return before, next, false 285 } 286 nextKey := next.key(s.arena) 287 cmp := bytes.Compare(key, nextKey) 288 if cmp <= 0 { 289 return before, next, cmp == 0 290 } 291 before = next // Keep moving right on this level. 292 } 293 } 294 295 func (s *skiplist) getHeight() int32 { 296 return atomic.LoadInt32(&s.height) 297 } 298 299 // Put inserts the key-value pair. 300 func (s *skiplist) Put(key []byte, v y.ValueStruct) { 301 s.PutWithHint(key, v, nil) 302 } 303 304 // Hint is used to speed up sequential write. 305 type hint struct { 306 height int32 307 308 // hitHeight is used to reduce cost of calculateRecomputeHeight. 309 // For random workload, comparing hint keys from bottom up is wasted work. 310 // So we record the hit height of the last operation, only grow recompute height from near that height. 311 hitHeight int32 312 prev [maxHeight + 1]*node 313 next [maxHeight + 1]*node 314 } 315 316 func (s *skiplist) calculateRecomputeHeight(key []byte, h *hint, listHeight int32) int32 { 317 if h.height < listHeight { 318 // Either splice is never used or list height has grown, we recompute all. 319 h.prev[listHeight] = s.head 320 h.next[listHeight] = nil 321 h.height = int32(listHeight) 322 h.hitHeight = h.height 323 return listHeight 324 } 325 recomputeHeight := h.hitHeight - 2 326 if recomputeHeight < 0 { 327 recomputeHeight = 0 328 } 329 for recomputeHeight < listHeight { 330 prevNode := h.prev[recomputeHeight] 331 nextNode := h.next[recomputeHeight] 332 prevNext := s.getNext(prevNode, int(recomputeHeight)) 333 if prevNext != nextNode { 334 recomputeHeight++ 335 continue 336 } 337 if prevNode != s.head && 338 prevNode != nil && 339 bytes.Compare(key, prevNode.key(s.arena)) <= 0 { 340 // Key is before splice. 341 for prevNode == h.prev[recomputeHeight] { 342 recomputeHeight++ 343 } 344 continue 345 } 346 if nextNode != nil && bytes.Compare(key, nextNode.key(s.arena)) > 0 { 347 // Key is after splice. 348 for nextNode == h.next[recomputeHeight] { 349 recomputeHeight++ 350 } 351 continue 352 } 353 break 354 } 355 h.hitHeight = recomputeHeight 356 return recomputeHeight 357 } 358 359 // PutWithHint inserts the key-value pair with Hint for better sequential write performance. 360 func (s *skiplist) PutWithHint(key []byte, v y.ValueStruct, h *hint) { 361 // Since we allow overwrite, we may not need to create a new node. We might not even need to 362 // increase the height. Let's defer these actions. 363 listHeight := s.getHeight() 364 height := s.randomHeight() 365 366 // Try to increase s.height via CAS. 367 for height > int(listHeight) { 368 if atomic.CompareAndSwapInt32(&s.height, listHeight, int32(height)) { 369 // Successfully increased skiplist.height. 370 listHeight = int32(height) 371 break 372 } 373 listHeight = s.getHeight() 374 } 375 spliceIsValid := h != nil 376 if h == nil { 377 h = new(hint) 378 } 379 recomputeHeight := s.calculateRecomputeHeight(key, h, listHeight) 380 if recomputeHeight > 0 { 381 for i := recomputeHeight - 1; i >= 0; i-- { 382 var match bool 383 h.prev[i], h.next[i], match = s.findSpliceForLevel(key, h.prev[i+1], int(i)) 384 if match { 385 // In place update. 386 h.next[i].setValue(s.arena, v) 387 for i > 0 { 388 h.prev[i-1] = h.prev[i] 389 h.next[i-1] = h.next[i] 390 i-- 391 } 392 return 393 } 394 } 395 } else { 396 // Even the recomputeHeight is 0, we still need to check match and do in place update to insert the new version. 397 if h.next[0] != nil && bytes.Equal(h.next[0].key(s.arena), key) { 398 h.next[0].setValue(s.arena, v) 399 return 400 } 401 } 402 403 // We do need to create a new node. 404 x := newNode(s.arena, key, v, height) 405 406 // We always insert from the base level and up. After you add a node in base level, we cannot 407 // create a node in the level above because it would have discovered the node in the base level. 408 for i := 0; i < height; i++ { 409 for { 410 nextOffset := s.arena.getNodeOffset(h.next[i]) 411 x.tower[i] = nextOffset 412 if h.prev[i].casNextOffset(i, nextOffset, s.arena.getNodeOffset(x)) { 413 // Managed to insert x between prev[i] and next[i]. Go to the next level. 414 break 415 } 416 // CAS failed. We need to recompute prev and next. 417 // It is unlikely to be helpful to try to use a different level as we redo the search, 418 // because it is unlikely that lots of nodes are inserted between prev[i] and next[i]. 419 h.prev[i], h.next[i], _ = s.findSpliceForLevel(key, h.prev[i], i) 420 if i > 0 { 421 spliceIsValid = false 422 } 423 } 424 } 425 if spliceIsValid { 426 for i := 0; i < height; i++ { 427 h.prev[i] = x 428 h.next[i] = s.getNext(x, i) 429 } 430 } else { 431 h.height = 0 432 } 433 } 434 435 func (s *skiplist) GetWithHint(key []byte, version uint64, h *hint) y.ValueStruct { 436 if h == nil { 437 h = new(hint) 438 } 439 listHeight := s.getHeight() 440 recomputeHeight := s.calculateRecomputeHeight(key, h, listHeight) 441 var n *node 442 if recomputeHeight > 0 { 443 for i := recomputeHeight - 1; i >= 0; i-- { 444 var match bool 445 h.prev[i], h.next[i], match = s.findSpliceForLevel(key, h.prev[i+1], int(i)) 446 if match { 447 n = h.next[i] 448 for j := i; j >= 0; j-- { 449 h.prev[j] = n 450 h.next[j] = s.getNext(n, int(j)) 451 } 452 break 453 } 454 } 455 } else { 456 n = h.next[0] 457 } 458 if n == nil { 459 return y.ValueStruct{} 460 } 461 nextKey := s.arena.getKey(n.keyOffset, n.keySize) 462 if !bytes.Equal(key, nextKey) { 463 return y.ValueStruct{} 464 } 465 valOffset, size := n.getValueAddr() 466 var v y.ValueStruct 467 for size == 0 { 468 vn := s.arena.getValueNode(valOffset) 469 valOffset, size = decodeValueAddr(vn.valAddr) 470 s.arena.fillVal(&v, valOffset, size) 471 if v.Version <= version { 472 return v 473 } 474 if vn.nextValAddr == 0 { 475 return y.ValueStruct{} 476 } 477 valOffset, size = decodeValueAddr(vn.nextValAddr) 478 } 479 vs := s.arena.getVal(valOffset, size) 480 return vs 481 } 482 483 // Empty returns if the Skiplist is empty. 484 func (s *skiplist) Empty() bool { 485 return s.findLast() == nil 486 } 487 488 // findLast returns the last element. If head (empty list), we return nil. All the find functions 489 // will NEVER return the head nodes. 490 func (s *skiplist) findLast() *node { 491 n := s.head 492 level := int(s.getHeight()) - 1 493 for { 494 next := s.getNext(n, level) 495 if next != nil { 496 n = next 497 continue 498 } 499 if level == 0 { 500 if n == s.head { 501 return nil 502 } 503 return n 504 } 505 level-- 506 } 507 } 508 509 // Get gets the value associated with the key. It returns a valid value if it finds equal or earlier 510 // version of the same key. 511 func (s *skiplist) Get(key []byte, version uint64) y.ValueStruct { 512 n, _ := s.findNear(key, false, true) // findGreaterOrEqual. 513 if n == nil { 514 return y.ValueStruct{} 515 } 516 517 nextKey := s.arena.getKey(n.keyOffset, n.keySize) 518 if !bytes.Equal(key, nextKey) { 519 return y.ValueStruct{} 520 } 521 valOffset, valSize := n.getValueAddr() 522 var v y.ValueStruct 523 for valSize == 0 { 524 vn := s.arena.getValueNode(valOffset) 525 valOffset, valSize = decodeValueAddr(vn.valAddr) 526 s.arena.fillVal(&v, valOffset, valSize) 527 if version >= v.Version { 528 return v 529 } 530 valOffset, valSize = decodeValueAddr(vn.nextValAddr) 531 } 532 s.arena.fillVal(&v, valOffset, valSize) 533 if version >= v.Version { 534 return v 535 } 536 return y.ValueStruct{} 537 } 538 539 // NewIterator returns a skiplist iterator. You have to Close() the iterator. 540 func (s *skiplist) NewIterator() *Iterator { 541 return &Iterator{list: s} 542 } 543 544 // MemSize returns the size of the Skiplist in terms of how much memory is used within its internal 545 // arena. 546 func (s *skiplist) MemSize() int64 { return s.arena.size() } 547 548 // Iterator is an iterator over skiplist object. For new objects, you just 549 // need to initialize Iterator.list. 550 type Iterator struct { 551 list *skiplist 552 n *node 553 554 uk []byte 555 v y.ValueStruct 556 valList []uint64 557 valListIdx int 558 } 559 560 // Valid returns true iff the iterator is positioned at a valid node. 561 func (s *Iterator) Valid() bool { return s.n != nil } 562 563 // Key returns the key at the current position. 564 func (s *Iterator) Key() y.Key { 565 return y.KeyWithTs(s.uk, s.v.Version) 566 } 567 568 // Value returns value. 569 func (s *Iterator) Value() y.ValueStruct { 570 return s.v 571 } 572 573 // FillValue fills value. 574 func (s *Iterator) FillValue(vs *y.ValueStruct) { 575 *vs = s.v 576 } 577 578 // Next advances to the next position. 579 func (s *Iterator) Next() { 580 y.Assert(s.Valid()) 581 s.n = s.list.getNext(s.n, 0) 582 s.loadNode() 583 } 584 585 func (s *Iterator) NextVersion() bool { 586 if s.valListIdx+1 < len(s.valList) { 587 s.setValueListIdx(s.valListIdx + 1) 588 return true 589 } 590 return false 591 } 592 593 // Prev advances to the previous position. 594 func (s *Iterator) Prev() { 595 y.Assert(s.Valid()) 596 s.n, _ = s.list.findNear(s.uk, true, false) // find <. No equality allowed. 597 s.loadNode() 598 } 599 600 // Seek advances to the first entry with a key >= target. 601 func (s *Iterator) Seek(target []byte) { 602 s.n, _ = s.list.findNear(target, false, true) // find >=. 603 s.loadNode() 604 } 605 606 func (s *Iterator) loadNode() { 607 if s.n == nil { 608 return 609 } 610 if len(s.valList) > 0 { 611 s.valList = s.valList[:0] 612 s.valListIdx = 0 613 } 614 s.uk = s.n.key(s.list.arena) 615 off, size := s.n.getValueAddr() 616 if size > 0 { 617 s.list.arena.fillVal(&s.v, off, size) 618 return 619 } 620 for { 621 vn := s.list.arena.getValueNode(off) 622 s.valList = append(s.valList, vn.valAddr) 623 off, size = decodeValueAddr(vn.nextValAddr) 624 if size != 0 { 625 s.valList = append(s.valList, vn.nextValAddr) 626 break 627 } 628 } 629 s.setValueListIdx(0) 630 } 631 632 func (s *Iterator) setValueListIdx(idx int) { 633 s.valListIdx = idx 634 off, size := decodeValueAddr(s.valList[idx]) 635 s.list.arena.fillVal(&s.v, off, size) 636 } 637 638 // SeekForPrev finds an entry with key <= target. 639 func (s *Iterator) SeekForPrev(target []byte) { 640 s.n, _ = s.list.findNear(target, true, true) // find <=. 641 s.loadNode() 642 } 643 644 // SeekToFirst seeks position at the first entry in list. 645 // Final state of iterator is Valid() iff list is not empty. 646 func (s *Iterator) SeekToFirst() { 647 s.n = s.list.getNext(s.list.head, 0) 648 s.loadNode() 649 } 650 651 // SeekToLast seeks position at the last entry in list. 652 // Final state of iterator is Valid() iff list is not empty. 653 func (s *Iterator) SeekToLast() { 654 s.n = s.list.findLast() 655 s.loadNode() 656 } 657 658 func (s *Iterator) Close() error { 659 return nil 660 } 661 662 // UniIterator is a unidirectional memtable iterator. It is a thin wrapper around 663 // Iterator. We like to keep Iterator as before, because it is more powerful and 664 // we might support bidirectional iterators in the future. 665 type UniIterator struct { 666 iter *Iterator 667 reversed bool 668 } 669 670 // NewUniIterator returns a UniIterator. 671 func (s *skiplist) NewUniIterator(reversed bool) *UniIterator { 672 return &UniIterator{ 673 iter: s.NewIterator(), 674 reversed: reversed, 675 } 676 } 677 678 // Next implements y.Interface 679 func (s *UniIterator) Next() { 680 if !s.reversed { 681 s.iter.Next() 682 } else { 683 s.iter.Prev() 684 } 685 } 686 687 func (s *UniIterator) NextVersion() bool { 688 return s.iter.NextVersion() 689 } 690 691 // Rewind implements y.Interface 692 func (s *UniIterator) Rewind() { 693 if !s.reversed { 694 s.iter.SeekToFirst() 695 } else { 696 s.iter.SeekToLast() 697 } 698 } 699 700 // Seek implements y.Interface 701 func (s *UniIterator) Seek(key []byte) { 702 if !s.reversed { 703 s.iter.Seek(key) 704 } else { 705 s.iter.SeekForPrev(key) 706 } 707 } 708 709 // Key implements y.Interface 710 func (s *UniIterator) Key() y.Key { return s.iter.Key() } 711 712 // Value implements y.Interface 713 func (s *UniIterator) Value() y.ValueStruct { return s.iter.Value() } 714 715 // FillValue implements y.Interface 716 func (s *UniIterator) FillValue(vs *y.ValueStruct) { s.iter.FillValue(vs) } 717 718 // Valid implements y.Interface 719 func (s *UniIterator) Valid() bool { return s.iter.Valid() } 720 721 func (s *UniIterator) Close() error { return s.iter.Close() }