github.com/zuoyebang/bitalostable@v1.0.1-0.20240229032404-e3b99a834294/internal/arenaskl/skl.go (about) 1 /* 2 * Copyright 2017 Dgraph Labs, Inc. and Contributors 3 * Modifications copyright (C) 2017 Andy Kimball and Contributors 4 * 5 * Licensed under the Apache License, Version 2.0 (the "License"); 6 * you may not use this file except in compliance with the License. 7 * You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 18 /* 19 Adapted from RocksDB inline skiplist. 20 21 Key differences: 22 - No optimization for sequential inserts (no "prev"). 23 - No custom comparator. 24 - Support overwrites. This requires care when we see the same key when inserting. 25 For RocksDB or LevelDB, overwrites are implemented as a newer sequence number in the key, so 26 there is no need for values. We don't intend to support versioning. In-place updates of values 27 would be more efficient. 28 - We discard all non-concurrent code. 29 - We do not support Splices. This simplifies the code a lot. 30 - No AllocateNode or other pointer arithmetic. 31 - We combine the findLessThan, findGreaterOrEqual, etc into one function. 32 */ 33 34 /* 35 Further adapted from Badger: https://github.com/dgraph-io/badger. 36 37 Key differences: 38 - Support for previous pointers - doubly linked lists. Note that it's up to higher 39 level code to deal with the intermediate state that occurs during insertion, 40 where node A is linked to node B, but node B is not yet linked back to node A. 41 - Iterator includes mutator functions. 42 */ 43 44 package arenaskl // import "github.com/zuoyebang/bitalostable/internal/arenaskl" 45 46 import ( 47 "encoding/binary" 48 "math" 49 "runtime" 50 "sync/atomic" 51 "unsafe" 52 53 "github.com/cockroachdb/errors" 54 "github.com/zuoyebang/bitalostable/internal/base" 55 "github.com/zuoyebang/bitalostable/internal/fastrand" 56 ) 57 58 const ( 59 maxHeight = 20 60 maxNodeSize = int(unsafe.Sizeof(node{})) 61 linksSize = int(unsafe.Sizeof(links{})) 62 pValue = 1 / math.E 63 ) 64 65 // ErrRecordExists indicates that an entry with the specified key already 66 // exists in the skiplist. Duplicate entries are not directly supported and 67 // instead must be handled by the user by appending a unique version suffix to 68 // keys. 69 var ErrRecordExists = errors.New("record with this key already exists") 70 71 // Skiplist is a fast, cocnurrent skiplist implementation that supports forward 72 // and backward iteration. See batchskl.Skiplist for a non-concurrent 73 // skiplist. Keys and values are immutable once added to the skiplist and 74 // deletion is not supported. Instead, higher-level code is expected to add new 75 // entries that shadow existing entries and perform deletion via tombstones. It 76 // is up to the user to process these shadow entries and tombstones 77 // appropriately during retrieval. 78 type Skiplist struct { 79 arena *Arena 80 cmp base.Compare 81 head *node 82 tail *node 83 height uint32 // Current height. 1 <= height <= maxHeight. CAS. 84 85 // If set to true by tests, then extra delays are added to make it easier to 86 // detect unusual race conditions. 87 testing bool 88 } 89 90 // Inserter TODO(peter) 91 type Inserter struct { 92 spl [maxHeight]splice 93 height uint32 94 } 95 96 // Add TODO(peter) 97 func (ins *Inserter) Add(list *Skiplist, key base.InternalKey, value []byte) error { 98 return list.addInternal(key, value, ins) 99 } 100 101 var ( 102 probabilities [maxHeight]uint32 103 ) 104 105 func init() { 106 // Precompute the skiplist probabilities so that only a single random number 107 // needs to be generated and so that the optimal pvalue can be used (inverse 108 // of Euler's number). 109 p := float64(1.0) 110 for i := 0; i < maxHeight; i++ { 111 probabilities[i] = uint32(float64(math.MaxUint32) * p) 112 p *= pValue 113 } 114 } 115 116 // NewSkiplist constructs and initializes a new, empty skiplist. All nodes, keys, 117 // and values in the skiplist will be allocated from the given arena. 118 func NewSkiplist(arena *Arena, cmp base.Compare) *Skiplist { 119 skl := &Skiplist{} 120 skl.Reset(arena, cmp) 121 return skl 122 } 123 124 // Reset the skiplist to empty and re-initialize. 125 func (s *Skiplist) Reset(arena *Arena, cmp base.Compare) { 126 // Allocate head and tail nodes. 127 head, err := newRawNode(arena, maxHeight, 0, 0) 128 if err != nil { 129 panic("arenaSize is not large enough to hold the head node") 130 } 131 head.keyOffset = 0 132 head.skipToFirst = 0 133 head.skipToLast = 0 134 135 tail, err := newRawNode(arena, maxHeight, 0, 0) 136 if err != nil { 137 panic("arenaSize is not large enough to hold the tail node") 138 } 139 tail.keyOffset = 0 140 tail.skipToFirst = 0 141 tail.skipToLast = 0 142 143 // Link all head/tail levels together. 144 headOffset := arena.getPointerOffset(unsafe.Pointer(head)) 145 tailOffset := arena.getPointerOffset(unsafe.Pointer(tail)) 146 for i := 0; i < maxHeight; i++ { 147 head.tower[i].nextOffset = tailOffset 148 tail.tower[i].prevOffset = headOffset 149 } 150 151 *s = Skiplist{ 152 arena: arena, 153 cmp: cmp, 154 head: head, 155 tail: tail, 156 height: 1, 157 } 158 } 159 160 // Height returns the height of the highest tower within any of the nodes that 161 // have ever been allocated as part of this skiplist. 162 func (s *Skiplist) Height() uint32 { return atomic.LoadUint32(&s.height) } 163 164 // Arena returns the arena backing this skiplist. 165 func (s *Skiplist) Arena() *Arena { return s.arena } 166 167 // Size returns the number of bytes that have allocated from the arena. 168 func (s *Skiplist) Size() uint32 { return s.arena.Size() } 169 170 // Add adds a new key if it does not yet exist. If the key already exists, then 171 // Add returns ErrRecordExists. If there isn't enough room in the arena, then 172 // Add returns ErrArenaFull. 173 func (s *Skiplist) Add(key base.InternalKey, value []byte) error { 174 var ins Inserter 175 return s.addInternal(key, value, &ins) 176 } 177 178 func (s *Skiplist) addInternal(key base.InternalKey, value []byte, ins *Inserter) error { 179 if s.findSplice(key, ins) { 180 // Found a matching node, but handle case where it's been deleted. 181 return ErrRecordExists 182 } 183 184 if s.testing { 185 // Add delay to make it easier to test race between this thread 186 // and another thread that sees the intermediate state between 187 // finding the splice and using it. 188 runtime.Gosched() 189 } 190 191 nd, height, err := s.newNode(key, value) 192 if err != nil { 193 return err 194 } 195 196 ndOffset := s.arena.getPointerOffset(unsafe.Pointer(nd)) 197 198 // We always insert from the base level and up. After you add a node in base 199 // level, we cannot create a node in the level above because it would have 200 // discovered the node in the base level. 201 var found bool 202 var invalidateSplice bool 203 for i := 0; i < int(height); i++ { 204 prev := ins.spl[i].prev 205 next := ins.spl[i].next 206 207 if prev == nil { 208 // New node increased the height of the skiplist, so assume that the 209 // new level has not yet been populated. 210 if next != nil { 211 panic("next is expected to be nil, since prev is nil") 212 } 213 214 prev = s.head 215 next = s.tail 216 } 217 218 // +----------------+ +------------+ +----------------+ 219 // | prev | | nd | | next | 220 // | prevNextOffset |---->| | | | 221 // | |<----| prevOffset | | | 222 // | | | nextOffset |---->| | 223 // | | | |<----| nextPrevOffset | 224 // +----------------+ +------------+ +----------------+ 225 // 226 // 1. Initialize prevOffset and nextOffset to point to prev and next. 227 // 2. CAS prevNextOffset to repoint from next to nd. 228 // 3. CAS nextPrevOffset to repoint from prev to nd. 229 for { 230 prevOffset := s.arena.getPointerOffset(unsafe.Pointer(prev)) 231 nextOffset := s.arena.getPointerOffset(unsafe.Pointer(next)) 232 nd.tower[i].init(prevOffset, nextOffset) 233 234 // Check whether next has an updated link to prev. If it does not, 235 // that can mean one of two things: 236 // 1. The thread that added the next node hasn't yet had a chance 237 // to add the prev link (but will shortly). 238 // 2. Another thread has added a new node between prev and next. 239 nextPrevOffset := next.prevOffset(i) 240 if nextPrevOffset != prevOffset { 241 // Determine whether #1 or #2 is true by checking whether prev 242 // is still pointing to next. As long as the atomic operations 243 // have at least acquire/release semantics (no need for 244 // sequential consistency), this works, as it is equivalent to 245 // the "publication safety" pattern. 246 prevNextOffset := prev.nextOffset(i) 247 if prevNextOffset == nextOffset { 248 // Ok, case #1 is true, so help the other thread along by 249 // updating the next node's prev link. 250 next.casPrevOffset(i, nextPrevOffset, prevOffset) 251 } 252 } 253 254 if prev.casNextOffset(i, nextOffset, ndOffset) { 255 // Managed to insert nd between prev and next, so update the next 256 // node's prev link and go to the next level. 257 if s.testing { 258 // Add delay to make it easier to test race between this thread 259 // and another thread that sees the intermediate state between 260 // setting next and setting prev. 261 runtime.Gosched() 262 } 263 264 next.casPrevOffset(i, prevOffset, ndOffset) 265 break 266 } 267 268 // CAS failed. We need to recompute prev and next. It is unlikely to 269 // be helpful to try to use a different level as we redo the search, 270 // because it is unlikely that lots of nodes are inserted between prev 271 // and next. 272 prev, next, found = s.findSpliceForLevel(key, i, prev) 273 if found { 274 if i != 0 { 275 panic("how can another thread have inserted a node at a non-base level?") 276 } 277 278 return ErrRecordExists 279 } 280 invalidateSplice = true 281 } 282 } 283 284 s.setNodeSkipOffset(nd, ndOffset, key) 285 286 // If we had to recompute the splice for a level, invalidate the entire 287 // cached splice. 288 if invalidateSplice { 289 ins.height = 0 290 } else { 291 // The splice was valid. We inserted a node between spl[i].prev and 292 // spl[i].next. Optimistically update spl[i].prev for use in a subsequent 293 // call to add. 294 for i := uint32(0); i < height; i++ { 295 ins.spl[i].prev = nd 296 } 297 } 298 299 return nil 300 } 301 302 func (s *Skiplist) setNodeSkipOffset(nd *node, ndOffset uint32, key base.InternalKey) { 303 nextNd := s.getNext(nd, 0) 304 if nextNd == s.tail { 305 return 306 } 307 308 offset, size := nextNd.keyOffset, nextNd.keySize 309 nextKey := s.arena.buf[offset : offset+size] 310 n := int32(size) - 8 311 if n < 0 || s.cmp(key.UserKey, nextKey[:n]) != 0 || key.Trailer <= binary.LittleEndian.Uint64(nextKey[n:]) { 312 return 313 } 314 315 skipToFirstOffset := nextNd.skipToFirstOffset() 316 if skipToFirstOffset > 0 { 317 nd.setSkipToFirstOffset(skipToFirstOffset) 318 319 skipToFirstNd := (*node)(s.arena.getPointer(skipToFirstOffset)) 320 if skipToFirstNd == s.tail { 321 return 322 } 323 324 skipToFirstNd.setSkipToLastOffset(ndOffset) 325 } else { 326 nextNdOffset := s.arena.getPointerOffset(unsafe.Pointer(nextNd)) 327 nd.setSkipToFirstOffset(nextNdOffset) 328 } 329 } 330 331 // NewIter returns a new Iterator object. The lower and upper bound parameters 332 // control the range of keys the iterator will return. Specifying for nil for 333 // lower or upper bound disables the check for that boundary. Note that lower 334 // bound is not checked on {SeekGE,First} and upper bound is not check on 335 // {SeekLT,Last}. The user is expected to perform that check. Note that it is 336 // safe for an iterator to be copied by value. 337 func (s *Skiplist) NewIter(lower, upper []byte) *Iterator { 338 it := iterPool.Get().(*Iterator) 339 *it = Iterator{list: s, nd: s.head, lower: lower, upper: upper} 340 return it 341 } 342 343 // NewFlushIter returns a new flushIterator, which is similar to an Iterator 344 // but also sets the current number of the bytes that have been iterated 345 // through. 346 func (s *Skiplist) NewFlushIter(bytesFlushed *uint64) base.InternalIterator { 347 return &flushIterator{ 348 Iterator: Iterator{list: s, nd: s.head}, 349 bytesIterated: bytesFlushed, 350 } 351 } 352 353 func (s *Skiplist) newNode( 354 key base.InternalKey, value []byte, 355 ) (nd *node, height uint32, err error) { 356 height = s.randomHeight() 357 nd, err = newNode(s.arena, height, key, value) 358 if err != nil { 359 return 360 } 361 362 // Try to increase s.height via CAS. 363 listHeight := s.Height() 364 for height > listHeight { 365 if atomic.CompareAndSwapUint32(&s.height, listHeight, height) { 366 // Successfully increased skiplist.height. 367 break 368 } 369 370 listHeight = s.Height() 371 } 372 373 return 374 } 375 376 func (s *Skiplist) randomHeight() uint32 { 377 rnd := fastrand.Uint32() 378 379 h := uint32(1) 380 for h < maxHeight && rnd <= probabilities[h] { 381 h++ 382 } 383 384 return h 385 } 386 387 func (s *Skiplist) findSplice(key base.InternalKey, ins *Inserter) (found bool) { 388 listHeight := s.Height() 389 var level int 390 391 prev := s.head 392 if ins.height < listHeight { 393 // Our cached height is less than the list height, which means there were 394 // inserts that increased the height of the list. Recompute the splice from 395 // scratch. 396 ins.height = listHeight 397 level = int(ins.height) 398 } else { 399 // Our cached height is equal to the list height. 400 for ; level < int(listHeight); level++ { 401 spl := &ins.spl[level] 402 if s.getNext(spl.prev, level) != spl.next { 403 // One or more nodes have been inserted between the splice at this 404 // level. 405 continue 406 } 407 if spl.prev != s.head && !s.keyIsAfterNode(spl.prev, key) { 408 // Key lies before splice. 409 level = int(listHeight) 410 break 411 } 412 if spl.next != s.tail && s.keyIsAfterNode(spl.next, key) { 413 // Key lies after splice. 414 level = int(listHeight) 415 break 416 } 417 // The splice brackets the key! 418 prev = spl.prev 419 break 420 } 421 } 422 423 for level = level - 1; level >= 0; level-- { 424 var next *node 425 prev, next, found = s.findSpliceForLevel(key, level, prev) 426 if next == nil { 427 next = s.tail 428 } 429 ins.spl[level].init(prev, next) 430 } 431 432 return 433 } 434 435 func (s *Skiplist) findSpliceForLevel( 436 key base.InternalKey, level int, start *node, 437 ) (prev, next *node, found bool) { 438 prev = start 439 440 for { 441 // Assume prev.key < key. 442 next = s.getNext(prev, level) 443 if next == s.tail { 444 // Tail node, so done. 445 break 446 } 447 448 offset, size := next.keyOffset, next.keySize 449 nextKey := s.arena.buf[offset : offset+size] 450 n := int32(size) - 8 451 cmp := s.cmp(key.UserKey, nextKey[:n]) 452 if cmp < 0 { 453 // We are done for this level, since prev.key < key < next.key. 454 break 455 } 456 if cmp == 0 { 457 // User-key equality. 458 var nextTrailer uint64 459 if n >= 0 { 460 nextTrailer = binary.LittleEndian.Uint64(nextKey[n:]) 461 } else { 462 nextTrailer = uint64(base.InternalKeyKindInvalid) 463 } 464 if key.Trailer == nextTrailer { 465 // Internal key equality. 466 found = true 467 break 468 } 469 if key.Trailer > nextTrailer { 470 // We are done for this level, since prev.key < key < next.key. 471 break 472 } 473 } 474 475 // Keep moving right on this level. 476 prev = next 477 } 478 479 return 480 } 481 482 func (s *Skiplist) keyIsAfterNode(nd *node, key base.InternalKey) bool { 483 ndKey := s.arena.buf[nd.keyOffset : nd.keyOffset+nd.keySize] 484 n := int32(nd.keySize) - 8 485 cmp := s.cmp(ndKey[:n], key.UserKey) 486 if cmp < 0 { 487 return true 488 } 489 if cmp > 0 { 490 return false 491 } 492 // User-key equality. 493 var ndTrailer uint64 494 if n >= 0 { 495 ndTrailer = binary.LittleEndian.Uint64(ndKey[n:]) 496 } else { 497 ndTrailer = uint64(base.InternalKeyKindInvalid) 498 } 499 if key.Trailer == ndTrailer { 500 // Internal key equality. 501 return false 502 } 503 return key.Trailer < ndTrailer 504 } 505 506 func (s *Skiplist) getNext(nd *node, h int) *node { 507 offset := atomic.LoadUint32(&nd.tower[h].nextOffset) 508 return (*node)(s.arena.getPointer(offset)) 509 } 510 511 func (s *Skiplist) getPrev(nd *node, h int) *node { 512 offset := atomic.LoadUint32(&nd.tower[h].prevOffset) 513 return (*node)(s.arena.getPointer(offset)) 514 } 515 516 func (s *Skiplist) getSkipNext(nd *node) *node { 517 var nextNd *node 518 skipToFirstOffset := nd.skipToFirstOffset() 519 if skipToFirstOffset > 0 { 520 nextNd = (*node)(s.arena.getPointer(skipToFirstOffset)) 521 } else { 522 offset := atomic.LoadUint32(&nd.tower[0].nextOffset) 523 nextNd = (*node)(s.arena.getPointer(offset)) 524 } 525 return nextNd 526 } 527 528 func (s *Skiplist) getSkipPrev(nd *node) *node { 529 var prevNd *node 530 skipToLastOffset := nd.skipToLastOffset() 531 if skipToLastOffset > 0 { 532 prevNd = (*node)(s.arena.getPointer(skipToLastOffset)) 533 } else { 534 offset := atomic.LoadUint32(&nd.tower[0].prevOffset) 535 prevNd = (*node)(s.arena.getPointer(offset)) 536 } 537 return prevNd 538 }