github.com/matrixorigin/matrixone@v1.2.0/pkg/common/arenaskl/skl.go (about) 1 /* 2 * Copyright 2017 Dgraph Labs, Inc. and Contributors 3 * Modifications copyright (C) 2017 Andy Kimball and Contributors 4 * and copyright (C) 2024 MatrixOrigin Inc. 5 * 6 * Licensed under the Apache License, Version 2.0 (the "License"); 7 * you may not use this file except in compliance with the License. 8 * You may obtain a copy of the License at 9 * 10 * http://www.apache.org/licenses/LICENSE-2.0 11 * 12 * Unless required by applicable law or agreed to in writing, software 13 * distributed under the License is distributed on an "AS IS" BASIS, 14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 * See the License for the specific language governing permissions and 16 * limitations under the License. 17 */ 18 19 /* 20 Adapted from RocksDB inline skiplist. 21 22 Key differences: 23 - No optimization for sequential inserts (no "prev"). 24 - No custom comparator. 25 - Support overwrites. This requires care when we see the same key when inserting. 26 For RocksDB or LevelDB, overwrites are implemented as a newer sequence number in the key, so 27 there is no need for values. We don't intend to support versioning. In-place updates of values 28 would be more efficient. 29 - We discard all non-concurrent code. 30 - We do not support Splices. This simplifies the code a lot. 31 - No AllocateNode or other pointer arithmetic. 32 - We combine the findLessThan, findGreaterOrEqual, etc into one function. 33 */ 34 35 /* 36 Further adapted from Badger: https://github.com/dgraph-io/badger. 37 38 Key differences: 39 - Support for previous pointers - doubly linked lists. Note that it's up to higher 40 level code to deal with the intermediate state that occurs during insertion, 41 where node A is linked to node B, but node B is not yet linked back to node A. 42 - Iterator includes mutator functions. 43 */ 44 45 package arenaskl 46 47 import ( 48 "math" 49 "runtime" 50 "sync/atomic" 51 "unsafe" 52 53 "github.com/matrixorigin/matrixone/pkg/common/fastrand" 54 "github.com/matrixorigin/matrixone/pkg/common/moerr" 55 ) 56 57 const ( 58 maxHeight = 20 59 maxNodeSize = int(unsafe.Sizeof(node{})) 60 linksSize = int(unsafe.Sizeof(links{})) 61 pValue = 1 / math.E 62 ) 63 64 // Compare is a comparison function for keys. 65 type Compare func(a, b []byte) int 66 67 // ErrRecordExists indicates that an entry with the specified key already 68 // exists in the skiplist. Duplicate entries are not directly supported and 69 // instead must be handled by the user by appending a unique version suffix to 70 // keys. 71 var ErrRecordExists = moerr.NewKeyAlreadyExistsNoCtx() 72 73 // Skiplist is a fast, concurrent skiplist implementation that supports forward 74 // and backward iteration. See batchskl.Skiplist for a non-concurrent 75 // skiplist. Keys and values are immutable once added to the skiplist and 76 // deletion is not supported. Instead, higher-level code is expected to add new 77 // entries that shadow existing entries and perform deletion via tombstones. It 78 // is up to the user to process these shadow entries and tombstones 79 // appropriately during retrieval. 80 type Skiplist struct { 81 arena *Arena 82 cmp Compare 83 head *node 84 tail *node 85 height atomic.Uint32 // Current height. 1 <= height <= maxHeight. CAS. 86 87 // If set to true by tests, then extra delays are added to make it easier to 88 // detect unusual race conditions. 89 testing bool 90 } 91 92 // Inserter TODO(peter) 93 type Inserter struct { 94 spl [maxHeight]splice 95 height uint32 96 } 97 98 // Add TODO(peter) 99 func (ins *Inserter) Add(list *Skiplist, key, value []byte) error { 100 return list.addInternal(key, value, ins) 101 } 102 103 var ( 104 probabilities [maxHeight]uint32 105 ) 106 107 func init() { 108 // Precompute the skiplist probabilities so that only a single random number 109 // needs to be generated and so that the optimal pvalue can be used (inverse 110 // of Euler's number). 111 p := float64(1.0) 112 for i := 0; i < maxHeight; i++ { 113 probabilities[i] = uint32(float64(math.MaxUint32) * p) 114 p *= pValue 115 } 116 } 117 118 // NewSkiplist constructs and initializes a new, empty skiplist. All nodes, keys, 119 // and values in the skiplist will be allocated from the given arena. 120 func NewSkiplist(arena *Arena, cmp Compare) *Skiplist { 121 skl := &Skiplist{} 122 skl.Reset(arena, cmp) 123 return skl 124 } 125 126 // Reset the skiplist to empty and re-initialize. 127 func (s *Skiplist) Reset(arena *Arena, cmp Compare) { 128 // Allocate head and tail nodes. 129 head, err := newRawNode(arena, maxHeight, 0, 0) 130 if err != nil { 131 panic("arenaSize is not large enough to hold the head node") 132 } 133 head.keyOffset = 0 134 135 tail, err := newRawNode(arena, maxHeight, 0, 0) 136 if err != nil { 137 panic("arenaSize is not large enough to hold the tail node") 138 } 139 tail.keyOffset = 0 140 141 // Link all head/tail levels together. 142 headOffset := arena.getPointerOffset(unsafe.Pointer(head)) 143 tailOffset := arena.getPointerOffset(unsafe.Pointer(tail)) 144 for i := 0; i < maxHeight; i++ { 145 head.tower[i].nextOffset.Store(tailOffset) 146 tail.tower[i].prevOffset.Store(headOffset) 147 } 148 149 *s = Skiplist{ 150 arena: arena, 151 cmp: cmp, 152 head: head, 153 tail: tail, 154 } 155 s.height.Store(1) 156 } 157 158 // Height returns the height of the highest tower within any of the nodes that 159 // have ever been allocated as part of this skiplist. 160 func (s *Skiplist) Height() uint32 { return s.height.Load() } 161 162 // Arena returns the arena backing this skiplist. 163 func (s *Skiplist) Arena() *Arena { return s.arena } 164 165 // Size returns the number of bytes that have allocated from the arena. 166 func (s *Skiplist) Size() uint32 { return s.arena.Size() } 167 168 // Add adds a new key if it does not yet exist. If the key already exists, then 169 // Add returns ErrRecordExists. If there isn't enough room in the arena, then 170 // Add returns ErrArenaFull. 171 func (s *Skiplist) Add(key, value []byte) error { 172 var ins Inserter 173 return s.addInternal(key, value, &ins) 174 } 175 176 func (s *Skiplist) addInternal(key, value []byte, ins *Inserter) error { 177 if s.findSplice(key, ins) { 178 // Found a matching node, but handle case where it's been deleted. 179 return ErrRecordExists 180 } 181 182 if s.testing { 183 // Add delay to make it easier to test race between this thread 184 // and another thread that sees the intermediate state between 185 // finding the splice and using it. 186 runtime.Gosched() 187 } 188 189 nd, height, err := s.newNode(key, value) 190 if err != nil { 191 return err 192 } 193 194 ndOffset := s.arena.getPointerOffset(unsafe.Pointer(nd)) 195 196 // We always insert from the base level and up. After you add a node in base 197 // level, we cannot create a node in the level above because it would have 198 // discovered the node in the base level. 199 var found bool 200 var invalidateSplice bool 201 for i := 0; i < int(height); i++ { 202 prev := ins.spl[i].prev 203 next := ins.spl[i].next 204 205 if prev == nil { 206 // New node increased the height of the skiplist, so assume that the 207 // new level has not yet been populated. 208 if next != nil { 209 panic("next is expected to be nil, since prev is nil") 210 } 211 212 prev = s.head 213 next = s.tail 214 } 215 216 // +----------------+ +------------+ +----------------+ 217 // | prev | | nd | | next | 218 // | prevNextOffset |---->| | | | 219 // | |<----| prevOffset | | | 220 // | | | nextOffset |---->| | 221 // | | | |<----| nextPrevOffset | 222 // +----------------+ +------------+ +----------------+ 223 // 224 // 1. Initialize prevOffset and nextOffset to point to prev and next. 225 // 2. CAS prevNextOffset to repoint from next to nd. 226 // 3. CAS nextPrevOffset to repoint from prev to nd. 227 for { 228 prevOffset := s.arena.getPointerOffset(unsafe.Pointer(prev)) 229 nextOffset := s.arena.getPointerOffset(unsafe.Pointer(next)) 230 nd.tower[i].init(prevOffset, nextOffset) 231 232 // Check whether next has an updated link to prev. If it does not, 233 // that can mean one of two things: 234 // 1. The thread that added the next node hasn't yet had a chance 235 // to add the prev link (but will shortly). 236 // 2. Another thread has added a new node between prev and next. 237 nextPrevOffset := next.prevOffset(i) 238 if nextPrevOffset != prevOffset { 239 // Determine whether #1 or #2 is true by checking whether prev 240 // is still pointing to next. As long as the atomic operations 241 // have at least acquire/release semantics (no need for 242 // sequential consistency), this works, as it is equivalent to 243 // the "publication safety" pattern. 244 prevNextOffset := prev.nextOffset(i) 245 if prevNextOffset == nextOffset { 246 // Ok, case #1 is true, so help the other thread along by 247 // updating the next node's prev link. 248 next.casPrevOffset(i, nextPrevOffset, prevOffset) 249 } 250 } 251 252 if prev.casNextOffset(i, nextOffset, ndOffset) { 253 // Managed to insert nd between prev and next, so update the next 254 // node's prev link and go to the next level. 255 if s.testing { 256 // Add delay to make it easier to test race between this thread 257 // and another thread that sees the intermediate state between 258 // setting next and setting prev. 259 runtime.Gosched() 260 } 261 262 next.casPrevOffset(i, prevOffset, ndOffset) 263 break 264 } 265 266 // CAS failed. We need to recompute prev and next. It is unlikely to 267 // be helpful to try to use a different level as we redo the search, 268 // because it is unlikely that lots of nodes are inserted between prev 269 // and next. 270 prev, next, found = s.findSpliceForLevel(key, i, prev) 271 if found { 272 if i != 0 { 273 panic("how can another thread have inserted a node at a non-base level?") 274 } 275 276 return ErrRecordExists 277 } 278 invalidateSplice = true 279 } 280 } 281 282 // If we had to recompute the splice for a level, invalidate the entire 283 // cached splice. 284 if invalidateSplice { 285 ins.height = 0 286 } else { 287 // The splice was valid. We inserted a node between spl[i].prev and 288 // spl[i].next. Optimistically update spl[i].prev for use in a subsequent 289 // call to add. 290 for i := uint32(0); i < height; i++ { 291 ins.spl[i].prev = nd 292 } 293 } 294 295 return nil 296 } 297 298 // NewIter returns a new Iterator object. The lower and upper bound parameters 299 // control the range of keys the iterator will return. Specifying for nil for 300 // lower or upper bound disables the check for that boundary. Note that lower 301 // bound is not checked on {SeekGE,First} and upper bound is not check on 302 // {SeekLT,Last}. The user is expected to perform that check. Note that it is 303 // safe for an iterator to be copied by value. 304 func (s *Skiplist) NewIter(lower, upper []byte) *Iterator { 305 it := iterPool.Get().(*Iterator) 306 *it = Iterator{list: s, nd: s.head, lower: lower, upper: upper} 307 return it 308 } 309 310 func (s *Skiplist) newNode( 311 key, value []byte, 312 ) (nd *node, height uint32, err error) { 313 height = s.randomHeight() 314 nd, err = newNode(s.arena, height, key, value) 315 if err != nil { 316 return 317 } 318 319 // Try to increase s.height via CAS. 320 listHeight := s.Height() 321 for height > listHeight { 322 if s.height.CompareAndSwap(listHeight, height) { 323 // Successfully increased skiplist.height. 324 break 325 } 326 327 listHeight = s.Height() 328 } 329 330 return 331 } 332 333 func (s *Skiplist) randomHeight() uint32 { 334 rnd := fastrand.Uint32() 335 336 h := uint32(1) 337 for h < maxHeight && rnd <= probabilities[h] { 338 h++ 339 } 340 341 return h 342 } 343 344 func (s *Skiplist) findSplice(key []byte, ins *Inserter) (found bool) { 345 listHeight := s.Height() 346 var level int 347 348 prev := s.head 349 if ins.height < listHeight { 350 // Our cached height is less than the list height, which means there were 351 // inserts that increased the height of the list. Recompute the splice from 352 // scratch. 353 ins.height = listHeight 354 level = int(ins.height) 355 } else { 356 // Our cached height is equal to the list height. 357 for ; level < int(listHeight); level++ { 358 spl := &ins.spl[level] 359 if s.getNext(spl.prev, level) != spl.next { 360 // One or more nodes have been inserted between the splice at this 361 // level. 362 continue 363 } 364 if spl.prev != s.head && !s.keyIsAfterNode(spl.prev, key) { 365 // Key lies before splice. 366 level = int(listHeight) 367 break 368 } 369 if spl.next != s.tail && s.keyIsAfterNode(spl.next, key) { 370 // Key lies after splice. 371 level = int(listHeight) 372 break 373 } 374 // The splice brackets the key! 375 prev = spl.prev 376 break 377 } 378 } 379 380 for level = level - 1; level >= 0; level-- { 381 var next *node 382 prev, next, found = s.findSpliceForLevel(key, level, prev) 383 if next == nil { 384 next = s.tail 385 } 386 ins.spl[level].init(prev, next) 387 } 388 389 return 390 } 391 392 func (s *Skiplist) findSpliceForLevel( 393 key []byte, level int, start *node, 394 ) (prev, next *node, found bool) { 395 prev = start 396 397 for { 398 // Assume prev.key < key. 399 next = s.getNext(prev, level) 400 if next == s.tail { 401 // Tail node, so done. 402 break 403 } 404 405 offset, size := next.keyOffset, next.keySize 406 nextKey := s.arena.buf[offset : offset+size] 407 cmp := s.cmp(key, nextKey) 408 if cmp < 0 { 409 // We are done for this level, since prev.key < key < next.key. 410 break 411 } 412 if cmp == 0 { 413 found = true 414 break 415 } 416 417 // Keep moving right on this level. 418 prev = next 419 } 420 421 return 422 } 423 424 func (s *Skiplist) keyIsAfterNode(nd *node, key []byte) bool { 425 ndKey := s.arena.buf[nd.keyOffset : nd.keyOffset+nd.keySize] 426 return s.cmp(ndKey, key) < 0 427 } 428 429 func (s *Skiplist) getNext(nd *node, h int) *node { 430 offset := nd.tower[h].nextOffset.Load() 431 return (*node)(s.arena.getPointer(offset)) 432 } 433 434 func (s *Skiplist) getPrev(nd *node, h int) *node { 435 offset := nd.tower[h].prevOffset.Load() 436 return (*node)(s.arena.getPointer(offset)) 437 }