github.com/zuoyebang/bitalostable@v1.0.1-0.20240229032404-e3b99a834294/internal/batchskl/skl.go (about) 1 /* 2 * Copyright 2017 Dgraph Labs, Inc. and Contributors 3 * Modifications copyright (C) 2017 Andy Kimball and Contributors 4 * 5 * Licensed under the Apache License, Version 2.0 (the "License") 6 * you may not use this file except in compliance with the License. 7 * You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 18 /* 19 Adapted from RocksDB inline skiplist. 20 21 Key differences: 22 - No optimization for sequential inserts (no "prev"). 23 - No custom comparator. 24 - Support overwrites. This requires care when we see the same key when inserting. 25 For RocksDB or LevelDB, overwrites are implemented as a newer sequence number in the key, so 26 there is no need for values. We don't intend to support versioning. In-place updates of values 27 would be more efficient. 28 - We discard all non-concurrent code. 29 - We do not support Splices. This simplifies the code a lot. 30 - No AllocateNode or other pointer arithmetic. 31 - We combine the findLessThan, findGreaterOrEqual, etc into one function. 32 */ 33 34 /* 35 Further adapted from Badger: https://github.com/dgraph-io/badger. 36 37 Key differences: 38 - Support for previous pointers - doubly linked lists. Note that it's up to higher 39 level code to deal with the intermediate state that occurs during insertion, 40 where node A is linked to node B, but node B is not yet linked back to node A. 41 - Iterator includes mutator functions. 42 */ 43 44 /* 45 Further adapted from arenaskl: https://github.com/andy-kimball/arenaskl 46 47 Key differences: 48 - Removed support for deletion. 49 - Removed support for concurrency. 50 - External storage of keys. 51 - Node storage grows to an arbitrary size. 52 */ 53 54 package batchskl // import "github.com/zuoyebang/bitalostable/internal/batchskl" 55 56 import ( 57 "bytes" 58 "encoding/binary" 59 "fmt" 60 "math" 61 "time" 62 "unsafe" 63 64 "github.com/cockroachdb/errors" 65 "github.com/zuoyebang/bitalostable/internal/base" 66 "golang.org/x/exp/rand" 67 ) 68 69 const ( 70 maxHeight = 20 71 maxNodeSize = int(unsafe.Sizeof(node{})) 72 linksSize = int(unsafe.Sizeof(links{})) 73 maxNodesSize = math.MaxUint32 74 ) 75 76 var ( 77 // ErrExists indicates that a duplicate record was inserted. This should never 78 // happen for normal usage of batchskl as every key should have a unique 79 // sequence number. 80 ErrExists = errors.New("record with this key already exists") 81 82 // ErrTooManyRecords is a sentinel error returned when the size of the raw 83 // nodes slice exceeds the maximum allowed size (currently 1 << 32 - 1). This 84 // corresponds to ~117 M skiplist entries. 85 ErrTooManyRecords = errors.New("too many records") 86 ) 87 88 type links struct { 89 next uint32 90 prev uint32 91 } 92 93 type node struct { 94 // The offset of the start of the record in the storage. 95 offset uint32 96 // The offset of the start and end of the key in storage. 97 keyStart uint32 98 keyEnd uint32 99 // A fixed 8-byte abbreviation of the key, used to avoid retrieval of the key 100 // during seek operations. The key retrieval can be expensive purely due to 101 // cache misses while the abbreviatedKey stored here will be in the same 102 // cache line as the key and the links making accessing and comparing against 103 // it almost free. 104 abbreviatedKey uint64 105 // Most nodes do not need to use the full height of the link tower, since the 106 // probability of each successive level decreases exponentially. Because 107 // these elements are never accessed, they do not need to be allocated. 108 // Therefore, when a node is allocated, its memory footprint is deliberately 109 // truncated to not include unneeded link elements. 110 links [maxHeight]links 111 } 112 113 // Skiplist is a fast, non-cocnurrent skiplist implementation that supports 114 // forward and backward iteration. See arenaskl.Skiplist for a concurrent 115 // skiplist. Keys and values are stored externally from the skiplist via the 116 // Storage interface. Deletion is not supported. Instead, higher-level code is 117 // expected to perform deletion via tombstones and needs to process those 118 // tombstones appropriately during retrieval operations. 119 type Skiplist struct { 120 storage *[]byte 121 cmp base.Compare 122 abbreviatedKey base.AbbreviatedKey 123 nodes []byte 124 head uint32 125 tail uint32 126 height uint32 // Current height: 1 <= height <= maxHeight 127 rand rand.PCGSource 128 } 129 130 var ( 131 probabilities [maxHeight]uint32 132 ) 133 134 func init() { 135 const pValue = 1 / math.E 136 137 // Precompute the skiplist probabilities so that only a single random number 138 // needs to be generated and so that the optimal pvalue can be used (inverse 139 // of Euler's number). 140 p := float64(1.0) 141 for i := 0; i < maxHeight; i++ { 142 probabilities[i] = uint32(float64(math.MaxUint32) * p) 143 p *= pValue 144 } 145 } 146 147 // NewSkiplist constructs and initializes a new, empty skiplist. 148 func NewSkiplist(storage *[]byte, cmp base.Compare, abbreviatedKey base.AbbreviatedKey) *Skiplist { 149 s := &Skiplist{} 150 s.Init(storage, cmp, abbreviatedKey) 151 return s 152 } 153 154 // Reset the fields in the skiplist for reuse. 155 func (s *Skiplist) Reset() { 156 *s = Skiplist{ 157 nodes: s.nodes[:0], 158 height: 1, 159 } 160 const batchMaxRetainedSize = 1 << 20 // 1 MB 161 if cap(s.nodes) > batchMaxRetainedSize { 162 s.nodes = nil 163 } 164 } 165 166 // Init the skiplist to empty and re-initialize. 167 func (s *Skiplist) Init(storage *[]byte, cmp base.Compare, abbreviatedKey base.AbbreviatedKey) { 168 *s = Skiplist{ 169 storage: storage, 170 cmp: cmp, 171 abbreviatedKey: abbreviatedKey, 172 nodes: s.nodes[:0], 173 height: 1, 174 } 175 s.rand.Seed(uint64(time.Now().UnixNano())) 176 177 const initBufSize = 256 178 if cap(s.nodes) < initBufSize { 179 s.nodes = make([]byte, 0, initBufSize) 180 } 181 182 // Allocate head and tail nodes. While allocating a new node can fail, in the 183 // context of initializing the skiplist we consider it unrecoverable. 184 var err error 185 s.head, err = s.newNode(maxHeight, 0, 0, 0, 0) 186 if err != nil { 187 panic(err) 188 } 189 s.tail, err = s.newNode(maxHeight, 0, 0, 0, 0) 190 if err != nil { 191 panic(err) 192 } 193 194 // Link all head/tail levels together. 195 headNode := s.node(s.head) 196 tailNode := s.node(s.tail) 197 for i := uint32(0); i < maxHeight; i++ { 198 headNode.links[i].next = s.tail 199 tailNode.links[i].prev = s.head 200 } 201 } 202 203 // Add adds a new key to the skiplist if it does not yet exist. If the record 204 // already exists, then Add returns ErrRecordExists. 205 func (s *Skiplist) Add(keyOffset uint32) error { 206 data := (*s.storage)[keyOffset+1:] 207 v, n := binary.Uvarint(data) 208 if n <= 0 { 209 return errors.Errorf("corrupted batch entry: %d", errors.Safe(keyOffset)) 210 } 211 data = data[n:] 212 if v > uint64(len(data)) { 213 return errors.Errorf("corrupted batch entry: %d", errors.Safe(keyOffset)) 214 } 215 keyStart := 1 + keyOffset + uint32(n) 216 keyEnd := keyStart + uint32(v) 217 key := data[:v] 218 abbreviatedKey := s.abbreviatedKey(key) 219 220 // spl holds the list of next and previous links for each level in the 221 // skiplist indicating where the new node will be inserted. 222 var spl [maxHeight]splice 223 224 // Fast-path for in-order insertion of keys: compare the new key against the 225 // last key. 226 prev := s.getPrev(s.tail, 0) 227 if prevNode := s.node(prev); prev == s.head || 228 abbreviatedKey > prevNode.abbreviatedKey || 229 (abbreviatedKey == prevNode.abbreviatedKey && 230 s.cmp(key, (*s.storage)[prevNode.keyStart:prevNode.keyEnd]) > 0) { 231 for level := uint32(0); level < s.height; level++ { 232 spl[level].prev = s.getPrev(s.tail, level) 233 spl[level].next = s.tail 234 } 235 } else { 236 s.findSplice(key, abbreviatedKey, &spl) 237 } 238 239 height := s.randomHeight() 240 // Increase s.height as necessary. 241 for ; s.height < height; s.height++ { 242 spl[s.height].next = s.tail 243 spl[s.height].prev = s.head 244 } 245 246 // We always insert from the base level and up. After you add a node in base 247 // level, we cannot create a node in the level above because it would have 248 // discovered the node in the base level. 249 nd, err := s.newNode(height, keyOffset, keyStart, keyEnd, abbreviatedKey) 250 if err != nil { 251 return err 252 } 253 newNode := s.node(nd) 254 for level := uint32(0); level < height; level++ { 255 next := spl[level].next 256 prev := spl[level].prev 257 newNode.links[level].next = next 258 newNode.links[level].prev = prev 259 s.node(next).links[level].prev = nd 260 s.node(prev).links[level].next = nd 261 } 262 263 return nil 264 } 265 266 // NewIter returns a new Iterator object. The lower and upper bound parameters 267 // control the range of keys the iterator will return. Specifying for nil for 268 // lower or upper bound disables the check for that boundary. Note that lower 269 // bound is not checked on {SeekGE,First} and upper bound is not check on 270 // {SeekLT,Last}. The user is expected to perform that check. Note that it is 271 // safe for an iterator to be copied by value. 272 func (s *Skiplist) NewIter(lower, upper []byte) Iterator { 273 return Iterator{list: s, lower: lower, upper: upper} 274 } 275 276 func (s *Skiplist) newNode( 277 height, 278 offset, keyStart, keyEnd uint32, abbreviatedKey uint64, 279 ) (uint32, error) { 280 if height < 1 || height > maxHeight { 281 panic("height cannot be less than one or greater than the max height") 282 } 283 284 unusedSize := (maxHeight - int(height)) * linksSize 285 nodeOffset, err := s.alloc(uint32(maxNodeSize - unusedSize)) 286 if err != nil { 287 return 0, err 288 } 289 nd := s.node(nodeOffset) 290 291 nd.offset = offset 292 nd.keyStart = keyStart 293 nd.keyEnd = keyEnd 294 nd.abbreviatedKey = abbreviatedKey 295 return nodeOffset, nil 296 } 297 298 func (s *Skiplist) alloc(size uint32) (uint32, error) { 299 offset := len(s.nodes) 300 301 // We only have a need for memory up to offset + size, but we never want 302 // to allocate a node whose tail points into unallocated memory. 303 minAllocSize := offset + maxNodeSize 304 if cap(s.nodes) < minAllocSize { 305 allocSize := cap(s.nodes) * 2 306 if allocSize < minAllocSize { 307 allocSize = minAllocSize 308 } 309 // Cap the allocation at the max allowed size to avoid wasted capacity. 310 if allocSize > maxNodesSize { 311 // The new record may still not fit within the allocation, in which case 312 // we return early with an error. This avoids the panic below when we 313 // resize the slice. It also avoids the allocation and copy. 314 if uint64(offset)+uint64(size) > maxNodesSize { 315 return 0, errors.Wrapf(ErrTooManyRecords, 316 "alloc of new record (size=%d) would overflow uint32 (current size=%d)", 317 uint64(offset)+uint64(size), offset, 318 ) 319 } 320 allocSize = maxNodesSize 321 } 322 tmp := make([]byte, len(s.nodes), allocSize) 323 copy(tmp, s.nodes) 324 s.nodes = tmp 325 } 326 327 newSize := uint32(offset) + size 328 s.nodes = s.nodes[:newSize] 329 return uint32(offset), nil 330 } 331 332 func (s *Skiplist) node(offset uint32) *node { 333 return (*node)(unsafe.Pointer(&s.nodes[offset])) 334 } 335 336 func (s *Skiplist) randomHeight() uint32 { 337 rnd := uint32(s.rand.Uint64()) 338 h := uint32(1) 339 for h < maxHeight && rnd <= probabilities[h] { 340 h++ 341 } 342 return h 343 } 344 345 func (s *Skiplist) findSplice(key []byte, abbreviatedKey uint64, spl *[maxHeight]splice) { 346 prev := s.head 347 348 for level := s.height - 1; ; level-- { 349 // The code in this loop is the same as findSpliceForLevel(). For some 350 // reason, calling findSpliceForLevel() here is much much slower than the 351 // inlined code below. The excess time is also caught up in the final 352 // return statement which makes little sense. Revisit when in go1.14 or 353 // later if inlining improves. 354 355 next := s.getNext(prev, level) 356 for next != s.tail { 357 // Assume prev.key < key. 358 nextNode := s.node(next) 359 nextAbbreviatedKey := nextNode.abbreviatedKey 360 if abbreviatedKey < nextAbbreviatedKey { 361 // We are done for this level, since prev.key < key < next.key. 362 break 363 } 364 if abbreviatedKey == nextAbbreviatedKey { 365 if s.cmp(key, (*s.storage)[nextNode.keyStart:nextNode.keyEnd]) <= 0 { 366 // We are done for this level, since prev.key < key <= next.key. 367 break 368 } 369 } 370 371 // Keep moving right on this level. 372 prev = next 373 next = nextNode.links[level].next 374 } 375 376 spl[level].prev = prev 377 spl[level].next = next 378 if level == 0 { 379 break 380 } 381 } 382 } 383 384 func (s *Skiplist) findSpliceForLevel( 385 key []byte, abbreviatedKey uint64, level, start uint32, 386 ) (prev, next uint32) { 387 prev = start 388 next = s.getNext(prev, level) 389 390 for next != s.tail { 391 // Assume prev.key < key. 392 nextNode := s.node(next) 393 nextAbbreviatedKey := nextNode.abbreviatedKey 394 if abbreviatedKey < nextAbbreviatedKey { 395 // We are done for this level, since prev.key < key < next.key. 396 break 397 } 398 if abbreviatedKey == nextAbbreviatedKey { 399 if s.cmp(key, (*s.storage)[nextNode.keyStart:nextNode.keyEnd]) <= 0 { 400 // We are done for this level, since prev.key < key < next.key. 401 break 402 } 403 } 404 405 // Keep moving right on this level. 406 prev = next 407 next = nextNode.links[level].next 408 } 409 410 return 411 } 412 413 func (s *Skiplist) getKey(nd uint32) base.InternalKey { 414 n := s.node(nd) 415 kind := base.InternalKeyKind((*s.storage)[n.offset]) 416 key := (*s.storage)[n.keyStart:n.keyEnd] 417 return base.MakeInternalKey(key, uint64(n.offset)|base.InternalKeySeqNumBatch, kind) 418 } 419 420 func (s *Skiplist) getNext(nd, h uint32) uint32 { 421 return s.node(nd).links[h].next 422 } 423 424 func (s *Skiplist) getPrev(nd, h uint32) uint32 { 425 return s.node(nd).links[h].prev 426 } 427 428 func (s *Skiplist) debug() string { 429 var buf bytes.Buffer 430 for level := uint32(0); level < s.height; level++ { 431 var count int 432 for nd := s.head; nd != s.tail; nd = s.getNext(nd, level) { 433 count++ 434 } 435 fmt.Fprintf(&buf, "%d: %d\n", level, count) 436 } 437 return buf.String() 438 } 439 440 // Silence unused warning. 441 var _ = (*Skiplist).debug