github.com/cockroachdb/pebble@v1.1.1-0.20240513155919-3622ade60459/internal/batchskl/skl.go (about) 1 /* 2 * Copyright 2017 Dgraph Labs, Inc. and Contributors 3 * Modifications copyright (C) 2017 Andy Kimball and Contributors 4 * 5 * Licensed under the Apache License, Version 2.0 (the "License") 6 * you may not use this file except in compliance with the License. 7 * You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 18 /* 19 Adapted from RocksDB inline skiplist. 20 21 Key differences: 22 - No optimization for sequential inserts (no "prev"). 23 - No custom comparator. 24 - Support overwrites. This requires care when we see the same key when inserting. 25 For RocksDB or LevelDB, overwrites are implemented as a newer sequence number in the key, so 26 there is no need for values. We don't intend to support versioning. In-place updates of values 27 would be more efficient. 28 - We discard all non-concurrent code. 29 - We do not support Splices. This simplifies the code a lot. 30 - No AllocateNode or other pointer arithmetic. 31 - We combine the findLessThan, findGreaterOrEqual, etc into one function. 32 */ 33 34 /* 35 Further adapted from Badger: https://github.com/dgraph-io/badger. 36 37 Key differences: 38 - Support for previous pointers - doubly linked lists. Note that it's up to higher 39 level code to deal with the intermediate state that occurs during insertion, 40 where node A is linked to node B, but node B is not yet linked back to node A. 41 - Iterator includes mutator functions. 42 */ 43 44 /* 45 Further adapted from arenaskl: https://github.com/andy-kimball/arenaskl 46 47 Key differences: 48 - Removed support for deletion. 49 - Removed support for concurrency. 50 - External storage of keys. 51 - Node storage grows to an arbitrary size. 52 */ 53 54 package batchskl // import "github.com/cockroachdb/pebble/internal/batchskl" 55 56 import ( 57 "bytes" 58 "encoding/binary" 59 "fmt" 60 "math" 61 "time" 62 "unsafe" 63 64 "github.com/cockroachdb/errors" 65 "github.com/cockroachdb/pebble/internal/base" 66 "github.com/cockroachdb/pebble/internal/constants" 67 "golang.org/x/exp/rand" 68 ) 69 70 const ( 71 maxHeight = 20 72 maxNodeSize = uint64(unsafe.Sizeof(node{})) 73 linksSize = uint64(unsafe.Sizeof(links{})) 74 maxNodesSize = constants.MaxUint32OrInt 75 ) 76 77 var ( 78 // ErrExists indicates that a duplicate record was inserted. This should never 79 // happen for normal usage of batchskl as every key should have a unique 80 // sequence number. 81 ErrExists = errors.New("record with this key already exists") 82 83 // ErrTooManyRecords is a sentinel error returned when the size of the raw 84 // nodes slice exceeds the maximum allowed size (currently 1 << 32 - 1). This 85 // corresponds to ~117 M skiplist entries. 86 ErrTooManyRecords = errors.New("too many records") 87 ) 88 89 type links struct { 90 next uint32 91 prev uint32 92 } 93 94 type node struct { 95 // The offset of the start of the record in the storage. 96 offset uint32 97 // The offset of the start and end of the key in storage. 98 keyStart uint32 99 keyEnd uint32 100 // A fixed 8-byte abbreviation of the key, used to avoid retrieval of the key 101 // during seek operations. The key retrieval can be expensive purely due to 102 // cache misses while the abbreviatedKey stored here will be in the same 103 // cache line as the key and the links making accessing and comparing against 104 // it almost free. 105 abbreviatedKey uint64 106 // Most nodes do not need to use the full height of the link tower, since the 107 // probability of each successive level decreases exponentially. Because 108 // these elements are never accessed, they do not need to be allocated. 109 // Therefore, when a node is allocated, its memory footprint is deliberately 110 // truncated to not include unneeded link elements. 111 links [maxHeight]links 112 } 113 114 // Skiplist is a fast, non-cocnurrent skiplist implementation that supports 115 // forward and backward iteration. See arenaskl.Skiplist for a concurrent 116 // skiplist. Keys and values are stored externally from the skiplist via the 117 // Storage interface. Deletion is not supported. Instead, higher-level code is 118 // expected to perform deletion via tombstones and needs to process those 119 // tombstones appropriately during retrieval operations. 120 type Skiplist struct { 121 storage *[]byte 122 cmp base.Compare 123 abbreviatedKey base.AbbreviatedKey 124 nodes []byte 125 head uint32 126 tail uint32 127 height uint32 // Current height: 1 <= height <= maxHeight 128 rand rand.PCGSource 129 } 130 131 var ( 132 probabilities [maxHeight]uint32 133 ) 134 135 func init() { 136 const pValue = 1 / math.E 137 138 // Precompute the skiplist probabilities so that only a single random number 139 // needs to be generated and so that the optimal pvalue can be used (inverse 140 // of Euler's number). 141 p := float64(1.0) 142 for i := 0; i < maxHeight; i++ { 143 probabilities[i] = uint32(float64(math.MaxUint32) * p) 144 p *= pValue 145 } 146 } 147 148 // NewSkiplist constructs and initializes a new, empty skiplist. 149 func NewSkiplist(storage *[]byte, cmp base.Compare, abbreviatedKey base.AbbreviatedKey) *Skiplist { 150 s := &Skiplist{} 151 s.Init(storage, cmp, abbreviatedKey) 152 return s 153 } 154 155 // Reset the fields in the skiplist for reuse. 156 func (s *Skiplist) Reset() { 157 *s = Skiplist{ 158 nodes: s.nodes[:0], 159 height: 1, 160 } 161 const batchMaxRetainedSize = 1 << 20 // 1 MB 162 if cap(s.nodes) > batchMaxRetainedSize { 163 s.nodes = nil 164 } 165 } 166 167 // Init the skiplist to empty and re-initialize. 168 func (s *Skiplist) Init(storage *[]byte, cmp base.Compare, abbreviatedKey base.AbbreviatedKey) { 169 *s = Skiplist{ 170 storage: storage, 171 cmp: cmp, 172 abbreviatedKey: abbreviatedKey, 173 nodes: s.nodes[:0], 174 height: 1, 175 } 176 s.rand.Seed(uint64(time.Now().UnixNano())) 177 178 const initBufSize = 256 179 if cap(s.nodes) < initBufSize { 180 s.nodes = make([]byte, 0, initBufSize) 181 } 182 183 // Allocate head and tail nodes. While allocating a new node can fail, in the 184 // context of initializing the skiplist we consider it unrecoverable. 185 var err error 186 s.head, err = s.newNode(maxHeight, 0, 0, 0, 0) 187 if err != nil { 188 panic(err) 189 } 190 s.tail, err = s.newNode(maxHeight, 0, 0, 0, 0) 191 if err != nil { 192 panic(err) 193 } 194 195 // Link all head/tail levels together. 196 headNode := s.node(s.head) 197 tailNode := s.node(s.tail) 198 for i := uint32(0); i < maxHeight; i++ { 199 headNode.links[i].next = s.tail 200 tailNode.links[i].prev = s.head 201 } 202 } 203 204 // Add adds a new key to the skiplist if it does not yet exist. If the record 205 // already exists, then Add returns ErrRecordExists. 206 func (s *Skiplist) Add(keyOffset uint32) error { 207 data := (*s.storage)[keyOffset+1:] 208 v, n := binary.Uvarint(data) 209 if n <= 0 { 210 return errors.Errorf("corrupted batch entry: %d", errors.Safe(keyOffset)) 211 } 212 data = data[n:] 213 if v > uint64(len(data)) { 214 return errors.Errorf("corrupted batch entry: %d", errors.Safe(keyOffset)) 215 } 216 keyStart := 1 + keyOffset + uint32(n) 217 keyEnd := keyStart + uint32(v) 218 key := data[:v] 219 abbreviatedKey := s.abbreviatedKey(key) 220 221 // spl holds the list of next and previous links for each level in the 222 // skiplist indicating where the new node will be inserted. 223 var spl [maxHeight]splice 224 225 // Fast-path for in-order insertion of keys: compare the new key against the 226 // last key. 227 prev := s.getPrev(s.tail, 0) 228 if prevNode := s.node(prev); prev == s.head || 229 abbreviatedKey > prevNode.abbreviatedKey || 230 (abbreviatedKey == prevNode.abbreviatedKey && 231 s.cmp(key, (*s.storage)[prevNode.keyStart:prevNode.keyEnd]) > 0) { 232 for level := uint32(0); level < s.height; level++ { 233 spl[level].prev = s.getPrev(s.tail, level) 234 spl[level].next = s.tail 235 } 236 } else { 237 s.findSplice(key, abbreviatedKey, &spl) 238 } 239 240 height := s.randomHeight() 241 // Increase s.height as necessary. 242 for ; s.height < height; s.height++ { 243 spl[s.height].next = s.tail 244 spl[s.height].prev = s.head 245 } 246 247 // We always insert from the base level and up. After you add a node in base 248 // level, we cannot create a node in the level above because it would have 249 // discovered the node in the base level. 250 nd, err := s.newNode(height, keyOffset, keyStart, keyEnd, abbreviatedKey) 251 if err != nil { 252 return err 253 } 254 newNode := s.node(nd) 255 for level := uint32(0); level < height; level++ { 256 next := spl[level].next 257 prev := spl[level].prev 258 newNode.links[level].next = next 259 newNode.links[level].prev = prev 260 s.node(next).links[level].prev = nd 261 s.node(prev).links[level].next = nd 262 } 263 264 return nil 265 } 266 267 // NewIter returns a new Iterator object. The lower and upper bound parameters 268 // control the range of keys the iterator will return. Specifying for nil for 269 // lower or upper bound disables the check for that boundary. Note that lower 270 // bound is not checked on {SeekGE,First} and upper bound is not check on 271 // {SeekLT,Last}. The user is expected to perform that check. Note that it is 272 // safe for an iterator to be copied by value. 273 func (s *Skiplist) NewIter(lower, upper []byte) Iterator { 274 return Iterator{list: s, lower: lower, upper: upper} 275 } 276 277 func (s *Skiplist) newNode( 278 height, 279 offset, keyStart, keyEnd uint32, abbreviatedKey uint64, 280 ) (uint32, error) { 281 if height < 1 || height > maxHeight { 282 panic("height cannot be less than one or greater than the max height") 283 } 284 285 unusedSize := uint64(maxHeight-int(height)) * linksSize 286 nodeOffset, err := s.alloc(uint32(maxNodeSize - unusedSize)) 287 if err != nil { 288 return 0, err 289 } 290 nd := s.node(nodeOffset) 291 292 nd.offset = offset 293 nd.keyStart = keyStart 294 nd.keyEnd = keyEnd 295 nd.abbreviatedKey = abbreviatedKey 296 return nodeOffset, nil 297 } 298 299 func (s *Skiplist) alloc(size uint32) (uint32, error) { 300 offset := uint64(len(s.nodes)) 301 302 // We only have a need for memory up to offset + size, but we never want 303 // to allocate a node whose tail points into unallocated memory. 304 minAllocSize := offset + maxNodeSize 305 if uint64(cap(s.nodes)) < minAllocSize { 306 allocSize := uint64(cap(s.nodes)) * 2 307 if allocSize < minAllocSize { 308 allocSize = minAllocSize 309 } 310 // Cap the allocation at the max allowed size to avoid wasted capacity. 311 if allocSize > maxNodesSize { 312 // The new record may still not fit within the allocation, in which case 313 // we return early with an error. This avoids the panic below when we 314 // resize the slice. It also avoids the allocation and copy. 315 if uint64(offset)+uint64(size) > maxNodesSize { 316 return 0, errors.Wrapf(ErrTooManyRecords, 317 "alloc of new record (size=%d) would overflow uint32 (current size=%d)", 318 uint64(offset)+uint64(size), offset, 319 ) 320 } 321 allocSize = maxNodesSize 322 } 323 tmp := make([]byte, len(s.nodes), allocSize) 324 copy(tmp, s.nodes) 325 s.nodes = tmp 326 } 327 328 newSize := uint32(offset) + size 329 s.nodes = s.nodes[:newSize] 330 return uint32(offset), nil 331 } 332 333 func (s *Skiplist) node(offset uint32) *node { 334 return (*node)(unsafe.Pointer(&s.nodes[offset])) 335 } 336 337 func (s *Skiplist) randomHeight() uint32 { 338 rnd := uint32(s.rand.Uint64()) 339 h := uint32(1) 340 for h < maxHeight && rnd <= probabilities[h] { 341 h++ 342 } 343 return h 344 } 345 346 func (s *Skiplist) findSplice(key []byte, abbreviatedKey uint64, spl *[maxHeight]splice) { 347 prev := s.head 348 349 for level := s.height - 1; ; level-- { 350 // The code in this loop is the same as findSpliceForLevel(). For some 351 // reason, calling findSpliceForLevel() here is much much slower than the 352 // inlined code below. The excess time is also caught up in the final 353 // return statement which makes little sense. Revisit when in go1.14 or 354 // later if inlining improves. 355 356 next := s.getNext(prev, level) 357 for next != s.tail { 358 // Assume prev.key < key. 359 nextNode := s.node(next) 360 nextAbbreviatedKey := nextNode.abbreviatedKey 361 if abbreviatedKey < nextAbbreviatedKey { 362 // We are done for this level, since prev.key < key < next.key. 363 break 364 } 365 if abbreviatedKey == nextAbbreviatedKey { 366 if s.cmp(key, (*s.storage)[nextNode.keyStart:nextNode.keyEnd]) <= 0 { 367 // We are done for this level, since prev.key < key <= next.key. 368 break 369 } 370 } 371 372 // Keep moving right on this level. 373 prev = next 374 next = nextNode.links[level].next 375 } 376 377 spl[level].prev = prev 378 spl[level].next = next 379 if level == 0 { 380 break 381 } 382 } 383 } 384 385 func (s *Skiplist) findSpliceForLevel( 386 key []byte, abbreviatedKey uint64, level, start uint32, 387 ) (prev, next uint32) { 388 prev = start 389 next = s.getNext(prev, level) 390 391 for next != s.tail { 392 // Assume prev.key < key. 393 nextNode := s.node(next) 394 nextAbbreviatedKey := nextNode.abbreviatedKey 395 if abbreviatedKey < nextAbbreviatedKey { 396 // We are done for this level, since prev.key < key < next.key. 397 break 398 } 399 if abbreviatedKey == nextAbbreviatedKey { 400 if s.cmp(key, (*s.storage)[nextNode.keyStart:nextNode.keyEnd]) <= 0 { 401 // We are done for this level, since prev.key < key < next.key. 402 break 403 } 404 } 405 406 // Keep moving right on this level. 407 prev = next 408 next = nextNode.links[level].next 409 } 410 411 return 412 } 413 414 func (s *Skiplist) getKey(nd uint32) base.InternalKey { 415 n := s.node(nd) 416 kind := base.InternalKeyKind((*s.storage)[n.offset]) 417 key := (*s.storage)[n.keyStart:n.keyEnd] 418 return base.MakeInternalKey(key, uint64(n.offset)|base.InternalKeySeqNumBatch, kind) 419 } 420 421 func (s *Skiplist) getNext(nd, h uint32) uint32 { 422 return s.node(nd).links[h].next 423 } 424 425 func (s *Skiplist) getPrev(nd, h uint32) uint32 { 426 return s.node(nd).links[h].prev 427 } 428 429 func (s *Skiplist) debug() string { 430 var buf bytes.Buffer 431 for level := uint32(0); level < s.height; level++ { 432 var count int 433 for nd := s.head; nd != s.tail; nd = s.getNext(nd, level) { 434 count++ 435 } 436 fmt.Fprintf(&buf, "%d: %d\n", level, count) 437 } 438 return buf.String() 439 } 440 441 // Silence unused warning. 442 var _ = (*Skiplist).debug