github.com/dgraph-io/ristretto@v0.1.2-0.20240116140435-c67e07994f91/z/btree.go (about) 1 /* 2 * Copyright 2020 Dgraph Labs, Inc. and Contributors 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package z 18 19 import ( 20 "fmt" 21 "math" 22 "os" 23 "reflect" 24 "strings" 25 "unsafe" 26 27 "github.com/dgraph-io/ristretto/z/simd" 28 ) 29 30 var ( 31 pageSize = os.Getpagesize() 32 maxKeys = (pageSize / 16) - 1 33 //nolint:unused 34 oneThird = int(float64(maxKeys) / 3) 35 ) 36 37 const ( 38 absoluteMax = uint64(math.MaxUint64 - 1) 39 minSize = 1 << 20 40 ) 41 42 // Tree represents the structure for custom mmaped B+ tree. 43 // It supports keys in range [1, math.MaxUint64-1] and values [1, math.Uint64]. 44 type Tree struct { 45 buffer *Buffer 46 data []byte 47 nextPage uint64 48 freePage uint64 49 stats TreeStats 50 } 51 52 func (t *Tree) initRootNode() { 53 // This is the root node. 54 t.newNode(0) 55 // This acts as the rightmost pointer (all the keys are <= this key). 56 t.Set(absoluteMax, 0) 57 } 58 59 // NewTree returns an in-memory B+ tree. 60 func NewTree(tag string) *Tree { 61 const defaultTag = "tree" 62 if tag == "" { 63 tag = defaultTag 64 } 65 t := &Tree{buffer: NewBuffer(minSize, tag)} 66 t.Reset() 67 return t 68 } 69 70 // NewTree returns a persistent on-disk B+ tree. 71 func NewTreePersistent(path string) (*Tree, error) { 72 t := &Tree{} 73 var err error 74 75 // Open the buffer from disk and set it to the maximum allocated size. 76 t.buffer, err = NewBufferPersistent(path, minSize) 77 if err != nil { 78 return nil, err 79 } 80 t.buffer.offset = uint64(len(t.buffer.buf)) 81 t.data = t.buffer.Bytes() 82 83 // pageID can never be 0 if the tree has been initialized. 84 root := t.node(1) 85 isInitialized := root.pageID() != 0 86 87 if !isInitialized { 88 t.nextPage = 1 89 t.freePage = 0 90 t.initRootNode() 91 } else { 92 t.reinit() 93 } 94 95 return t, nil 96 } 97 98 // reinit sets the internal variables of a Tree, which are normally stored 99 // in-memory, but are lost when loading from disk. 100 func (t *Tree) reinit() { 101 // Calculate t.nextPage by finding the first node whose pageID is not set. 102 t.nextPage = 1 103 for int(t.nextPage)*pageSize < len(t.data) { 104 n := t.node(t.nextPage) 105 if n.pageID() == 0 { 106 break 107 } 108 t.nextPage++ 109 } 110 maxPageId := t.nextPage - 1 111 112 // Calculate t.freePage by finding the page to which no other page points. 113 // This would be the head of the page linked list. 114 // tailPages[i] is true if pageId i+1 is not the head of the list. 115 tailPages := make([]bool, maxPageId) 116 // Mark all pages containing nodes as tail pages. 117 t.Iterate(func(n node) { 118 i := n.pageID() - 1 119 tailPages[i] = true 120 // If this is a leaf node, increment the stats. 121 if n.isLeaf() { 122 t.stats.NumLeafKeys += n.numKeys() 123 } 124 }) 125 // pointedPages is a list of page IDs that the tail pages point to. 126 pointedPages := make([]uint64, 0) 127 for i, isTail := range tailPages { 128 if !isTail { 129 pageId := uint64(i) + 1 130 // Skip if nextPageId = 0, as that is equivalent to null page. 131 if nextPageId := t.node(pageId).uint64(0); nextPageId != 0 { 132 pointedPages = append(pointedPages, nextPageId) 133 } 134 t.stats.NumPagesFree++ 135 } 136 } 137 138 // Mark all pages being pointed to as tail pages. 139 for _, pageId := range pointedPages { 140 i := pageId - 1 141 tailPages[i] = true 142 } 143 // There should only be one head page left. 144 for i, isTail := range tailPages { 145 if !isTail { 146 pageId := uint64(i) + 1 147 t.freePage = pageId 148 break 149 } 150 } 151 } 152 153 // Reset resets the tree and truncates it to maxSz. 154 func (t *Tree) Reset() { 155 // Tree relies on uninitialized data being zeroed out, so we need to Memclr 156 // the data before using it again. 157 Memclr(t.buffer.buf) 158 t.buffer.Reset() 159 t.buffer.AllocateOffset(minSize) 160 t.data = t.buffer.Bytes() 161 t.stats = TreeStats{} 162 t.nextPage = 1 163 t.freePage = 0 164 t.initRootNode() 165 } 166 167 // Close releases the memory used by the tree. 168 func (t *Tree) Close() error { 169 if t == nil { 170 return nil 171 } 172 return t.buffer.Release() 173 } 174 175 type TreeStats struct { 176 Allocated int // Derived. 177 Bytes int // Derived. 178 NumLeafKeys int // Calculated. 179 NumPages int // Derived. 180 NumPagesFree int // Calculated. 181 Occupancy float64 // Derived. 182 PageSize int // Derived. 183 } 184 185 // Stats returns stats about the tree. 186 func (t *Tree) Stats() TreeStats { 187 numPages := int(t.nextPage - 1) 188 out := TreeStats{ 189 Bytes: numPages * pageSize, 190 Allocated: len(t.data), 191 NumLeafKeys: t.stats.NumLeafKeys, 192 NumPages: numPages, 193 NumPagesFree: t.stats.NumPagesFree, 194 PageSize: pageSize, 195 } 196 out.Occupancy = 100.0 * float64(out.NumLeafKeys) / float64(maxKeys*numPages) 197 return out 198 } 199 200 // BytesToUint64Slice converts a byte slice to a uint64 slice. 201 func BytesToUint64Slice(b []byte) []uint64 { 202 if len(b) == 0 { 203 return nil 204 } 205 var u64s []uint64 206 hdr := (*reflect.SliceHeader)(unsafe.Pointer(&u64s)) 207 hdr.Len = len(b) / 8 208 hdr.Cap = hdr.Len 209 hdr.Data = uintptr(unsafe.Pointer(&b[0])) 210 return u64s 211 } 212 213 func (t *Tree) newNode(bit uint64) node { 214 var pageId uint64 215 if t.freePage > 0 { 216 pageId = t.freePage 217 t.stats.NumPagesFree-- 218 } else { 219 pageId = t.nextPage 220 t.nextPage++ 221 offset := int(pageId) * pageSize 222 reqSize := offset + pageSize 223 if reqSize > len(t.data) { 224 t.buffer.AllocateOffset(reqSize - len(t.data)) 225 t.data = t.buffer.Bytes() 226 } 227 } 228 n := t.node(pageId) 229 if t.freePage > 0 { 230 t.freePage = n.uint64(0) 231 } 232 zeroOut(n) 233 n.setBit(bit) 234 n.setAt(keyOffset(maxKeys), pageId) 235 return n 236 } 237 238 func getNode(data []byte) node { 239 return node(BytesToUint64Slice(data)) 240 } 241 242 func zeroOut(data []uint64) { 243 for i := 0; i < len(data); i++ { 244 data[i] = 0 245 } 246 } 247 248 func (t *Tree) node(pid uint64) node { 249 // page does not exist 250 if pid == 0 { 251 return nil 252 } 253 start := pageSize * int(pid) 254 return getNode(t.data[start : start+pageSize]) 255 } 256 257 // Set sets the key-value pair in the tree. 258 func (t *Tree) Set(k, v uint64) { 259 if k == math.MaxUint64 || k == 0 { 260 panic("Error setting zero or MaxUint64") 261 } 262 root := t.set(1, k, v) 263 if root.isFull() { 264 right := t.split(1) 265 left := t.newNode(root.bits()) 266 // Re-read the root as the underlying buffer for tree might have changed during split. 267 root = t.node(1) 268 copy(left[:keyOffset(maxKeys)], root) 269 left.setNumKeys(root.numKeys()) 270 271 // reset the root node. 272 zeroOut(root[:keyOffset(maxKeys)]) 273 root.setNumKeys(0) 274 275 // set the pointers for left and right child in the root node. 276 root.set(left.maxKey(), left.pageID()) 277 root.set(right.maxKey(), right.pageID()) 278 } 279 } 280 281 // For internal nodes, they contain <key, ptr>. 282 // where all entries <= key are stored in the corresponding ptr. 283 func (t *Tree) set(pid, k, v uint64) node { 284 n := t.node(pid) 285 if n.isLeaf() { 286 t.stats.NumLeafKeys += n.set(k, v) 287 return n 288 } 289 290 // This is an internal node. 291 idx := n.search(k) 292 if idx >= maxKeys { 293 panic("search returned index >= maxKeys") 294 } 295 // If no key at idx. 296 if n.key(idx) == 0 { 297 n.setAt(keyOffset(idx), k) 298 n.setNumKeys(n.numKeys() + 1) 299 } 300 child := t.node(n.val(idx)) 301 if child == nil { 302 child = t.newNode(bitLeaf) 303 n = t.node(pid) 304 n.setAt(valOffset(idx), child.pageID()) 305 } 306 child = t.set(child.pageID(), k, v) 307 // Re-read n as the underlying buffer for tree might have changed during set. 308 n = t.node(pid) 309 if child.isFull() { 310 // Just consider the left sibling for simplicity. 311 // if t.shareWithSibling(n, idx) { 312 // return n 313 // } 314 315 nn := t.split(child.pageID()) 316 // Re-read n and child as the underlying buffer for tree might have changed during split. 317 n = t.node(pid) 318 child = t.node(n.uint64(valOffset(idx))) 319 // Set child pointers in the node n. 320 // Note that key for right node (nn) already exist in node n, but the 321 // pointer is updated. 322 n.set(child.maxKey(), child.pageID()) 323 n.set(nn.maxKey(), nn.pageID()) 324 } 325 return n 326 } 327 328 // Get looks for key and returns the corresponding value. 329 // If key is not found, 0 is returned. 330 func (t *Tree) Get(k uint64) uint64 { 331 if k == math.MaxUint64 || k == 0 { 332 panic("Does not support getting MaxUint64/Zero") 333 } 334 root := t.node(1) 335 return t.get(root, k) 336 } 337 338 func (t *Tree) get(n node, k uint64) uint64 { 339 if n.isLeaf() { 340 return n.get(k) 341 } 342 // This is internal node 343 idx := n.search(k) 344 if idx == n.numKeys() || n.key(idx) == 0 { 345 return 0 346 } 347 child := t.node(n.uint64(valOffset(idx))) 348 assert(child != nil) 349 return t.get(child, k) 350 } 351 352 // DeleteBelow deletes all keys with value under ts. 353 func (t *Tree) DeleteBelow(ts uint64) { 354 root := t.node(1) 355 t.stats.NumLeafKeys = 0 356 t.compact(root, ts) 357 assert(root.numKeys() >= 1) 358 } 359 360 func (t *Tree) compact(n node, ts uint64) int { 361 if n.isLeaf() { 362 numKeys := n.compact(ts) 363 t.stats.NumLeafKeys += n.numKeys() 364 return numKeys 365 } 366 // Not leaf. 367 N := n.numKeys() 368 for i := 0; i < N; i++ { 369 assert(n.key(i) > 0) 370 childID := n.uint64(valOffset(i)) 371 child := t.node(childID) 372 if rem := t.compact(child, ts); rem == 0 && i < N-1 { 373 // If no valid key is remaining we can drop this child. However, don't do that if this 374 // is the max key. 375 t.stats.NumLeafKeys -= child.numKeys() 376 child.setAt(0, t.freePage) 377 t.freePage = childID 378 n.setAt(valOffset(i), 0) 379 t.stats.NumPagesFree++ 380 } 381 } 382 // We use ts=1 here because we want to delete all the keys whose value is 0, which means they no 383 // longer have a valid page for that key. 384 return n.compact(1) 385 } 386 387 func (t *Tree) iterate(n node, fn func(node)) { 388 fn(n) 389 if n.isLeaf() { 390 return 391 } 392 // Explore children. 393 for i := 0; i < maxKeys; i++ { 394 if n.key(i) == 0 { 395 return 396 } 397 childID := n.uint64(valOffset(i)) 398 assert(childID > 0) 399 400 child := t.node(childID) 401 t.iterate(child, fn) 402 } 403 } 404 405 // Iterate iterates over the tree and executes the fn on each node. 406 func (t *Tree) Iterate(fn func(node)) { 407 root := t.node(1) 408 t.iterate(root, fn) 409 } 410 411 // IterateKV iterates through all keys and values in the tree. 412 // If newVal is non-zero, it will be set in the tree. 413 func (t *Tree) IterateKV(f func(key, val uint64) (newVal uint64)) { 414 t.Iterate(func(n node) { 415 // Only leaf nodes contain keys. 416 if !n.isLeaf() { 417 return 418 } 419 420 for i := 0; i < n.numKeys(); i++ { 421 key := n.key(i) 422 val := n.val(i) 423 424 // A zero value here means that this is a bogus entry. 425 if val == 0 { 426 continue 427 } 428 429 newVal := f(key, val) 430 if newVal != 0 { 431 n.setAt(valOffset(i), newVal) 432 } 433 } 434 }) 435 } 436 437 func (t *Tree) print(n node, parentID uint64) { 438 n.print(parentID) 439 if n.isLeaf() { 440 return 441 } 442 pid := n.pageID() 443 for i := 0; i < maxKeys; i++ { 444 if n.key(i) == 0 { 445 return 446 } 447 childID := n.uint64(valOffset(i)) 448 child := t.node(childID) 449 t.print(child, pid) 450 } 451 } 452 453 // Print iterates over the tree and prints all valid KVs. 454 func (t *Tree) Print() { 455 root := t.node(1) 456 t.print(root, 0) 457 } 458 459 // Splits the node into two. It moves right half of the keys from the original node to a newly 460 // created right node. It returns the right node. 461 func (t *Tree) split(pid uint64) node { 462 n := t.node(pid) 463 if !n.isFull() { 464 panic("This should be called only when n is full") 465 } 466 467 // Create a new node nn, copy over half the keys from n, and set the parent to n's parent. 468 nn := t.newNode(n.bits()) 469 // Re-read n as the underlying buffer for tree might have changed during newNode. 470 n = t.node(pid) 471 rightHalf := n[keyOffset(maxKeys/2):keyOffset(maxKeys)] 472 copy(nn, rightHalf) 473 nn.setNumKeys(maxKeys - maxKeys/2) 474 475 // Remove entries from node n. 476 zeroOut(rightHalf) 477 n.setNumKeys(maxKeys / 2) 478 return nn 479 } 480 481 // shareWithSiblingXXX is unused for now. The idea is to move some keys to 482 // sibling when a node is full. But, I don't see any special benefits in our 483 // access pattern. It doesn't result in better occupancy ratios. 484 // 485 //nolint:unused 486 func (t *Tree) shareWithSiblingXXX(n node, idx int) bool { 487 if idx == 0 { 488 return false 489 } 490 left := t.node(n.val(idx - 1)) 491 ns := left.numKeys() 492 if ns >= maxKeys/2 { 493 // Sibling is already getting full. 494 return false 495 } 496 497 right := t.node(n.val(idx)) 498 // Copy over keys from right child to left child. 499 copied := copy(left[keyOffset(ns):], right[:keyOffset(oneThird)]) 500 copied /= 2 // Considering that key-val constitute one key. 501 left.setNumKeys(ns + copied) 502 503 // Update the max key in parent node n for the left sibling. 504 n.setAt(keyOffset(idx-1), left.maxKey()) 505 506 // Now move keys to left for the right sibling. 507 until := copy(right, right[keyOffset(oneThird):keyOffset(maxKeys)]) 508 right.setNumKeys(until / 2) 509 zeroOut(right[until:keyOffset(maxKeys)]) 510 return true 511 } 512 513 // Each node in the node is of size pageSize. Two kinds of nodes. Leaf nodes and internal nodes. 514 // Leaf nodes only contain the data. Internal nodes would contain the key and the offset to the 515 // child node. 516 // Internal node would have first entry as 517 // <0 offset to child>, <1000 offset>, <5000 offset>, and so on... 518 // Leaf nodes would just have: <key, value>, <key, value>, and so on... 519 // Last 16 bytes of the node are off limits. 520 // | pageID (8 bytes) | metaBits (1 byte) | 3 free bytes | numKeys (4 bytes) | 521 type node []uint64 522 523 func (n node) uint64(start int) uint64 { return n[start] } 524 525 // func (n node) uint32(start int) uint32 { return *(*uint32)(unsafe.Pointer(&n[start])) } 526 527 func keyOffset(i int) int { return 2 * i } 528 func valOffset(i int) int { return 2*i + 1 } 529 func (n node) numKeys() int { return int(n.uint64(valOffset(maxKeys)) & 0xFFFFFFFF) } 530 func (n node) pageID() uint64 { return n.uint64(keyOffset(maxKeys)) } 531 func (n node) key(i int) uint64 { return n.uint64(keyOffset(i)) } 532 func (n node) val(i int) uint64 { return n.uint64(valOffset(i)) } 533 func (n node) data(i int) []uint64 { return n[keyOffset(i):keyOffset(i+1)] } 534 535 func (n node) setAt(start int, k uint64) { 536 n[start] = k 537 } 538 539 func (n node) setNumKeys(num int) { 540 idx := valOffset(maxKeys) 541 val := n[idx] 542 val &= 0xFFFFFFFF00000000 543 val |= uint64(num) 544 n[idx] = val 545 } 546 547 func (n node) moveRight(lo int) { 548 hi := n.numKeys() 549 assert(hi != maxKeys) 550 // copy works despite of overlap in src and dst. 551 // See https://golang.org/pkg/builtin/#copy 552 copy(n[keyOffset(lo+1):keyOffset(hi+1)], n[keyOffset(lo):keyOffset(hi)]) 553 } 554 555 const ( 556 bitLeaf = uint64(1 << 63) 557 ) 558 559 func (n node) setBit(b uint64) { 560 vo := valOffset(maxKeys) 561 val := n[vo] 562 val &= 0xFFFFFFFF 563 val |= b 564 n[vo] = val 565 } 566 func (n node) bits() uint64 { 567 return n.val(maxKeys) & 0xFF00000000000000 568 } 569 func (n node) isLeaf() bool { 570 return n.bits()&bitLeaf > 0 571 } 572 573 // isFull checks that the node is already full. 574 func (n node) isFull() bool { 575 return n.numKeys() == maxKeys 576 } 577 578 // Search returns the index of a smallest key >= k in a node. 579 func (n node) search(k uint64) int { 580 N := n.numKeys() 581 if N < 4 { 582 for i := 0; i < N; i++ { 583 if ki := n.key(i); ki >= k { 584 return i 585 } 586 } 587 return N 588 } 589 return int(simd.Search(n[:2*N], k)) 590 // lo, hi := 0, N 591 // // Reduce the search space using binary seach and then do linear search. 592 // for hi-lo > 32 { 593 // mid := (hi + lo) / 2 594 // km := n.key(mid) 595 // if k == km { 596 // return mid 597 // } 598 // if k > km { 599 // // key is greater than the key at mid, so move right. 600 // lo = mid + 1 601 // } else { 602 // // else move left. 603 // hi = mid 604 // } 605 // } 606 // for i := lo; i <= hi; i++ { 607 // if ki := n.key(i); ki >= k { 608 // return i 609 // } 610 // } 611 // return N 612 } 613 func (n node) maxKey() uint64 { 614 idx := n.numKeys() 615 // idx points to the first key which is zero. 616 if idx > 0 { 617 idx-- 618 } 619 return n.key(idx) 620 } 621 622 // compacts the node i.e., remove all the kvs with value < lo. It returns the remaining number of 623 // keys. 624 func (n node) compact(lo uint64) int { 625 N := n.numKeys() 626 mk := n.maxKey() 627 var left, right int 628 for right = 0; right < N; right++ { 629 if n.val(right) < lo && n.key(right) < mk { 630 // Skip over this key. Don't copy it. 631 continue 632 } 633 // Valid data. Copy it from right to left. Advance left. 634 if left != right { 635 copy(n.data(left), n.data(right)) 636 } 637 left++ 638 } 639 // zero out rest of the kv pairs. 640 zeroOut(n[keyOffset(left):keyOffset(right)]) 641 n.setNumKeys(left) 642 643 // If the only key we have is the max key, and its value is less than lo, then we can indicate 644 // to the caller by returning a zero that it's OK to drop the node. 645 if left == 1 && n.key(0) == mk && n.val(0) < lo { 646 return 0 647 } 648 return left 649 } 650 651 func (n node) get(k uint64) uint64 { 652 idx := n.search(k) 653 // key is not found 654 if idx == n.numKeys() { 655 return 0 656 } 657 if ki := n.key(idx); ki == k { 658 return n.val(idx) 659 } 660 return 0 661 } 662 663 // set returns true if it added a new key. 664 func (n node) set(k, v uint64) (numAdded int) { 665 idx := n.search(k) 666 ki := n.key(idx) 667 if n.numKeys() == maxKeys { 668 // This happens during split of non-root node, when we are updating the child pointer of 669 // right node. Hence, the key should already exist. 670 assert(ki == k) 671 } 672 if ki > k { 673 // Found the first entry which is greater than k. So, we need to fit k 674 // just before it. For that, we should move the rest of the data in the 675 // node to the right to make space for k. 676 n.moveRight(idx) 677 } 678 // If the k does not exist already, increment the number of keys. 679 if ki != k { 680 n.setNumKeys(n.numKeys() + 1) 681 numAdded = 1 682 } 683 if ki == 0 || ki >= k { 684 n.setAt(keyOffset(idx), k) 685 n.setAt(valOffset(idx), v) 686 return 687 } 688 panic("shouldn't reach here") 689 } 690 691 func (n node) iterate(fn func(node, int)) { 692 for i := 0; i < maxKeys; i++ { 693 if k := n.key(i); k > 0 { 694 fn(n, i) 695 } else { 696 break 697 } 698 } 699 } 700 701 func (n node) print(parentID uint64) { 702 var keys []string 703 n.iterate(func(n node, i int) { 704 keys = append(keys, fmt.Sprintf("%d", n.key(i))) 705 }) 706 if len(keys) > 8 { 707 copy(keys[4:], keys[len(keys)-4:]) 708 keys[3] = "..." 709 keys = keys[:8] 710 } 711 fmt.Printf("%d Child of: %d num keys: %d keys: %s\n", 712 n.pageID(), parentID, n.numKeys(), strings.Join(keys, " ")) 713 }