github.com/outcaste-io/ristretto@v0.2.3/z/btree.go (about) 1 /* 2 * Copyright 2020 Dgraph Labs, Inc. and Contributors 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package z 18 19 import ( 20 "fmt" 21 "math" 22 "os" 23 "reflect" 24 "strings" 25 "unsafe" 26 27 "github.com/outcaste-io/ristretto/z/simd" 28 ) 29 30 var ( 31 pageSize = os.Getpagesize() 32 maxKeys = (pageSize / 16) - 1 33 oneThird = int(float64(maxKeys) / 3) 34 ) 35 36 const ( 37 absoluteMax = uint64(math.MaxUint64 - 1) 38 minSize = 1 << 20 39 ) 40 41 // Tree represents the structure for custom mmaped B+ tree. 42 // It supports keys in range [1, math.MaxUint64-1] and values [1, math.Uint64]. 43 type Tree struct { 44 buffer *Buffer 45 data []byte 46 nextPage uint64 47 freePage uint64 48 stats TreeStats 49 } 50 51 func (t *Tree) initRootNode() { 52 // This is the root node. 53 t.newNode(0) 54 // This acts as the rightmost pointer (all the keys are <= this key). 55 t.Set(absoluteMax, 0) 56 } 57 58 // NewTree returns an in-memory B+ tree. 59 func NewTree(tag string) *Tree { 60 const defaultTag = "tree" 61 if tag == "" { 62 tag = defaultTag 63 } 64 t := &Tree{buffer: NewBuffer(minSize, tag)} 65 t.Reset() 66 return t 67 } 68 69 // NewTree returns a persistent on-disk B+ tree. 70 func NewTreePersistent(path string) (*Tree, error) { 71 t := &Tree{} 72 var err error 73 74 // Open the buffer from disk and set it to the maximum allocated size. 75 t.buffer, err = NewBufferPersistent(path, minSize) 76 if err != nil { 77 return nil, err 78 } 79 t.buffer.offset = uint64(len(t.buffer.buf)) 80 t.data = t.buffer.Bytes() 81 82 // pageID can never be 0 if the tree has been initialized. 83 root := t.node(1) 84 isInitialized := root.pageID() != 0 85 86 if !isInitialized { 87 t.nextPage = 1 88 t.freePage = 0 89 t.initRootNode() 90 } else { 91 t.reinit() 92 } 93 94 return t, nil 95 } 96 97 // reinit sets the internal variables of a Tree, which are normally stored 98 // in-memory, but are lost when loading from disk. 99 func (t *Tree) reinit() { 100 // Calculate t.nextPage by finding the first node whose pageID is not set. 101 t.nextPage = 1 102 for int(t.nextPage)*pageSize < len(t.data) { 103 n := t.node(t.nextPage) 104 if n.pageID() == 0 { 105 break 106 } 107 t.nextPage++ 108 } 109 maxPageId := t.nextPage - 1 110 111 // Calculate t.freePage by finding the page to which no other page points. 112 // This would be the head of the page linked list. 113 // tailPages[i] is true if pageId i+1 is not the head of the list. 114 tailPages := make([]bool, maxPageId) 115 // Mark all pages containing nodes as tail pages. 116 t.Iterate(func(n node) { 117 i := n.pageID() - 1 118 tailPages[i] = true 119 // If this is a leaf node, increment the stats. 120 if n.isLeaf() { 121 t.stats.NumLeafKeys += n.numKeys() 122 } 123 }) 124 // pointedPages is a list of page IDs that the tail pages point to. 125 pointedPages := make([]uint64, 0) 126 for i, isTail := range tailPages { 127 if !isTail { 128 pageId := uint64(i) + 1 129 // Skip if nextPageId = 0, as that is equivalent to null page. 130 if nextPageId := t.node(pageId).uint64(0); nextPageId != 0 { 131 pointedPages = append(pointedPages, nextPageId) 132 } 133 t.stats.NumPagesFree++ 134 } 135 } 136 137 // Mark all pages being pointed to as tail pages. 138 for _, pageId := range pointedPages { 139 i := pageId - 1 140 tailPages[i] = true 141 } 142 // There should only be one head page left. 143 for i, isTail := range tailPages { 144 if !isTail { 145 pageId := uint64(i) + 1 146 t.freePage = pageId 147 break 148 } 149 } 150 } 151 152 // Reset resets the tree and truncates it to maxSz. 153 func (t *Tree) Reset() { 154 // Tree relies on uninitialized data being zeroed out, so we need to Memclr 155 // the data before using it again. 156 Memclr(t.buffer.buf) 157 t.buffer.Reset() 158 t.buffer.AllocateOffset(minSize) 159 t.data = t.buffer.Bytes() 160 t.stats = TreeStats{} 161 t.nextPage = 1 162 t.freePage = 0 163 t.initRootNode() 164 } 165 166 // Close releases the memory used by the tree. 167 func (t *Tree) Close() error { 168 if t == nil { 169 return nil 170 } 171 return t.buffer.Release() 172 } 173 174 type TreeStats struct { 175 Allocated int // Derived. 176 Bytes int // Derived. 177 NumLeafKeys int // Calculated. 178 NumPages int // Derived. 179 NumPagesFree int // Calculated. 180 Occupancy float64 // Derived. 181 PageSize int // Derived. 182 } 183 184 // Stats returns stats about the tree. 185 func (t *Tree) Stats() TreeStats { 186 numPages := int(t.nextPage - 1) 187 out := TreeStats{ 188 Bytes: numPages * pageSize, 189 Allocated: len(t.data), 190 NumLeafKeys: t.stats.NumLeafKeys, 191 NumPages: numPages, 192 NumPagesFree: t.stats.NumPagesFree, 193 PageSize: pageSize, 194 } 195 out.Occupancy = 100.0 * float64(out.NumLeafKeys) / float64(maxKeys*numPages) 196 return out 197 } 198 199 // BytesToUint64Slice converts a byte slice to a uint64 slice. 200 func BytesToUint64Slice(b []byte) []uint64 { 201 if len(b) == 0 { 202 return nil 203 } 204 var u64s []uint64 205 hdr := (*reflect.SliceHeader)(unsafe.Pointer(&u64s)) 206 hdr.Len = len(b) / 8 207 hdr.Cap = hdr.Len 208 hdr.Data = uintptr(unsafe.Pointer(&b[0])) 209 return u64s 210 } 211 212 func (t *Tree) newNode(bit uint64) node { 213 var pageId uint64 214 if t.freePage > 0 { 215 pageId = t.freePage 216 t.stats.NumPagesFree-- 217 } else { 218 pageId = t.nextPage 219 t.nextPage++ 220 offset := int(pageId) * pageSize 221 reqSize := offset + pageSize 222 if reqSize > len(t.data) { 223 t.buffer.AllocateOffset(reqSize - len(t.data)) 224 t.data = t.buffer.Bytes() 225 } 226 } 227 n := t.node(pageId) 228 if t.freePage > 0 { 229 t.freePage = n.uint64(0) 230 } 231 zeroOut(n) 232 n.setBit(bit) 233 n.setAt(keyOffset(maxKeys), pageId) 234 return n 235 } 236 237 func getNode(data []byte) node { 238 return node(BytesToUint64Slice(data)) 239 } 240 241 func zeroOut(data []uint64) { 242 for i := 0; i < len(data); i++ { 243 data[i] = 0 244 } 245 } 246 247 func (t *Tree) node(pid uint64) node { 248 // page does not exist 249 if pid == 0 { 250 return nil 251 } 252 start := pageSize * int(pid) 253 return getNode(t.data[start : start+pageSize]) 254 } 255 256 // Set sets the key-value pair in the tree. 257 func (t *Tree) Set(k, v uint64) { 258 if k == math.MaxUint64 || k == 0 { 259 panic("Error setting zero or MaxUint64") 260 } 261 root := t.set(1, k, v) 262 if root.isFull() { 263 right := t.split(1) 264 left := t.newNode(root.bits()) 265 // Re-read the root as the underlying buffer for tree might have changed during split. 266 root = t.node(1) 267 copy(left[:keyOffset(maxKeys)], root) 268 left.setNumKeys(root.numKeys()) 269 270 // reset the root node. 271 zeroOut(root[:keyOffset(maxKeys)]) 272 root.setNumKeys(0) 273 274 // set the pointers for left and right child in the root node. 275 root.set(left.maxKey(), left.pageID()) 276 root.set(right.maxKey(), right.pageID()) 277 } 278 } 279 280 // For internal nodes, they contain <key, ptr>. 281 // where all entries <= key are stored in the corresponding ptr. 282 func (t *Tree) set(pid, k, v uint64) node { 283 n := t.node(pid) 284 if n.isLeaf() { 285 t.stats.NumLeafKeys += n.set(k, v) 286 return n 287 } 288 289 // This is an internal node. 290 idx := n.search(k) 291 if idx >= maxKeys { 292 panic("search returned index >= maxKeys") 293 } 294 // If no key at idx. 295 if n.key(idx) == 0 { 296 n.setAt(keyOffset(idx), k) 297 n.setNumKeys(n.numKeys() + 1) 298 } 299 child := t.node(n.val(idx)) 300 if child == nil { 301 child = t.newNode(bitLeaf) 302 n = t.node(pid) 303 n.setAt(valOffset(idx), child.pageID()) 304 } 305 child = t.set(child.pageID(), k, v) 306 // Re-read n as the underlying buffer for tree might have changed during set. 307 n = t.node(pid) 308 if child.isFull() { 309 // Just consider the left sibling for simplicity. 310 // if t.shareWithSibling(n, idx) { 311 // return n 312 // } 313 314 nn := t.split(child.pageID()) 315 // Re-read n and child as the underlying buffer for tree might have changed during split. 316 n = t.node(pid) 317 child = t.node(n.uint64(valOffset(idx))) 318 // Set child pointers in the node n. 319 // Note that key for right node (nn) already exist in node n, but the 320 // pointer is updated. 321 n.set(child.maxKey(), child.pageID()) 322 n.set(nn.maxKey(), nn.pageID()) 323 } 324 return n 325 } 326 327 // Get looks for key and returns the corresponding value. 328 // If key is not found, 0 is returned. 329 func (t *Tree) Get(k uint64) uint64 { 330 if k == math.MaxUint64 || k == 0 { 331 panic("Does not support getting MaxUint64/Zero") 332 } 333 root := t.node(1) 334 return t.get(root, k) 335 } 336 337 func (t *Tree) get(n node, k uint64) uint64 { 338 if n.isLeaf() { 339 return n.get(k) 340 } 341 // This is internal node 342 idx := n.search(k) 343 if idx == n.numKeys() || n.key(idx) == 0 { 344 return 0 345 } 346 child := t.node(n.uint64(valOffset(idx))) 347 assert(child != nil) 348 return t.get(child, k) 349 } 350 351 // DeleteBelow deletes all keys with value under ts. 352 func (t *Tree) DeleteBelow(ts uint64) { 353 root := t.node(1) 354 t.stats.NumLeafKeys = 0 355 t.compact(root, ts) 356 assert(root.numKeys() >= 1) 357 } 358 359 func (t *Tree) compact(n node, ts uint64) int { 360 if n.isLeaf() { 361 numKeys := n.compact(ts) 362 t.stats.NumLeafKeys += n.numKeys() 363 return numKeys 364 } 365 // Not leaf. 366 N := n.numKeys() 367 for i := 0; i < N; i++ { 368 assert(n.key(i) > 0) 369 childID := n.uint64(valOffset(i)) 370 child := t.node(childID) 371 if rem := t.compact(child, ts); rem == 0 && i < N-1 { 372 // If no valid key is remaining we can drop this child. However, don't do that if this 373 // is the max key. 374 t.stats.NumLeafKeys -= child.numKeys() 375 child.setAt(0, t.freePage) 376 t.freePage = childID 377 n.setAt(valOffset(i), 0) 378 t.stats.NumPagesFree++ 379 } 380 } 381 // We use ts=1 here because we want to delete all the keys whose value is 0, which means they no 382 // longer have a valid page for that key. 383 return n.compact(1) 384 } 385 386 func (t *Tree) iterate(n node, fn func(node)) { 387 fn(n) 388 if n.isLeaf() { 389 return 390 } 391 // Explore children. 392 for i := 0; i < maxKeys; i++ { 393 if n.key(i) == 0 { 394 return 395 } 396 childID := n.uint64(valOffset(i)) 397 assert(childID > 0) 398 399 child := t.node(childID) 400 t.iterate(child, fn) 401 } 402 } 403 404 // Iterate iterates over the tree and executes the fn on each node. 405 func (t *Tree) Iterate(fn func(node)) { 406 root := t.node(1) 407 t.iterate(root, fn) 408 } 409 410 // IterateKV iterates through all keys and values in the tree. 411 // If newVal is non-zero, it will be set in the tree. 412 func (t *Tree) IterateKV(f func(key, val uint64) (newVal uint64)) { 413 t.Iterate(func(n node) { 414 // Only leaf nodes contain keys. 415 if !n.isLeaf() { 416 return 417 } 418 419 for i := 0; i < n.numKeys(); i++ { 420 key := n.key(i) 421 val := n.val(i) 422 423 // A zero value here means that this is a bogus entry. 424 if val == 0 { 425 continue 426 } 427 428 newVal := f(key, val) 429 if newVal != 0 { 430 n.setAt(valOffset(i), newVal) 431 } 432 } 433 }) 434 } 435 436 func (t *Tree) print(n node, parentID uint64) { 437 n.print(parentID) 438 if n.isLeaf() { 439 return 440 } 441 pid := n.pageID() 442 for i := 0; i < maxKeys; i++ { 443 if n.key(i) == 0 { 444 return 445 } 446 childID := n.uint64(valOffset(i)) 447 child := t.node(childID) 448 t.print(child, pid) 449 } 450 } 451 452 // Print iterates over the tree and prints all valid KVs. 453 func (t *Tree) Print() { 454 root := t.node(1) 455 t.print(root, 0) 456 } 457 458 // Splits the node into two. It moves right half of the keys from the original node to a newly 459 // created right node. It returns the right node. 460 func (t *Tree) split(pid uint64) node { 461 n := t.node(pid) 462 if !n.isFull() { 463 panic("This should be called only when n is full") 464 } 465 466 // Create a new node nn, copy over half the keys from n, and set the parent to n's parent. 467 nn := t.newNode(n.bits()) 468 // Re-read n as the underlying buffer for tree might have changed during newNode. 469 n = t.node(pid) 470 rightHalf := n[keyOffset(maxKeys/2):keyOffset(maxKeys)] 471 copy(nn, rightHalf) 472 nn.setNumKeys(maxKeys - maxKeys/2) 473 474 // Remove entries from node n. 475 zeroOut(rightHalf) 476 n.setNumKeys(maxKeys / 2) 477 return nn 478 } 479 480 // shareWithSiblingXXX is unused for now. The idea is to move some keys to 481 // sibling when a node is full. But, I don't see any special benefits in our 482 // access pattern. It doesn't result in better occupancy ratios. 483 func (t *Tree) shareWithSiblingXXX(n node, idx int) bool { 484 if idx == 0 { 485 return false 486 } 487 left := t.node(n.val(idx - 1)) 488 ns := left.numKeys() 489 if ns >= maxKeys/2 { 490 // Sibling is already getting full. 491 return false 492 } 493 494 right := t.node(n.val(idx)) 495 // Copy over keys from right child to left child. 496 copied := copy(left[keyOffset(ns):], right[:keyOffset(oneThird)]) 497 copied /= 2 // Considering that key-val constitute one key. 498 left.setNumKeys(ns + copied) 499 500 // Update the max key in parent node n for the left sibling. 501 n.setAt(keyOffset(idx-1), left.maxKey()) 502 503 // Now move keys to left for the right sibling. 504 until := copy(right, right[keyOffset(oneThird):keyOffset(maxKeys)]) 505 right.setNumKeys(until / 2) 506 zeroOut(right[until:keyOffset(maxKeys)]) 507 return true 508 } 509 510 // Each node in the node is of size pageSize. Two kinds of nodes. Leaf nodes and internal nodes. 511 // Leaf nodes only contain the data. Internal nodes would contain the key and the offset to the 512 // child node. 513 // Internal node would have first entry as 514 // <0 offset to child>, <1000 offset>, <5000 offset>, and so on... 515 // Leaf nodes would just have: <key, value>, <key, value>, and so on... 516 // Last 16 bytes of the node are off limits. 517 // | pageID (8 bytes) | metaBits (1 byte) | 3 free bytes | numKeys (4 bytes) | 518 type node []uint64 519 520 func (n node) uint64(start int) uint64 { return n[start] } 521 522 // func (n node) uint32(start int) uint32 { return *(*uint32)(unsafe.Pointer(&n[start])) } 523 524 func keyOffset(i int) int { return 2 * i } 525 func valOffset(i int) int { return 2*i + 1 } 526 func (n node) numKeys() int { return int(n.uint64(valOffset(maxKeys)) & 0xFFFFFFFF) } 527 func (n node) pageID() uint64 { return n.uint64(keyOffset(maxKeys)) } 528 func (n node) key(i int) uint64 { return n.uint64(keyOffset(i)) } 529 func (n node) val(i int) uint64 { return n.uint64(valOffset(i)) } 530 func (n node) data(i int) []uint64 { return n[keyOffset(i):keyOffset(i+1)] } 531 532 func (n node) setAt(start int, k uint64) { 533 n[start] = k 534 } 535 536 func (n node) setNumKeys(num int) { 537 idx := valOffset(maxKeys) 538 val := n[idx] 539 val &= 0xFFFFFFFF00000000 540 val |= uint64(num) 541 n[idx] = val 542 } 543 544 func (n node) moveRight(lo int) { 545 hi := n.numKeys() 546 assert(hi != maxKeys) 547 // copy works despite of overlap in src and dst. 548 // See https://golang.org/pkg/builtin/#copy 549 copy(n[keyOffset(lo+1):keyOffset(hi+1)], n[keyOffset(lo):keyOffset(hi)]) 550 } 551 552 const ( 553 bitLeaf = uint64(1 << 63) 554 ) 555 556 func (n node) setBit(b uint64) { 557 vo := valOffset(maxKeys) 558 val := n[vo] 559 val &= 0xFFFFFFFF 560 val |= b 561 n[vo] = val 562 } 563 func (n node) bits() uint64 { 564 return n.val(maxKeys) & 0xFF00000000000000 565 } 566 func (n node) isLeaf() bool { 567 return n.bits()&bitLeaf > 0 568 } 569 570 // isFull checks that the node is already full. 571 func (n node) isFull() bool { 572 return n.numKeys() == maxKeys 573 } 574 575 // Search returns the index of a smallest key >= k in a node. 576 func (n node) search(k uint64) int { 577 N := n.numKeys() 578 if N < 4 { 579 for i := 0; i < N; i++ { 580 if ki := n.key(i); ki >= k { 581 return i 582 } 583 } 584 return N 585 } 586 return int(simd.Search(n[:2*N], k)) 587 // lo, hi := 0, N 588 // // Reduce the search space using binary seach and then do linear search. 589 // for hi-lo > 32 { 590 // mid := (hi + lo) / 2 591 // km := n.key(mid) 592 // if k == km { 593 // return mid 594 // } 595 // if k > km { 596 // // key is greater than the key at mid, so move right. 597 // lo = mid + 1 598 // } else { 599 // // else move left. 600 // hi = mid 601 // } 602 // } 603 // for i := lo; i <= hi; i++ { 604 // if ki := n.key(i); ki >= k { 605 // return i 606 // } 607 // } 608 // return N 609 } 610 func (n node) maxKey() uint64 { 611 idx := n.numKeys() 612 // idx points to the first key which is zero. 613 if idx > 0 { 614 idx-- 615 } 616 return n.key(idx) 617 } 618 619 // compacts the node i.e., remove all the kvs with value < lo. It returns the remaining number of 620 // keys. 621 func (n node) compact(lo uint64) int { 622 N := n.numKeys() 623 mk := n.maxKey() 624 var left, right int 625 for right = 0; right < N; right++ { 626 if n.val(right) < lo && n.key(right) < mk { 627 // Skip over this key. Don't copy it. 628 continue 629 } 630 // Valid data. Copy it from right to left. Advance left. 631 if left != right { 632 copy(n.data(left), n.data(right)) 633 } 634 left++ 635 } 636 // zero out rest of the kv pairs. 637 zeroOut(n[keyOffset(left):keyOffset(right)]) 638 n.setNumKeys(left) 639 640 // If the only key we have is the max key, and its value is less than lo, then we can indicate 641 // to the caller by returning a zero that it's OK to drop the node. 642 if left == 1 && n.key(0) == mk && n.val(0) < lo { 643 return 0 644 } 645 return left 646 } 647 648 func (n node) get(k uint64) uint64 { 649 idx := n.search(k) 650 // key is not found 651 if idx == n.numKeys() { 652 return 0 653 } 654 if ki := n.key(idx); ki == k { 655 return n.val(idx) 656 } 657 return 0 658 } 659 660 // set returns true if it added a new key. 661 func (n node) set(k, v uint64) (numAdded int) { 662 idx := n.search(k) 663 ki := n.key(idx) 664 if n.numKeys() == maxKeys { 665 // This happens during split of non-root node, when we are updating the child pointer of 666 // right node. Hence, the key should already exist. 667 assert(ki == k) 668 } 669 if ki > k { 670 // Found the first entry which is greater than k. So, we need to fit k 671 // just before it. For that, we should move the rest of the data in the 672 // node to the right to make space for k. 673 n.moveRight(idx) 674 } 675 // If the k does not exist already, increment the number of keys. 676 if ki != k { 677 n.setNumKeys(n.numKeys() + 1) 678 numAdded = 1 679 } 680 if ki == 0 || ki >= k { 681 n.setAt(keyOffset(idx), k) 682 n.setAt(valOffset(idx), v) 683 return 684 } 685 panic("shouldn't reach here") 686 } 687 688 func (n node) iterate(fn func(node, int)) { 689 for i := 0; i < maxKeys; i++ { 690 if k := n.key(i); k > 0 { 691 fn(n, i) 692 } else { 693 break 694 } 695 } 696 } 697 698 func (n node) print(parentID uint64) { 699 var keys []string 700 n.iterate(func(n node, i int) { 701 keys = append(keys, fmt.Sprintf("%d", n.key(i))) 702 }) 703 if len(keys) > 8 { 704 copy(keys[4:], keys[len(keys)-4:]) 705 keys[3] = "..." 706 keys = keys[:8] 707 } 708 fmt.Printf("%d Child of: %d num keys: %d keys: %s\n", 709 n.pageID(), parentID, n.numKeys(), strings.Join(keys, " ")) 710 }