github.com/cockroachdb/pebble@v0.0.0-20231214172447-ab4952c5f87b/internal/manifest/btree.go (about) 1 // Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package manifest 6 7 import ( 8 "bytes" 9 stdcmp "cmp" 10 "fmt" 11 "strings" 12 "sync/atomic" 13 "unsafe" 14 15 "github.com/cockroachdb/errors" 16 "github.com/cockroachdb/pebble/internal/invariants" 17 ) 18 19 // The Annotator type defined below is used by other packages to lazily 20 // compute a value over a B-Tree. Each node of the B-Tree stores one 21 // `annotation` per annotator, containing the result of the computation over 22 // the node's subtree. 23 // 24 // An annotation is marked as valid if it's current with the current subtree 25 // state. Annotations are marked as invalid whenever a node will be mutated 26 // (in mut). Annotators may also return `false` from `Accumulate` to signal 27 // that a computation for a file is not stable and may change in the future. 28 // Annotations that include these unstable values are also marked as invalid 29 // on the node, ensuring that future queries for the annotation will recompute 30 // the value. 31 32 // An Annotator defines a computation over a level's FileMetadata. If the 33 // computation is stable and uses inputs that are fixed for the lifetime of 34 // a FileMetadata, the LevelMetadata's internal data structures are annotated 35 // with the intermediary computations. This allows the computation to be 36 // computed incrementally as edits are applied to a level. 37 type Annotator interface { 38 // Zero returns the zero value of an annotation. This value is returned 39 // when a LevelMetadata is empty. The dst argument, if non-nil, is an 40 // obsolete value previously returned by this Annotator and may be 41 // overwritten and reused to avoid a memory allocation. 42 Zero(dst interface{}) (v interface{}) 43 44 // Accumulate computes the annotation for a single file in a level's 45 // metadata. It merges the file's value into dst and returns a bool flag 46 // indicating whether or not the value is stable and okay to cache as an 47 // annotation. If the file's value may change over the life of the file, 48 // the annotator must return false. 49 // 50 // Implementations may modify dst and return it to avoid an allocation. 51 Accumulate(m *FileMetadata, dst interface{}) (v interface{}, cacheOK bool) 52 53 // Merge combines two values src and dst, returning the result. 54 // Implementations may modify dst and return it to avoid an allocation. 55 Merge(src interface{}, dst interface{}) interface{} 56 } 57 58 type btreeCmp func(*FileMetadata, *FileMetadata) int 59 60 func btreeCmpSeqNum(a, b *FileMetadata) int { 61 return a.cmpSeqNum(b) 62 } 63 64 func btreeCmpSmallestKey(cmp Compare) btreeCmp { 65 return func(a, b *FileMetadata) int { 66 return a.cmpSmallestKey(b, cmp) 67 } 68 } 69 70 // btreeCmpSpecificOrder is used in tests to construct a B-Tree with a 71 // specific ordering of FileMetadata within the tree. It's typically used to 72 // test consistency checking code that needs to construct a malformed B-Tree. 73 func btreeCmpSpecificOrder(files []*FileMetadata) btreeCmp { 74 m := map[*FileMetadata]int{} 75 for i, f := range files { 76 m[f] = i 77 } 78 return func(a, b *FileMetadata) int { 79 ai, aok := m[a] 80 bi, bok := m[b] 81 if !aok || !bok { 82 panic("btreeCmpSliceOrder called with unknown files") 83 } 84 return stdcmp.Compare(ai, bi) 85 } 86 } 87 88 const ( 89 degree = 16 90 maxItems = 2*degree - 1 91 minItems = degree - 1 92 ) 93 94 type annotation struct { 95 annotator Annotator 96 // v is an annotation value, the output of either 97 // annotator.Value or annotator.Merge. 98 v interface{} 99 // valid indicates whether future reads of the annotation may use v as-is. 100 // If false, v will be zeroed and recalculated. 101 valid bool 102 } 103 104 type leafNode struct { 105 ref atomic.Int32 106 count int16 107 leaf bool 108 // subtreeCount holds the count of files in the entire subtree formed by 109 // this node. For leaf nodes, subtreeCount is always equal to count. For 110 // non-leaf nodes, it's the sum of count plus all the children's 111 // subtreeCounts. 112 // 113 // NB: We could move this field to the end of the node struct, since leaf => 114 // count=subtreeCount, however the unsafe casting [leafToNode] performs make 115 // it risky and cumbersome. 116 subtreeCount int 117 items [maxItems]*FileMetadata 118 // annot contains one annotation per annotator, merged over the entire 119 // node's files (and all descendants for non-leaf nodes). 120 annot []annotation 121 } 122 123 type node struct { 124 leafNode 125 children [maxItems + 1]*node 126 } 127 128 //go:nocheckptr casts a ptr to a smaller struct to a ptr to a larger struct. 129 func leafToNode(ln *leafNode) *node { 130 return (*node)(unsafe.Pointer(ln)) 131 } 132 133 func newLeafNode() *node { 134 n := leafToNode(new(leafNode)) 135 n.leaf = true 136 n.ref.Store(1) 137 return n 138 } 139 140 func newNode() *node { 141 n := new(node) 142 n.ref.Store(1) 143 return n 144 } 145 146 // mut creates and returns a mutable node reference. If the node is not shared 147 // with any other trees then it can be modified in place. Otherwise, it must be 148 // cloned to ensure unique ownership. In this way, we enforce a copy-on-write 149 // policy which transparently incorporates the idea of local mutations, like 150 // Clojure's transients or Haskell's ST monad, where nodes are only copied 151 // during the first time that they are modified between Clone operations. 152 // 153 // When a node is cloned, the provided pointer will be redirected to the new 154 // mutable node. 155 func mut(n **node) *node { 156 if (*n).ref.Load() == 1 { 157 // Exclusive ownership. Can mutate in place. 158 159 // Whenever a node will be mutated, reset its annotations to be marked 160 // as uncached. This ensures any future calls to (*node).annotation 161 // will recompute annotations on the modified subtree. 162 for i := range (*n).annot { 163 (*n).annot[i].valid = false 164 } 165 return *n 166 } 167 // If we do not have unique ownership over the node then we 168 // clone it to gain unique ownership. After doing so, we can 169 // release our reference to the old node. We pass recursive 170 // as true because even though we just observed the node's 171 // reference count to be greater than 1, we might be racing 172 // with another call to decRef on this node. 173 c := (*n).clone() 174 (*n).decRef(true /* contentsToo */, nil) 175 *n = c 176 // NB: We don't need to clear annotations, because (*node).clone does not 177 // copy them. 178 return *n 179 } 180 181 // incRef acquires a reference to the node. 182 func (n *node) incRef() { 183 n.ref.Add(1) 184 } 185 186 // decRef releases a reference to the node. If requested, the method will unref 187 // its items and recurse into child nodes and decrease their refcounts as well. 188 // Some internal codepaths that manually copy the node's items or children to 189 // new nodes pass contentsToo=false to preserve existing reference counts during 190 // operations that should yield a net-zero change to descendant refcounts. 191 // When a node is released, its contained files are dereferenced. 192 func (n *node) decRef(contentsToo bool, obsolete *[]*FileBacking) { 193 if n.ref.Add(-1) > 0 { 194 // Other references remain. Can't free. 195 return 196 } 197 198 // Dereference the node's metadata and release child references if 199 // requested. Some internal callers may not want to propagate the deref 200 // because they're manually copying the filemetadata and children to other 201 // nodes, and they want to preserve the existing reference count. 202 if contentsToo { 203 for _, f := range n.items[:n.count] { 204 if f.Unref() == 0 { 205 // There are two sources of node dereferences: tree mutations 206 // and Version dereferences. Files should only be made obsolete 207 // during Version dereferences, during which `obsolete` will be 208 // non-nil. 209 if obsolete == nil { 210 panic(fmt.Sprintf("file metadata %s dereferenced to zero during tree mutation", f.FileNum)) 211 } 212 // Reference counting is performed on the FileBacking. In the case 213 // of a virtual sstable, this reference counting is performed on 214 // a FileBacking which is shared by every single virtual sstable 215 // with the same backing sstable. If the reference count hits 0, 216 // then we know that the FileBacking won't be required by any 217 // sstable in Pebble, and that the backing sstable can be deleted. 218 *obsolete = append(*obsolete, f.FileBacking) 219 } 220 } 221 if !n.leaf { 222 for i := int16(0); i <= n.count; i++ { 223 n.children[i].decRef(true /* contentsToo */, obsolete) 224 } 225 } 226 } 227 } 228 229 // clone creates a clone of the receiver with a single reference count. 230 func (n *node) clone() *node { 231 var c *node 232 if n.leaf { 233 c = newLeafNode() 234 } else { 235 c = newNode() 236 } 237 // NB: copy field-by-field without touching n.ref to avoid 238 // triggering the race detector and looking like a data race. 239 c.count = n.count 240 c.items = n.items 241 c.subtreeCount = n.subtreeCount 242 // Increase the refcount of each contained item. 243 for _, f := range n.items[:n.count] { 244 f.Ref() 245 } 246 if !c.leaf { 247 // Copy children and increase each refcount. 248 c.children = n.children 249 for i := int16(0); i <= c.count; i++ { 250 c.children[i].incRef() 251 } 252 } 253 return c 254 } 255 256 // insertAt inserts the provided file and node at the provided index. This 257 // function is for use only as a helper function for internal B-Tree code. 258 // Clients should not invoke it directly. 259 func (n *node) insertAt(index int, item *FileMetadata, nd *node) { 260 if index < int(n.count) { 261 copy(n.items[index+1:n.count+1], n.items[index:n.count]) 262 if !n.leaf { 263 copy(n.children[index+2:n.count+2], n.children[index+1:n.count+1]) 264 } 265 } 266 n.items[index] = item 267 if !n.leaf { 268 n.children[index+1] = nd 269 } 270 n.count++ 271 } 272 273 // pushBack inserts the provided file and node at the tail of the node's items. 274 // This function is for use only as a helper function for internal B-Tree code. 275 // Clients should not invoke it directly. 276 func (n *node) pushBack(item *FileMetadata, nd *node) { 277 n.items[n.count] = item 278 if !n.leaf { 279 n.children[n.count+1] = nd 280 } 281 n.count++ 282 } 283 284 // pushFront inserts the provided file and node at the head of the 285 // node's items. This function is for use only as a helper function for internal B-Tree 286 // code. Clients should not invoke it directly. 287 func (n *node) pushFront(item *FileMetadata, nd *node) { 288 if !n.leaf { 289 copy(n.children[1:n.count+2], n.children[:n.count+1]) 290 n.children[0] = nd 291 } 292 copy(n.items[1:n.count+1], n.items[:n.count]) 293 n.items[0] = item 294 n.count++ 295 } 296 297 // removeAt removes a value at a given index, pulling all subsequent values 298 // back. This function is for use only as a helper function for internal B-Tree 299 // code. Clients should not invoke it directly. 300 func (n *node) removeAt(index int) (*FileMetadata, *node) { 301 var child *node 302 if !n.leaf { 303 child = n.children[index+1] 304 copy(n.children[index+1:n.count], n.children[index+2:n.count+1]) 305 n.children[n.count] = nil 306 } 307 n.count-- 308 out := n.items[index] 309 copy(n.items[index:n.count], n.items[index+1:n.count+1]) 310 n.items[n.count] = nil 311 return out, child 312 } 313 314 // popBack removes and returns the last element in the list. This function is 315 // for use only as a helper function for internal B-Tree code. Clients should 316 // not invoke it directly. 317 func (n *node) popBack() (*FileMetadata, *node) { 318 n.count-- 319 out := n.items[n.count] 320 n.items[n.count] = nil 321 if n.leaf { 322 return out, nil 323 } 324 child := n.children[n.count+1] 325 n.children[n.count+1] = nil 326 return out, child 327 } 328 329 // popFront removes and returns the first element in the list. This function is 330 // for use only as a helper function for internal B-Tree code. Clients should 331 // not invoke it directly. 332 func (n *node) popFront() (*FileMetadata, *node) { 333 n.count-- 334 var child *node 335 if !n.leaf { 336 child = n.children[0] 337 copy(n.children[:n.count+1], n.children[1:n.count+2]) 338 n.children[n.count+1] = nil 339 } 340 out := n.items[0] 341 copy(n.items[:n.count], n.items[1:n.count+1]) 342 n.items[n.count] = nil 343 return out, child 344 } 345 346 // find returns the index where the given item should be inserted into this 347 // list. 'found' is true if the item already exists in the list at the given 348 // index. 349 // 350 // This function is for use only as a helper function for internal B-Tree code. 351 // Clients should not invoke it directly. 352 func (n *node) find(cmp btreeCmp, item *FileMetadata) (index int, found bool) { 353 // Logic copied from sort.Search. Inlining this gave 354 // an 11% speedup on BenchmarkBTreeDeleteInsert. 355 i, j := 0, int(n.count) 356 for i < j { 357 h := int(uint(i+j) >> 1) // avoid overflow when computing h 358 // i ≤ h < j 359 v := cmp(item, n.items[h]) 360 if v == 0 { 361 return h, true 362 } else if v > 0 { 363 i = h + 1 364 } else { 365 j = h 366 } 367 } 368 return i, false 369 } 370 371 // split splits the given node at the given index. The current node shrinks, 372 // and this function returns the item that existed at that index and a new 373 // node containing all items/children after it. 374 // 375 // split is called when we want to perform a transformation like the one 376 // depicted in the following diagram. 377 // 378 // Before: 379 // +-----------+ 380 // n *node | x y z | 381 // +--/-/-\-\--+ 382 // 383 // After: 384 // +-----------+ 385 // | y | n's parent 386 // +----/-\----+ 387 // / \ 388 // v v 389 // +-----------+ +-----------+ 390 // n *node | x | | z | next *node 391 // +-----------+ +-----------+ 392 // 393 // split does not perform the complete transformation; the caller is responsible 394 // for updating the parent appropriately. split splits `n` into two nodes, `n` 395 // and `next`, returning `next` and the file that separates them. In the diagram 396 // above, `n.split` removes y and z from `n`, returning y in the first return 397 // value and `next` in the second return value. The caller is responsible for 398 // updating n's parent to now contain `y` as the separator between nodes `n` and 399 // `next`. 400 // 401 // This function is for use only as a helper function for internal B-Tree code. 402 // Clients should not invoke it directly. 403 func (n *node) split(i int) (*FileMetadata, *node) { 404 out := n.items[i] 405 var next *node 406 if n.leaf { 407 next = newLeafNode() 408 } else { 409 next = newNode() 410 } 411 next.count = n.count - int16(i+1) 412 copy(next.items[:], n.items[i+1:n.count]) 413 for j := int16(i); j < n.count; j++ { 414 n.items[j] = nil 415 } 416 if !n.leaf { 417 copy(next.children[:], n.children[i+1:n.count+1]) 418 descendantsMoved := 0 419 for j := int16(i + 1); j <= n.count; j++ { 420 descendantsMoved += n.children[j].subtreeCount 421 n.children[j] = nil 422 } 423 n.subtreeCount -= descendantsMoved 424 next.subtreeCount += descendantsMoved 425 } 426 n.count = int16(i) 427 // NB: We subtract one more than `next.count` from n's subtreeCount because 428 // the item at index `i` was removed from `n.items`. We'll return the item 429 // at index `i`, and the caller is responsible for updating the subtree 430 // count of whichever node adopts it. 431 n.subtreeCount -= int(next.count) + 1 432 next.subtreeCount += int(next.count) 433 return out, next 434 } 435 436 // Insert inserts a item into the subtree rooted at this node, making sure no 437 // nodes in the subtree exceed maxItems items. 438 func (n *node) Insert(cmp btreeCmp, item *FileMetadata) error { 439 i, found := n.find(cmp, item) 440 if found { 441 // cmp provides a total ordering of the files within a level. 442 // If we're inserting a metadata that's equal to an existing item 443 // in the tree, we're inserting a file into a level twice. 444 return errors.Errorf("files %s and %s collided on sort keys", 445 errors.Safe(item.FileNum), errors.Safe(n.items[i].FileNum)) 446 } 447 if n.leaf { 448 n.insertAt(i, item, nil) 449 n.subtreeCount++ 450 return nil 451 } 452 if n.children[i].count >= maxItems { 453 splitLa, splitNode := mut(&n.children[i]).split(maxItems / 2) 454 n.insertAt(i, splitLa, splitNode) 455 456 switch cmp := cmp(item, n.items[i]); { 457 case cmp < 0: 458 // no change, we want first split node 459 case cmp > 0: 460 i++ // we want second split node 461 default: 462 // cmp provides a total ordering of the files within a level. 463 // If we're inserting a metadata that's equal to an existing item 464 // in the tree, we're inserting a file into a level twice. 465 return errors.Errorf("files %s and %s collided on sort keys", 466 errors.Safe(item.FileNum), errors.Safe(n.items[i].FileNum)) 467 } 468 } 469 470 err := mut(&n.children[i]).Insert(cmp, item) 471 if err == nil { 472 n.subtreeCount++ 473 } 474 return err 475 } 476 477 // removeMax removes and returns the maximum item from the subtree rooted at 478 // this node. This function is for use only as a helper function for internal 479 // B-Tree code. Clients should not invoke it directly. 480 func (n *node) removeMax() *FileMetadata { 481 if n.leaf { 482 n.count-- 483 n.subtreeCount-- 484 out := n.items[n.count] 485 n.items[n.count] = nil 486 return out 487 } 488 child := mut(&n.children[n.count]) 489 if child.count <= minItems { 490 n.rebalanceOrMerge(int(n.count)) 491 return n.removeMax() 492 } 493 n.subtreeCount-- 494 return child.removeMax() 495 } 496 497 // Remove removes a item from the subtree rooted at this node. Returns 498 // the item that was removed or nil if no matching item was found. 499 func (n *node) Remove(cmp btreeCmp, item *FileMetadata) (out *FileMetadata) { 500 i, found := n.find(cmp, item) 501 if n.leaf { 502 if found { 503 out, _ = n.removeAt(i) 504 n.subtreeCount-- 505 return out 506 } 507 return nil 508 } 509 if n.children[i].count <= minItems { 510 // Child not large enough to remove from. 511 n.rebalanceOrMerge(i) 512 return n.Remove(cmp, item) 513 } 514 child := mut(&n.children[i]) 515 if found { 516 // Replace the item being removed with the max item in our left child. 517 out = n.items[i] 518 n.items[i] = child.removeMax() 519 n.subtreeCount-- 520 return out 521 } 522 // File is not in this node and child is large enough to remove from. 523 out = child.Remove(cmp, item) 524 if out != nil { 525 n.subtreeCount-- 526 } 527 return out 528 } 529 530 // rebalanceOrMerge grows child 'i' to ensure it has sufficient room to remove a 531 // item from it while keeping it at or above minItems. This function is for use 532 // only as a helper function for internal B-Tree code. Clients should not invoke 533 // it directly. 534 func (n *node) rebalanceOrMerge(i int) { 535 switch { 536 case i > 0 && n.children[i-1].count > minItems: 537 // Rebalance from left sibling. 538 // 539 // +-----------+ 540 // | y | 541 // +----/-\----+ 542 // / \ 543 // v v 544 // +-----------+ +-----------+ 545 // | x | | | 546 // +----------\+ +-----------+ 547 // \ 548 // v 549 // a 550 // 551 // After: 552 // 553 // +-----------+ 554 // | x | 555 // +----/-\----+ 556 // / \ 557 // v v 558 // +-----------+ +-----------+ 559 // | | | y | 560 // +-----------+ +/----------+ 561 // / 562 // v 563 // a 564 // 565 left := mut(&n.children[i-1]) 566 child := mut(&n.children[i]) 567 xLa, grandChild := left.popBack() 568 yLa := n.items[i-1] 569 child.pushFront(yLa, grandChild) 570 n.items[i-1] = xLa 571 child.subtreeCount++ 572 left.subtreeCount-- 573 if grandChild != nil { 574 child.subtreeCount += grandChild.subtreeCount 575 left.subtreeCount -= grandChild.subtreeCount 576 } 577 578 case i < int(n.count) && n.children[i+1].count > minItems: 579 // Rebalance from right sibling. 580 // 581 // +-----------+ 582 // | y | 583 // +----/-\----+ 584 // / \ 585 // v v 586 // +-----------+ +-----------+ 587 // | | | x | 588 // +-----------+ +/----------+ 589 // / 590 // v 591 // a 592 // 593 // After: 594 // 595 // +-----------+ 596 // | x | 597 // +----/-\----+ 598 // / \ 599 // v v 600 // +-----------+ +-----------+ 601 // | y | | | 602 // +----------\+ +-----------+ 603 // \ 604 // v 605 // a 606 // 607 right := mut(&n.children[i+1]) 608 child := mut(&n.children[i]) 609 xLa, grandChild := right.popFront() 610 yLa := n.items[i] 611 child.pushBack(yLa, grandChild) 612 child.subtreeCount++ 613 right.subtreeCount-- 614 if grandChild != nil { 615 child.subtreeCount += grandChild.subtreeCount 616 right.subtreeCount -= grandChild.subtreeCount 617 } 618 n.items[i] = xLa 619 620 default: 621 // Merge with either the left or right sibling. 622 // 623 // +-----------+ 624 // | u y v | 625 // +----/-\----+ 626 // / \ 627 // v v 628 // +-----------+ +-----------+ 629 // | x | | z | 630 // +-----------+ +-----------+ 631 // 632 // After: 633 // 634 // +-----------+ 635 // | u v | 636 // +-----|-----+ 637 // | 638 // v 639 // +-----------+ 640 // | x y z | 641 // +-----------+ 642 // 643 if i >= int(n.count) { 644 i = int(n.count - 1) 645 } 646 child := mut(&n.children[i]) 647 // Make mergeChild mutable, bumping the refcounts on its children if necessary. 648 _ = mut(&n.children[i+1]) 649 mergeLa, mergeChild := n.removeAt(i) 650 child.items[child.count] = mergeLa 651 copy(child.items[child.count+1:], mergeChild.items[:mergeChild.count]) 652 if !child.leaf { 653 copy(child.children[child.count+1:], mergeChild.children[:mergeChild.count+1]) 654 } 655 child.count += mergeChild.count + 1 656 child.subtreeCount += mergeChild.subtreeCount + 1 657 658 mergeChild.decRef(false /* contentsToo */, nil) 659 } 660 } 661 662 // InvalidateAnnotation removes any existing cached annotations for the provided 663 // annotator from this node's subtree. 664 func (n *node) InvalidateAnnotation(a Annotator) { 665 // Find this annotator's annotation on this node. 666 var annot *annotation 667 for i := range n.annot { 668 if n.annot[i].annotator == a { 669 annot = &n.annot[i] 670 } 671 } 672 673 if annot != nil && annot.valid { 674 annot.valid = false 675 annot.v = a.Zero(annot.v) 676 } 677 if !n.leaf { 678 for i := int16(0); i <= n.count; i++ { 679 n.children[i].InvalidateAnnotation(a) 680 } 681 } 682 } 683 684 // Annotation retrieves, computing if not already computed, the provided 685 // annotator's annotation of this node. The second return value indicates 686 // whether the future reads of this annotation may use the first return value 687 // as-is. If false, the annotation is not stable and may change on a subsequent 688 // computation. 689 func (n *node) Annotation(a Annotator) (interface{}, bool) { 690 // Find this annotator's annotation on this node. 691 var annot *annotation 692 for i := range n.annot { 693 if n.annot[i].annotator == a { 694 annot = &n.annot[i] 695 } 696 } 697 698 // If it exists and is marked as valid, we can return it without 699 // recomputing anything. 700 if annot != nil && annot.valid { 701 return annot.v, true 702 } 703 704 if annot == nil { 705 // This is n's first time being annotated by a. 706 // Create a new zeroed annotation. 707 n.annot = append(n.annot, annotation{ 708 annotator: a, 709 v: a.Zero(nil), 710 }) 711 annot = &n.annot[len(n.annot)-1] 712 } else { 713 // There's an existing annotation that must be recomputed. 714 // Zero its value. 715 annot.v = a.Zero(annot.v) 716 } 717 718 annot.valid = true 719 for i := int16(0); i <= n.count; i++ { 720 if !n.leaf { 721 v, ok := n.children[i].Annotation(a) 722 annot.v = a.Merge(v, annot.v) 723 annot.valid = annot.valid && ok 724 } 725 if i < n.count { 726 v, ok := a.Accumulate(n.items[i], annot.v) 727 annot.v = v 728 annot.valid = annot.valid && ok 729 } 730 } 731 return annot.v, annot.valid 732 } 733 734 func (n *node) verifyInvariants() { 735 recomputedSubtreeCount := int(n.count) 736 if !n.leaf { 737 for i := int16(0); i <= n.count; i++ { 738 n.children[i].verifyInvariants() 739 recomputedSubtreeCount += n.children[i].subtreeCount 740 } 741 } 742 if recomputedSubtreeCount != n.subtreeCount { 743 panic(fmt.Sprintf("recomputed subtree count (%d) ≠ n.subtreeCount (%d)", 744 recomputedSubtreeCount, n.subtreeCount)) 745 } 746 } 747 748 // btree is an implementation of a B-Tree. 749 // 750 // btree stores FileMetadata in an ordered structure, allowing easy insertion, 751 // removal, and iteration. The B-Tree stores items in order based on cmp. The 752 // first level of the LSM uses a cmp function that compares sequence numbers. 753 // All other levels compare using the FileMetadata.Smallest. 754 // 755 // Write operations are not safe for concurrent mutation by multiple 756 // goroutines, but Read operations are. 757 type btree struct { 758 root *node 759 cmp btreeCmp 760 } 761 762 // Release dereferences and clears the root node of the btree, removing all 763 // items from the btree. In doing so, it decrements contained file counts. 764 // It returns a slice of newly obsolete backing files, if any. 765 func (t *btree) Release() (obsolete []*FileBacking) { 766 if t.root != nil { 767 t.root.decRef(true /* contentsToo */, &obsolete) 768 t.root = nil 769 } 770 return obsolete 771 } 772 773 // Clone clones the btree, lazily. It does so in constant time. 774 func (t *btree) Clone() btree { 775 c := *t 776 if c.root != nil { 777 // Incrementing the reference count on the root node is sufficient to 778 // ensure that no node in the cloned tree can be mutated by an actor 779 // holding a reference to the original tree and vice versa. This 780 // property is upheld because the root node in the receiver btree and 781 // the returned btree will both necessarily have a reference count of at 782 // least 2 when this method returns. All tree mutations recursively 783 // acquire mutable node references (see mut) as they traverse down the 784 // tree. The act of acquiring a mutable node reference performs a clone 785 // if a node's reference count is greater than one. Cloning a node (see 786 // clone) increases the reference count on each of its children, 787 // ensuring that they have a reference count of at least 2. This, in 788 // turn, ensures that any of the child nodes that are modified will also 789 // be copied-on-write, recursively ensuring the immutability property 790 // over the entire tree. 791 c.root.incRef() 792 } 793 return c 794 } 795 796 // Delete removes the provided file from the tree. 797 // It returns true if the file now has a zero reference count. 798 func (t *btree) Delete(item *FileMetadata) (obsolete bool) { 799 if t.root == nil || t.root.count == 0 { 800 return false 801 } 802 if out := mut(&t.root).Remove(t.cmp, item); out != nil { 803 obsolete = out.Unref() == 0 804 } 805 if invariants.Enabled { 806 t.root.verifyInvariants() 807 } 808 if t.root.count == 0 { 809 old := t.root 810 if t.root.leaf { 811 t.root = nil 812 } else { 813 t.root = t.root.children[0] 814 } 815 old.decRef(false /* contentsToo */, nil) 816 } 817 return obsolete 818 } 819 820 // Insert adds the given item to the tree. If a item in the tree already 821 // equals the given one, Insert panics. 822 func (t *btree) Insert(item *FileMetadata) error { 823 if t.root == nil { 824 t.root = newLeafNode() 825 } else if t.root.count >= maxItems { 826 splitLa, splitNode := mut(&t.root).split(maxItems / 2) 827 newRoot := newNode() 828 newRoot.count = 1 829 newRoot.items[0] = splitLa 830 newRoot.children[0] = t.root 831 newRoot.children[1] = splitNode 832 newRoot.subtreeCount = t.root.subtreeCount + splitNode.subtreeCount + 1 833 t.root = newRoot 834 } 835 item.Ref() 836 err := mut(&t.root).Insert(t.cmp, item) 837 if invariants.Enabled { 838 t.root.verifyInvariants() 839 } 840 return err 841 } 842 843 // Iter returns a new iterator object. It is not safe to continue using an 844 // iterator after modifications are made to the tree. If modifications are made, 845 // create a new iterator. 846 func (t *btree) Iter() iterator { 847 return iterator{r: t.root, pos: -1, cmp: t.cmp} 848 } 849 850 // Count returns the number of files contained within the B-Tree. 851 func (t *btree) Count() int { 852 if t.root == nil { 853 return 0 854 } 855 return t.root.subtreeCount 856 } 857 858 // String returns a string description of the tree. The format is 859 // similar to the https://en.wikipedia.org/wiki/Newick_format. 860 func (t *btree) String() string { 861 if t.Count() == 0 { 862 return ";" 863 } 864 var b strings.Builder 865 t.root.writeString(&b) 866 return b.String() 867 } 868 869 func (n *node) writeString(b *strings.Builder) { 870 if n.leaf { 871 for i := int16(0); i < n.count; i++ { 872 if i != 0 { 873 b.WriteString(",") 874 } 875 b.WriteString(n.items[i].String()) 876 } 877 return 878 } 879 for i := int16(0); i <= n.count; i++ { 880 b.WriteString("(") 881 n.children[i].writeString(b) 882 b.WriteString(")") 883 if i < n.count { 884 b.WriteString(n.items[i].String()) 885 } 886 } 887 } 888 889 // iterStack represents a stack of (node, pos) tuples, which captures 890 // iteration state as an iterator descends a btree. 891 type iterStack struct { 892 // a contains aLen stack frames when an iterator stack is short enough. 893 // If the iterator stack overflows the capacity of iterStackArr, the stack 894 // is moved to s and aLen is set to -1. 895 a iterStackArr 896 aLen int16 // -1 when using s 897 s []iterFrame 898 } 899 900 // Used to avoid allocations for stacks below a certain size. 901 type iterStackArr [3]iterFrame 902 903 type iterFrame struct { 904 n *node 905 pos int16 906 } 907 908 func (is *iterStack) push(f iterFrame) { 909 if is.aLen == -1 { 910 is.s = append(is.s, f) 911 } else if int(is.aLen) == len(is.a) { 912 is.s = make([]iterFrame, int(is.aLen)+1, 2*int(is.aLen)) 913 copy(is.s, is.a[:]) 914 is.s[int(is.aLen)] = f 915 is.aLen = -1 916 } else { 917 is.a[is.aLen] = f 918 is.aLen++ 919 } 920 } 921 922 func (is *iterStack) pop() iterFrame { 923 if is.aLen == -1 { 924 f := is.s[len(is.s)-1] 925 is.s = is.s[:len(is.s)-1] 926 return f 927 } 928 is.aLen-- 929 return is.a[is.aLen] 930 } 931 932 func (is *iterStack) len() int { 933 if is.aLen == -1 { 934 return len(is.s) 935 } 936 return int(is.aLen) 937 } 938 939 func (is *iterStack) clone() iterStack { 940 // If the iterator is using the embedded iterStackArr, we only need to 941 // copy the struct itself. 942 if is.s == nil { 943 return *is 944 } 945 clone := *is 946 clone.s = make([]iterFrame, len(is.s)) 947 copy(clone.s, is.s) 948 return clone 949 } 950 951 func (is *iterStack) nth(n int) (f iterFrame, ok bool) { 952 if is.aLen == -1 { 953 if n >= len(is.s) { 954 return f, false 955 } 956 return is.s[n], true 957 } 958 if int16(n) >= is.aLen { 959 return f, false 960 } 961 return is.a[n], true 962 } 963 964 func (is *iterStack) reset() { 965 if is.aLen == -1 { 966 is.s = is.s[:0] 967 } else { 968 is.aLen = 0 969 } 970 } 971 972 // iterator is responsible for search and traversal within a btree. 973 type iterator struct { 974 // the root node of the B-Tree. 975 r *node 976 // n and pos make up the current position of the iterator. 977 // If valid, n.items[pos] is the current value of the iterator. 978 // 979 // n may be nil iff i.r is nil. 980 n *node 981 pos int16 982 // cmp dictates the ordering of the FileMetadata. 983 cmp func(*FileMetadata, *FileMetadata) int 984 // a stack of n's ancestors within the B-Tree, alongside the position 985 // taken to arrive at n. If non-empty, the bottommost frame of the stack 986 // will always contain the B-Tree root. 987 s iterStack 988 } 989 990 // countLeft returns the count of files that are to the left of the current 991 // iterator position. 992 func (i *iterator) countLeft() int { 993 if i.r == nil { 994 return 0 995 } 996 997 // Each iterator has a stack of frames marking the path from the root node 998 // to the current iterator position. All files (n.items) and all subtrees 999 // (n.children) with indexes less than [pos] are to the left of the current 1000 // iterator position. 1001 // 1002 // +------------------------+ - 1003 // | Root pos:5 | | 1004 // +------------------------+ | stack 1005 // | Root/5 pos:3 | | frames 1006 // +------------------------+ | [i.s] 1007 // | Root/5/3 pos:9 | | 1008 // +========================+ - 1009 // | | 1010 // | i.n: Root/5/3/9 i.pos:2| 1011 // +------------------------+ 1012 // 1013 var count int 1014 // Walk all the ancestors in the iterator stack [i.s], tallying up all the 1015 // files and subtrees to the left of the stack frame's position. 1016 f, ok := i.s.nth(0) 1017 for fi := 0; ok; fi++ { 1018 // There are [f.pos] files contained within [f.n.items] that sort to the 1019 // left of the subtree the iterator has descended. 1020 count += int(f.pos) 1021 // Any subtrees that fall before the stack frame's position are entirely 1022 // to the left of the iterator's current position. 1023 for j := int16(0); j < f.pos; j++ { 1024 count += f.n.children[j].subtreeCount 1025 } 1026 f, ok = i.s.nth(fi + 1) 1027 } 1028 1029 // The bottommost stack frame is inlined within the iterator struct. Again, 1030 // [i.pos] files fall to the left of the current iterator position. 1031 count += int(i.pos) 1032 if !i.n.leaf { 1033 // NB: Unlike above, we use a `<= i.pos` comparison. The iterator is 1034 // positioned at item `i.n.items[i.pos]`, which sorts after everything 1035 // in the subtree at `i.n.children[i.pos]`. 1036 for j := int16(0); j <= i.pos; j++ { 1037 count += i.n.children[j].subtreeCount 1038 } 1039 } 1040 return count 1041 } 1042 1043 func (i *iterator) clone() iterator { 1044 c := *i 1045 c.s = i.s.clone() 1046 return c 1047 } 1048 1049 func (i *iterator) reset() { 1050 i.n = i.r 1051 i.pos = -1 1052 i.s.reset() 1053 } 1054 1055 func (i iterator) String() string { 1056 var buf bytes.Buffer 1057 for n := 0; ; n++ { 1058 f, ok := i.s.nth(n) 1059 if !ok { 1060 break 1061 } 1062 fmt.Fprintf(&buf, "%p: %02d/%02d\n", f.n, f.pos, f.n.count) 1063 } 1064 if i.r == nil { 1065 fmt.Fprintf(&buf, "<nil>: %02d", i.pos) 1066 } else { 1067 fmt.Fprintf(&buf, "%p: %02d/%02d", i.n, i.pos, i.n.count) 1068 } 1069 return buf.String() 1070 } 1071 1072 func cmpIter(a, b iterator) int { 1073 if a.r != b.r { 1074 panic("compared iterators from different btrees") 1075 } 1076 1077 // Each iterator has a stack of frames marking the path from the root node 1078 // to the current iterator position. We walk both paths formed by the 1079 // iterators' stacks simultaneously, descending from the shared root node, 1080 // always comparing nodes at the same level in the tree. 1081 // 1082 // If the iterators' paths ever diverge and point to different nodes, the 1083 // iterators are not equal and we use the node positions to evaluate the 1084 // comparison. 1085 // 1086 // If an iterator's stack ends, we stop descending and use its current 1087 // node and position for the final comparison. One iterator's stack may 1088 // end before another's if one iterator is positioned deeper in the tree. 1089 // 1090 // a b 1091 // +------------------------+ +--------------------------+ - 1092 // | Root pos:5 | = | Root pos:5 | | 1093 // +------------------------+ +--------------------------+ | stack 1094 // | Root/5 pos:3 | = | Root/5 pos:3 | | frames 1095 // +------------------------+ +--------------------------+ | 1096 // | Root/5/3 pos:9 | > | Root/5/3 pos:1 | | 1097 // +========================+ +==========================+ - 1098 // | | | | 1099 // | a.n: Root/5/3/9 a.pos:2| | b.n: Root/5/3/1, b.pos:5 | 1100 // +------------------------+ +--------------------------+ 1101 1102 // Initialize with the iterator's current node and position. These are 1103 // conceptually the most-recent/current frame of the iterator stack. 1104 an, apos := a.n, a.pos 1105 bn, bpos := b.n, b.pos 1106 1107 // aok, bok are set while traversing the iterator's path down the B-Tree. 1108 // They're declared in the outer scope because they help distinguish the 1109 // sentinel case when both iterators' first frame points to the last child 1110 // of the root. If an iterator has no other frames in its stack, it's the 1111 // end sentinel state which sorts after everything else. 1112 var aok, bok bool 1113 for i := 0; ; i++ { 1114 var af, bf iterFrame 1115 af, aok = a.s.nth(i) 1116 bf, bok = b.s.nth(i) 1117 if !aok || !bok { 1118 if aok { 1119 // Iterator a, unlike iterator b, still has a frame. Set an, 1120 // apos so we compare using the frame from the stack. 1121 an, apos = af.n, af.pos 1122 } 1123 if bok { 1124 // Iterator b, unlike iterator a, still has a frame. Set bn, 1125 // bpos so we compare using the frame from the stack. 1126 bn, bpos = bf.n, bf.pos 1127 } 1128 break 1129 } 1130 1131 // aok && bok 1132 if af.n != bf.n { 1133 panic("nonmatching nodes during btree iterator comparison") 1134 } 1135 if v := stdcmp.Compare(af.pos, bf.pos); v != 0 { 1136 return v 1137 } 1138 // Otherwise continue up both iterators' stacks (equivalently, down the 1139 // B-Tree away from the root). 1140 } 1141 1142 if aok && bok { 1143 panic("expected one or more stacks to have been exhausted") 1144 } 1145 if an != bn { 1146 panic("nonmatching nodes during btree iterator comparison") 1147 } 1148 if v := stdcmp.Compare(apos, bpos); v != 0 { 1149 return v 1150 } 1151 switch { 1152 case aok: 1153 // a is positioned at a leaf child at this position and b is at an 1154 // end sentinel state. 1155 return -1 1156 case bok: 1157 // b is positioned at a leaf child at this position and a is at an 1158 // end sentinel state. 1159 return +1 1160 default: 1161 return 0 1162 } 1163 } 1164 1165 func (i *iterator) descend(n *node, pos int16) { 1166 i.s.push(iterFrame{n: n, pos: pos}) 1167 i.n = n.children[pos] 1168 i.pos = 0 1169 } 1170 1171 // ascend ascends up to the current node's parent and resets the position 1172 // to the one previously set for this parent node. 1173 func (i *iterator) ascend() { 1174 f := i.s.pop() 1175 i.n = f.n 1176 i.pos = f.pos 1177 } 1178 1179 // seek repositions the iterator over the first file for which fn returns 1180 // true, mirroring the semantics of the standard library's sort.Search 1181 // function. Like sort.Search, seek requires the iterator's B-Tree to be 1182 // ordered such that fn returns false for some (possibly empty) prefix of the 1183 // tree's files, and then true for the (possibly empty) remainder. 1184 func (i *iterator) seek(fn func(*FileMetadata) bool) { 1185 i.reset() 1186 if i.r == nil { 1187 return 1188 } 1189 1190 for { 1191 // Logic copied from sort.Search. 1192 j, k := 0, int(i.n.count) 1193 for j < k { 1194 h := int(uint(j+k) >> 1) // avoid overflow when computing h 1195 1196 // j ≤ h < k 1197 if !fn(i.n.items[h]) { 1198 j = h + 1 // preserves f(j-1) == false 1199 } else { 1200 k = h // preserves f(k) == true 1201 } 1202 } 1203 1204 i.pos = int16(j) 1205 if i.n.leaf { 1206 if i.pos == i.n.count { 1207 i.next() 1208 } 1209 return 1210 } 1211 i.descend(i.n, i.pos) 1212 } 1213 } 1214 1215 // first seeks to the first item in the btree. 1216 func (i *iterator) first() { 1217 i.reset() 1218 if i.r == nil { 1219 return 1220 } 1221 for !i.n.leaf { 1222 i.descend(i.n, 0) 1223 } 1224 i.pos = 0 1225 } 1226 1227 // last seeks to the last item in the btree. 1228 func (i *iterator) last() { 1229 i.reset() 1230 if i.r == nil { 1231 return 1232 } 1233 for !i.n.leaf { 1234 i.descend(i.n, i.n.count) 1235 } 1236 i.pos = i.n.count - 1 1237 } 1238 1239 // next positions the iterator to the item immediately following 1240 // its current position. 1241 func (i *iterator) next() { 1242 if i.r == nil { 1243 return 1244 } 1245 1246 if i.n.leaf { 1247 if i.pos < i.n.count { 1248 i.pos++ 1249 } 1250 if i.pos < i.n.count { 1251 return 1252 } 1253 for i.s.len() > 0 && i.pos >= i.n.count { 1254 i.ascend() 1255 } 1256 return 1257 } 1258 1259 i.descend(i.n, i.pos+1) 1260 for !i.n.leaf { 1261 i.descend(i.n, 0) 1262 } 1263 i.pos = 0 1264 } 1265 1266 // prev positions the iterator to the item immediately preceding 1267 // its current position. 1268 func (i *iterator) prev() { 1269 if i.r == nil { 1270 return 1271 } 1272 1273 if i.n.leaf { 1274 i.pos-- 1275 if i.pos >= 0 { 1276 return 1277 } 1278 for i.s.len() > 0 && i.pos < 0 { 1279 i.ascend() 1280 i.pos-- 1281 } 1282 return 1283 } 1284 1285 i.descend(i.n, i.pos) 1286 for !i.n.leaf { 1287 i.descend(i.n, i.n.count) 1288 } 1289 i.pos = i.n.count - 1 1290 } 1291 1292 // valid returns whether the iterator is positioned at a valid position. 1293 func (i *iterator) valid() bool { 1294 return i.r != nil && i.pos >= 0 && i.pos < i.n.count 1295 } 1296 1297 // cur returns the item at the iterator's current position. It is illegal 1298 // to call cur if the iterator is not valid. 1299 func (i *iterator) cur() *FileMetadata { 1300 if invariants.Enabled && !i.valid() { 1301 panic("btree iterator.cur invoked on invalid iterator") 1302 } 1303 return i.n.items[i.pos] 1304 }