github.com/cockroachdb/pebble@v1.1.2/internal/manifest/btree.go (about) 1 // Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package manifest 6 7 import ( 8 "bytes" 9 "fmt" 10 "strings" 11 "sync/atomic" 12 "unsafe" 13 14 "github.com/cockroachdb/errors" 15 "github.com/cockroachdb/pebble/internal/invariants" 16 ) 17 18 // The Annotator type defined below is used by other packages to lazily 19 // compute a value over a B-Tree. Each node of the B-Tree stores one 20 // `annotation` per annotator, containing the result of the computation over 21 // the node's subtree. 22 // 23 // An annotation is marked as valid if it's current with the current subtree 24 // state. Annotations are marked as invalid whenever a node will be mutated 25 // (in mut). Annotators may also return `false` from `Accumulate` to signal 26 // that a computation for a file is not stable and may change in the future. 27 // Annotations that include these unstable values are also marked as invalid 28 // on the node, ensuring that future queries for the annotation will recompute 29 // the value. 30 31 // An Annotator defines a computation over a level's FileMetadata. If the 32 // computation is stable and uses inputs that are fixed for the lifetime of 33 // a FileMetadata, the LevelMetadata's internal data structures are annotated 34 // with the intermediary computations. This allows the computation to be 35 // computed incrementally as edits are applied to a level. 36 type Annotator interface { 37 // Zero returns the zero value of an annotation. This value is returned 38 // when a LevelMetadata is empty. The dst argument, if non-nil, is an 39 // obsolete value previously returned by this Annotator and may be 40 // overwritten and reused to avoid a memory allocation. 41 Zero(dst interface{}) (v interface{}) 42 43 // Accumulate computes the annotation for a single file in a level's 44 // metadata. It merges the file's value into dst and returns a bool flag 45 // indicating whether or not the value is stable and okay to cache as an 46 // annotation. If the file's value may change over the life of the file, 47 // the annotator must return false. 48 // 49 // Implementations may modify dst and return it to avoid an allocation. 50 Accumulate(m *FileMetadata, dst interface{}) (v interface{}, cacheOK bool) 51 52 // Merge combines two values src and dst, returning the result. 53 // Implementations may modify dst and return it to avoid an allocation. 54 Merge(src interface{}, dst interface{}) interface{} 55 } 56 57 type btreeCmp func(*FileMetadata, *FileMetadata) int 58 59 func btreeCmpSeqNum(a, b *FileMetadata) int { 60 return a.cmpSeqNum(b) 61 } 62 63 func btreeCmpSmallestKey(cmp Compare) btreeCmp { 64 return func(a, b *FileMetadata) int { 65 return a.cmpSmallestKey(b, cmp) 66 } 67 } 68 69 // btreeCmpSpecificOrder is used in tests to construct a B-Tree with a 70 // specific ordering of FileMetadata within the tree. It's typically used to 71 // test consistency checking code that needs to construct a malformed B-Tree. 72 func btreeCmpSpecificOrder(files []*FileMetadata) btreeCmp { 73 m := map[*FileMetadata]int{} 74 for i, f := range files { 75 m[f] = i 76 } 77 return func(a, b *FileMetadata) int { 78 ai, aok := m[a] 79 bi, bok := m[b] 80 if !aok || !bok { 81 panic("btreeCmpSliceOrder called with unknown files") 82 } 83 switch { 84 case ai < bi: 85 return -1 86 case ai > bi: 87 return +1 88 default: 89 return 0 90 } 91 } 92 } 93 94 const ( 95 degree = 16 96 maxItems = 2*degree - 1 97 minItems = degree - 1 98 ) 99 100 type annotation struct { 101 annotator Annotator 102 // v is an annotation value, the output of either 103 // annotator.Value or annotator.Merge. 104 v interface{} 105 // valid indicates whether future reads of the annotation may use v as-is. 106 // If false, v will be zeroed and recalculated. 107 valid bool 108 } 109 110 type leafNode struct { 111 ref atomic.Int32 112 count int16 113 leaf bool 114 // subtreeCount holds the count of files in the entire subtree formed by 115 // this node. For leaf nodes, subtreeCount is always equal to count. For 116 // non-leaf nodes, it's the sum of count plus all the children's 117 // subtreeCounts. 118 // 119 // NB: We could move this field to the end of the node struct, since leaf => 120 // count=subtreeCount, however the unsafe casting [leafToNode] performs make 121 // it risky and cumbersome. 122 subtreeCount int 123 items [maxItems]*FileMetadata 124 // annot contains one annotation per annotator, merged over the entire 125 // node's files (and all descendants for non-leaf nodes). 126 annot []annotation 127 } 128 129 type node struct { 130 leafNode 131 children [maxItems + 1]*node 132 } 133 134 //go:nocheckptr casts a ptr to a smaller struct to a ptr to a larger struct. 135 func leafToNode(ln *leafNode) *node { 136 return (*node)(unsafe.Pointer(ln)) 137 } 138 139 func newLeafNode() *node { 140 n := leafToNode(new(leafNode)) 141 n.leaf = true 142 n.ref.Store(1) 143 return n 144 } 145 146 func newNode() *node { 147 n := new(node) 148 n.ref.Store(1) 149 return n 150 } 151 152 // mut creates and returns a mutable node reference. If the node is not shared 153 // with any other trees then it can be modified in place. Otherwise, it must be 154 // cloned to ensure unique ownership. In this way, we enforce a copy-on-write 155 // policy which transparently incorporates the idea of local mutations, like 156 // Clojure's transients or Haskell's ST monad, where nodes are only copied 157 // during the first time that they are modified between Clone operations. 158 // 159 // When a node is cloned, the provided pointer will be redirected to the new 160 // mutable node. 161 func mut(n **node) *node { 162 if (*n).ref.Load() == 1 { 163 // Exclusive ownership. Can mutate in place. 164 165 // Whenever a node will be mutated, reset its annotations to be marked 166 // as uncached. This ensures any future calls to (*node).annotation 167 // will recompute annotations on the modified subtree. 168 for i := range (*n).annot { 169 (*n).annot[i].valid = false 170 } 171 return *n 172 } 173 // If we do not have unique ownership over the node then we 174 // clone it to gain unique ownership. After doing so, we can 175 // release our reference to the old node. We pass recursive 176 // as true because even though we just observed the node's 177 // reference count to be greater than 1, we might be racing 178 // with another call to decRef on this node. 179 c := (*n).clone() 180 (*n).decRef(true /* contentsToo */, nil) 181 *n = c 182 // NB: We don't need to clear annotations, because (*node).clone does not 183 // copy them. 184 return *n 185 } 186 187 // incRef acquires a reference to the node. 188 func (n *node) incRef() { 189 n.ref.Add(1) 190 } 191 192 // decRef releases a reference to the node. If requested, the method will unref 193 // its items and recurse into child nodes and decrease their refcounts as well. 194 // Some internal codepaths that manually copy the node's items or children to 195 // new nodes pass contentsToo=false to preserve existing reference counts during 196 // operations that should yield a net-zero change to descendant refcounts. 197 // When a node is released, its contained files are dereferenced. 198 func (n *node) decRef(contentsToo bool, obsolete *[]*FileBacking) { 199 if n.ref.Add(-1) > 0 { 200 // Other references remain. Can't free. 201 return 202 } 203 204 // Dereference the node's metadata and release child references if 205 // requested. Some internal callers may not want to propagate the deref 206 // because they're manually copying the filemetadata and children to other 207 // nodes, and they want to preserve the existing reference count. 208 if contentsToo { 209 for _, f := range n.items[:n.count] { 210 if f.Unref() == 0 { 211 // There are two sources of node dereferences: tree mutations 212 // and Version dereferences. Files should only be made obsolete 213 // during Version dereferences, during which `obsolete` will be 214 // non-nil. 215 if obsolete == nil { 216 panic(fmt.Sprintf("file metadata %s dereferenced to zero during tree mutation", f.FileNum)) 217 } 218 // Reference counting is performed on the FileBacking. In the case 219 // of a virtual sstable, this reference counting is performed on 220 // a FileBacking which is shared by every single virtual sstable 221 // with the same backing sstable. If the reference count hits 0, 222 // then we know that the FileBacking won't be required by any 223 // sstable in Pebble, and that the backing sstable can be deleted. 224 *obsolete = append(*obsolete, f.FileBacking) 225 } 226 } 227 if !n.leaf { 228 for i := int16(0); i <= n.count; i++ { 229 n.children[i].decRef(true /* contentsToo */, obsolete) 230 } 231 } 232 } 233 } 234 235 // clone creates a clone of the receiver with a single reference count. 236 func (n *node) clone() *node { 237 var c *node 238 if n.leaf { 239 c = newLeafNode() 240 } else { 241 c = newNode() 242 } 243 // NB: copy field-by-field without touching n.ref to avoid 244 // triggering the race detector and looking like a data race. 245 c.count = n.count 246 c.items = n.items 247 c.subtreeCount = n.subtreeCount 248 // Increase the refcount of each contained item. 249 for _, f := range n.items[:n.count] { 250 f.Ref() 251 } 252 if !c.leaf { 253 // Copy children and increase each refcount. 254 c.children = n.children 255 for i := int16(0); i <= c.count; i++ { 256 c.children[i].incRef() 257 } 258 } 259 return c 260 } 261 262 // insertAt inserts the provided file and node at the provided index. This 263 // function is for use only as a helper function for internal B-Tree code. 264 // Clients should not invoke it directly. 265 func (n *node) insertAt(index int, item *FileMetadata, nd *node) { 266 if index < int(n.count) { 267 copy(n.items[index+1:n.count+1], n.items[index:n.count]) 268 if !n.leaf { 269 copy(n.children[index+2:n.count+2], n.children[index+1:n.count+1]) 270 } 271 } 272 n.items[index] = item 273 if !n.leaf { 274 n.children[index+1] = nd 275 } 276 n.count++ 277 } 278 279 // pushBack inserts the provided file and node at the tail of the node's items. 280 // This function is for use only as a helper function for internal B-Tree code. 281 // Clients should not invoke it directly. 282 func (n *node) pushBack(item *FileMetadata, nd *node) { 283 n.items[n.count] = item 284 if !n.leaf { 285 n.children[n.count+1] = nd 286 } 287 n.count++ 288 } 289 290 // pushFront inserts the provided file and node at the head of the 291 // node's items. This function is for use only as a helper function for internal B-Tree 292 // code. Clients should not invoke it directly. 293 func (n *node) pushFront(item *FileMetadata, nd *node) { 294 if !n.leaf { 295 copy(n.children[1:n.count+2], n.children[:n.count+1]) 296 n.children[0] = nd 297 } 298 copy(n.items[1:n.count+1], n.items[:n.count]) 299 n.items[0] = item 300 n.count++ 301 } 302 303 // removeAt removes a value at a given index, pulling all subsequent values 304 // back. This function is for use only as a helper function for internal B-Tree 305 // code. Clients should not invoke it directly. 306 func (n *node) removeAt(index int) (*FileMetadata, *node) { 307 var child *node 308 if !n.leaf { 309 child = n.children[index+1] 310 copy(n.children[index+1:n.count], n.children[index+2:n.count+1]) 311 n.children[n.count] = nil 312 } 313 n.count-- 314 out := n.items[index] 315 copy(n.items[index:n.count], n.items[index+1:n.count+1]) 316 n.items[n.count] = nil 317 return out, child 318 } 319 320 // popBack removes and returns the last element in the list. This function is 321 // for use only as a helper function for internal B-Tree code. Clients should 322 // not invoke it directly. 323 func (n *node) popBack() (*FileMetadata, *node) { 324 n.count-- 325 out := n.items[n.count] 326 n.items[n.count] = nil 327 if n.leaf { 328 return out, nil 329 } 330 child := n.children[n.count+1] 331 n.children[n.count+1] = nil 332 return out, child 333 } 334 335 // popFront removes and returns the first element in the list. This function is 336 // for use only as a helper function for internal B-Tree code. Clients should 337 // not invoke it directly. 338 func (n *node) popFront() (*FileMetadata, *node) { 339 n.count-- 340 var child *node 341 if !n.leaf { 342 child = n.children[0] 343 copy(n.children[:n.count+1], n.children[1:n.count+2]) 344 n.children[n.count+1] = nil 345 } 346 out := n.items[0] 347 copy(n.items[:n.count], n.items[1:n.count+1]) 348 n.items[n.count] = nil 349 return out, child 350 } 351 352 // find returns the index where the given item should be inserted into this 353 // list. 'found' is true if the item already exists in the list at the given 354 // index. 355 // 356 // This function is for use only as a helper function for internal B-Tree code. 357 // Clients should not invoke it directly. 358 func (n *node) find(cmp btreeCmp, item *FileMetadata) (index int, found bool) { 359 // Logic copied from sort.Search. Inlining this gave 360 // an 11% speedup on BenchmarkBTreeDeleteInsert. 361 i, j := 0, int(n.count) 362 for i < j { 363 h := int(uint(i+j) >> 1) // avoid overflow when computing h 364 // i ≤ h < j 365 v := cmp(item, n.items[h]) 366 if v == 0 { 367 return h, true 368 } else if v > 0 { 369 i = h + 1 370 } else { 371 j = h 372 } 373 } 374 return i, false 375 } 376 377 // split splits the given node at the given index. The current node shrinks, 378 // and this function returns the item that existed at that index and a new 379 // node containing all items/children after it. 380 // 381 // split is called when we want to perform a transformation like the one 382 // depicted in the following diagram. 383 // 384 // Before: 385 // +-----------+ 386 // n *node | x y z | 387 // +--/-/-\-\--+ 388 // 389 // After: 390 // +-----------+ 391 // | y | n's parent 392 // +----/-\----+ 393 // / \ 394 // v v 395 // +-----------+ +-----------+ 396 // n *node | x | | z | next *node 397 // +-----------+ +-----------+ 398 // 399 // split does not perform the complete transformation; the caller is responsible 400 // for updating the parent appropriately. split splits `n` into two nodes, `n` 401 // and `next`, returning `next` and the file that separates them. In the diagram 402 // above, `n.split` removes y and z from `n`, returning y in the first return 403 // value and `next` in the second return value. The caller is responsible for 404 // updating n's parent to now contain `y` as the separator between nodes `n` and 405 // `next`. 406 // 407 // This function is for use only as a helper function for internal B-Tree code. 408 // Clients should not invoke it directly. 409 func (n *node) split(i int) (*FileMetadata, *node) { 410 out := n.items[i] 411 var next *node 412 if n.leaf { 413 next = newLeafNode() 414 } else { 415 next = newNode() 416 } 417 next.count = n.count - int16(i+1) 418 copy(next.items[:], n.items[i+1:n.count]) 419 for j := int16(i); j < n.count; j++ { 420 n.items[j] = nil 421 } 422 if !n.leaf { 423 copy(next.children[:], n.children[i+1:n.count+1]) 424 descendantsMoved := 0 425 for j := int16(i + 1); j <= n.count; j++ { 426 descendantsMoved += n.children[j].subtreeCount 427 n.children[j] = nil 428 } 429 n.subtreeCount -= descendantsMoved 430 next.subtreeCount += descendantsMoved 431 } 432 n.count = int16(i) 433 // NB: We subtract one more than `next.count` from n's subtreeCount because 434 // the item at index `i` was removed from `n.items`. We'll return the item 435 // at index `i`, and the caller is responsible for updating the subtree 436 // count of whichever node adopts it. 437 n.subtreeCount -= int(next.count) + 1 438 next.subtreeCount += int(next.count) 439 return out, next 440 } 441 442 // Insert inserts a item into the subtree rooted at this node, making sure no 443 // nodes in the subtree exceed maxItems items. 444 func (n *node) Insert(cmp btreeCmp, item *FileMetadata) error { 445 i, found := n.find(cmp, item) 446 if found { 447 // cmp provides a total ordering of the files within a level. 448 // If we're inserting a metadata that's equal to an existing item 449 // in the tree, we're inserting a file into a level twice. 450 return errors.Errorf("files %s and %s collided on sort keys", 451 errors.Safe(item.FileNum), errors.Safe(n.items[i].FileNum)) 452 } 453 if n.leaf { 454 n.insertAt(i, item, nil) 455 n.subtreeCount++ 456 return nil 457 } 458 if n.children[i].count >= maxItems { 459 splitLa, splitNode := mut(&n.children[i]).split(maxItems / 2) 460 n.insertAt(i, splitLa, splitNode) 461 462 switch cmp := cmp(item, n.items[i]); { 463 case cmp < 0: 464 // no change, we want first split node 465 case cmp > 0: 466 i++ // we want second split node 467 default: 468 // cmp provides a total ordering of the files within a level. 469 // If we're inserting a metadata that's equal to an existing item 470 // in the tree, we're inserting a file into a level twice. 471 return errors.Errorf("files %s and %s collided on sort keys", 472 errors.Safe(item.FileNum), errors.Safe(n.items[i].FileNum)) 473 } 474 } 475 476 err := mut(&n.children[i]).Insert(cmp, item) 477 if err == nil { 478 n.subtreeCount++ 479 } 480 return err 481 } 482 483 // removeMax removes and returns the maximum item from the subtree rooted at 484 // this node. This function is for use only as a helper function for internal 485 // B-Tree code. Clients should not invoke it directly. 486 func (n *node) removeMax() *FileMetadata { 487 if n.leaf { 488 n.count-- 489 n.subtreeCount-- 490 out := n.items[n.count] 491 n.items[n.count] = nil 492 return out 493 } 494 child := mut(&n.children[n.count]) 495 if child.count <= minItems { 496 n.rebalanceOrMerge(int(n.count)) 497 return n.removeMax() 498 } 499 n.subtreeCount-- 500 return child.removeMax() 501 } 502 503 // Remove removes a item from the subtree rooted at this node. Returns 504 // the item that was removed or nil if no matching item was found. 505 func (n *node) Remove(cmp btreeCmp, item *FileMetadata) (out *FileMetadata) { 506 i, found := n.find(cmp, item) 507 if n.leaf { 508 if found { 509 out, _ = n.removeAt(i) 510 n.subtreeCount-- 511 return out 512 } 513 return nil 514 } 515 if n.children[i].count <= minItems { 516 // Child not large enough to remove from. 517 n.rebalanceOrMerge(i) 518 return n.Remove(cmp, item) 519 } 520 child := mut(&n.children[i]) 521 if found { 522 // Replace the item being removed with the max item in our left child. 523 out = n.items[i] 524 n.items[i] = child.removeMax() 525 n.subtreeCount-- 526 return out 527 } 528 // File is not in this node and child is large enough to remove from. 529 out = child.Remove(cmp, item) 530 if out != nil { 531 n.subtreeCount-- 532 } 533 return out 534 } 535 536 // rebalanceOrMerge grows child 'i' to ensure it has sufficient room to remove a 537 // item from it while keeping it at or above minItems. This function is for use 538 // only as a helper function for internal B-Tree code. Clients should not invoke 539 // it directly. 540 func (n *node) rebalanceOrMerge(i int) { 541 switch { 542 case i > 0 && n.children[i-1].count > minItems: 543 // Rebalance from left sibling. 544 // 545 // +-----------+ 546 // | y | 547 // +----/-\----+ 548 // / \ 549 // v v 550 // +-----------+ +-----------+ 551 // | x | | | 552 // +----------\+ +-----------+ 553 // \ 554 // v 555 // a 556 // 557 // After: 558 // 559 // +-----------+ 560 // | x | 561 // +----/-\----+ 562 // / \ 563 // v v 564 // +-----------+ +-----------+ 565 // | | | y | 566 // +-----------+ +/----------+ 567 // / 568 // v 569 // a 570 // 571 left := mut(&n.children[i-1]) 572 child := mut(&n.children[i]) 573 xLa, grandChild := left.popBack() 574 yLa := n.items[i-1] 575 child.pushFront(yLa, grandChild) 576 n.items[i-1] = xLa 577 child.subtreeCount++ 578 left.subtreeCount-- 579 if grandChild != nil { 580 child.subtreeCount += grandChild.subtreeCount 581 left.subtreeCount -= grandChild.subtreeCount 582 } 583 584 case i < int(n.count) && n.children[i+1].count > minItems: 585 // Rebalance from right sibling. 586 // 587 // +-----------+ 588 // | y | 589 // +----/-\----+ 590 // / \ 591 // v v 592 // +-----------+ +-----------+ 593 // | | | x | 594 // +-----------+ +/----------+ 595 // / 596 // v 597 // a 598 // 599 // After: 600 // 601 // +-----------+ 602 // | x | 603 // +----/-\----+ 604 // / \ 605 // v v 606 // +-----------+ +-----------+ 607 // | y | | | 608 // +----------\+ +-----------+ 609 // \ 610 // v 611 // a 612 // 613 right := mut(&n.children[i+1]) 614 child := mut(&n.children[i]) 615 xLa, grandChild := right.popFront() 616 yLa := n.items[i] 617 child.pushBack(yLa, grandChild) 618 child.subtreeCount++ 619 right.subtreeCount-- 620 if grandChild != nil { 621 child.subtreeCount += grandChild.subtreeCount 622 right.subtreeCount -= grandChild.subtreeCount 623 } 624 n.items[i] = xLa 625 626 default: 627 // Merge with either the left or right sibling. 628 // 629 // +-----------+ 630 // | u y v | 631 // +----/-\----+ 632 // / \ 633 // v v 634 // +-----------+ +-----------+ 635 // | x | | z | 636 // +-----------+ +-----------+ 637 // 638 // After: 639 // 640 // +-----------+ 641 // | u v | 642 // +-----|-----+ 643 // | 644 // v 645 // +-----------+ 646 // | x y z | 647 // +-----------+ 648 // 649 if i >= int(n.count) { 650 i = int(n.count - 1) 651 } 652 child := mut(&n.children[i]) 653 // Make mergeChild mutable, bumping the refcounts on its children if necessary. 654 _ = mut(&n.children[i+1]) 655 mergeLa, mergeChild := n.removeAt(i) 656 child.items[child.count] = mergeLa 657 copy(child.items[child.count+1:], mergeChild.items[:mergeChild.count]) 658 if !child.leaf { 659 copy(child.children[child.count+1:], mergeChild.children[:mergeChild.count+1]) 660 } 661 child.count += mergeChild.count + 1 662 child.subtreeCount += mergeChild.subtreeCount + 1 663 664 mergeChild.decRef(false /* contentsToo */, nil) 665 } 666 } 667 668 // InvalidateAnnotation removes any existing cached annotations for the provided 669 // annotator from this node's subtree. 670 func (n *node) InvalidateAnnotation(a Annotator) { 671 // Find this annotator's annotation on this node. 672 var annot *annotation 673 for i := range n.annot { 674 if n.annot[i].annotator == a { 675 annot = &n.annot[i] 676 } 677 } 678 679 if annot != nil && annot.valid { 680 annot.valid = false 681 annot.v = a.Zero(annot.v) 682 } 683 if !n.leaf { 684 for i := int16(0); i <= n.count; i++ { 685 n.children[i].InvalidateAnnotation(a) 686 } 687 } 688 } 689 690 // Annotation retrieves, computing if not already computed, the provided 691 // annotator's annotation of this node. The second return value indicates 692 // whether the future reads of this annotation may use the first return value 693 // as-is. If false, the annotation is not stable and may change on a subsequent 694 // computation. 695 func (n *node) Annotation(a Annotator) (interface{}, bool) { 696 // Find this annotator's annotation on this node. 697 var annot *annotation 698 for i := range n.annot { 699 if n.annot[i].annotator == a { 700 annot = &n.annot[i] 701 } 702 } 703 704 // If it exists and is marked as valid, we can return it without 705 // recomputing anything. 706 if annot != nil && annot.valid { 707 return annot.v, true 708 } 709 710 if annot == nil { 711 // This is n's first time being annotated by a. 712 // Create a new zeroed annotation. 713 n.annot = append(n.annot, annotation{ 714 annotator: a, 715 v: a.Zero(nil), 716 }) 717 annot = &n.annot[len(n.annot)-1] 718 } else { 719 // There's an existing annotation that must be recomputed. 720 // Zero its value. 721 annot.v = a.Zero(annot.v) 722 } 723 724 annot.valid = true 725 for i := int16(0); i <= n.count; i++ { 726 if !n.leaf { 727 v, ok := n.children[i].Annotation(a) 728 annot.v = a.Merge(v, annot.v) 729 annot.valid = annot.valid && ok 730 } 731 if i < n.count { 732 v, ok := a.Accumulate(n.items[i], annot.v) 733 annot.v = v 734 annot.valid = annot.valid && ok 735 } 736 } 737 return annot.v, annot.valid 738 } 739 740 func (n *node) verifyInvariants() { 741 recomputedSubtreeCount := int(n.count) 742 if !n.leaf { 743 for i := int16(0); i <= n.count; i++ { 744 n.children[i].verifyInvariants() 745 recomputedSubtreeCount += n.children[i].subtreeCount 746 } 747 } 748 if recomputedSubtreeCount != n.subtreeCount { 749 panic(fmt.Sprintf("recomputed subtree count (%d) ≠ n.subtreeCount (%d)", 750 recomputedSubtreeCount, n.subtreeCount)) 751 } 752 } 753 754 // btree is an implementation of a B-Tree. 755 // 756 // btree stores FileMetadata in an ordered structure, allowing easy insertion, 757 // removal, and iteration. The B-Tree stores items in order based on cmp. The 758 // first level of the LSM uses a cmp function that compares sequence numbers. 759 // All other levels compare using the FileMetadata.Smallest. 760 // 761 // Write operations are not safe for concurrent mutation by multiple 762 // goroutines, but Read operations are. 763 type btree struct { 764 root *node 765 cmp btreeCmp 766 } 767 768 // Release dereferences and clears the root node of the btree, removing all 769 // items from the btree. In doing so, it decrements contained file counts. 770 // It returns a slice of newly obsolete backing files, if any. 771 func (t *btree) Release() (obsolete []*FileBacking) { 772 if t.root != nil { 773 t.root.decRef(true /* contentsToo */, &obsolete) 774 t.root = nil 775 } 776 return obsolete 777 } 778 779 // Clone clones the btree, lazily. It does so in constant time. 780 func (t *btree) Clone() btree { 781 c := *t 782 if c.root != nil { 783 // Incrementing the reference count on the root node is sufficient to 784 // ensure that no node in the cloned tree can be mutated by an actor 785 // holding a reference to the original tree and vice versa. This 786 // property is upheld because the root node in the receiver btree and 787 // the returned btree will both necessarily have a reference count of at 788 // least 2 when this method returns. All tree mutations recursively 789 // acquire mutable node references (see mut) as they traverse down the 790 // tree. The act of acquiring a mutable node reference performs a clone 791 // if a node's reference count is greater than one. Cloning a node (see 792 // clone) increases the reference count on each of its children, 793 // ensuring that they have a reference count of at least 2. This, in 794 // turn, ensures that any of the child nodes that are modified will also 795 // be copied-on-write, recursively ensuring the immutability property 796 // over the entire tree. 797 c.root.incRef() 798 } 799 return c 800 } 801 802 // Delete removes the provided file from the tree. 803 // It returns true if the file now has a zero reference count. 804 func (t *btree) Delete(item *FileMetadata) (obsolete bool) { 805 if t.root == nil || t.root.count == 0 { 806 return false 807 } 808 if out := mut(&t.root).Remove(t.cmp, item); out != nil { 809 obsolete = out.Unref() == 0 810 } 811 if invariants.Enabled { 812 t.root.verifyInvariants() 813 } 814 if t.root.count == 0 { 815 old := t.root 816 if t.root.leaf { 817 t.root = nil 818 } else { 819 t.root = t.root.children[0] 820 } 821 old.decRef(false /* contentsToo */, nil) 822 } 823 return obsolete 824 } 825 826 // Insert adds the given item to the tree. If a item in the tree already 827 // equals the given one, Insert panics. 828 func (t *btree) Insert(item *FileMetadata) error { 829 if t.root == nil { 830 t.root = newLeafNode() 831 } else if t.root.count >= maxItems { 832 splitLa, splitNode := mut(&t.root).split(maxItems / 2) 833 newRoot := newNode() 834 newRoot.count = 1 835 newRoot.items[0] = splitLa 836 newRoot.children[0] = t.root 837 newRoot.children[1] = splitNode 838 newRoot.subtreeCount = t.root.subtreeCount + splitNode.subtreeCount + 1 839 t.root = newRoot 840 } 841 item.Ref() 842 err := mut(&t.root).Insert(t.cmp, item) 843 if invariants.Enabled { 844 t.root.verifyInvariants() 845 } 846 return err 847 } 848 849 // Iter returns a new iterator object. It is not safe to continue using an 850 // iterator after modifications are made to the tree. If modifications are made, 851 // create a new iterator. 852 func (t *btree) Iter() iterator { 853 return iterator{r: t.root, pos: -1, cmp: t.cmp} 854 } 855 856 // Count returns the number of files contained within the B-Tree. 857 func (t *btree) Count() int { 858 if t.root == nil { 859 return 0 860 } 861 return t.root.subtreeCount 862 } 863 864 // String returns a string description of the tree. The format is 865 // similar to the https://en.wikipedia.org/wiki/Newick_format. 866 func (t *btree) String() string { 867 if t.Count() == 0 { 868 return ";" 869 } 870 var b strings.Builder 871 t.root.writeString(&b) 872 return b.String() 873 } 874 875 func (n *node) writeString(b *strings.Builder) { 876 if n.leaf { 877 for i := int16(0); i < n.count; i++ { 878 if i != 0 { 879 b.WriteString(",") 880 } 881 b.WriteString(n.items[i].String()) 882 } 883 return 884 } 885 for i := int16(0); i <= n.count; i++ { 886 b.WriteString("(") 887 n.children[i].writeString(b) 888 b.WriteString(")") 889 if i < n.count { 890 b.WriteString(n.items[i].String()) 891 } 892 } 893 } 894 895 // iterStack represents a stack of (node, pos) tuples, which captures 896 // iteration state as an iterator descends a btree. 897 type iterStack struct { 898 // a contains aLen stack frames when an iterator stack is short enough. 899 // If the iterator stack overflows the capacity of iterStackArr, the stack 900 // is moved to s and aLen is set to -1. 901 a iterStackArr 902 aLen int16 // -1 when using s 903 s []iterFrame 904 } 905 906 // Used to avoid allocations for stacks below a certain size. 907 type iterStackArr [3]iterFrame 908 909 type iterFrame struct { 910 n *node 911 pos int16 912 } 913 914 func (is *iterStack) push(f iterFrame) { 915 if is.aLen == -1 { 916 is.s = append(is.s, f) 917 } else if int(is.aLen) == len(is.a) { 918 is.s = make([]iterFrame, int(is.aLen)+1, 2*int(is.aLen)) 919 copy(is.s, is.a[:]) 920 is.s[int(is.aLen)] = f 921 is.aLen = -1 922 } else { 923 is.a[is.aLen] = f 924 is.aLen++ 925 } 926 } 927 928 func (is *iterStack) pop() iterFrame { 929 if is.aLen == -1 { 930 f := is.s[len(is.s)-1] 931 is.s = is.s[:len(is.s)-1] 932 return f 933 } 934 is.aLen-- 935 return is.a[is.aLen] 936 } 937 938 func (is *iterStack) len() int { 939 if is.aLen == -1 { 940 return len(is.s) 941 } 942 return int(is.aLen) 943 } 944 945 func (is *iterStack) clone() iterStack { 946 // If the iterator is using the embedded iterStackArr, we only need to 947 // copy the struct itself. 948 if is.s == nil { 949 return *is 950 } 951 clone := *is 952 clone.s = make([]iterFrame, len(is.s)) 953 copy(clone.s, is.s) 954 return clone 955 } 956 957 func (is *iterStack) nth(n int) (f iterFrame, ok bool) { 958 if is.aLen == -1 { 959 if n >= len(is.s) { 960 return f, false 961 } 962 return is.s[n], true 963 } 964 if int16(n) >= is.aLen { 965 return f, false 966 } 967 return is.a[n], true 968 } 969 970 func (is *iterStack) reset() { 971 if is.aLen == -1 { 972 is.s = is.s[:0] 973 } else { 974 is.aLen = 0 975 } 976 } 977 978 // iterator is responsible for search and traversal within a btree. 979 type iterator struct { 980 // the root node of the B-Tree. 981 r *node 982 // n and pos make up the current position of the iterator. 983 // If valid, n.items[pos] is the current value of the iterator. 984 // 985 // n may be nil iff i.r is nil. 986 n *node 987 pos int16 988 // cmp dictates the ordering of the FileMetadata. 989 cmp func(*FileMetadata, *FileMetadata) int 990 // a stack of n's ancestors within the B-Tree, alongside the position 991 // taken to arrive at n. If non-empty, the bottommost frame of the stack 992 // will always contain the B-Tree root. 993 s iterStack 994 } 995 996 // countLeft returns the count of files that are to the left of the current 997 // iterator position. 998 func (i *iterator) countLeft() int { 999 if i.r == nil { 1000 return 0 1001 } 1002 1003 // Each iterator has a stack of frames marking the path from the root node 1004 // to the current iterator position. All files (n.items) and all subtrees 1005 // (n.children) with indexes less than [pos] are to the left of the current 1006 // iterator position. 1007 // 1008 // +------------------------+ - 1009 // | Root pos:5 | | 1010 // +------------------------+ | stack 1011 // | Root/5 pos:3 | | frames 1012 // +------------------------+ | [i.s] 1013 // | Root/5/3 pos:9 | | 1014 // +========================+ - 1015 // | | 1016 // | i.n: Root/5/3/9 i.pos:2| 1017 // +------------------------+ 1018 // 1019 var count int 1020 // Walk all the ancestors in the iterator stack [i.s], tallying up all the 1021 // files and subtrees to the left of the stack frame's position. 1022 f, ok := i.s.nth(0) 1023 for fi := 0; ok; fi++ { 1024 // There are [f.pos] files contained within [f.n.items] that sort to the 1025 // left of the subtree the iterator has descended. 1026 count += int(f.pos) 1027 // Any subtrees that fall before the stack frame's position are entirely 1028 // to the left of the iterator's current position. 1029 for j := int16(0); j < f.pos; j++ { 1030 count += f.n.children[j].subtreeCount 1031 } 1032 f, ok = i.s.nth(fi + 1) 1033 } 1034 1035 // The bottommost stack frame is inlined within the iterator struct. Again, 1036 // [i.pos] files fall to the left of the current iterator position. 1037 count += int(i.pos) 1038 if !i.n.leaf { 1039 // NB: Unlike above, we use a `<= i.pos` comparison. The iterator is 1040 // positioned at item `i.n.items[i.pos]`, which sorts after everything 1041 // in the subtree at `i.n.children[i.pos]`. 1042 for j := int16(0); j <= i.pos; j++ { 1043 count += i.n.children[j].subtreeCount 1044 } 1045 } 1046 return count 1047 } 1048 1049 func (i *iterator) clone() iterator { 1050 c := *i 1051 c.s = i.s.clone() 1052 return c 1053 } 1054 1055 func (i *iterator) reset() { 1056 i.n = i.r 1057 i.pos = -1 1058 i.s.reset() 1059 } 1060 1061 func (i iterator) String() string { 1062 var buf bytes.Buffer 1063 for n := 0; ; n++ { 1064 f, ok := i.s.nth(n) 1065 if !ok { 1066 break 1067 } 1068 fmt.Fprintf(&buf, "%p: %02d/%02d\n", f.n, f.pos, f.n.count) 1069 } 1070 if i.r == nil { 1071 fmt.Fprintf(&buf, "<nil>: %02d", i.pos) 1072 } else { 1073 fmt.Fprintf(&buf, "%p: %02d/%02d", i.n, i.pos, i.n.count) 1074 } 1075 return buf.String() 1076 } 1077 1078 func cmpIter(a, b iterator) int { 1079 if a.r != b.r { 1080 panic("compared iterators from different btrees") 1081 } 1082 1083 // Each iterator has a stack of frames marking the path from the root node 1084 // to the current iterator position. We walk both paths formed by the 1085 // iterators' stacks simultaneously, descending from the shared root node, 1086 // always comparing nodes at the same level in the tree. 1087 // 1088 // If the iterators' paths ever diverge and point to different nodes, the 1089 // iterators are not equal and we use the node positions to evaluate the 1090 // comparison. 1091 // 1092 // If an iterator's stack ends, we stop descending and use its current 1093 // node and position for the final comparison. One iterator's stack may 1094 // end before another's if one iterator is positioned deeper in the tree. 1095 // 1096 // a b 1097 // +------------------------+ +--------------------------+ - 1098 // | Root pos:5 | = | Root pos:5 | | 1099 // +------------------------+ +--------------------------+ | stack 1100 // | Root/5 pos:3 | = | Root/5 pos:3 | | frames 1101 // +------------------------+ +--------------------------+ | 1102 // | Root/5/3 pos:9 | > | Root/5/3 pos:1 | | 1103 // +========================+ +==========================+ - 1104 // | | | | 1105 // | a.n: Root/5/3/9 a.pos:2| | b.n: Root/5/3/1, b.pos:5 | 1106 // +------------------------+ +--------------------------+ 1107 1108 // Initialize with the iterator's current node and position. These are 1109 // conceptually the most-recent/current frame of the iterator stack. 1110 an, apos := a.n, a.pos 1111 bn, bpos := b.n, b.pos 1112 1113 // aok, bok are set while traversing the iterator's path down the B-Tree. 1114 // They're declared in the outer scope because they help distinguish the 1115 // sentinel case when both iterators' first frame points to the last child 1116 // of the root. If an iterator has no other frames in its stack, it's the 1117 // end sentinel state which sorts after everything else. 1118 var aok, bok bool 1119 for i := 0; ; i++ { 1120 var af, bf iterFrame 1121 af, aok = a.s.nth(i) 1122 bf, bok = b.s.nth(i) 1123 if !aok || !bok { 1124 if aok { 1125 // Iterator a, unlike iterator b, still has a frame. Set an, 1126 // apos so we compare using the frame from the stack. 1127 an, apos = af.n, af.pos 1128 } 1129 if bok { 1130 // Iterator b, unlike iterator a, still has a frame. Set bn, 1131 // bpos so we compare using the frame from the stack. 1132 bn, bpos = bf.n, bf.pos 1133 } 1134 break 1135 } 1136 1137 // aok && bok 1138 if af.n != bf.n { 1139 panic("nonmatching nodes during btree iterator comparison") 1140 } 1141 switch { 1142 case af.pos < bf.pos: 1143 return -1 1144 case af.pos > bf.pos: 1145 return +1 1146 default: 1147 // Continue up both iterators' stacks (equivalently, down the 1148 // B-Tree away from the root). 1149 } 1150 } 1151 1152 if aok && bok { 1153 panic("expected one or more stacks to have been exhausted") 1154 } 1155 if an != bn { 1156 panic("nonmatching nodes during btree iterator comparison") 1157 } 1158 switch { 1159 case apos < bpos: 1160 return -1 1161 case apos > bpos: 1162 return +1 1163 default: 1164 switch { 1165 case aok: 1166 // a is positioned at a leaf child at this position and b is at an 1167 // end sentinel state. 1168 return -1 1169 case bok: 1170 // b is positioned at a leaf child at this position and a is at an 1171 // end sentinel state. 1172 return +1 1173 default: 1174 return 0 1175 } 1176 } 1177 } 1178 1179 func (i *iterator) descend(n *node, pos int16) { 1180 i.s.push(iterFrame{n: n, pos: pos}) 1181 i.n = n.children[pos] 1182 i.pos = 0 1183 } 1184 1185 // ascend ascends up to the current node's parent and resets the position 1186 // to the one previously set for this parent node. 1187 func (i *iterator) ascend() { 1188 f := i.s.pop() 1189 i.n = f.n 1190 i.pos = f.pos 1191 } 1192 1193 // seek repositions the iterator over the first file for which fn returns 1194 // true, mirroring the semantics of the standard library's sort.Search 1195 // function. Like sort.Search, seek requires the iterator's B-Tree to be 1196 // ordered such that fn returns false for some (possibly empty) prefix of the 1197 // tree's files, and then true for the (possibly empty) remainder. 1198 func (i *iterator) seek(fn func(*FileMetadata) bool) { 1199 i.reset() 1200 if i.r == nil { 1201 return 1202 } 1203 1204 for { 1205 // Logic copied from sort.Search. 1206 j, k := 0, int(i.n.count) 1207 for j < k { 1208 h := int(uint(j+k) >> 1) // avoid overflow when computing h 1209 1210 // j ≤ h < k 1211 if !fn(i.n.items[h]) { 1212 j = h + 1 // preserves f(j-1) == false 1213 } else { 1214 k = h // preserves f(k) == true 1215 } 1216 } 1217 1218 i.pos = int16(j) 1219 if i.n.leaf { 1220 if i.pos == i.n.count { 1221 i.next() 1222 } 1223 return 1224 } 1225 i.descend(i.n, i.pos) 1226 } 1227 } 1228 1229 // first seeks to the first item in the btree. 1230 func (i *iterator) first() { 1231 i.reset() 1232 if i.r == nil { 1233 return 1234 } 1235 for !i.n.leaf { 1236 i.descend(i.n, 0) 1237 } 1238 i.pos = 0 1239 } 1240 1241 // last seeks to the last item in the btree. 1242 func (i *iterator) last() { 1243 i.reset() 1244 if i.r == nil { 1245 return 1246 } 1247 for !i.n.leaf { 1248 i.descend(i.n, i.n.count) 1249 } 1250 i.pos = i.n.count - 1 1251 } 1252 1253 // next positions the iterator to the item immediately following 1254 // its current position. 1255 func (i *iterator) next() { 1256 if i.r == nil { 1257 return 1258 } 1259 1260 if i.n.leaf { 1261 if i.pos < i.n.count { 1262 i.pos++ 1263 } 1264 if i.pos < i.n.count { 1265 return 1266 } 1267 for i.s.len() > 0 && i.pos >= i.n.count { 1268 i.ascend() 1269 } 1270 return 1271 } 1272 1273 i.descend(i.n, i.pos+1) 1274 for !i.n.leaf { 1275 i.descend(i.n, 0) 1276 } 1277 i.pos = 0 1278 } 1279 1280 // prev positions the iterator to the item immediately preceding 1281 // its current position. 1282 func (i *iterator) prev() { 1283 if i.r == nil { 1284 return 1285 } 1286 1287 if i.n.leaf { 1288 i.pos-- 1289 if i.pos >= 0 { 1290 return 1291 } 1292 for i.s.len() > 0 && i.pos < 0 { 1293 i.ascend() 1294 i.pos-- 1295 } 1296 return 1297 } 1298 1299 i.descend(i.n, i.pos) 1300 for !i.n.leaf { 1301 i.descend(i.n, i.n.count) 1302 } 1303 i.pos = i.n.count - 1 1304 } 1305 1306 // valid returns whether the iterator is positioned at a valid position. 1307 func (i *iterator) valid() bool { 1308 return i.r != nil && i.pos >= 0 && i.pos < i.n.count 1309 } 1310 1311 // cur returns the item at the iterator's current position. It is illegal 1312 // to call cur if the iterator is not valid. 1313 func (i *iterator) cur() *FileMetadata { 1314 if invariants.Enabled && !i.valid() { 1315 panic("btree iterator.cur invoked on invalid iterator") 1316 } 1317 return i.n.items[i.pos] 1318 }