github.com/zuoyebang/bitalostable@v1.0.1-0.20240229032404-e3b99a834294/internal/manifest/btree.go (about)

     1  // Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package manifest
     6  
     7  import (
     8  	"bytes"
     9  	"fmt"
    10  	"strings"
    11  	"sync/atomic"
    12  	"unsafe"
    13  
    14  	"github.com/cockroachdb/errors"
    15  )
    16  
    17  // The Annotator type defined below is used by other packages to lazily
    18  // compute a value over a B-Tree. Each node of the B-Tree stores one
    19  // `annotation` per annotator, containing the result of the computation over
    20  // the node's subtree.
    21  //
    22  // An annotation is marked as valid if it's current with the current subtree
    23  // state. Annotations are marked as invalid whenever a node will be mutated
    24  // (in mut).  Annotators may also return `false` from `Accumulate` to signal
    25  // that a computation for a file is not stable and may change in the future.
    26  // Annotations that include these unstable values are also marked as invalid
    27  // on the node, ensuring that future queries for the annotation will recompute
    28  // the value.
    29  
    30  // An Annotator defines a computation over a level's FileMetadata. If the
    31  // computation is stable and uses inputs that are fixed for the lifetime of
    32  // a FileMetadata, the LevelMetadata's internal data structures are annotated
    33  // with the intermediary computations. This allows the computation to be
    34  // computed incrementally as edits are applied to a level.
    35  type Annotator interface {
    36  	// Zero returns the zero value of an annotation. This value is returned
    37  	// when a LevelMetadata is empty. The dst argument, if non-nil, is an
    38  	// obsolete value previously returned by this Annotator and may be
    39  	// overwritten and reused to avoid a memory allocation.
    40  	Zero(dst interface{}) (v interface{})
    41  
    42  	// Accumulate computes the annotation for a single file in a level's
    43  	// metadata. It merges the file's value into dst and returns a bool flag
    44  	// indicating whether or not the value is stable and okay to cache as an
    45  	// annotation. If the file's value may change over the life of the file,
    46  	// the annotator must return false.
    47  	//
    48  	// Implementations may modify dst and return it to avoid an allocation.
    49  	Accumulate(m *FileMetadata, dst interface{}) (v interface{}, cacheOK bool)
    50  
    51  	// Merge combines two values src and dst, returning the result.
    52  	// Implementations may modify dst and return it to avoid an allocation.
    53  	Merge(src interface{}, dst interface{}) interface{}
    54  }
    55  
    56  type btreeCmp func(*FileMetadata, *FileMetadata) int
    57  
    58  func btreeCmpSeqNum(a, b *FileMetadata) int {
    59  	return a.cmpSeqNum(b)
    60  }
    61  
    62  func btreeCmpSmallestKey(cmp Compare) btreeCmp {
    63  	return func(a, b *FileMetadata) int {
    64  		return a.cmpSmallestKey(b, cmp)
    65  	}
    66  }
    67  
    68  // btreeCmpSpecificOrder is used in tests to construct a B-Tree with a
    69  // specific ordering of FileMetadata within the tree. It's typically used to
    70  // test consistency checking code that needs to construct a malformed B-Tree.
    71  func btreeCmpSpecificOrder(files []*FileMetadata) btreeCmp {
    72  	m := map[*FileMetadata]int{}
    73  	for i, f := range files {
    74  		m[f] = i
    75  	}
    76  	return func(a, b *FileMetadata) int {
    77  		ai, aok := m[a]
    78  		bi, bok := m[b]
    79  		if !aok || !bok {
    80  			panic("btreeCmpSliceOrder called with unknown files")
    81  		}
    82  		switch {
    83  		case ai < bi:
    84  			return -1
    85  		case ai > bi:
    86  			return +1
    87  		default:
    88  			return 0
    89  		}
    90  	}
    91  }
    92  
    93  const (
    94  	degree   = 16
    95  	maxItems = 2*degree - 1
    96  	minItems = degree - 1
    97  )
    98  
    99  type annotation struct {
   100  	annotator Annotator
   101  	// v is an annotation value, the output of either
   102  	// annotator.Value or annotator.Merge.
   103  	v interface{}
   104  	// valid indicates whether future reads of the annotation may use v as-is.
   105  	// If false, v will be zeroed and recalculated.
   106  	valid bool
   107  }
   108  
   109  type leafNode struct {
   110  	ref   int32
   111  	count int16
   112  	leaf  bool
   113  	items [maxItems]*FileMetadata
   114  	// annot contains one annotation per annotator, merged over the entire
   115  	// node's files (and all descendants for non-leaf nodes).
   116  	annot []annotation
   117  }
   118  
   119  type node struct {
   120  	leafNode
   121  	children [maxItems + 1]*node
   122  }
   123  
   124  //go:nocheckptr casts a ptr to a smaller struct to a ptr to a larger struct.
   125  func leafToNode(ln *leafNode) *node {
   126  	return (*node)(unsafe.Pointer(ln))
   127  }
   128  
   129  func newLeafNode() *node {
   130  	n := leafToNode(new(leafNode))
   131  	n.leaf = true
   132  	n.ref = 1
   133  	return n
   134  }
   135  
   136  func newNode() *node {
   137  	n := new(node)
   138  	n.ref = 1
   139  	return n
   140  }
   141  
   142  // mut creates and returns a mutable node reference. If the node is not shared
   143  // with any other trees then it can be modified in place. Otherwise, it must be
   144  // cloned to ensure unique ownership. In this way, we enforce a copy-on-write
   145  // policy which transparently incorporates the idea of local mutations, like
   146  // Clojure's transients or Haskell's ST monad, where nodes are only copied
   147  // during the first time that they are modified between Clone operations.
   148  //
   149  // When a node is cloned, the provided pointer will be redirected to the new
   150  // mutable node.
   151  func mut(n **node) *node {
   152  	if atomic.LoadInt32(&(*n).ref) == 1 {
   153  		// Exclusive ownership. Can mutate in place.
   154  
   155  		// Whenever a node will be mutated, reset its annotations to be marked
   156  		// as uncached. This ensures any future calls to (*node).annotation
   157  		// will recompute annotations on the modified subtree.
   158  		for i := range (*n).annot {
   159  			(*n).annot[i].valid = false
   160  		}
   161  		return *n
   162  	}
   163  	// If we do not have unique ownership over the node then we
   164  	// clone it to gain unique ownership. After doing so, we can
   165  	// release our reference to the old node. We pass recursive
   166  	// as true because even though we just observed the node's
   167  	// reference count to be greater than 1, we might be racing
   168  	// with another call to decRef on this node.
   169  	c := (*n).clone()
   170  	(*n).decRef(true /* recursive */, nil)
   171  	*n = c
   172  	// NB: We don't need to clear annotations, because (*node).clone does not
   173  	// copy them.
   174  	return *n
   175  }
   176  
   177  // incRef acquires a reference to the node.
   178  func (n *node) incRef() {
   179  	atomic.AddInt32(&n.ref, 1)
   180  }
   181  
   182  // decRef releases a reference to the node. If requested, the method
   183  // will recurse into child nodes and decrease their refcounts as well.
   184  // When a node is released, its contained files are dereferenced.
   185  func (n *node) decRef(recursive bool, obsolete *[]*FileMetadata) {
   186  	if atomic.AddInt32(&n.ref, -1) > 0 {
   187  		// Other references remain. Can't free.
   188  		return
   189  	}
   190  
   191  	// Dereference the node's metadata and release child references.
   192  	if recursive {
   193  		for _, f := range n.items[:n.count] {
   194  			if atomic.AddInt32(&f.refs, -1) == 0 {
   195  				// There are two sources of node dereferences: tree mutations
   196  				// and Version dereferences. Files should only be made obsolete
   197  				// during Version dereferences, during which `obsolete` will be
   198  				// non-nil.
   199  				if obsolete == nil {
   200  					panic(fmt.Sprintf("file metadata %s dereferenced to zero during tree mutation", f.FileNum))
   201  				}
   202  				*obsolete = append(*obsolete, f)
   203  			}
   204  		}
   205  		if !n.leaf {
   206  			for i := int16(0); i <= n.count; i++ {
   207  				n.children[i].decRef(true /* recursive */, obsolete)
   208  			}
   209  		}
   210  	}
   211  }
   212  
   213  // clone creates a clone of the receiver with a single reference count.
   214  func (n *node) clone() *node {
   215  	var c *node
   216  	if n.leaf {
   217  		c = newLeafNode()
   218  	} else {
   219  		c = newNode()
   220  	}
   221  	// NB: copy field-by-field without touching n.ref to avoid
   222  	// triggering the race detector and looking like a data race.
   223  	c.count = n.count
   224  	c.items = n.items
   225  	// Increase the refcount of each contained item.
   226  	for _, f := range n.items[:n.count] {
   227  		atomic.AddInt32(&f.refs, 1)
   228  	}
   229  	if !c.leaf {
   230  		// Copy children and increase each refcount.
   231  		c.children = n.children
   232  		for i := int16(0); i <= c.count; i++ {
   233  			c.children[i].incRef()
   234  		}
   235  	}
   236  	return c
   237  }
   238  
   239  func (n *node) insertAt(index int, item *FileMetadata, nd *node) {
   240  	if index < int(n.count) {
   241  		copy(n.items[index+1:n.count+1], n.items[index:n.count])
   242  		if !n.leaf {
   243  			copy(n.children[index+2:n.count+2], n.children[index+1:n.count+1])
   244  		}
   245  	}
   246  	n.items[index] = item
   247  	if !n.leaf {
   248  		n.children[index+1] = nd
   249  	}
   250  	n.count++
   251  }
   252  
   253  func (n *node) pushBack(item *FileMetadata, nd *node) {
   254  	n.items[n.count] = item
   255  	if !n.leaf {
   256  		n.children[n.count+1] = nd
   257  	}
   258  	n.count++
   259  }
   260  
   261  func (n *node) pushFront(item *FileMetadata, nd *node) {
   262  	if !n.leaf {
   263  		copy(n.children[1:n.count+2], n.children[:n.count+1])
   264  		n.children[0] = nd
   265  	}
   266  	copy(n.items[1:n.count+1], n.items[:n.count])
   267  	n.items[0] = item
   268  	n.count++
   269  }
   270  
   271  // removeAt removes a value at a given index, pulling all subsequent values
   272  // back.
   273  func (n *node) removeAt(index int) (*FileMetadata, *node) {
   274  	var child *node
   275  	if !n.leaf {
   276  		child = n.children[index+1]
   277  		copy(n.children[index+1:n.count], n.children[index+2:n.count+1])
   278  		n.children[n.count] = nil
   279  	}
   280  	n.count--
   281  	out := n.items[index]
   282  	copy(n.items[index:n.count], n.items[index+1:n.count+1])
   283  	n.items[n.count] = nil
   284  	return out, child
   285  }
   286  
   287  // popBack removes and returns the last element in the list.
   288  func (n *node) popBack() (*FileMetadata, *node) {
   289  	n.count--
   290  	out := n.items[n.count]
   291  	n.items[n.count] = nil
   292  	if n.leaf {
   293  		return out, nil
   294  	}
   295  	child := n.children[n.count+1]
   296  	n.children[n.count+1] = nil
   297  	return out, child
   298  }
   299  
   300  // popFront removes and returns the first element in the list.
   301  func (n *node) popFront() (*FileMetadata, *node) {
   302  	n.count--
   303  	var child *node
   304  	if !n.leaf {
   305  		child = n.children[0]
   306  		copy(n.children[:n.count+1], n.children[1:n.count+2])
   307  		n.children[n.count+1] = nil
   308  	}
   309  	out := n.items[0]
   310  	copy(n.items[:n.count], n.items[1:n.count+1])
   311  	n.items[n.count] = nil
   312  	return out, child
   313  }
   314  
   315  // find returns the index where the given item should be inserted into this
   316  // list. 'found' is true if the item already exists in the list at the given
   317  // index.
   318  func (n *node) find(cmp btreeCmp, item *FileMetadata) (index int, found bool) {
   319  	// Logic copied from sort.Search. Inlining this gave
   320  	// an 11% speedup on BenchmarkBTreeDeleteInsert.
   321  	i, j := 0, int(n.count)
   322  	for i < j {
   323  		h := int(uint(i+j) >> 1) // avoid overflow when computing h
   324  		// i ≤ h < j
   325  		v := cmp(item, n.items[h])
   326  		if v == 0 {
   327  			return h, true
   328  		} else if v > 0 {
   329  			i = h + 1
   330  		} else {
   331  			j = h
   332  		}
   333  	}
   334  	return i, false
   335  }
   336  
   337  // split splits the given node at the given index. The current node shrinks,
   338  // and this function returns the item that existed at that index and a new
   339  // node containing all items/children after it.
   340  //
   341  // Before:
   342  //
   343  //	+-----------+
   344  //	|   x y z   |
   345  //	+--/-/-\-\--+
   346  //
   347  // After:
   348  //
   349  //	+-----------+
   350  //	|     y     |
   351  //	+----/-\----+
   352  //	    /   \
   353  //	   v     v
   354  //
   355  // +-----------+     +-----------+
   356  // |         x |     | z         |
   357  // +-----------+     +-----------+
   358  func (n *node) split(i int) (*FileMetadata, *node) {
   359  	out := n.items[i]
   360  	var next *node
   361  	if n.leaf {
   362  		next = newLeafNode()
   363  	} else {
   364  		next = newNode()
   365  	}
   366  	next.count = n.count - int16(i+1)
   367  	copy(next.items[:], n.items[i+1:n.count])
   368  	for j := int16(i); j < n.count; j++ {
   369  		n.items[j] = nil
   370  	}
   371  	if !n.leaf {
   372  		copy(next.children[:], n.children[i+1:n.count+1])
   373  		for j := int16(i + 1); j <= n.count; j++ {
   374  			n.children[j] = nil
   375  		}
   376  	}
   377  	n.count = int16(i)
   378  	return out, next
   379  }
   380  
   381  // insert inserts a item into the subtree rooted at this node, making sure no
   382  // nodes in the subtree exceed maxItems items.
   383  func (n *node) insert(cmp btreeCmp, item *FileMetadata) error {
   384  	i, found := n.find(cmp, item)
   385  	if found {
   386  		// cmp provides a total ordering of the files within a level.
   387  		// If we're inserting a metadata that's equal to an existing item
   388  		// in the tree, we're inserting a file into a level twice.
   389  		return errors.Errorf("files %s and %s collided on sort keys",
   390  			errors.Safe(item.FileNum), errors.Safe(n.items[i].FileNum))
   391  	}
   392  	if n.leaf {
   393  		n.insertAt(i, item, nil)
   394  		return nil
   395  	}
   396  	if n.children[i].count >= maxItems {
   397  		splitLa, splitNode := mut(&n.children[i]).split(maxItems / 2)
   398  		n.insertAt(i, splitLa, splitNode)
   399  
   400  		switch cmp := cmp(item, n.items[i]); {
   401  		case cmp < 0:
   402  			// no change, we want first split node
   403  		case cmp > 0:
   404  			i++ // we want second split node
   405  		default:
   406  			// cmp provides a total ordering of the files within a level.
   407  			// If we're inserting a metadata that's equal to an existing item
   408  			// in the tree, we're inserting a file into a level twice.
   409  			return errors.Errorf("files %s and %s collided on sort keys",
   410  				errors.Safe(item.FileNum), errors.Safe(n.items[i].FileNum))
   411  		}
   412  	}
   413  	return mut(&n.children[i]).insert(cmp, item)
   414  }
   415  
   416  // removeMax removes and returns the maximum item from the subtree rooted
   417  // at this node.
   418  func (n *node) removeMax() *FileMetadata {
   419  	if n.leaf {
   420  		n.count--
   421  		out := n.items[n.count]
   422  		n.items[n.count] = nil
   423  		return out
   424  	}
   425  	child := mut(&n.children[n.count])
   426  	if child.count <= minItems {
   427  		n.rebalanceOrMerge(int(n.count))
   428  		return n.removeMax()
   429  	}
   430  	return child.removeMax()
   431  }
   432  
   433  // remove removes a item from the subtree rooted at this node. Returns
   434  // the item that was removed or nil if no matching item was found.
   435  func (n *node) remove(cmp btreeCmp, item *FileMetadata) (out *FileMetadata) {
   436  	i, found := n.find(cmp, item)
   437  	if n.leaf {
   438  		if found {
   439  			out, _ = n.removeAt(i)
   440  			return out
   441  		}
   442  		return nil
   443  	}
   444  	if n.children[i].count <= minItems {
   445  		// Child not large enough to remove from.
   446  		n.rebalanceOrMerge(i)
   447  		return n.remove(cmp, item)
   448  	}
   449  	child := mut(&n.children[i])
   450  	if found {
   451  		// Replace the item being removed with the max item in our left child.
   452  		out = n.items[i]
   453  		n.items[i] = child.removeMax()
   454  		return out
   455  	}
   456  	// Latch is not in this node and child is large enough to remove from.
   457  	out = child.remove(cmp, item)
   458  	return out
   459  }
   460  
   461  // rebalanceOrMerge grows child 'i' to ensure it has sufficient room to remove
   462  // a item from it while keeping it at or above minItems.
   463  func (n *node) rebalanceOrMerge(i int) {
   464  	switch {
   465  	case i > 0 && n.children[i-1].count > minItems:
   466  		// Rebalance from left sibling.
   467  		//
   468  		//          +-----------+
   469  		//          |     y     |
   470  		//          +----/-\----+
   471  		//              /   \
   472  		//             v     v
   473  		// +-----------+     +-----------+
   474  		// |         x |     |           |
   475  		// +----------\+     +-----------+
   476  		//             \
   477  		//              v
   478  		//              a
   479  		//
   480  		// After:
   481  		//
   482  		//          +-----------+
   483  		//          |     x     |
   484  		//          +----/-\----+
   485  		//              /   \
   486  		//             v     v
   487  		// +-----------+     +-----------+
   488  		// |           |     | y         |
   489  		// +-----------+     +/----------+
   490  		//                   /
   491  		//                  v
   492  		//                  a
   493  		//
   494  		left := mut(&n.children[i-1])
   495  		child := mut(&n.children[i])
   496  		xLa, grandChild := left.popBack()
   497  		yLa := n.items[i-1]
   498  		child.pushFront(yLa, grandChild)
   499  		n.items[i-1] = xLa
   500  
   501  	case i < int(n.count) && n.children[i+1].count > minItems:
   502  		// Rebalance from right sibling.
   503  		//
   504  		//          +-----------+
   505  		//          |     y     |
   506  		//          +----/-\----+
   507  		//              /   \
   508  		//             v     v
   509  		// +-----------+     +-----------+
   510  		// |           |     | x         |
   511  		// +-----------+     +/----------+
   512  		//                   /
   513  		//                  v
   514  		//                  a
   515  		//
   516  		// After:
   517  		//
   518  		//          +-----------+
   519  		//          |     x     |
   520  		//          +----/-\----+
   521  		//              /   \
   522  		//             v     v
   523  		// +-----------+     +-----------+
   524  		// |         y |     |           |
   525  		// +----------\+     +-----------+
   526  		//             \
   527  		//              v
   528  		//              a
   529  		//
   530  		right := mut(&n.children[i+1])
   531  		child := mut(&n.children[i])
   532  		xLa, grandChild := right.popFront()
   533  		yLa := n.items[i]
   534  		child.pushBack(yLa, grandChild)
   535  		n.items[i] = xLa
   536  
   537  	default:
   538  		// Merge with either the left or right sibling.
   539  		//
   540  		//          +-----------+
   541  		//          |   u y v   |
   542  		//          +----/-\----+
   543  		//              /   \
   544  		//             v     v
   545  		// +-----------+     +-----------+
   546  		// |         x |     | z         |
   547  		// +-----------+     +-----------+
   548  		//
   549  		// After:
   550  		//
   551  		//          +-----------+
   552  		//          |    u v    |
   553  		//          +-----|-----+
   554  		//                |
   555  		//                v
   556  		//          +-----------+
   557  		//          |   x y z   |
   558  		//          +-----------+
   559  		//
   560  		if i >= int(n.count) {
   561  			i = int(n.count - 1)
   562  		}
   563  		child := mut(&n.children[i])
   564  		// Make mergeChild mutable, bumping the refcounts on its children if necessary.
   565  		_ = mut(&n.children[i+1])
   566  		mergeLa, mergeChild := n.removeAt(i)
   567  		child.items[child.count] = mergeLa
   568  		copy(child.items[child.count+1:], mergeChild.items[:mergeChild.count])
   569  		if !child.leaf {
   570  			copy(child.children[child.count+1:], mergeChild.children[:mergeChild.count+1])
   571  		}
   572  		child.count += mergeChild.count + 1
   573  
   574  		mergeChild.decRef(false /* recursive */, nil)
   575  	}
   576  }
   577  
   578  func (n *node) invalidateAnnotation(a Annotator) {
   579  	// Find this annotator's annotation on this node.
   580  	var annot *annotation
   581  	for i := range n.annot {
   582  		if n.annot[i].annotator == a {
   583  			annot = &n.annot[i]
   584  		}
   585  	}
   586  
   587  	if annot != nil && annot.valid {
   588  		annot.valid = false
   589  		annot.v = a.Zero(annot.v)
   590  	}
   591  	if !n.leaf {
   592  		for i := int16(0); i <= n.count; i++ {
   593  			n.children[i].invalidateAnnotation(a)
   594  		}
   595  	}
   596  }
   597  
   598  func (n *node) annotation(a Annotator) (interface{}, bool) {
   599  	// Find this annotator's annotation on this node.
   600  	var annot *annotation
   601  	for i := range n.annot {
   602  		if n.annot[i].annotator == a {
   603  			annot = &n.annot[i]
   604  		}
   605  	}
   606  
   607  	// If it exists and is marked as valid, we can return it without
   608  	// recomputing anything.
   609  	if annot != nil && annot.valid {
   610  		return annot.v, true
   611  	}
   612  
   613  	if annot == nil {
   614  		// This is n's first time being annotated by a.
   615  		// Create a new zeroed annotation.
   616  		n.annot = append(n.annot, annotation{
   617  			annotator: a,
   618  			v:         a.Zero(nil),
   619  		})
   620  		annot = &n.annot[len(n.annot)-1]
   621  	} else {
   622  		// There's an existing annotation that must be recomputed.
   623  		// Zero its value.
   624  		annot.v = a.Zero(annot.v)
   625  	}
   626  
   627  	annot.valid = true
   628  	for i := int16(0); i <= n.count; i++ {
   629  		if !n.leaf {
   630  			v, ok := n.children[i].annotation(a)
   631  			annot.v = a.Merge(v, annot.v)
   632  			annot.valid = annot.valid && ok
   633  		}
   634  		if i < n.count {
   635  			v, ok := a.Accumulate(n.items[i], annot.v)
   636  			annot.v = v
   637  			annot.valid = annot.valid && ok
   638  		}
   639  	}
   640  	return annot.v, annot.valid
   641  }
   642  
   643  // btree is an implementation of a B-Tree.
   644  //
   645  // btree stores FileMetadata in an ordered structure, allowing easy insertion,
   646  // removal, and iteration. The B-Tree stores items in order based on cmp. The
   647  // first level of the LSM uses a cmp function that compares sequence numbers.
   648  // All other levels compare using the FileMetadata.Smallest.
   649  //
   650  // Write operations are not safe for concurrent mutation by multiple
   651  // goroutines, but Read operations are.
   652  type btree struct {
   653  	root   *node
   654  	length int
   655  	cmp    btreeCmp
   656  }
   657  
   658  // release dereferences and clears the root node of the btree, removing all
   659  // items from the btree. In doing so, it decrements contained file counts.
   660  // It returns a slice of newly obsolete files, if any.
   661  func (t *btree) release() (obsolete []*FileMetadata) {
   662  	if t.root != nil {
   663  		t.root.decRef(true /* recursive */, &obsolete)
   664  		t.root = nil
   665  	}
   666  	t.length = 0
   667  	return obsolete
   668  }
   669  
   670  // clone clones the btree, lazily. It does so in constant time.
   671  func (t *btree) clone() btree {
   672  	c := *t
   673  	if c.root != nil {
   674  		// Incrementing the reference count on the root node is sufficient to
   675  		// ensure that no node in the cloned tree can be mutated by an actor
   676  		// holding a reference to the original tree and vice versa. This
   677  		// property is upheld because the root node in the receiver btree and
   678  		// the returned btree will both necessarily have a reference count of at
   679  		// least 2 when this method returns. All tree mutations recursively
   680  		// acquire mutable node references (see mut) as they traverse down the
   681  		// tree. The act of acquiring a mutable node reference performs a clone
   682  		// if a node's reference count is greater than one. Cloning a node (see
   683  		// clone) increases the reference count on each of its children,
   684  		// ensuring that they have a reference count of at least 2. This, in
   685  		// turn, ensures that any of the child nodes that are modified will also
   686  		// be copied-on-write, recursively ensuring the immutability property
   687  		// over the entire tree.
   688  		c.root.incRef()
   689  	}
   690  	return c
   691  }
   692  
   693  // delete removes the provided file from the tree.
   694  // It returns true if the file now has a zero reference count.
   695  func (t *btree) delete(item *FileMetadata) (obsolete bool) {
   696  	if t.root == nil || t.root.count == 0 {
   697  		return false
   698  	}
   699  	if out := mut(&t.root).remove(t.cmp, item); out != nil {
   700  		t.length--
   701  		obsolete = atomic.AddInt32(&out.refs, -1) == 0
   702  	}
   703  	if t.root.count == 0 {
   704  		old := t.root
   705  		if t.root.leaf {
   706  			t.root = nil
   707  		} else {
   708  			t.root = t.root.children[0]
   709  		}
   710  		old.decRef(false /* recursive */, nil)
   711  	}
   712  	return obsolete
   713  }
   714  
   715  // insert adds the given item to the tree. If a item in the tree already
   716  // equals the given one, insert panics.
   717  func (t *btree) insert(item *FileMetadata) error {
   718  	if t.root == nil {
   719  		t.root = newLeafNode()
   720  	} else if t.root.count >= maxItems {
   721  		splitLa, splitNode := mut(&t.root).split(maxItems / 2)
   722  		newRoot := newNode()
   723  		newRoot.count = 1
   724  		newRoot.items[0] = splitLa
   725  		newRoot.children[0] = t.root
   726  		newRoot.children[1] = splitNode
   727  		t.root = newRoot
   728  	}
   729  	atomic.AddInt32(&item.refs, 1)
   730  	err := mut(&t.root).insert(t.cmp, item)
   731  	t.length++
   732  	return err
   733  }
   734  
   735  // iter returns a new iterator object. It is not safe to continue using an
   736  // iterator after modifications are made to the tree. If modifications are made,
   737  // create a new iterator.
   738  func (t *btree) iter() iterator {
   739  	return iterator{r: t.root, pos: -1, cmp: t.cmp}
   740  }
   741  
   742  // height returns the height of the tree.
   743  func (t *btree) height() int {
   744  	if t.root == nil {
   745  		return 0
   746  	}
   747  	h := 1
   748  	n := t.root
   749  	for !n.leaf {
   750  		n = n.children[0]
   751  		h++
   752  	}
   753  	return h
   754  }
   755  
   756  // String returns a string description of the tree. The format is
   757  // similar to the https://en.wikipedia.org/wiki/Newick_format.
   758  func (t *btree) String() string {
   759  	if t.length == 0 {
   760  		return ";"
   761  	}
   762  	var b strings.Builder
   763  	t.root.writeString(&b)
   764  	return b.String()
   765  }
   766  
   767  func (n *node) writeString(b *strings.Builder) {
   768  	if n.leaf {
   769  		for i := int16(0); i < n.count; i++ {
   770  			if i != 0 {
   771  				b.WriteString(",")
   772  			}
   773  			b.WriteString(n.items[i].String())
   774  		}
   775  		return
   776  	}
   777  	for i := int16(0); i <= n.count; i++ {
   778  		b.WriteString("(")
   779  		n.children[i].writeString(b)
   780  		b.WriteString(")")
   781  		if i < n.count {
   782  			b.WriteString(n.items[i].String())
   783  		}
   784  	}
   785  }
   786  
   787  // iterStack represents a stack of (node, pos) tuples, which captures
   788  // iteration state as an iterator descends a btree.
   789  type iterStack struct {
   790  	// a contains aLen stack frames when an iterator stack is short enough.
   791  	// If the iterator stack overflows the capacity of iterStackArr, the stack
   792  	// is moved to s and aLen is set to -1.
   793  	a    iterStackArr
   794  	aLen int16 // -1 when using s
   795  	s    []iterFrame
   796  }
   797  
   798  // Used to avoid allocations for stacks below a certain size.
   799  type iterStackArr [3]iterFrame
   800  
   801  type iterFrame struct {
   802  	n   *node
   803  	pos int16
   804  }
   805  
   806  func (is *iterStack) push(f iterFrame) {
   807  	if is.aLen == -1 {
   808  		is.s = append(is.s, f)
   809  	} else if int(is.aLen) == len(is.a) {
   810  		is.s = make([]iterFrame, int(is.aLen)+1, 2*int(is.aLen))
   811  		copy(is.s, is.a[:])
   812  		is.s[int(is.aLen)] = f
   813  		is.aLen = -1
   814  	} else {
   815  		is.a[is.aLen] = f
   816  		is.aLen++
   817  	}
   818  }
   819  
   820  func (is *iterStack) pop() iterFrame {
   821  	if is.aLen == -1 {
   822  		f := is.s[len(is.s)-1]
   823  		is.s = is.s[:len(is.s)-1]
   824  		return f
   825  	}
   826  	is.aLen--
   827  	return is.a[is.aLen]
   828  }
   829  
   830  func (is *iterStack) len() int {
   831  	if is.aLen == -1 {
   832  		return len(is.s)
   833  	}
   834  	return int(is.aLen)
   835  }
   836  
   837  func (is *iterStack) clone() iterStack {
   838  	// If the iterator is using the embedded iterStackArr, we only need to
   839  	// copy the struct itself.
   840  	if is.s == nil {
   841  		return *is
   842  	}
   843  	clone := *is
   844  	clone.s = make([]iterFrame, len(is.s))
   845  	copy(clone.s, is.s)
   846  	return clone
   847  }
   848  
   849  func (is *iterStack) nth(n int) (f iterFrame, ok bool) {
   850  	if is.aLen == -1 {
   851  		if n >= len(is.s) {
   852  			return f, false
   853  		}
   854  		return is.s[n], true
   855  	}
   856  	if int16(n) >= is.aLen {
   857  		return f, false
   858  	}
   859  	return is.a[n], true
   860  }
   861  
   862  func (is *iterStack) reset() {
   863  	if is.aLen == -1 {
   864  		is.s = is.s[:0]
   865  	} else {
   866  		is.aLen = 0
   867  	}
   868  }
   869  
   870  // iterator is responsible for search and traversal within a btree.
   871  type iterator struct {
   872  	// the root node of the B-Tree.
   873  	r *node
   874  	// n and pos make up the current position of the iterator.
   875  	// If valid, n.items[pos] is the current value of the iterator.
   876  	n   *node
   877  	pos int16
   878  	// cmp dictates the ordering of the FileMetadata.
   879  	cmp func(*FileMetadata, *FileMetadata) int
   880  	// a stack of n's ancestors within the B-Tree, alongside the position
   881  	// taken to arrive at n. If non-empty, the bottommost frame of the stack
   882  	// will always contain the B-Tree root.
   883  	s iterStack
   884  }
   885  
   886  func (i *iterator) clone() iterator {
   887  	c := *i
   888  	c.s = i.s.clone()
   889  	return c
   890  }
   891  
   892  func (i *iterator) reset() {
   893  	i.n = i.r
   894  	i.pos = -1
   895  	i.s.reset()
   896  }
   897  
   898  func (i iterator) String() string {
   899  	var buf bytes.Buffer
   900  	for n := 0; ; n++ {
   901  		f, ok := i.s.nth(n)
   902  		if !ok {
   903  			break
   904  		}
   905  		fmt.Fprintf(&buf, "%p: %02d/%02d\n", f.n, f.pos, f.n.count)
   906  	}
   907  	if i.n == nil {
   908  		fmt.Fprintf(&buf, "<nil>: %02d", i.pos)
   909  	} else {
   910  		fmt.Fprintf(&buf, "%p: %02d/%02d", i.n, i.pos, i.n.count)
   911  	}
   912  	return buf.String()
   913  }
   914  
   915  func cmpIter(a, b iterator) int {
   916  	if a.r != b.r {
   917  		panic("compared iterators from different btrees")
   918  	}
   919  
   920  	// Each iterator has a stack of frames marking the path from the root node
   921  	// to the current iterator position. We walk both paths formed by the
   922  	// iterators' stacks simultaneously, descending from the shared root node,
   923  	// always comparing nodes at the same level in the tree.
   924  	//
   925  	// If the iterators' paths ever diverge and point to different nodes, the
   926  	// iterators are not equal and we use the node positions to evaluate the
   927  	// comparison.
   928  	//
   929  	// If an iterator's stack ends, we stop descending and use its current
   930  	// node and position for the final comparison. One iterator's stack may
   931  	// end before another's if one iterator is positioned deeper in the tree.
   932  	//
   933  	// a                                b
   934  	// +------------------------+      +--------------------------+ -
   935  	// |  Root            pos:5 |   =  |  Root              pos:5 |  |
   936  	// +------------------------+      +--------------------------+  | stack
   937  	// |  Root/5          pos:3 |   =  |  Root/5            pos:3 |  | frames
   938  	// +------------------------+      +--------------------------+  |
   939  	// |  Root/5/3        pos:9 |   >  |  Root/5/3          pos:1 |  |
   940  	// +========================+      +==========================+ -
   941  	// |                        |      |                          |
   942  	// | a.n: Root/5/3/9 a.pos:2|      | b.n: Root/5/3/1, b.pos:5 |
   943  	// +------------------------+      +--------------------------+
   944  
   945  	// Initialize with the iterator's current node and position. These are
   946  	// conceptually the most-recent/current frame of the iterator stack.
   947  	an, apos := a.n, a.pos
   948  	bn, bpos := b.n, b.pos
   949  
   950  	// aok, bok are set while traversing the iterator's path down the B-Tree.
   951  	// They're declared in the outer scope because they help distinguish the
   952  	// sentinel case when both iterators' first frame points to the last child
   953  	// of the root. If an iterator has no other frames in its stack, it's the
   954  	// end sentinel state which sorts after everything else.
   955  	var aok, bok bool
   956  	for i := 0; ; i++ {
   957  		var af, bf iterFrame
   958  		af, aok = a.s.nth(i)
   959  		bf, bok = b.s.nth(i)
   960  		if !aok || !bok {
   961  			if aok {
   962  				// Iterator a, unlike iterator b, still has a frame. Set an,
   963  				// apos so we compare using the frame from the stack.
   964  				an, apos = af.n, af.pos
   965  			}
   966  			if bok {
   967  				// Iterator b, unlike iterator a, still has a frame. Set bn,
   968  				// bpos so we compare using the frame from the stack.
   969  				bn, bpos = bf.n, bf.pos
   970  			}
   971  			break
   972  		}
   973  
   974  		// aok && bok
   975  		if af.n != bf.n {
   976  			panic("nonmatching nodes during btree iterator comparison")
   977  		}
   978  		switch {
   979  		case af.pos < bf.pos:
   980  			return -1
   981  		case af.pos > bf.pos:
   982  			return +1
   983  		default:
   984  			// Continue up both iterators' stacks (equivalently, down the
   985  			// B-Tree away from the root).
   986  		}
   987  	}
   988  
   989  	if aok && bok {
   990  		panic("expected one or more stacks to have been exhausted")
   991  	}
   992  	if an != bn {
   993  		panic("nonmatching nodes during btree iterator comparison")
   994  	}
   995  	switch {
   996  	case apos < bpos:
   997  		return -1
   998  	case apos > bpos:
   999  		return +1
  1000  	default:
  1001  		switch {
  1002  		case aok:
  1003  			// a is positioned at a leaf child at this position and b is at an
  1004  			// end sentinel state.
  1005  			return -1
  1006  		case bok:
  1007  			// b is positioned at a leaf child at this position and a is at an
  1008  			// end sentinel state.
  1009  			return +1
  1010  		default:
  1011  			return 0
  1012  		}
  1013  	}
  1014  }
  1015  
  1016  func (i *iterator) descend(n *node, pos int16) {
  1017  	i.s.push(iterFrame{n: n, pos: pos})
  1018  	i.n = n.children[pos]
  1019  	i.pos = 0
  1020  }
  1021  
  1022  // ascend ascends up to the current node's parent and resets the position
  1023  // to the one previously set for this parent node.
  1024  func (i *iterator) ascend() {
  1025  	f := i.s.pop()
  1026  	i.n = f.n
  1027  	i.pos = f.pos
  1028  }
  1029  
  1030  // seek repositions the iterator over the first file for which fn returns
  1031  // true, mirroring the semantics of the standard library's sort.Search
  1032  // function.  Like sort.Search, seek requires the iterator's B-Tree to be
  1033  // ordered such that fn returns false for some (possibly empty) prefix of the
  1034  // tree's files, and then true for the (possibly empty) remainder.
  1035  func (i *iterator) seek(fn func(*FileMetadata) bool) {
  1036  	i.reset()
  1037  	if i.n == nil {
  1038  		return
  1039  	}
  1040  
  1041  	for {
  1042  		// Logic copied from sort.Search.
  1043  		j, k := 0, int(i.n.count)
  1044  		for j < k {
  1045  			h := int(uint(j+k) >> 1) // avoid overflow when computing h
  1046  
  1047  			// j ≤ h < k
  1048  			if !fn(i.n.items[h]) {
  1049  				j = h + 1 // preserves f(j-1) == false
  1050  			} else {
  1051  				k = h // preserves f(k) == true
  1052  			}
  1053  		}
  1054  
  1055  		i.pos = int16(j)
  1056  		if i.n.leaf {
  1057  			if i.pos == i.n.count {
  1058  				i.next()
  1059  			}
  1060  			return
  1061  		}
  1062  		i.descend(i.n, i.pos)
  1063  	}
  1064  }
  1065  
  1066  // first seeks to the first item in the btree.
  1067  func (i *iterator) first() {
  1068  	i.reset()
  1069  	if i.n == nil {
  1070  		return
  1071  	}
  1072  	for !i.n.leaf {
  1073  		i.descend(i.n, 0)
  1074  	}
  1075  	i.pos = 0
  1076  }
  1077  
  1078  // last seeks to the last item in the btree.
  1079  func (i *iterator) last() {
  1080  	i.reset()
  1081  	if i.n == nil {
  1082  		return
  1083  	}
  1084  	for !i.n.leaf {
  1085  		i.descend(i.n, i.n.count)
  1086  	}
  1087  	i.pos = i.n.count - 1
  1088  }
  1089  
  1090  // next positions the iterator to the item immediately following
  1091  // its current position.
  1092  func (i *iterator) next() {
  1093  	if i.n == nil {
  1094  		return
  1095  	}
  1096  
  1097  	if i.n.leaf {
  1098  		if i.pos < i.n.count {
  1099  			i.pos++
  1100  		}
  1101  		if i.pos < i.n.count {
  1102  			return
  1103  		}
  1104  		for i.s.len() > 0 && i.pos >= i.n.count {
  1105  			i.ascend()
  1106  		}
  1107  		return
  1108  	}
  1109  
  1110  	i.descend(i.n, i.pos+1)
  1111  	for !i.n.leaf {
  1112  		i.descend(i.n, 0)
  1113  	}
  1114  	i.pos = 0
  1115  }
  1116  
  1117  // prev positions the iterator to the item immediately preceding
  1118  // its current position.
  1119  func (i *iterator) prev() {
  1120  	if i.n == nil {
  1121  		return
  1122  	}
  1123  
  1124  	if i.n.leaf {
  1125  		i.pos--
  1126  		if i.pos >= 0 {
  1127  			return
  1128  		}
  1129  		for i.s.len() > 0 && i.pos < 0 {
  1130  			i.ascend()
  1131  			i.pos--
  1132  		}
  1133  		return
  1134  	}
  1135  
  1136  	i.descend(i.n, i.pos)
  1137  	for !i.n.leaf {
  1138  		i.descend(i.n, i.n.count)
  1139  	}
  1140  	i.pos = i.n.count - 1
  1141  }
  1142  
  1143  // valid returns whether the iterator is positioned at a valid position.
  1144  func (i *iterator) valid() bool {
  1145  	return i.r != nil && i.pos >= 0 && i.pos < i.n.count
  1146  }
  1147  
  1148  // cur returns the item at the iterator's current position. It is illegal
  1149  // to call cur if the iterator is not valid.
  1150  func (i *iterator) cur() *FileMetadata {
  1151  	return i.n.items[i.pos]
  1152  }