github.com/cockroachdb/pebble@v0.0.0-20231214172447-ab4952c5f87b/internal/manifest/btree.go (about)

     1  // Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package manifest
     6  
     7  import (
     8  	"bytes"
     9  	stdcmp "cmp"
    10  	"fmt"
    11  	"strings"
    12  	"sync/atomic"
    13  	"unsafe"
    14  
    15  	"github.com/cockroachdb/errors"
    16  	"github.com/cockroachdb/pebble/internal/invariants"
    17  )
    18  
    19  // The Annotator type defined below is used by other packages to lazily
    20  // compute a value over a B-Tree. Each node of the B-Tree stores one
    21  // `annotation` per annotator, containing the result of the computation over
    22  // the node's subtree.
    23  //
    24  // An annotation is marked as valid if it's current with the current subtree
    25  // state. Annotations are marked as invalid whenever a node will be mutated
    26  // (in mut).  Annotators may also return `false` from `Accumulate` to signal
    27  // that a computation for a file is not stable and may change in the future.
    28  // Annotations that include these unstable values are also marked as invalid
    29  // on the node, ensuring that future queries for the annotation will recompute
    30  // the value.
    31  
    32  // An Annotator defines a computation over a level's FileMetadata. If the
    33  // computation is stable and uses inputs that are fixed for the lifetime of
    34  // a FileMetadata, the LevelMetadata's internal data structures are annotated
    35  // with the intermediary computations. This allows the computation to be
    36  // computed incrementally as edits are applied to a level.
    37  type Annotator interface {
    38  	// Zero returns the zero value of an annotation. This value is returned
    39  	// when a LevelMetadata is empty. The dst argument, if non-nil, is an
    40  	// obsolete value previously returned by this Annotator and may be
    41  	// overwritten and reused to avoid a memory allocation.
    42  	Zero(dst interface{}) (v interface{})
    43  
    44  	// Accumulate computes the annotation for a single file in a level's
    45  	// metadata. It merges the file's value into dst and returns a bool flag
    46  	// indicating whether or not the value is stable and okay to cache as an
    47  	// annotation. If the file's value may change over the life of the file,
    48  	// the annotator must return false.
    49  	//
    50  	// Implementations may modify dst and return it to avoid an allocation.
    51  	Accumulate(m *FileMetadata, dst interface{}) (v interface{}, cacheOK bool)
    52  
    53  	// Merge combines two values src and dst, returning the result.
    54  	// Implementations may modify dst and return it to avoid an allocation.
    55  	Merge(src interface{}, dst interface{}) interface{}
    56  }
    57  
    58  type btreeCmp func(*FileMetadata, *FileMetadata) int
    59  
    60  func btreeCmpSeqNum(a, b *FileMetadata) int {
    61  	return a.cmpSeqNum(b)
    62  }
    63  
    64  func btreeCmpSmallestKey(cmp Compare) btreeCmp {
    65  	return func(a, b *FileMetadata) int {
    66  		return a.cmpSmallestKey(b, cmp)
    67  	}
    68  }
    69  
    70  // btreeCmpSpecificOrder is used in tests to construct a B-Tree with a
    71  // specific ordering of FileMetadata within the tree. It's typically used to
    72  // test consistency checking code that needs to construct a malformed B-Tree.
    73  func btreeCmpSpecificOrder(files []*FileMetadata) btreeCmp {
    74  	m := map[*FileMetadata]int{}
    75  	for i, f := range files {
    76  		m[f] = i
    77  	}
    78  	return func(a, b *FileMetadata) int {
    79  		ai, aok := m[a]
    80  		bi, bok := m[b]
    81  		if !aok || !bok {
    82  			panic("btreeCmpSliceOrder called with unknown files")
    83  		}
    84  		return stdcmp.Compare(ai, bi)
    85  	}
    86  }
    87  
    88  const (
    89  	degree   = 16
    90  	maxItems = 2*degree - 1
    91  	minItems = degree - 1
    92  )
    93  
    94  type annotation struct {
    95  	annotator Annotator
    96  	// v is an annotation value, the output of either
    97  	// annotator.Value or annotator.Merge.
    98  	v interface{}
    99  	// valid indicates whether future reads of the annotation may use v as-is.
   100  	// If false, v will be zeroed and recalculated.
   101  	valid bool
   102  }
   103  
   104  type leafNode struct {
   105  	ref   atomic.Int32
   106  	count int16
   107  	leaf  bool
   108  	// subtreeCount holds the count of files in the entire subtree formed by
   109  	// this node. For leaf nodes, subtreeCount is always equal to count. For
   110  	// non-leaf nodes, it's the sum of count plus all the children's
   111  	// subtreeCounts.
   112  	//
   113  	// NB: We could move this field to the end of the node struct, since leaf =>
   114  	// count=subtreeCount, however the unsafe casting [leafToNode] performs make
   115  	// it risky and cumbersome.
   116  	subtreeCount int
   117  	items        [maxItems]*FileMetadata
   118  	// annot contains one annotation per annotator, merged over the entire
   119  	// node's files (and all descendants for non-leaf nodes).
   120  	annot []annotation
   121  }
   122  
   123  type node struct {
   124  	leafNode
   125  	children [maxItems + 1]*node
   126  }
   127  
   128  //go:nocheckptr casts a ptr to a smaller struct to a ptr to a larger struct.
   129  func leafToNode(ln *leafNode) *node {
   130  	return (*node)(unsafe.Pointer(ln))
   131  }
   132  
   133  func newLeafNode() *node {
   134  	n := leafToNode(new(leafNode))
   135  	n.leaf = true
   136  	n.ref.Store(1)
   137  	return n
   138  }
   139  
   140  func newNode() *node {
   141  	n := new(node)
   142  	n.ref.Store(1)
   143  	return n
   144  }
   145  
   146  // mut creates and returns a mutable node reference. If the node is not shared
   147  // with any other trees then it can be modified in place. Otherwise, it must be
   148  // cloned to ensure unique ownership. In this way, we enforce a copy-on-write
   149  // policy which transparently incorporates the idea of local mutations, like
   150  // Clojure's transients or Haskell's ST monad, where nodes are only copied
   151  // during the first time that they are modified between Clone operations.
   152  //
   153  // When a node is cloned, the provided pointer will be redirected to the new
   154  // mutable node.
   155  func mut(n **node) *node {
   156  	if (*n).ref.Load() == 1 {
   157  		// Exclusive ownership. Can mutate in place.
   158  
   159  		// Whenever a node will be mutated, reset its annotations to be marked
   160  		// as uncached. This ensures any future calls to (*node).annotation
   161  		// will recompute annotations on the modified subtree.
   162  		for i := range (*n).annot {
   163  			(*n).annot[i].valid = false
   164  		}
   165  		return *n
   166  	}
   167  	// If we do not have unique ownership over the node then we
   168  	// clone it to gain unique ownership. After doing so, we can
   169  	// release our reference to the old node. We pass recursive
   170  	// as true because even though we just observed the node's
   171  	// reference count to be greater than 1, we might be racing
   172  	// with another call to decRef on this node.
   173  	c := (*n).clone()
   174  	(*n).decRef(true /* contentsToo */, nil)
   175  	*n = c
   176  	// NB: We don't need to clear annotations, because (*node).clone does not
   177  	// copy them.
   178  	return *n
   179  }
   180  
   181  // incRef acquires a reference to the node.
   182  func (n *node) incRef() {
   183  	n.ref.Add(1)
   184  }
   185  
   186  // decRef releases a reference to the node. If requested, the method will unref
   187  // its items and recurse into child nodes and decrease their refcounts as well.
   188  // Some internal codepaths that manually copy the node's items or children to
   189  // new nodes pass contentsToo=false to preserve existing reference counts during
   190  // operations that should yield a net-zero change to descendant refcounts.
   191  // When a node is released, its contained files are dereferenced.
   192  func (n *node) decRef(contentsToo bool, obsolete *[]*FileBacking) {
   193  	if n.ref.Add(-1) > 0 {
   194  		// Other references remain. Can't free.
   195  		return
   196  	}
   197  
   198  	// Dereference the node's metadata and release child references if
   199  	// requested. Some internal callers may not want to propagate the deref
   200  	// because they're manually copying the filemetadata and children to other
   201  	// nodes, and they want to preserve the existing reference count.
   202  	if contentsToo {
   203  		for _, f := range n.items[:n.count] {
   204  			if f.Unref() == 0 {
   205  				// There are two sources of node dereferences: tree mutations
   206  				// and Version dereferences. Files should only be made obsolete
   207  				// during Version dereferences, during which `obsolete` will be
   208  				// non-nil.
   209  				if obsolete == nil {
   210  					panic(fmt.Sprintf("file metadata %s dereferenced to zero during tree mutation", f.FileNum))
   211  				}
   212  				// Reference counting is performed on the FileBacking. In the case
   213  				// of a virtual sstable, this reference counting is performed on
   214  				// a FileBacking which is shared by every single virtual sstable
   215  				// with the same backing sstable. If the reference count hits 0,
   216  				// then we know that the FileBacking won't be required by any
   217  				// sstable in Pebble, and that the backing sstable can be deleted.
   218  				*obsolete = append(*obsolete, f.FileBacking)
   219  			}
   220  		}
   221  		if !n.leaf {
   222  			for i := int16(0); i <= n.count; i++ {
   223  				n.children[i].decRef(true /* contentsToo */, obsolete)
   224  			}
   225  		}
   226  	}
   227  }
   228  
   229  // clone creates a clone of the receiver with a single reference count.
   230  func (n *node) clone() *node {
   231  	var c *node
   232  	if n.leaf {
   233  		c = newLeafNode()
   234  	} else {
   235  		c = newNode()
   236  	}
   237  	// NB: copy field-by-field without touching n.ref to avoid
   238  	// triggering the race detector and looking like a data race.
   239  	c.count = n.count
   240  	c.items = n.items
   241  	c.subtreeCount = n.subtreeCount
   242  	// Increase the refcount of each contained item.
   243  	for _, f := range n.items[:n.count] {
   244  		f.Ref()
   245  	}
   246  	if !c.leaf {
   247  		// Copy children and increase each refcount.
   248  		c.children = n.children
   249  		for i := int16(0); i <= c.count; i++ {
   250  			c.children[i].incRef()
   251  		}
   252  	}
   253  	return c
   254  }
   255  
   256  // insertAt inserts the provided file and node at the provided index. This
   257  // function is for use only as a helper function for internal B-Tree code.
   258  // Clients should not invoke it directly.
   259  func (n *node) insertAt(index int, item *FileMetadata, nd *node) {
   260  	if index < int(n.count) {
   261  		copy(n.items[index+1:n.count+1], n.items[index:n.count])
   262  		if !n.leaf {
   263  			copy(n.children[index+2:n.count+2], n.children[index+1:n.count+1])
   264  		}
   265  	}
   266  	n.items[index] = item
   267  	if !n.leaf {
   268  		n.children[index+1] = nd
   269  	}
   270  	n.count++
   271  }
   272  
   273  // pushBack inserts the provided file and node at the tail of the node's items.
   274  // This function is for use only as a helper function for internal B-Tree code.
   275  // Clients should not invoke it directly.
   276  func (n *node) pushBack(item *FileMetadata, nd *node) {
   277  	n.items[n.count] = item
   278  	if !n.leaf {
   279  		n.children[n.count+1] = nd
   280  	}
   281  	n.count++
   282  }
   283  
   284  // pushFront inserts the provided file and node at the head of the
   285  // node's items. This function is for use only as a helper function for internal B-Tree
   286  // code. Clients should not invoke it directly.
   287  func (n *node) pushFront(item *FileMetadata, nd *node) {
   288  	if !n.leaf {
   289  		copy(n.children[1:n.count+2], n.children[:n.count+1])
   290  		n.children[0] = nd
   291  	}
   292  	copy(n.items[1:n.count+1], n.items[:n.count])
   293  	n.items[0] = item
   294  	n.count++
   295  }
   296  
   297  // removeAt removes a value at a given index, pulling all subsequent values
   298  // back. This function is for use only as a helper function for internal B-Tree
   299  // code. Clients should not invoke it directly.
   300  func (n *node) removeAt(index int) (*FileMetadata, *node) {
   301  	var child *node
   302  	if !n.leaf {
   303  		child = n.children[index+1]
   304  		copy(n.children[index+1:n.count], n.children[index+2:n.count+1])
   305  		n.children[n.count] = nil
   306  	}
   307  	n.count--
   308  	out := n.items[index]
   309  	copy(n.items[index:n.count], n.items[index+1:n.count+1])
   310  	n.items[n.count] = nil
   311  	return out, child
   312  }
   313  
   314  // popBack removes and returns the last element in the list. This function is
   315  // for use only as a helper function for internal B-Tree code. Clients should
   316  // not invoke it directly.
   317  func (n *node) popBack() (*FileMetadata, *node) {
   318  	n.count--
   319  	out := n.items[n.count]
   320  	n.items[n.count] = nil
   321  	if n.leaf {
   322  		return out, nil
   323  	}
   324  	child := n.children[n.count+1]
   325  	n.children[n.count+1] = nil
   326  	return out, child
   327  }
   328  
   329  // popFront removes and returns the first element in the list. This function is
   330  // for use only as a helper function for internal B-Tree code. Clients should
   331  // not invoke it directly.
   332  func (n *node) popFront() (*FileMetadata, *node) {
   333  	n.count--
   334  	var child *node
   335  	if !n.leaf {
   336  		child = n.children[0]
   337  		copy(n.children[:n.count+1], n.children[1:n.count+2])
   338  		n.children[n.count+1] = nil
   339  	}
   340  	out := n.items[0]
   341  	copy(n.items[:n.count], n.items[1:n.count+1])
   342  	n.items[n.count] = nil
   343  	return out, child
   344  }
   345  
   346  // find returns the index where the given item should be inserted into this
   347  // list. 'found' is true if the item already exists in the list at the given
   348  // index.
   349  //
   350  // This function is for use only as a helper function for internal B-Tree code.
   351  // Clients should not invoke it directly.
   352  func (n *node) find(cmp btreeCmp, item *FileMetadata) (index int, found bool) {
   353  	// Logic copied from sort.Search. Inlining this gave
   354  	// an 11% speedup on BenchmarkBTreeDeleteInsert.
   355  	i, j := 0, int(n.count)
   356  	for i < j {
   357  		h := int(uint(i+j) >> 1) // avoid overflow when computing h
   358  		// i ≤ h < j
   359  		v := cmp(item, n.items[h])
   360  		if v == 0 {
   361  			return h, true
   362  		} else if v > 0 {
   363  			i = h + 1
   364  		} else {
   365  			j = h
   366  		}
   367  	}
   368  	return i, false
   369  }
   370  
   371  // split splits the given node at the given index. The current node shrinks,
   372  // and this function returns the item that existed at that index and a new
   373  // node containing all items/children after it.
   374  //
   375  // split is called when we want to perform a transformation like the one
   376  // depicted in the following diagram.
   377  //
   378  //	Before:
   379  //	                       +-----------+
   380  //	             n *node   |   x y z   |
   381  //	                       +--/-/-\-\--+
   382  //
   383  //	After:
   384  //	                       +-----------+
   385  //	                       |     y     |  n's parent
   386  //	                       +----/-\----+
   387  //	                           /   \
   388  //	                          v     v
   389  //	              +-----------+     +-----------+
   390  //	      n *node |         x |     | z         | next *node
   391  //	              +-----------+     +-----------+
   392  //
   393  // split does not perform the complete transformation; the caller is responsible
   394  // for updating the parent appropriately. split splits `n` into two nodes, `n`
   395  // and `next`, returning `next` and the file that separates them. In the diagram
   396  // above, `n.split` removes y and z from `n`, returning y in the first return
   397  // value and `next` in the second return value. The caller is responsible for
   398  // updating n's parent to now contain `y` as the separator between nodes `n` and
   399  // `next`.
   400  //
   401  // This function is for use only as a helper function for internal B-Tree code.
   402  // Clients should not invoke it directly.
   403  func (n *node) split(i int) (*FileMetadata, *node) {
   404  	out := n.items[i]
   405  	var next *node
   406  	if n.leaf {
   407  		next = newLeafNode()
   408  	} else {
   409  		next = newNode()
   410  	}
   411  	next.count = n.count - int16(i+1)
   412  	copy(next.items[:], n.items[i+1:n.count])
   413  	for j := int16(i); j < n.count; j++ {
   414  		n.items[j] = nil
   415  	}
   416  	if !n.leaf {
   417  		copy(next.children[:], n.children[i+1:n.count+1])
   418  		descendantsMoved := 0
   419  		for j := int16(i + 1); j <= n.count; j++ {
   420  			descendantsMoved += n.children[j].subtreeCount
   421  			n.children[j] = nil
   422  		}
   423  		n.subtreeCount -= descendantsMoved
   424  		next.subtreeCount += descendantsMoved
   425  	}
   426  	n.count = int16(i)
   427  	// NB: We subtract one more than `next.count` from n's subtreeCount because
   428  	// the item at index `i` was removed from `n.items`. We'll return the item
   429  	// at index `i`, and the caller is responsible for updating the subtree
   430  	// count of whichever node adopts it.
   431  	n.subtreeCount -= int(next.count) + 1
   432  	next.subtreeCount += int(next.count)
   433  	return out, next
   434  }
   435  
   436  // Insert inserts a item into the subtree rooted at this node, making sure no
   437  // nodes in the subtree exceed maxItems items.
   438  func (n *node) Insert(cmp btreeCmp, item *FileMetadata) error {
   439  	i, found := n.find(cmp, item)
   440  	if found {
   441  		// cmp provides a total ordering of the files within a level.
   442  		// If we're inserting a metadata that's equal to an existing item
   443  		// in the tree, we're inserting a file into a level twice.
   444  		return errors.Errorf("files %s and %s collided on sort keys",
   445  			errors.Safe(item.FileNum), errors.Safe(n.items[i].FileNum))
   446  	}
   447  	if n.leaf {
   448  		n.insertAt(i, item, nil)
   449  		n.subtreeCount++
   450  		return nil
   451  	}
   452  	if n.children[i].count >= maxItems {
   453  		splitLa, splitNode := mut(&n.children[i]).split(maxItems / 2)
   454  		n.insertAt(i, splitLa, splitNode)
   455  
   456  		switch cmp := cmp(item, n.items[i]); {
   457  		case cmp < 0:
   458  			// no change, we want first split node
   459  		case cmp > 0:
   460  			i++ // we want second split node
   461  		default:
   462  			// cmp provides a total ordering of the files within a level.
   463  			// If we're inserting a metadata that's equal to an existing item
   464  			// in the tree, we're inserting a file into a level twice.
   465  			return errors.Errorf("files %s and %s collided on sort keys",
   466  				errors.Safe(item.FileNum), errors.Safe(n.items[i].FileNum))
   467  		}
   468  	}
   469  
   470  	err := mut(&n.children[i]).Insert(cmp, item)
   471  	if err == nil {
   472  		n.subtreeCount++
   473  	}
   474  	return err
   475  }
   476  
   477  // removeMax removes and returns the maximum item from the subtree rooted at
   478  // this node. This function is for use only as a helper function for internal
   479  // B-Tree code. Clients should not invoke it directly.
   480  func (n *node) removeMax() *FileMetadata {
   481  	if n.leaf {
   482  		n.count--
   483  		n.subtreeCount--
   484  		out := n.items[n.count]
   485  		n.items[n.count] = nil
   486  		return out
   487  	}
   488  	child := mut(&n.children[n.count])
   489  	if child.count <= minItems {
   490  		n.rebalanceOrMerge(int(n.count))
   491  		return n.removeMax()
   492  	}
   493  	n.subtreeCount--
   494  	return child.removeMax()
   495  }
   496  
   497  // Remove removes a item from the subtree rooted at this node. Returns
   498  // the item that was removed or nil if no matching item was found.
   499  func (n *node) Remove(cmp btreeCmp, item *FileMetadata) (out *FileMetadata) {
   500  	i, found := n.find(cmp, item)
   501  	if n.leaf {
   502  		if found {
   503  			out, _ = n.removeAt(i)
   504  			n.subtreeCount--
   505  			return out
   506  		}
   507  		return nil
   508  	}
   509  	if n.children[i].count <= minItems {
   510  		// Child not large enough to remove from.
   511  		n.rebalanceOrMerge(i)
   512  		return n.Remove(cmp, item)
   513  	}
   514  	child := mut(&n.children[i])
   515  	if found {
   516  		// Replace the item being removed with the max item in our left child.
   517  		out = n.items[i]
   518  		n.items[i] = child.removeMax()
   519  		n.subtreeCount--
   520  		return out
   521  	}
   522  	// File is not in this node and child is large enough to remove from.
   523  	out = child.Remove(cmp, item)
   524  	if out != nil {
   525  		n.subtreeCount--
   526  	}
   527  	return out
   528  }
   529  
   530  // rebalanceOrMerge grows child 'i' to ensure it has sufficient room to remove a
   531  // item from it while keeping it at or above minItems. This function is for use
   532  // only as a helper function for internal B-Tree code. Clients should not invoke
   533  // it directly.
   534  func (n *node) rebalanceOrMerge(i int) {
   535  	switch {
   536  	case i > 0 && n.children[i-1].count > minItems:
   537  		// Rebalance from left sibling.
   538  		//
   539  		//          +-----------+
   540  		//          |     y     |
   541  		//          +----/-\----+
   542  		//              /   \
   543  		//             v     v
   544  		// +-----------+     +-----------+
   545  		// |         x |     |           |
   546  		// +----------\+     +-----------+
   547  		//             \
   548  		//              v
   549  		//              a
   550  		//
   551  		// After:
   552  		//
   553  		//          +-----------+
   554  		//          |     x     |
   555  		//          +----/-\----+
   556  		//              /   \
   557  		//             v     v
   558  		// +-----------+     +-----------+
   559  		// |           |     | y         |
   560  		// +-----------+     +/----------+
   561  		//                   /
   562  		//                  v
   563  		//                  a
   564  		//
   565  		left := mut(&n.children[i-1])
   566  		child := mut(&n.children[i])
   567  		xLa, grandChild := left.popBack()
   568  		yLa := n.items[i-1]
   569  		child.pushFront(yLa, grandChild)
   570  		n.items[i-1] = xLa
   571  		child.subtreeCount++
   572  		left.subtreeCount--
   573  		if grandChild != nil {
   574  			child.subtreeCount += grandChild.subtreeCount
   575  			left.subtreeCount -= grandChild.subtreeCount
   576  		}
   577  
   578  	case i < int(n.count) && n.children[i+1].count > minItems:
   579  		// Rebalance from right sibling.
   580  		//
   581  		//          +-----------+
   582  		//          |     y     |
   583  		//          +----/-\----+
   584  		//              /   \
   585  		//             v     v
   586  		// +-----------+     +-----------+
   587  		// |           |     | x         |
   588  		// +-----------+     +/----------+
   589  		//                   /
   590  		//                  v
   591  		//                  a
   592  		//
   593  		// After:
   594  		//
   595  		//          +-----------+
   596  		//          |     x     |
   597  		//          +----/-\----+
   598  		//              /   \
   599  		//             v     v
   600  		// +-----------+     +-----------+
   601  		// |         y |     |           |
   602  		// +----------\+     +-----------+
   603  		//             \
   604  		//              v
   605  		//              a
   606  		//
   607  		right := mut(&n.children[i+1])
   608  		child := mut(&n.children[i])
   609  		xLa, grandChild := right.popFront()
   610  		yLa := n.items[i]
   611  		child.pushBack(yLa, grandChild)
   612  		child.subtreeCount++
   613  		right.subtreeCount--
   614  		if grandChild != nil {
   615  			child.subtreeCount += grandChild.subtreeCount
   616  			right.subtreeCount -= grandChild.subtreeCount
   617  		}
   618  		n.items[i] = xLa
   619  
   620  	default:
   621  		// Merge with either the left or right sibling.
   622  		//
   623  		//          +-----------+
   624  		//          |   u y v   |
   625  		//          +----/-\----+
   626  		//              /   \
   627  		//             v     v
   628  		// +-----------+     +-----------+
   629  		// |         x |     | z         |
   630  		// +-----------+     +-----------+
   631  		//
   632  		// After:
   633  		//
   634  		//          +-----------+
   635  		//          |    u v    |
   636  		//          +-----|-----+
   637  		//                |
   638  		//                v
   639  		//          +-----------+
   640  		//          |   x y z   |
   641  		//          +-----------+
   642  		//
   643  		if i >= int(n.count) {
   644  			i = int(n.count - 1)
   645  		}
   646  		child := mut(&n.children[i])
   647  		// Make mergeChild mutable, bumping the refcounts on its children if necessary.
   648  		_ = mut(&n.children[i+1])
   649  		mergeLa, mergeChild := n.removeAt(i)
   650  		child.items[child.count] = mergeLa
   651  		copy(child.items[child.count+1:], mergeChild.items[:mergeChild.count])
   652  		if !child.leaf {
   653  			copy(child.children[child.count+1:], mergeChild.children[:mergeChild.count+1])
   654  		}
   655  		child.count += mergeChild.count + 1
   656  		child.subtreeCount += mergeChild.subtreeCount + 1
   657  
   658  		mergeChild.decRef(false /* contentsToo */, nil)
   659  	}
   660  }
   661  
   662  // InvalidateAnnotation removes any existing cached annotations for the provided
   663  // annotator from this node's subtree.
   664  func (n *node) InvalidateAnnotation(a Annotator) {
   665  	// Find this annotator's annotation on this node.
   666  	var annot *annotation
   667  	for i := range n.annot {
   668  		if n.annot[i].annotator == a {
   669  			annot = &n.annot[i]
   670  		}
   671  	}
   672  
   673  	if annot != nil && annot.valid {
   674  		annot.valid = false
   675  		annot.v = a.Zero(annot.v)
   676  	}
   677  	if !n.leaf {
   678  		for i := int16(0); i <= n.count; i++ {
   679  			n.children[i].InvalidateAnnotation(a)
   680  		}
   681  	}
   682  }
   683  
   684  // Annotation retrieves, computing if not already computed, the provided
   685  // annotator's annotation of this node. The second return value indicates
   686  // whether the future reads of this annotation may use the first return value
   687  // as-is. If false, the annotation is not stable and may change on a subsequent
   688  // computation.
   689  func (n *node) Annotation(a Annotator) (interface{}, bool) {
   690  	// Find this annotator's annotation on this node.
   691  	var annot *annotation
   692  	for i := range n.annot {
   693  		if n.annot[i].annotator == a {
   694  			annot = &n.annot[i]
   695  		}
   696  	}
   697  
   698  	// If it exists and is marked as valid, we can return it without
   699  	// recomputing anything.
   700  	if annot != nil && annot.valid {
   701  		return annot.v, true
   702  	}
   703  
   704  	if annot == nil {
   705  		// This is n's first time being annotated by a.
   706  		// Create a new zeroed annotation.
   707  		n.annot = append(n.annot, annotation{
   708  			annotator: a,
   709  			v:         a.Zero(nil),
   710  		})
   711  		annot = &n.annot[len(n.annot)-1]
   712  	} else {
   713  		// There's an existing annotation that must be recomputed.
   714  		// Zero its value.
   715  		annot.v = a.Zero(annot.v)
   716  	}
   717  
   718  	annot.valid = true
   719  	for i := int16(0); i <= n.count; i++ {
   720  		if !n.leaf {
   721  			v, ok := n.children[i].Annotation(a)
   722  			annot.v = a.Merge(v, annot.v)
   723  			annot.valid = annot.valid && ok
   724  		}
   725  		if i < n.count {
   726  			v, ok := a.Accumulate(n.items[i], annot.v)
   727  			annot.v = v
   728  			annot.valid = annot.valid && ok
   729  		}
   730  	}
   731  	return annot.v, annot.valid
   732  }
   733  
   734  func (n *node) verifyInvariants() {
   735  	recomputedSubtreeCount := int(n.count)
   736  	if !n.leaf {
   737  		for i := int16(0); i <= n.count; i++ {
   738  			n.children[i].verifyInvariants()
   739  			recomputedSubtreeCount += n.children[i].subtreeCount
   740  		}
   741  	}
   742  	if recomputedSubtreeCount != n.subtreeCount {
   743  		panic(fmt.Sprintf("recomputed subtree count (%d) ≠ n.subtreeCount (%d)",
   744  			recomputedSubtreeCount, n.subtreeCount))
   745  	}
   746  }
   747  
   748  // btree is an implementation of a B-Tree.
   749  //
   750  // btree stores FileMetadata in an ordered structure, allowing easy insertion,
   751  // removal, and iteration. The B-Tree stores items in order based on cmp. The
   752  // first level of the LSM uses a cmp function that compares sequence numbers.
   753  // All other levels compare using the FileMetadata.Smallest.
   754  //
   755  // Write operations are not safe for concurrent mutation by multiple
   756  // goroutines, but Read operations are.
   757  type btree struct {
   758  	root *node
   759  	cmp  btreeCmp
   760  }
   761  
   762  // Release dereferences and clears the root node of the btree, removing all
   763  // items from the btree. In doing so, it decrements contained file counts.
   764  // It returns a slice of newly obsolete backing files, if any.
   765  func (t *btree) Release() (obsolete []*FileBacking) {
   766  	if t.root != nil {
   767  		t.root.decRef(true /* contentsToo */, &obsolete)
   768  		t.root = nil
   769  	}
   770  	return obsolete
   771  }
   772  
   773  // Clone clones the btree, lazily. It does so in constant time.
   774  func (t *btree) Clone() btree {
   775  	c := *t
   776  	if c.root != nil {
   777  		// Incrementing the reference count on the root node is sufficient to
   778  		// ensure that no node in the cloned tree can be mutated by an actor
   779  		// holding a reference to the original tree and vice versa. This
   780  		// property is upheld because the root node in the receiver btree and
   781  		// the returned btree will both necessarily have a reference count of at
   782  		// least 2 when this method returns. All tree mutations recursively
   783  		// acquire mutable node references (see mut) as they traverse down the
   784  		// tree. The act of acquiring a mutable node reference performs a clone
   785  		// if a node's reference count is greater than one. Cloning a node (see
   786  		// clone) increases the reference count on each of its children,
   787  		// ensuring that they have a reference count of at least 2. This, in
   788  		// turn, ensures that any of the child nodes that are modified will also
   789  		// be copied-on-write, recursively ensuring the immutability property
   790  		// over the entire tree.
   791  		c.root.incRef()
   792  	}
   793  	return c
   794  }
   795  
   796  // Delete removes the provided file from the tree.
   797  // It returns true if the file now has a zero reference count.
   798  func (t *btree) Delete(item *FileMetadata) (obsolete bool) {
   799  	if t.root == nil || t.root.count == 0 {
   800  		return false
   801  	}
   802  	if out := mut(&t.root).Remove(t.cmp, item); out != nil {
   803  		obsolete = out.Unref() == 0
   804  	}
   805  	if invariants.Enabled {
   806  		t.root.verifyInvariants()
   807  	}
   808  	if t.root.count == 0 {
   809  		old := t.root
   810  		if t.root.leaf {
   811  			t.root = nil
   812  		} else {
   813  			t.root = t.root.children[0]
   814  		}
   815  		old.decRef(false /* contentsToo */, nil)
   816  	}
   817  	return obsolete
   818  }
   819  
   820  // Insert adds the given item to the tree. If a item in the tree already
   821  // equals the given one, Insert panics.
   822  func (t *btree) Insert(item *FileMetadata) error {
   823  	if t.root == nil {
   824  		t.root = newLeafNode()
   825  	} else if t.root.count >= maxItems {
   826  		splitLa, splitNode := mut(&t.root).split(maxItems / 2)
   827  		newRoot := newNode()
   828  		newRoot.count = 1
   829  		newRoot.items[0] = splitLa
   830  		newRoot.children[0] = t.root
   831  		newRoot.children[1] = splitNode
   832  		newRoot.subtreeCount = t.root.subtreeCount + splitNode.subtreeCount + 1
   833  		t.root = newRoot
   834  	}
   835  	item.Ref()
   836  	err := mut(&t.root).Insert(t.cmp, item)
   837  	if invariants.Enabled {
   838  		t.root.verifyInvariants()
   839  	}
   840  	return err
   841  }
   842  
   843  // Iter returns a new iterator object. It is not safe to continue using an
   844  // iterator after modifications are made to the tree. If modifications are made,
   845  // create a new iterator.
   846  func (t *btree) Iter() iterator {
   847  	return iterator{r: t.root, pos: -1, cmp: t.cmp}
   848  }
   849  
   850  // Count returns the number of files contained within the B-Tree.
   851  func (t *btree) Count() int {
   852  	if t.root == nil {
   853  		return 0
   854  	}
   855  	return t.root.subtreeCount
   856  }
   857  
   858  // String returns a string description of the tree. The format is
   859  // similar to the https://en.wikipedia.org/wiki/Newick_format.
   860  func (t *btree) String() string {
   861  	if t.Count() == 0 {
   862  		return ";"
   863  	}
   864  	var b strings.Builder
   865  	t.root.writeString(&b)
   866  	return b.String()
   867  }
   868  
   869  func (n *node) writeString(b *strings.Builder) {
   870  	if n.leaf {
   871  		for i := int16(0); i < n.count; i++ {
   872  			if i != 0 {
   873  				b.WriteString(",")
   874  			}
   875  			b.WriteString(n.items[i].String())
   876  		}
   877  		return
   878  	}
   879  	for i := int16(0); i <= n.count; i++ {
   880  		b.WriteString("(")
   881  		n.children[i].writeString(b)
   882  		b.WriteString(")")
   883  		if i < n.count {
   884  			b.WriteString(n.items[i].String())
   885  		}
   886  	}
   887  }
   888  
   889  // iterStack represents a stack of (node, pos) tuples, which captures
   890  // iteration state as an iterator descends a btree.
   891  type iterStack struct {
   892  	// a contains aLen stack frames when an iterator stack is short enough.
   893  	// If the iterator stack overflows the capacity of iterStackArr, the stack
   894  	// is moved to s and aLen is set to -1.
   895  	a    iterStackArr
   896  	aLen int16 // -1 when using s
   897  	s    []iterFrame
   898  }
   899  
   900  // Used to avoid allocations for stacks below a certain size.
   901  type iterStackArr [3]iterFrame
   902  
   903  type iterFrame struct {
   904  	n   *node
   905  	pos int16
   906  }
   907  
   908  func (is *iterStack) push(f iterFrame) {
   909  	if is.aLen == -1 {
   910  		is.s = append(is.s, f)
   911  	} else if int(is.aLen) == len(is.a) {
   912  		is.s = make([]iterFrame, int(is.aLen)+1, 2*int(is.aLen))
   913  		copy(is.s, is.a[:])
   914  		is.s[int(is.aLen)] = f
   915  		is.aLen = -1
   916  	} else {
   917  		is.a[is.aLen] = f
   918  		is.aLen++
   919  	}
   920  }
   921  
   922  func (is *iterStack) pop() iterFrame {
   923  	if is.aLen == -1 {
   924  		f := is.s[len(is.s)-1]
   925  		is.s = is.s[:len(is.s)-1]
   926  		return f
   927  	}
   928  	is.aLen--
   929  	return is.a[is.aLen]
   930  }
   931  
   932  func (is *iterStack) len() int {
   933  	if is.aLen == -1 {
   934  		return len(is.s)
   935  	}
   936  	return int(is.aLen)
   937  }
   938  
   939  func (is *iterStack) clone() iterStack {
   940  	// If the iterator is using the embedded iterStackArr, we only need to
   941  	// copy the struct itself.
   942  	if is.s == nil {
   943  		return *is
   944  	}
   945  	clone := *is
   946  	clone.s = make([]iterFrame, len(is.s))
   947  	copy(clone.s, is.s)
   948  	return clone
   949  }
   950  
   951  func (is *iterStack) nth(n int) (f iterFrame, ok bool) {
   952  	if is.aLen == -1 {
   953  		if n >= len(is.s) {
   954  			return f, false
   955  		}
   956  		return is.s[n], true
   957  	}
   958  	if int16(n) >= is.aLen {
   959  		return f, false
   960  	}
   961  	return is.a[n], true
   962  }
   963  
   964  func (is *iterStack) reset() {
   965  	if is.aLen == -1 {
   966  		is.s = is.s[:0]
   967  	} else {
   968  		is.aLen = 0
   969  	}
   970  }
   971  
   972  // iterator is responsible for search and traversal within a btree.
   973  type iterator struct {
   974  	// the root node of the B-Tree.
   975  	r *node
   976  	// n and pos make up the current position of the iterator.
   977  	// If valid, n.items[pos] is the current value of the iterator.
   978  	//
   979  	// n may be nil iff i.r is nil.
   980  	n   *node
   981  	pos int16
   982  	// cmp dictates the ordering of the FileMetadata.
   983  	cmp func(*FileMetadata, *FileMetadata) int
   984  	// a stack of n's ancestors within the B-Tree, alongside the position
   985  	// taken to arrive at n. If non-empty, the bottommost frame of the stack
   986  	// will always contain the B-Tree root.
   987  	s iterStack
   988  }
   989  
   990  // countLeft returns the count of files that are to the left of the current
   991  // iterator position.
   992  func (i *iterator) countLeft() int {
   993  	if i.r == nil {
   994  		return 0
   995  	}
   996  
   997  	// Each iterator has a stack of frames marking the path from the root node
   998  	// to the current iterator position. All files (n.items) and all subtrees
   999  	// (n.children) with indexes less than [pos] are to the left of the current
  1000  	// iterator position.
  1001  	//
  1002  	//     +------------------------+  -
  1003  	//     |  Root            pos:5 |   |
  1004  	//     +------------------------+   | stack
  1005  	//     |  Root/5          pos:3 |   | frames
  1006  	//     +------------------------+   | [i.s]
  1007  	//     |  Root/5/3        pos:9 |   |
  1008  	//     +========================+  -
  1009  	//     |                        |
  1010  	//     | i.n: Root/5/3/9 i.pos:2|
  1011  	//     +------------------------+
  1012  	//
  1013  	var count int
  1014  	// Walk all the ancestors in the iterator stack [i.s], tallying up all the
  1015  	// files and subtrees to the left of the stack frame's position.
  1016  	f, ok := i.s.nth(0)
  1017  	for fi := 0; ok; fi++ {
  1018  		// There are [f.pos] files contained within [f.n.items] that sort to the
  1019  		// left of the subtree the iterator has descended.
  1020  		count += int(f.pos)
  1021  		// Any subtrees that fall before the stack frame's position are entirely
  1022  		// to the left of the iterator's current position.
  1023  		for j := int16(0); j < f.pos; j++ {
  1024  			count += f.n.children[j].subtreeCount
  1025  		}
  1026  		f, ok = i.s.nth(fi + 1)
  1027  	}
  1028  
  1029  	// The bottommost stack frame is inlined within the iterator struct. Again,
  1030  	// [i.pos] files fall to the left of the current iterator position.
  1031  	count += int(i.pos)
  1032  	if !i.n.leaf {
  1033  		// NB: Unlike above, we use a `<= i.pos` comparison. The iterator is
  1034  		// positioned at item `i.n.items[i.pos]`, which sorts after everything
  1035  		// in the subtree at `i.n.children[i.pos]`.
  1036  		for j := int16(0); j <= i.pos; j++ {
  1037  			count += i.n.children[j].subtreeCount
  1038  		}
  1039  	}
  1040  	return count
  1041  }
  1042  
  1043  func (i *iterator) clone() iterator {
  1044  	c := *i
  1045  	c.s = i.s.clone()
  1046  	return c
  1047  }
  1048  
  1049  func (i *iterator) reset() {
  1050  	i.n = i.r
  1051  	i.pos = -1
  1052  	i.s.reset()
  1053  }
  1054  
  1055  func (i iterator) String() string {
  1056  	var buf bytes.Buffer
  1057  	for n := 0; ; n++ {
  1058  		f, ok := i.s.nth(n)
  1059  		if !ok {
  1060  			break
  1061  		}
  1062  		fmt.Fprintf(&buf, "%p: %02d/%02d\n", f.n, f.pos, f.n.count)
  1063  	}
  1064  	if i.r == nil {
  1065  		fmt.Fprintf(&buf, "<nil>: %02d", i.pos)
  1066  	} else {
  1067  		fmt.Fprintf(&buf, "%p: %02d/%02d", i.n, i.pos, i.n.count)
  1068  	}
  1069  	return buf.String()
  1070  }
  1071  
  1072  func cmpIter(a, b iterator) int {
  1073  	if a.r != b.r {
  1074  		panic("compared iterators from different btrees")
  1075  	}
  1076  
  1077  	// Each iterator has a stack of frames marking the path from the root node
  1078  	// to the current iterator position. We walk both paths formed by the
  1079  	// iterators' stacks simultaneously, descending from the shared root node,
  1080  	// always comparing nodes at the same level in the tree.
  1081  	//
  1082  	// If the iterators' paths ever diverge and point to different nodes, the
  1083  	// iterators are not equal and we use the node positions to evaluate the
  1084  	// comparison.
  1085  	//
  1086  	// If an iterator's stack ends, we stop descending and use its current
  1087  	// node and position for the final comparison. One iterator's stack may
  1088  	// end before another's if one iterator is positioned deeper in the tree.
  1089  	//
  1090  	// a                                b
  1091  	// +------------------------+      +--------------------------+ -
  1092  	// |  Root            pos:5 |   =  |  Root              pos:5 |  |
  1093  	// +------------------------+      +--------------------------+  | stack
  1094  	// |  Root/5          pos:3 |   =  |  Root/5            pos:3 |  | frames
  1095  	// +------------------------+      +--------------------------+  |
  1096  	// |  Root/5/3        pos:9 |   >  |  Root/5/3          pos:1 |  |
  1097  	// +========================+      +==========================+ -
  1098  	// |                        |      |                          |
  1099  	// | a.n: Root/5/3/9 a.pos:2|      | b.n: Root/5/3/1, b.pos:5 |
  1100  	// +------------------------+      +--------------------------+
  1101  
  1102  	// Initialize with the iterator's current node and position. These are
  1103  	// conceptually the most-recent/current frame of the iterator stack.
  1104  	an, apos := a.n, a.pos
  1105  	bn, bpos := b.n, b.pos
  1106  
  1107  	// aok, bok are set while traversing the iterator's path down the B-Tree.
  1108  	// They're declared in the outer scope because they help distinguish the
  1109  	// sentinel case when both iterators' first frame points to the last child
  1110  	// of the root. If an iterator has no other frames in its stack, it's the
  1111  	// end sentinel state which sorts after everything else.
  1112  	var aok, bok bool
  1113  	for i := 0; ; i++ {
  1114  		var af, bf iterFrame
  1115  		af, aok = a.s.nth(i)
  1116  		bf, bok = b.s.nth(i)
  1117  		if !aok || !bok {
  1118  			if aok {
  1119  				// Iterator a, unlike iterator b, still has a frame. Set an,
  1120  				// apos so we compare using the frame from the stack.
  1121  				an, apos = af.n, af.pos
  1122  			}
  1123  			if bok {
  1124  				// Iterator b, unlike iterator a, still has a frame. Set bn,
  1125  				// bpos so we compare using the frame from the stack.
  1126  				bn, bpos = bf.n, bf.pos
  1127  			}
  1128  			break
  1129  		}
  1130  
  1131  		// aok && bok
  1132  		if af.n != bf.n {
  1133  			panic("nonmatching nodes during btree iterator comparison")
  1134  		}
  1135  		if v := stdcmp.Compare(af.pos, bf.pos); v != 0 {
  1136  			return v
  1137  		}
  1138  		// Otherwise continue up both iterators' stacks (equivalently, down the
  1139  		// B-Tree away from the root).
  1140  	}
  1141  
  1142  	if aok && bok {
  1143  		panic("expected one or more stacks to have been exhausted")
  1144  	}
  1145  	if an != bn {
  1146  		panic("nonmatching nodes during btree iterator comparison")
  1147  	}
  1148  	if v := stdcmp.Compare(apos, bpos); v != 0 {
  1149  		return v
  1150  	}
  1151  	switch {
  1152  	case aok:
  1153  		// a is positioned at a leaf child at this position and b is at an
  1154  		// end sentinel state.
  1155  		return -1
  1156  	case bok:
  1157  		// b is positioned at a leaf child at this position and a is at an
  1158  		// end sentinel state.
  1159  		return +1
  1160  	default:
  1161  		return 0
  1162  	}
  1163  }
  1164  
  1165  func (i *iterator) descend(n *node, pos int16) {
  1166  	i.s.push(iterFrame{n: n, pos: pos})
  1167  	i.n = n.children[pos]
  1168  	i.pos = 0
  1169  }
  1170  
  1171  // ascend ascends up to the current node's parent and resets the position
  1172  // to the one previously set for this parent node.
  1173  func (i *iterator) ascend() {
  1174  	f := i.s.pop()
  1175  	i.n = f.n
  1176  	i.pos = f.pos
  1177  }
  1178  
  1179  // seek repositions the iterator over the first file for which fn returns
  1180  // true, mirroring the semantics of the standard library's sort.Search
  1181  // function.  Like sort.Search, seek requires the iterator's B-Tree to be
  1182  // ordered such that fn returns false for some (possibly empty) prefix of the
  1183  // tree's files, and then true for the (possibly empty) remainder.
  1184  func (i *iterator) seek(fn func(*FileMetadata) bool) {
  1185  	i.reset()
  1186  	if i.r == nil {
  1187  		return
  1188  	}
  1189  
  1190  	for {
  1191  		// Logic copied from sort.Search.
  1192  		j, k := 0, int(i.n.count)
  1193  		for j < k {
  1194  			h := int(uint(j+k) >> 1) // avoid overflow when computing h
  1195  
  1196  			// j ≤ h < k
  1197  			if !fn(i.n.items[h]) {
  1198  				j = h + 1 // preserves f(j-1) == false
  1199  			} else {
  1200  				k = h // preserves f(k) == true
  1201  			}
  1202  		}
  1203  
  1204  		i.pos = int16(j)
  1205  		if i.n.leaf {
  1206  			if i.pos == i.n.count {
  1207  				i.next()
  1208  			}
  1209  			return
  1210  		}
  1211  		i.descend(i.n, i.pos)
  1212  	}
  1213  }
  1214  
  1215  // first seeks to the first item in the btree.
  1216  func (i *iterator) first() {
  1217  	i.reset()
  1218  	if i.r == nil {
  1219  		return
  1220  	}
  1221  	for !i.n.leaf {
  1222  		i.descend(i.n, 0)
  1223  	}
  1224  	i.pos = 0
  1225  }
  1226  
  1227  // last seeks to the last item in the btree.
  1228  func (i *iterator) last() {
  1229  	i.reset()
  1230  	if i.r == nil {
  1231  		return
  1232  	}
  1233  	for !i.n.leaf {
  1234  		i.descend(i.n, i.n.count)
  1235  	}
  1236  	i.pos = i.n.count - 1
  1237  }
  1238  
  1239  // next positions the iterator to the item immediately following
  1240  // its current position.
  1241  func (i *iterator) next() {
  1242  	if i.r == nil {
  1243  		return
  1244  	}
  1245  
  1246  	if i.n.leaf {
  1247  		if i.pos < i.n.count {
  1248  			i.pos++
  1249  		}
  1250  		if i.pos < i.n.count {
  1251  			return
  1252  		}
  1253  		for i.s.len() > 0 && i.pos >= i.n.count {
  1254  			i.ascend()
  1255  		}
  1256  		return
  1257  	}
  1258  
  1259  	i.descend(i.n, i.pos+1)
  1260  	for !i.n.leaf {
  1261  		i.descend(i.n, 0)
  1262  	}
  1263  	i.pos = 0
  1264  }
  1265  
  1266  // prev positions the iterator to the item immediately preceding
  1267  // its current position.
  1268  func (i *iterator) prev() {
  1269  	if i.r == nil {
  1270  		return
  1271  	}
  1272  
  1273  	if i.n.leaf {
  1274  		i.pos--
  1275  		if i.pos >= 0 {
  1276  			return
  1277  		}
  1278  		for i.s.len() > 0 && i.pos < 0 {
  1279  			i.ascend()
  1280  			i.pos--
  1281  		}
  1282  		return
  1283  	}
  1284  
  1285  	i.descend(i.n, i.pos)
  1286  	for !i.n.leaf {
  1287  		i.descend(i.n, i.n.count)
  1288  	}
  1289  	i.pos = i.n.count - 1
  1290  }
  1291  
  1292  // valid returns whether the iterator is positioned at a valid position.
  1293  func (i *iterator) valid() bool {
  1294  	return i.r != nil && i.pos >= 0 && i.pos < i.n.count
  1295  }
  1296  
  1297  // cur returns the item at the iterator's current position. It is illegal
  1298  // to call cur if the iterator is not valid.
  1299  func (i *iterator) cur() *FileMetadata {
  1300  	if invariants.Enabled && !i.valid() {
  1301  		panic("btree iterator.cur invoked on invalid iterator")
  1302  	}
  1303  	return i.n.items[i.pos]
  1304  }