github.com/cockroachdb/pebble@v1.1.2/internal/manifest/btree.go (about)

     1  // Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package manifest
     6  
     7  import (
     8  	"bytes"
     9  	"fmt"
    10  	"strings"
    11  	"sync/atomic"
    12  	"unsafe"
    13  
    14  	"github.com/cockroachdb/errors"
    15  	"github.com/cockroachdb/pebble/internal/invariants"
    16  )
    17  
    18  // The Annotator type defined below is used by other packages to lazily
    19  // compute a value over a B-Tree. Each node of the B-Tree stores one
    20  // `annotation` per annotator, containing the result of the computation over
    21  // the node's subtree.
    22  //
    23  // An annotation is marked as valid if it's current with the current subtree
    24  // state. Annotations are marked as invalid whenever a node will be mutated
    25  // (in mut).  Annotators may also return `false` from `Accumulate` to signal
    26  // that a computation for a file is not stable and may change in the future.
    27  // Annotations that include these unstable values are also marked as invalid
    28  // on the node, ensuring that future queries for the annotation will recompute
    29  // the value.
    30  
    31  // An Annotator defines a computation over a level's FileMetadata. If the
    32  // computation is stable and uses inputs that are fixed for the lifetime of
    33  // a FileMetadata, the LevelMetadata's internal data structures are annotated
    34  // with the intermediary computations. This allows the computation to be
    35  // computed incrementally as edits are applied to a level.
    36  type Annotator interface {
    37  	// Zero returns the zero value of an annotation. This value is returned
    38  	// when a LevelMetadata is empty. The dst argument, if non-nil, is an
    39  	// obsolete value previously returned by this Annotator and may be
    40  	// overwritten and reused to avoid a memory allocation.
    41  	Zero(dst interface{}) (v interface{})
    42  
    43  	// Accumulate computes the annotation for a single file in a level's
    44  	// metadata. It merges the file's value into dst and returns a bool flag
    45  	// indicating whether or not the value is stable and okay to cache as an
    46  	// annotation. If the file's value may change over the life of the file,
    47  	// the annotator must return false.
    48  	//
    49  	// Implementations may modify dst and return it to avoid an allocation.
    50  	Accumulate(m *FileMetadata, dst interface{}) (v interface{}, cacheOK bool)
    51  
    52  	// Merge combines two values src and dst, returning the result.
    53  	// Implementations may modify dst and return it to avoid an allocation.
    54  	Merge(src interface{}, dst interface{}) interface{}
    55  }
    56  
    57  type btreeCmp func(*FileMetadata, *FileMetadata) int
    58  
    59  func btreeCmpSeqNum(a, b *FileMetadata) int {
    60  	return a.cmpSeqNum(b)
    61  }
    62  
    63  func btreeCmpSmallestKey(cmp Compare) btreeCmp {
    64  	return func(a, b *FileMetadata) int {
    65  		return a.cmpSmallestKey(b, cmp)
    66  	}
    67  }
    68  
    69  // btreeCmpSpecificOrder is used in tests to construct a B-Tree with a
    70  // specific ordering of FileMetadata within the tree. It's typically used to
    71  // test consistency checking code that needs to construct a malformed B-Tree.
    72  func btreeCmpSpecificOrder(files []*FileMetadata) btreeCmp {
    73  	m := map[*FileMetadata]int{}
    74  	for i, f := range files {
    75  		m[f] = i
    76  	}
    77  	return func(a, b *FileMetadata) int {
    78  		ai, aok := m[a]
    79  		bi, bok := m[b]
    80  		if !aok || !bok {
    81  			panic("btreeCmpSliceOrder called with unknown files")
    82  		}
    83  		switch {
    84  		case ai < bi:
    85  			return -1
    86  		case ai > bi:
    87  			return +1
    88  		default:
    89  			return 0
    90  		}
    91  	}
    92  }
    93  
    94  const (
    95  	degree   = 16
    96  	maxItems = 2*degree - 1
    97  	minItems = degree - 1
    98  )
    99  
   100  type annotation struct {
   101  	annotator Annotator
   102  	// v is an annotation value, the output of either
   103  	// annotator.Value or annotator.Merge.
   104  	v interface{}
   105  	// valid indicates whether future reads of the annotation may use v as-is.
   106  	// If false, v will be zeroed and recalculated.
   107  	valid bool
   108  }
   109  
   110  type leafNode struct {
   111  	ref   atomic.Int32
   112  	count int16
   113  	leaf  bool
   114  	// subtreeCount holds the count of files in the entire subtree formed by
   115  	// this node. For leaf nodes, subtreeCount is always equal to count. For
   116  	// non-leaf nodes, it's the sum of count plus all the children's
   117  	// subtreeCounts.
   118  	//
   119  	// NB: We could move this field to the end of the node struct, since leaf =>
   120  	// count=subtreeCount, however the unsafe casting [leafToNode] performs make
   121  	// it risky and cumbersome.
   122  	subtreeCount int
   123  	items        [maxItems]*FileMetadata
   124  	// annot contains one annotation per annotator, merged over the entire
   125  	// node's files (and all descendants for non-leaf nodes).
   126  	annot []annotation
   127  }
   128  
   129  type node struct {
   130  	leafNode
   131  	children [maxItems + 1]*node
   132  }
   133  
   134  //go:nocheckptr casts a ptr to a smaller struct to a ptr to a larger struct.
   135  func leafToNode(ln *leafNode) *node {
   136  	return (*node)(unsafe.Pointer(ln))
   137  }
   138  
   139  func newLeafNode() *node {
   140  	n := leafToNode(new(leafNode))
   141  	n.leaf = true
   142  	n.ref.Store(1)
   143  	return n
   144  }
   145  
   146  func newNode() *node {
   147  	n := new(node)
   148  	n.ref.Store(1)
   149  	return n
   150  }
   151  
   152  // mut creates and returns a mutable node reference. If the node is not shared
   153  // with any other trees then it can be modified in place. Otherwise, it must be
   154  // cloned to ensure unique ownership. In this way, we enforce a copy-on-write
   155  // policy which transparently incorporates the idea of local mutations, like
   156  // Clojure's transients or Haskell's ST monad, where nodes are only copied
   157  // during the first time that they are modified between Clone operations.
   158  //
   159  // When a node is cloned, the provided pointer will be redirected to the new
   160  // mutable node.
   161  func mut(n **node) *node {
   162  	if (*n).ref.Load() == 1 {
   163  		// Exclusive ownership. Can mutate in place.
   164  
   165  		// Whenever a node will be mutated, reset its annotations to be marked
   166  		// as uncached. This ensures any future calls to (*node).annotation
   167  		// will recompute annotations on the modified subtree.
   168  		for i := range (*n).annot {
   169  			(*n).annot[i].valid = false
   170  		}
   171  		return *n
   172  	}
   173  	// If we do not have unique ownership over the node then we
   174  	// clone it to gain unique ownership. After doing so, we can
   175  	// release our reference to the old node. We pass recursive
   176  	// as true because even though we just observed the node's
   177  	// reference count to be greater than 1, we might be racing
   178  	// with another call to decRef on this node.
   179  	c := (*n).clone()
   180  	(*n).decRef(true /* contentsToo */, nil)
   181  	*n = c
   182  	// NB: We don't need to clear annotations, because (*node).clone does not
   183  	// copy them.
   184  	return *n
   185  }
   186  
   187  // incRef acquires a reference to the node.
   188  func (n *node) incRef() {
   189  	n.ref.Add(1)
   190  }
   191  
   192  // decRef releases a reference to the node. If requested, the method will unref
   193  // its items and recurse into child nodes and decrease their refcounts as well.
   194  // Some internal codepaths that manually copy the node's items or children to
   195  // new nodes pass contentsToo=false to preserve existing reference counts during
   196  // operations that should yield a net-zero change to descendant refcounts.
   197  // When a node is released, its contained files are dereferenced.
   198  func (n *node) decRef(contentsToo bool, obsolete *[]*FileBacking) {
   199  	if n.ref.Add(-1) > 0 {
   200  		// Other references remain. Can't free.
   201  		return
   202  	}
   203  
   204  	// Dereference the node's metadata and release child references if
   205  	// requested. Some internal callers may not want to propagate the deref
   206  	// because they're manually copying the filemetadata and children to other
   207  	// nodes, and they want to preserve the existing reference count.
   208  	if contentsToo {
   209  		for _, f := range n.items[:n.count] {
   210  			if f.Unref() == 0 {
   211  				// There are two sources of node dereferences: tree mutations
   212  				// and Version dereferences. Files should only be made obsolete
   213  				// during Version dereferences, during which `obsolete` will be
   214  				// non-nil.
   215  				if obsolete == nil {
   216  					panic(fmt.Sprintf("file metadata %s dereferenced to zero during tree mutation", f.FileNum))
   217  				}
   218  				// Reference counting is performed on the FileBacking. In the case
   219  				// of a virtual sstable, this reference counting is performed on
   220  				// a FileBacking which is shared by every single virtual sstable
   221  				// with the same backing sstable. If the reference count hits 0,
   222  				// then we know that the FileBacking won't be required by any
   223  				// sstable in Pebble, and that the backing sstable can be deleted.
   224  				*obsolete = append(*obsolete, f.FileBacking)
   225  			}
   226  		}
   227  		if !n.leaf {
   228  			for i := int16(0); i <= n.count; i++ {
   229  				n.children[i].decRef(true /* contentsToo */, obsolete)
   230  			}
   231  		}
   232  	}
   233  }
   234  
   235  // clone creates a clone of the receiver with a single reference count.
   236  func (n *node) clone() *node {
   237  	var c *node
   238  	if n.leaf {
   239  		c = newLeafNode()
   240  	} else {
   241  		c = newNode()
   242  	}
   243  	// NB: copy field-by-field without touching n.ref to avoid
   244  	// triggering the race detector and looking like a data race.
   245  	c.count = n.count
   246  	c.items = n.items
   247  	c.subtreeCount = n.subtreeCount
   248  	// Increase the refcount of each contained item.
   249  	for _, f := range n.items[:n.count] {
   250  		f.Ref()
   251  	}
   252  	if !c.leaf {
   253  		// Copy children and increase each refcount.
   254  		c.children = n.children
   255  		for i := int16(0); i <= c.count; i++ {
   256  			c.children[i].incRef()
   257  		}
   258  	}
   259  	return c
   260  }
   261  
   262  // insertAt inserts the provided file and node at the provided index. This
   263  // function is for use only as a helper function for internal B-Tree code.
   264  // Clients should not invoke it directly.
   265  func (n *node) insertAt(index int, item *FileMetadata, nd *node) {
   266  	if index < int(n.count) {
   267  		copy(n.items[index+1:n.count+1], n.items[index:n.count])
   268  		if !n.leaf {
   269  			copy(n.children[index+2:n.count+2], n.children[index+1:n.count+1])
   270  		}
   271  	}
   272  	n.items[index] = item
   273  	if !n.leaf {
   274  		n.children[index+1] = nd
   275  	}
   276  	n.count++
   277  }
   278  
   279  // pushBack inserts the provided file and node at the tail of the node's items.
   280  // This function is for use only as a helper function for internal B-Tree code.
   281  // Clients should not invoke it directly.
   282  func (n *node) pushBack(item *FileMetadata, nd *node) {
   283  	n.items[n.count] = item
   284  	if !n.leaf {
   285  		n.children[n.count+1] = nd
   286  	}
   287  	n.count++
   288  }
   289  
   290  // pushFront inserts the provided file and node at the head of the
   291  // node's items. This function is for use only as a helper function for internal B-Tree
   292  // code. Clients should not invoke it directly.
   293  func (n *node) pushFront(item *FileMetadata, nd *node) {
   294  	if !n.leaf {
   295  		copy(n.children[1:n.count+2], n.children[:n.count+1])
   296  		n.children[0] = nd
   297  	}
   298  	copy(n.items[1:n.count+1], n.items[:n.count])
   299  	n.items[0] = item
   300  	n.count++
   301  }
   302  
   303  // removeAt removes a value at a given index, pulling all subsequent values
   304  // back. This function is for use only as a helper function for internal B-Tree
   305  // code. Clients should not invoke it directly.
   306  func (n *node) removeAt(index int) (*FileMetadata, *node) {
   307  	var child *node
   308  	if !n.leaf {
   309  		child = n.children[index+1]
   310  		copy(n.children[index+1:n.count], n.children[index+2:n.count+1])
   311  		n.children[n.count] = nil
   312  	}
   313  	n.count--
   314  	out := n.items[index]
   315  	copy(n.items[index:n.count], n.items[index+1:n.count+1])
   316  	n.items[n.count] = nil
   317  	return out, child
   318  }
   319  
   320  // popBack removes and returns the last element in the list. This function is
   321  // for use only as a helper function for internal B-Tree code. Clients should
   322  // not invoke it directly.
   323  func (n *node) popBack() (*FileMetadata, *node) {
   324  	n.count--
   325  	out := n.items[n.count]
   326  	n.items[n.count] = nil
   327  	if n.leaf {
   328  		return out, nil
   329  	}
   330  	child := n.children[n.count+1]
   331  	n.children[n.count+1] = nil
   332  	return out, child
   333  }
   334  
   335  // popFront removes and returns the first element in the list. This function is
   336  // for use only as a helper function for internal B-Tree code. Clients should
   337  // not invoke it directly.
   338  func (n *node) popFront() (*FileMetadata, *node) {
   339  	n.count--
   340  	var child *node
   341  	if !n.leaf {
   342  		child = n.children[0]
   343  		copy(n.children[:n.count+1], n.children[1:n.count+2])
   344  		n.children[n.count+1] = nil
   345  	}
   346  	out := n.items[0]
   347  	copy(n.items[:n.count], n.items[1:n.count+1])
   348  	n.items[n.count] = nil
   349  	return out, child
   350  }
   351  
   352  // find returns the index where the given item should be inserted into this
   353  // list. 'found' is true if the item already exists in the list at the given
   354  // index.
   355  //
   356  // This function is for use only as a helper function for internal B-Tree code.
   357  // Clients should not invoke it directly.
   358  func (n *node) find(cmp btreeCmp, item *FileMetadata) (index int, found bool) {
   359  	// Logic copied from sort.Search. Inlining this gave
   360  	// an 11% speedup on BenchmarkBTreeDeleteInsert.
   361  	i, j := 0, int(n.count)
   362  	for i < j {
   363  		h := int(uint(i+j) >> 1) // avoid overflow when computing h
   364  		// i ≤ h < j
   365  		v := cmp(item, n.items[h])
   366  		if v == 0 {
   367  			return h, true
   368  		} else if v > 0 {
   369  			i = h + 1
   370  		} else {
   371  			j = h
   372  		}
   373  	}
   374  	return i, false
   375  }
   376  
   377  // split splits the given node at the given index. The current node shrinks,
   378  // and this function returns the item that existed at that index and a new
   379  // node containing all items/children after it.
   380  //
   381  // split is called when we want to perform a transformation like the one
   382  // depicted in the following diagram.
   383  //
   384  //	Before:
   385  //	                       +-----------+
   386  //	             n *node   |   x y z   |
   387  //	                       +--/-/-\-\--+
   388  //
   389  //	After:
   390  //	                       +-----------+
   391  //	                       |     y     |  n's parent
   392  //	                       +----/-\----+
   393  //	                           /   \
   394  //	                          v     v
   395  //	              +-----------+     +-----------+
   396  //	      n *node |         x |     | z         | next *node
   397  //	              +-----------+     +-----------+
   398  //
   399  // split does not perform the complete transformation; the caller is responsible
   400  // for updating the parent appropriately. split splits `n` into two nodes, `n`
   401  // and `next`, returning `next` and the file that separates them. In the diagram
   402  // above, `n.split` removes y and z from `n`, returning y in the first return
   403  // value and `next` in the second return value. The caller is responsible for
   404  // updating n's parent to now contain `y` as the separator between nodes `n` and
   405  // `next`.
   406  //
   407  // This function is for use only as a helper function for internal B-Tree code.
   408  // Clients should not invoke it directly.
   409  func (n *node) split(i int) (*FileMetadata, *node) {
   410  	out := n.items[i]
   411  	var next *node
   412  	if n.leaf {
   413  		next = newLeafNode()
   414  	} else {
   415  		next = newNode()
   416  	}
   417  	next.count = n.count - int16(i+1)
   418  	copy(next.items[:], n.items[i+1:n.count])
   419  	for j := int16(i); j < n.count; j++ {
   420  		n.items[j] = nil
   421  	}
   422  	if !n.leaf {
   423  		copy(next.children[:], n.children[i+1:n.count+1])
   424  		descendantsMoved := 0
   425  		for j := int16(i + 1); j <= n.count; j++ {
   426  			descendantsMoved += n.children[j].subtreeCount
   427  			n.children[j] = nil
   428  		}
   429  		n.subtreeCount -= descendantsMoved
   430  		next.subtreeCount += descendantsMoved
   431  	}
   432  	n.count = int16(i)
   433  	// NB: We subtract one more than `next.count` from n's subtreeCount because
   434  	// the item at index `i` was removed from `n.items`. We'll return the item
   435  	// at index `i`, and the caller is responsible for updating the subtree
   436  	// count of whichever node adopts it.
   437  	n.subtreeCount -= int(next.count) + 1
   438  	next.subtreeCount += int(next.count)
   439  	return out, next
   440  }
   441  
   442  // Insert inserts a item into the subtree rooted at this node, making sure no
   443  // nodes in the subtree exceed maxItems items.
   444  func (n *node) Insert(cmp btreeCmp, item *FileMetadata) error {
   445  	i, found := n.find(cmp, item)
   446  	if found {
   447  		// cmp provides a total ordering of the files within a level.
   448  		// If we're inserting a metadata that's equal to an existing item
   449  		// in the tree, we're inserting a file into a level twice.
   450  		return errors.Errorf("files %s and %s collided on sort keys",
   451  			errors.Safe(item.FileNum), errors.Safe(n.items[i].FileNum))
   452  	}
   453  	if n.leaf {
   454  		n.insertAt(i, item, nil)
   455  		n.subtreeCount++
   456  		return nil
   457  	}
   458  	if n.children[i].count >= maxItems {
   459  		splitLa, splitNode := mut(&n.children[i]).split(maxItems / 2)
   460  		n.insertAt(i, splitLa, splitNode)
   461  
   462  		switch cmp := cmp(item, n.items[i]); {
   463  		case cmp < 0:
   464  			// no change, we want first split node
   465  		case cmp > 0:
   466  			i++ // we want second split node
   467  		default:
   468  			// cmp provides a total ordering of the files within a level.
   469  			// If we're inserting a metadata that's equal to an existing item
   470  			// in the tree, we're inserting a file into a level twice.
   471  			return errors.Errorf("files %s and %s collided on sort keys",
   472  				errors.Safe(item.FileNum), errors.Safe(n.items[i].FileNum))
   473  		}
   474  	}
   475  
   476  	err := mut(&n.children[i]).Insert(cmp, item)
   477  	if err == nil {
   478  		n.subtreeCount++
   479  	}
   480  	return err
   481  }
   482  
   483  // removeMax removes and returns the maximum item from the subtree rooted at
   484  // this node. This function is for use only as a helper function for internal
   485  // B-Tree code. Clients should not invoke it directly.
   486  func (n *node) removeMax() *FileMetadata {
   487  	if n.leaf {
   488  		n.count--
   489  		n.subtreeCount--
   490  		out := n.items[n.count]
   491  		n.items[n.count] = nil
   492  		return out
   493  	}
   494  	child := mut(&n.children[n.count])
   495  	if child.count <= minItems {
   496  		n.rebalanceOrMerge(int(n.count))
   497  		return n.removeMax()
   498  	}
   499  	n.subtreeCount--
   500  	return child.removeMax()
   501  }
   502  
   503  // Remove removes a item from the subtree rooted at this node. Returns
   504  // the item that was removed or nil if no matching item was found.
   505  func (n *node) Remove(cmp btreeCmp, item *FileMetadata) (out *FileMetadata) {
   506  	i, found := n.find(cmp, item)
   507  	if n.leaf {
   508  		if found {
   509  			out, _ = n.removeAt(i)
   510  			n.subtreeCount--
   511  			return out
   512  		}
   513  		return nil
   514  	}
   515  	if n.children[i].count <= minItems {
   516  		// Child not large enough to remove from.
   517  		n.rebalanceOrMerge(i)
   518  		return n.Remove(cmp, item)
   519  	}
   520  	child := mut(&n.children[i])
   521  	if found {
   522  		// Replace the item being removed with the max item in our left child.
   523  		out = n.items[i]
   524  		n.items[i] = child.removeMax()
   525  		n.subtreeCount--
   526  		return out
   527  	}
   528  	// File is not in this node and child is large enough to remove from.
   529  	out = child.Remove(cmp, item)
   530  	if out != nil {
   531  		n.subtreeCount--
   532  	}
   533  	return out
   534  }
   535  
   536  // rebalanceOrMerge grows child 'i' to ensure it has sufficient room to remove a
   537  // item from it while keeping it at or above minItems. This function is for use
   538  // only as a helper function for internal B-Tree code. Clients should not invoke
   539  // it directly.
   540  func (n *node) rebalanceOrMerge(i int) {
   541  	switch {
   542  	case i > 0 && n.children[i-1].count > minItems:
   543  		// Rebalance from left sibling.
   544  		//
   545  		//          +-----------+
   546  		//          |     y     |
   547  		//          +----/-\----+
   548  		//              /   \
   549  		//             v     v
   550  		// +-----------+     +-----------+
   551  		// |         x |     |           |
   552  		// +----------\+     +-----------+
   553  		//             \
   554  		//              v
   555  		//              a
   556  		//
   557  		// After:
   558  		//
   559  		//          +-----------+
   560  		//          |     x     |
   561  		//          +----/-\----+
   562  		//              /   \
   563  		//             v     v
   564  		// +-----------+     +-----------+
   565  		// |           |     | y         |
   566  		// +-----------+     +/----------+
   567  		//                   /
   568  		//                  v
   569  		//                  a
   570  		//
   571  		left := mut(&n.children[i-1])
   572  		child := mut(&n.children[i])
   573  		xLa, grandChild := left.popBack()
   574  		yLa := n.items[i-1]
   575  		child.pushFront(yLa, grandChild)
   576  		n.items[i-1] = xLa
   577  		child.subtreeCount++
   578  		left.subtreeCount--
   579  		if grandChild != nil {
   580  			child.subtreeCount += grandChild.subtreeCount
   581  			left.subtreeCount -= grandChild.subtreeCount
   582  		}
   583  
   584  	case i < int(n.count) && n.children[i+1].count > minItems:
   585  		// Rebalance from right sibling.
   586  		//
   587  		//          +-----------+
   588  		//          |     y     |
   589  		//          +----/-\----+
   590  		//              /   \
   591  		//             v     v
   592  		// +-----------+     +-----------+
   593  		// |           |     | x         |
   594  		// +-----------+     +/----------+
   595  		//                   /
   596  		//                  v
   597  		//                  a
   598  		//
   599  		// After:
   600  		//
   601  		//          +-----------+
   602  		//          |     x     |
   603  		//          +----/-\----+
   604  		//              /   \
   605  		//             v     v
   606  		// +-----------+     +-----------+
   607  		// |         y |     |           |
   608  		// +----------\+     +-----------+
   609  		//             \
   610  		//              v
   611  		//              a
   612  		//
   613  		right := mut(&n.children[i+1])
   614  		child := mut(&n.children[i])
   615  		xLa, grandChild := right.popFront()
   616  		yLa := n.items[i]
   617  		child.pushBack(yLa, grandChild)
   618  		child.subtreeCount++
   619  		right.subtreeCount--
   620  		if grandChild != nil {
   621  			child.subtreeCount += grandChild.subtreeCount
   622  			right.subtreeCount -= grandChild.subtreeCount
   623  		}
   624  		n.items[i] = xLa
   625  
   626  	default:
   627  		// Merge with either the left or right sibling.
   628  		//
   629  		//          +-----------+
   630  		//          |   u y v   |
   631  		//          +----/-\----+
   632  		//              /   \
   633  		//             v     v
   634  		// +-----------+     +-----------+
   635  		// |         x |     | z         |
   636  		// +-----------+     +-----------+
   637  		//
   638  		// After:
   639  		//
   640  		//          +-----------+
   641  		//          |    u v    |
   642  		//          +-----|-----+
   643  		//                |
   644  		//                v
   645  		//          +-----------+
   646  		//          |   x y z   |
   647  		//          +-----------+
   648  		//
   649  		if i >= int(n.count) {
   650  			i = int(n.count - 1)
   651  		}
   652  		child := mut(&n.children[i])
   653  		// Make mergeChild mutable, bumping the refcounts on its children if necessary.
   654  		_ = mut(&n.children[i+1])
   655  		mergeLa, mergeChild := n.removeAt(i)
   656  		child.items[child.count] = mergeLa
   657  		copy(child.items[child.count+1:], mergeChild.items[:mergeChild.count])
   658  		if !child.leaf {
   659  			copy(child.children[child.count+1:], mergeChild.children[:mergeChild.count+1])
   660  		}
   661  		child.count += mergeChild.count + 1
   662  		child.subtreeCount += mergeChild.subtreeCount + 1
   663  
   664  		mergeChild.decRef(false /* contentsToo */, nil)
   665  	}
   666  }
   667  
   668  // InvalidateAnnotation removes any existing cached annotations for the provided
   669  // annotator from this node's subtree.
   670  func (n *node) InvalidateAnnotation(a Annotator) {
   671  	// Find this annotator's annotation on this node.
   672  	var annot *annotation
   673  	for i := range n.annot {
   674  		if n.annot[i].annotator == a {
   675  			annot = &n.annot[i]
   676  		}
   677  	}
   678  
   679  	if annot != nil && annot.valid {
   680  		annot.valid = false
   681  		annot.v = a.Zero(annot.v)
   682  	}
   683  	if !n.leaf {
   684  		for i := int16(0); i <= n.count; i++ {
   685  			n.children[i].InvalidateAnnotation(a)
   686  		}
   687  	}
   688  }
   689  
   690  // Annotation retrieves, computing if not already computed, the provided
   691  // annotator's annotation of this node. The second return value indicates
   692  // whether the future reads of this annotation may use the first return value
   693  // as-is. If false, the annotation is not stable and may change on a subsequent
   694  // computation.
   695  func (n *node) Annotation(a Annotator) (interface{}, bool) {
   696  	// Find this annotator's annotation on this node.
   697  	var annot *annotation
   698  	for i := range n.annot {
   699  		if n.annot[i].annotator == a {
   700  			annot = &n.annot[i]
   701  		}
   702  	}
   703  
   704  	// If it exists and is marked as valid, we can return it without
   705  	// recomputing anything.
   706  	if annot != nil && annot.valid {
   707  		return annot.v, true
   708  	}
   709  
   710  	if annot == nil {
   711  		// This is n's first time being annotated by a.
   712  		// Create a new zeroed annotation.
   713  		n.annot = append(n.annot, annotation{
   714  			annotator: a,
   715  			v:         a.Zero(nil),
   716  		})
   717  		annot = &n.annot[len(n.annot)-1]
   718  	} else {
   719  		// There's an existing annotation that must be recomputed.
   720  		// Zero its value.
   721  		annot.v = a.Zero(annot.v)
   722  	}
   723  
   724  	annot.valid = true
   725  	for i := int16(0); i <= n.count; i++ {
   726  		if !n.leaf {
   727  			v, ok := n.children[i].Annotation(a)
   728  			annot.v = a.Merge(v, annot.v)
   729  			annot.valid = annot.valid && ok
   730  		}
   731  		if i < n.count {
   732  			v, ok := a.Accumulate(n.items[i], annot.v)
   733  			annot.v = v
   734  			annot.valid = annot.valid && ok
   735  		}
   736  	}
   737  	return annot.v, annot.valid
   738  }
   739  
   740  func (n *node) verifyInvariants() {
   741  	recomputedSubtreeCount := int(n.count)
   742  	if !n.leaf {
   743  		for i := int16(0); i <= n.count; i++ {
   744  			n.children[i].verifyInvariants()
   745  			recomputedSubtreeCount += n.children[i].subtreeCount
   746  		}
   747  	}
   748  	if recomputedSubtreeCount != n.subtreeCount {
   749  		panic(fmt.Sprintf("recomputed subtree count (%d) ≠ n.subtreeCount (%d)",
   750  			recomputedSubtreeCount, n.subtreeCount))
   751  	}
   752  }
   753  
   754  // btree is an implementation of a B-Tree.
   755  //
   756  // btree stores FileMetadata in an ordered structure, allowing easy insertion,
   757  // removal, and iteration. The B-Tree stores items in order based on cmp. The
   758  // first level of the LSM uses a cmp function that compares sequence numbers.
   759  // All other levels compare using the FileMetadata.Smallest.
   760  //
   761  // Write operations are not safe for concurrent mutation by multiple
   762  // goroutines, but Read operations are.
   763  type btree struct {
   764  	root *node
   765  	cmp  btreeCmp
   766  }
   767  
   768  // Release dereferences and clears the root node of the btree, removing all
   769  // items from the btree. In doing so, it decrements contained file counts.
   770  // It returns a slice of newly obsolete backing files, if any.
   771  func (t *btree) Release() (obsolete []*FileBacking) {
   772  	if t.root != nil {
   773  		t.root.decRef(true /* contentsToo */, &obsolete)
   774  		t.root = nil
   775  	}
   776  	return obsolete
   777  }
   778  
   779  // Clone clones the btree, lazily. It does so in constant time.
   780  func (t *btree) Clone() btree {
   781  	c := *t
   782  	if c.root != nil {
   783  		// Incrementing the reference count on the root node is sufficient to
   784  		// ensure that no node in the cloned tree can be mutated by an actor
   785  		// holding a reference to the original tree and vice versa. This
   786  		// property is upheld because the root node in the receiver btree and
   787  		// the returned btree will both necessarily have a reference count of at
   788  		// least 2 when this method returns. All tree mutations recursively
   789  		// acquire mutable node references (see mut) as they traverse down the
   790  		// tree. The act of acquiring a mutable node reference performs a clone
   791  		// if a node's reference count is greater than one. Cloning a node (see
   792  		// clone) increases the reference count on each of its children,
   793  		// ensuring that they have a reference count of at least 2. This, in
   794  		// turn, ensures that any of the child nodes that are modified will also
   795  		// be copied-on-write, recursively ensuring the immutability property
   796  		// over the entire tree.
   797  		c.root.incRef()
   798  	}
   799  	return c
   800  }
   801  
   802  // Delete removes the provided file from the tree.
   803  // It returns true if the file now has a zero reference count.
   804  func (t *btree) Delete(item *FileMetadata) (obsolete bool) {
   805  	if t.root == nil || t.root.count == 0 {
   806  		return false
   807  	}
   808  	if out := mut(&t.root).Remove(t.cmp, item); out != nil {
   809  		obsolete = out.Unref() == 0
   810  	}
   811  	if invariants.Enabled {
   812  		t.root.verifyInvariants()
   813  	}
   814  	if t.root.count == 0 {
   815  		old := t.root
   816  		if t.root.leaf {
   817  			t.root = nil
   818  		} else {
   819  			t.root = t.root.children[0]
   820  		}
   821  		old.decRef(false /* contentsToo */, nil)
   822  	}
   823  	return obsolete
   824  }
   825  
   826  // Insert adds the given item to the tree. If a item in the tree already
   827  // equals the given one, Insert panics.
   828  func (t *btree) Insert(item *FileMetadata) error {
   829  	if t.root == nil {
   830  		t.root = newLeafNode()
   831  	} else if t.root.count >= maxItems {
   832  		splitLa, splitNode := mut(&t.root).split(maxItems / 2)
   833  		newRoot := newNode()
   834  		newRoot.count = 1
   835  		newRoot.items[0] = splitLa
   836  		newRoot.children[0] = t.root
   837  		newRoot.children[1] = splitNode
   838  		newRoot.subtreeCount = t.root.subtreeCount + splitNode.subtreeCount + 1
   839  		t.root = newRoot
   840  	}
   841  	item.Ref()
   842  	err := mut(&t.root).Insert(t.cmp, item)
   843  	if invariants.Enabled {
   844  		t.root.verifyInvariants()
   845  	}
   846  	return err
   847  }
   848  
   849  // Iter returns a new iterator object. It is not safe to continue using an
   850  // iterator after modifications are made to the tree. If modifications are made,
   851  // create a new iterator.
   852  func (t *btree) Iter() iterator {
   853  	return iterator{r: t.root, pos: -1, cmp: t.cmp}
   854  }
   855  
   856  // Count returns the number of files contained within the B-Tree.
   857  func (t *btree) Count() int {
   858  	if t.root == nil {
   859  		return 0
   860  	}
   861  	return t.root.subtreeCount
   862  }
   863  
   864  // String returns a string description of the tree. The format is
   865  // similar to the https://en.wikipedia.org/wiki/Newick_format.
   866  func (t *btree) String() string {
   867  	if t.Count() == 0 {
   868  		return ";"
   869  	}
   870  	var b strings.Builder
   871  	t.root.writeString(&b)
   872  	return b.String()
   873  }
   874  
   875  func (n *node) writeString(b *strings.Builder) {
   876  	if n.leaf {
   877  		for i := int16(0); i < n.count; i++ {
   878  			if i != 0 {
   879  				b.WriteString(",")
   880  			}
   881  			b.WriteString(n.items[i].String())
   882  		}
   883  		return
   884  	}
   885  	for i := int16(0); i <= n.count; i++ {
   886  		b.WriteString("(")
   887  		n.children[i].writeString(b)
   888  		b.WriteString(")")
   889  		if i < n.count {
   890  			b.WriteString(n.items[i].String())
   891  		}
   892  	}
   893  }
   894  
   895  // iterStack represents a stack of (node, pos) tuples, which captures
   896  // iteration state as an iterator descends a btree.
   897  type iterStack struct {
   898  	// a contains aLen stack frames when an iterator stack is short enough.
   899  	// If the iterator stack overflows the capacity of iterStackArr, the stack
   900  	// is moved to s and aLen is set to -1.
   901  	a    iterStackArr
   902  	aLen int16 // -1 when using s
   903  	s    []iterFrame
   904  }
   905  
   906  // Used to avoid allocations for stacks below a certain size.
   907  type iterStackArr [3]iterFrame
   908  
   909  type iterFrame struct {
   910  	n   *node
   911  	pos int16
   912  }
   913  
   914  func (is *iterStack) push(f iterFrame) {
   915  	if is.aLen == -1 {
   916  		is.s = append(is.s, f)
   917  	} else if int(is.aLen) == len(is.a) {
   918  		is.s = make([]iterFrame, int(is.aLen)+1, 2*int(is.aLen))
   919  		copy(is.s, is.a[:])
   920  		is.s[int(is.aLen)] = f
   921  		is.aLen = -1
   922  	} else {
   923  		is.a[is.aLen] = f
   924  		is.aLen++
   925  	}
   926  }
   927  
   928  func (is *iterStack) pop() iterFrame {
   929  	if is.aLen == -1 {
   930  		f := is.s[len(is.s)-1]
   931  		is.s = is.s[:len(is.s)-1]
   932  		return f
   933  	}
   934  	is.aLen--
   935  	return is.a[is.aLen]
   936  }
   937  
   938  func (is *iterStack) len() int {
   939  	if is.aLen == -1 {
   940  		return len(is.s)
   941  	}
   942  	return int(is.aLen)
   943  }
   944  
   945  func (is *iterStack) clone() iterStack {
   946  	// If the iterator is using the embedded iterStackArr, we only need to
   947  	// copy the struct itself.
   948  	if is.s == nil {
   949  		return *is
   950  	}
   951  	clone := *is
   952  	clone.s = make([]iterFrame, len(is.s))
   953  	copy(clone.s, is.s)
   954  	return clone
   955  }
   956  
   957  func (is *iterStack) nth(n int) (f iterFrame, ok bool) {
   958  	if is.aLen == -1 {
   959  		if n >= len(is.s) {
   960  			return f, false
   961  		}
   962  		return is.s[n], true
   963  	}
   964  	if int16(n) >= is.aLen {
   965  		return f, false
   966  	}
   967  	return is.a[n], true
   968  }
   969  
   970  func (is *iterStack) reset() {
   971  	if is.aLen == -1 {
   972  		is.s = is.s[:0]
   973  	} else {
   974  		is.aLen = 0
   975  	}
   976  }
   977  
   978  // iterator is responsible for search and traversal within a btree.
   979  type iterator struct {
   980  	// the root node of the B-Tree.
   981  	r *node
   982  	// n and pos make up the current position of the iterator.
   983  	// If valid, n.items[pos] is the current value of the iterator.
   984  	//
   985  	// n may be nil iff i.r is nil.
   986  	n   *node
   987  	pos int16
   988  	// cmp dictates the ordering of the FileMetadata.
   989  	cmp func(*FileMetadata, *FileMetadata) int
   990  	// a stack of n's ancestors within the B-Tree, alongside the position
   991  	// taken to arrive at n. If non-empty, the bottommost frame of the stack
   992  	// will always contain the B-Tree root.
   993  	s iterStack
   994  }
   995  
   996  // countLeft returns the count of files that are to the left of the current
   997  // iterator position.
   998  func (i *iterator) countLeft() int {
   999  	if i.r == nil {
  1000  		return 0
  1001  	}
  1002  
  1003  	// Each iterator has a stack of frames marking the path from the root node
  1004  	// to the current iterator position. All files (n.items) and all subtrees
  1005  	// (n.children) with indexes less than [pos] are to the left of the current
  1006  	// iterator position.
  1007  	//
  1008  	//     +------------------------+  -
  1009  	//     |  Root            pos:5 |   |
  1010  	//     +------------------------+   | stack
  1011  	//     |  Root/5          pos:3 |   | frames
  1012  	//     +------------------------+   | [i.s]
  1013  	//     |  Root/5/3        pos:9 |   |
  1014  	//     +========================+  -
  1015  	//     |                        |
  1016  	//     | i.n: Root/5/3/9 i.pos:2|
  1017  	//     +------------------------+
  1018  	//
  1019  	var count int
  1020  	// Walk all the ancestors in the iterator stack [i.s], tallying up all the
  1021  	// files and subtrees to the left of the stack frame's position.
  1022  	f, ok := i.s.nth(0)
  1023  	for fi := 0; ok; fi++ {
  1024  		// There are [f.pos] files contained within [f.n.items] that sort to the
  1025  		// left of the subtree the iterator has descended.
  1026  		count += int(f.pos)
  1027  		// Any subtrees that fall before the stack frame's position are entirely
  1028  		// to the left of the iterator's current position.
  1029  		for j := int16(0); j < f.pos; j++ {
  1030  			count += f.n.children[j].subtreeCount
  1031  		}
  1032  		f, ok = i.s.nth(fi + 1)
  1033  	}
  1034  
  1035  	// The bottommost stack frame is inlined within the iterator struct. Again,
  1036  	// [i.pos] files fall to the left of the current iterator position.
  1037  	count += int(i.pos)
  1038  	if !i.n.leaf {
  1039  		// NB: Unlike above, we use a `<= i.pos` comparison. The iterator is
  1040  		// positioned at item `i.n.items[i.pos]`, which sorts after everything
  1041  		// in the subtree at `i.n.children[i.pos]`.
  1042  		for j := int16(0); j <= i.pos; j++ {
  1043  			count += i.n.children[j].subtreeCount
  1044  		}
  1045  	}
  1046  	return count
  1047  }
  1048  
  1049  func (i *iterator) clone() iterator {
  1050  	c := *i
  1051  	c.s = i.s.clone()
  1052  	return c
  1053  }
  1054  
  1055  func (i *iterator) reset() {
  1056  	i.n = i.r
  1057  	i.pos = -1
  1058  	i.s.reset()
  1059  }
  1060  
  1061  func (i iterator) String() string {
  1062  	var buf bytes.Buffer
  1063  	for n := 0; ; n++ {
  1064  		f, ok := i.s.nth(n)
  1065  		if !ok {
  1066  			break
  1067  		}
  1068  		fmt.Fprintf(&buf, "%p: %02d/%02d\n", f.n, f.pos, f.n.count)
  1069  	}
  1070  	if i.r == nil {
  1071  		fmt.Fprintf(&buf, "<nil>: %02d", i.pos)
  1072  	} else {
  1073  		fmt.Fprintf(&buf, "%p: %02d/%02d", i.n, i.pos, i.n.count)
  1074  	}
  1075  	return buf.String()
  1076  }
  1077  
  1078  func cmpIter(a, b iterator) int {
  1079  	if a.r != b.r {
  1080  		panic("compared iterators from different btrees")
  1081  	}
  1082  
  1083  	// Each iterator has a stack of frames marking the path from the root node
  1084  	// to the current iterator position. We walk both paths formed by the
  1085  	// iterators' stacks simultaneously, descending from the shared root node,
  1086  	// always comparing nodes at the same level in the tree.
  1087  	//
  1088  	// If the iterators' paths ever diverge and point to different nodes, the
  1089  	// iterators are not equal and we use the node positions to evaluate the
  1090  	// comparison.
  1091  	//
  1092  	// If an iterator's stack ends, we stop descending and use its current
  1093  	// node and position for the final comparison. One iterator's stack may
  1094  	// end before another's if one iterator is positioned deeper in the tree.
  1095  	//
  1096  	// a                                b
  1097  	// +------------------------+      +--------------------------+ -
  1098  	// |  Root            pos:5 |   =  |  Root              pos:5 |  |
  1099  	// +------------------------+      +--------------------------+  | stack
  1100  	// |  Root/5          pos:3 |   =  |  Root/5            pos:3 |  | frames
  1101  	// +------------------------+      +--------------------------+  |
  1102  	// |  Root/5/3        pos:9 |   >  |  Root/5/3          pos:1 |  |
  1103  	// +========================+      +==========================+ -
  1104  	// |                        |      |                          |
  1105  	// | a.n: Root/5/3/9 a.pos:2|      | b.n: Root/5/3/1, b.pos:5 |
  1106  	// +------------------------+      +--------------------------+
  1107  
  1108  	// Initialize with the iterator's current node and position. These are
  1109  	// conceptually the most-recent/current frame of the iterator stack.
  1110  	an, apos := a.n, a.pos
  1111  	bn, bpos := b.n, b.pos
  1112  
  1113  	// aok, bok are set while traversing the iterator's path down the B-Tree.
  1114  	// They're declared in the outer scope because they help distinguish the
  1115  	// sentinel case when both iterators' first frame points to the last child
  1116  	// of the root. If an iterator has no other frames in its stack, it's the
  1117  	// end sentinel state which sorts after everything else.
  1118  	var aok, bok bool
  1119  	for i := 0; ; i++ {
  1120  		var af, bf iterFrame
  1121  		af, aok = a.s.nth(i)
  1122  		bf, bok = b.s.nth(i)
  1123  		if !aok || !bok {
  1124  			if aok {
  1125  				// Iterator a, unlike iterator b, still has a frame. Set an,
  1126  				// apos so we compare using the frame from the stack.
  1127  				an, apos = af.n, af.pos
  1128  			}
  1129  			if bok {
  1130  				// Iterator b, unlike iterator a, still has a frame. Set bn,
  1131  				// bpos so we compare using the frame from the stack.
  1132  				bn, bpos = bf.n, bf.pos
  1133  			}
  1134  			break
  1135  		}
  1136  
  1137  		// aok && bok
  1138  		if af.n != bf.n {
  1139  			panic("nonmatching nodes during btree iterator comparison")
  1140  		}
  1141  		switch {
  1142  		case af.pos < bf.pos:
  1143  			return -1
  1144  		case af.pos > bf.pos:
  1145  			return +1
  1146  		default:
  1147  			// Continue up both iterators' stacks (equivalently, down the
  1148  			// B-Tree away from the root).
  1149  		}
  1150  	}
  1151  
  1152  	if aok && bok {
  1153  		panic("expected one or more stacks to have been exhausted")
  1154  	}
  1155  	if an != bn {
  1156  		panic("nonmatching nodes during btree iterator comparison")
  1157  	}
  1158  	switch {
  1159  	case apos < bpos:
  1160  		return -1
  1161  	case apos > bpos:
  1162  		return +1
  1163  	default:
  1164  		switch {
  1165  		case aok:
  1166  			// a is positioned at a leaf child at this position and b is at an
  1167  			// end sentinel state.
  1168  			return -1
  1169  		case bok:
  1170  			// b is positioned at a leaf child at this position and a is at an
  1171  			// end sentinel state.
  1172  			return +1
  1173  		default:
  1174  			return 0
  1175  		}
  1176  	}
  1177  }
  1178  
  1179  func (i *iterator) descend(n *node, pos int16) {
  1180  	i.s.push(iterFrame{n: n, pos: pos})
  1181  	i.n = n.children[pos]
  1182  	i.pos = 0
  1183  }
  1184  
  1185  // ascend ascends up to the current node's parent and resets the position
  1186  // to the one previously set for this parent node.
  1187  func (i *iterator) ascend() {
  1188  	f := i.s.pop()
  1189  	i.n = f.n
  1190  	i.pos = f.pos
  1191  }
  1192  
  1193  // seek repositions the iterator over the first file for which fn returns
  1194  // true, mirroring the semantics of the standard library's sort.Search
  1195  // function.  Like sort.Search, seek requires the iterator's B-Tree to be
  1196  // ordered such that fn returns false for some (possibly empty) prefix of the
  1197  // tree's files, and then true for the (possibly empty) remainder.
  1198  func (i *iterator) seek(fn func(*FileMetadata) bool) {
  1199  	i.reset()
  1200  	if i.r == nil {
  1201  		return
  1202  	}
  1203  
  1204  	for {
  1205  		// Logic copied from sort.Search.
  1206  		j, k := 0, int(i.n.count)
  1207  		for j < k {
  1208  			h := int(uint(j+k) >> 1) // avoid overflow when computing h
  1209  
  1210  			// j ≤ h < k
  1211  			if !fn(i.n.items[h]) {
  1212  				j = h + 1 // preserves f(j-1) == false
  1213  			} else {
  1214  				k = h // preserves f(k) == true
  1215  			}
  1216  		}
  1217  
  1218  		i.pos = int16(j)
  1219  		if i.n.leaf {
  1220  			if i.pos == i.n.count {
  1221  				i.next()
  1222  			}
  1223  			return
  1224  		}
  1225  		i.descend(i.n, i.pos)
  1226  	}
  1227  }
  1228  
  1229  // first seeks to the first item in the btree.
  1230  func (i *iterator) first() {
  1231  	i.reset()
  1232  	if i.r == nil {
  1233  		return
  1234  	}
  1235  	for !i.n.leaf {
  1236  		i.descend(i.n, 0)
  1237  	}
  1238  	i.pos = 0
  1239  }
  1240  
  1241  // last seeks to the last item in the btree.
  1242  func (i *iterator) last() {
  1243  	i.reset()
  1244  	if i.r == nil {
  1245  		return
  1246  	}
  1247  	for !i.n.leaf {
  1248  		i.descend(i.n, i.n.count)
  1249  	}
  1250  	i.pos = i.n.count - 1
  1251  }
  1252  
  1253  // next positions the iterator to the item immediately following
  1254  // its current position.
  1255  func (i *iterator) next() {
  1256  	if i.r == nil {
  1257  		return
  1258  	}
  1259  
  1260  	if i.n.leaf {
  1261  		if i.pos < i.n.count {
  1262  			i.pos++
  1263  		}
  1264  		if i.pos < i.n.count {
  1265  			return
  1266  		}
  1267  		for i.s.len() > 0 && i.pos >= i.n.count {
  1268  			i.ascend()
  1269  		}
  1270  		return
  1271  	}
  1272  
  1273  	i.descend(i.n, i.pos+1)
  1274  	for !i.n.leaf {
  1275  		i.descend(i.n, 0)
  1276  	}
  1277  	i.pos = 0
  1278  }
  1279  
  1280  // prev positions the iterator to the item immediately preceding
  1281  // its current position.
  1282  func (i *iterator) prev() {
  1283  	if i.r == nil {
  1284  		return
  1285  	}
  1286  
  1287  	if i.n.leaf {
  1288  		i.pos--
  1289  		if i.pos >= 0 {
  1290  			return
  1291  		}
  1292  		for i.s.len() > 0 && i.pos < 0 {
  1293  			i.ascend()
  1294  			i.pos--
  1295  		}
  1296  		return
  1297  	}
  1298  
  1299  	i.descend(i.n, i.pos)
  1300  	for !i.n.leaf {
  1301  		i.descend(i.n, i.n.count)
  1302  	}
  1303  	i.pos = i.n.count - 1
  1304  }
  1305  
  1306  // valid returns whether the iterator is positioned at a valid position.
  1307  func (i *iterator) valid() bool {
  1308  	return i.r != nil && i.pos >= 0 && i.pos < i.n.count
  1309  }
  1310  
  1311  // cur returns the item at the iterator's current position. It is illegal
  1312  // to call cur if the iterator is not valid.
  1313  func (i *iterator) cur() *FileMetadata {
  1314  	if invariants.Enabled && !i.valid() {
  1315  		panic("btree iterator.cur invoked on invalid iterator")
  1316  	}
  1317  	return i.n.items[i.pos]
  1318  }