github.com/ooni/psiphon/tunnel-core@v0.0.0-20230105123940-fe12a24c96ee/oovendor/bolt/node.go (about)

     1  package bolt
     2  
     3  import (
     4  	"bytes"
     5  	"fmt"
     6  	"sort"
     7  	"unsafe"
     8  )
     9  
    10  // node represents an in-memory, deserialized page.
    11  type node struct {
    12  	bucket     *Bucket
    13  	isLeaf     bool
    14  	unbalanced bool
    15  	spilled    bool
    16  	key        []byte
    17  	pgid       pgid
    18  	parent     *node
    19  	children   nodes
    20  	inodes     inodes
    21  }
    22  
    23  // root returns the top-level node this node is attached to.
    24  func (n *node) root() *node {
    25  	if n.parent == nil {
    26  		return n
    27  	}
    28  	return n.parent.root()
    29  }
    30  
    31  // minKeys returns the minimum number of inodes this node should have.
    32  func (n *node) minKeys() int {
    33  	if n.isLeaf {
    34  		return 1
    35  	}
    36  	return 2
    37  }
    38  
    39  // size returns the size of the node after serialization.
    40  func (n *node) size() int {
    41  	sz, elsz := pageHeaderSize, n.pageElementSize()
    42  	for i := 0; i < len(n.inodes); i++ {
    43  		item := &n.inodes[i]
    44  		sz += elsz + uintptr(len(item.key)) + uintptr(len(item.value))
    45  	}
    46  	return int(sz)
    47  }
    48  
    49  // sizeLessThan returns true if the node is less than a given size.
    50  // This is an optimization to avoid calculating a large node when we only need
    51  // to know if it fits inside a certain page size.
    52  func (n *node) sizeLessThan(v uintptr) bool {
    53  	sz, elsz := pageHeaderSize, n.pageElementSize()
    54  	for i := 0; i < len(n.inodes); i++ {
    55  		item := &n.inodes[i]
    56  		sz += elsz + uintptr(len(item.key)) + uintptr(len(item.value))
    57  		if sz >= v {
    58  			return false
    59  		}
    60  	}
    61  	return true
    62  }
    63  
    64  // pageElementSize returns the size of each page element based on the type of node.
    65  func (n *node) pageElementSize() uintptr {
    66  	if n.isLeaf {
    67  		return leafPageElementSize
    68  	}
    69  	return branchPageElementSize
    70  }
    71  
    72  // childAt returns the child node at a given index.
    73  func (n *node) childAt(index int) *node {
    74  	if n.isLeaf {
    75  		panic(fmt.Sprintf("invalid childAt(%d) on a leaf node", index))
    76  	}
    77  	return n.bucket.node(n.inodes[index].pgid, n)
    78  }
    79  
    80  // childIndex returns the index of a given child node.
    81  func (n *node) childIndex(child *node) int {
    82  	index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, child.key) != -1 })
    83  	return index
    84  }
    85  
    86  // numChildren returns the number of children.
    87  func (n *node) numChildren() int {
    88  	return len(n.inodes)
    89  }
    90  
    91  // nextSibling returns the next node with the same parent.
    92  func (n *node) nextSibling() *node {
    93  	if n.parent == nil {
    94  		return nil
    95  	}
    96  	index := n.parent.childIndex(n)
    97  	if index >= n.parent.numChildren()-1 {
    98  		return nil
    99  	}
   100  	return n.parent.childAt(index + 1)
   101  }
   102  
   103  // prevSibling returns the previous node with the same parent.
   104  func (n *node) prevSibling() *node {
   105  	if n.parent == nil {
   106  		return nil
   107  	}
   108  	index := n.parent.childIndex(n)
   109  	if index == 0 {
   110  		return nil
   111  	}
   112  	return n.parent.childAt(index - 1)
   113  }
   114  
   115  // put inserts a key/value.
   116  func (n *node) put(oldKey, newKey, value []byte, pgid pgid, flags uint32) {
   117  	if pgid >= n.bucket.tx.meta.pgid {
   118  		panic(fmt.Sprintf("pgid (%d) above high water mark (%d)", pgid, n.bucket.tx.meta.pgid))
   119  	} else if len(oldKey) <= 0 {
   120  		panic("put: zero-length old key")
   121  	} else if len(newKey) <= 0 {
   122  		panic("put: zero-length new key")
   123  	}
   124  
   125  	// Find insertion index.
   126  	index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, oldKey) != -1 })
   127  
   128  	// Add capacity and shift nodes if we don't have an exact match and need to insert.
   129  	exact := (len(n.inodes) > 0 && index < len(n.inodes) && bytes.Equal(n.inodes[index].key, oldKey))
   130  	if !exact {
   131  		n.inodes = append(n.inodes, inode{})
   132  		copy(n.inodes[index+1:], n.inodes[index:])
   133  	}
   134  
   135  	inode := &n.inodes[index]
   136  	inode.flags = flags
   137  	inode.key = newKey
   138  	inode.value = value
   139  	inode.pgid = pgid
   140  	_assert(len(inode.key) > 0, "put: zero-length inode key")
   141  }
   142  
   143  // del removes a key from the node.
   144  func (n *node) del(key []byte) {
   145  	// Find index of key.
   146  	index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, key) != -1 })
   147  
   148  	// Exit if the key isn't found.
   149  	if index >= len(n.inodes) || !bytes.Equal(n.inodes[index].key, key) {
   150  		return
   151  	}
   152  
   153  	// Delete inode from the node.
   154  	n.inodes = append(n.inodes[:index], n.inodes[index+1:]...)
   155  
   156  	// Mark the node as needing rebalancing.
   157  	n.unbalanced = true
   158  }
   159  
   160  // read initializes the node from a page.
   161  func (n *node) read(p *page) {
   162  	n.pgid = p.id
   163  	n.isLeaf = ((p.flags & leafPageFlag) != 0)
   164  	n.inodes = make(inodes, int(p.count))
   165  
   166  	for i := 0; i < int(p.count); i++ {
   167  		inode := &n.inodes[i]
   168  		if n.isLeaf {
   169  			elem := p.leafPageElement(uint16(i))
   170  			inode.flags = elem.flags
   171  			inode.key = elem.key()
   172  			inode.value = elem.value()
   173  		} else {
   174  			elem := p.branchPageElement(uint16(i))
   175  			inode.pgid = elem.pgid
   176  			inode.key = elem.key()
   177  		}
   178  		_assert(len(inode.key) > 0, "read: zero-length inode key")
   179  	}
   180  
   181  	// Save first key so we can find the node in the parent when we spill.
   182  	if len(n.inodes) > 0 {
   183  		n.key = n.inodes[0].key
   184  		_assert(len(n.key) > 0, "read: zero-length node key")
   185  	} else {
   186  		n.key = nil
   187  	}
   188  }
   189  
   190  // write writes the items onto one or more pages.
   191  func (n *node) write(p *page) {
   192  	// Initialize page.
   193  	if n.isLeaf {
   194  		p.flags |= leafPageFlag
   195  	} else {
   196  		p.flags |= branchPageFlag
   197  	}
   198  
   199  	if len(n.inodes) >= 0xFFFF {
   200  		panic(fmt.Sprintf("inode overflow: %d (pgid=%d)", len(n.inodes), p.id))
   201  	}
   202  	p.count = uint16(len(n.inodes))
   203  
   204  	// Stop here if there are no items to write.
   205  	if p.count == 0 {
   206  		return
   207  	}
   208  
   209  	// Loop over each item and write it to the page.
   210  	// off tracks the offset into p of the start of the next data.
   211  	off := unsafe.Sizeof(*p) + n.pageElementSize()*uintptr(len(n.inodes))
   212  	for i, item := range n.inodes {
   213  		_assert(len(item.key) > 0, "write: zero-length inode key")
   214  
   215  		// Create a slice to write into of needed size and advance
   216  		// byte pointer for next iteration.
   217  		sz := len(item.key) + len(item.value)
   218  		b := unsafeByteSlice(unsafe.Pointer(p), off, 0, sz)
   219  		off += uintptr(sz)
   220  
   221  		// Write the page element.
   222  		if n.isLeaf {
   223  			elem := p.leafPageElement(uint16(i))
   224  			elem.pos = uint32(uintptr(unsafe.Pointer(&b[0])) - uintptr(unsafe.Pointer(elem)))
   225  			elem.flags = item.flags
   226  			elem.ksize = uint32(len(item.key))
   227  			elem.vsize = uint32(len(item.value))
   228  		} else {
   229  			elem := p.branchPageElement(uint16(i))
   230  			elem.pos = uint32(uintptr(unsafe.Pointer(&b[0])) - uintptr(unsafe.Pointer(elem)))
   231  			elem.ksize = uint32(len(item.key))
   232  			elem.pgid = item.pgid
   233  			_assert(elem.pgid != p.id, "write: circular dependency occurred")
   234  		}
   235  
   236  		// Write data for the element to the end of the page.
   237  		l := copy(b, item.key)
   238  		copy(b[l:], item.value)
   239  	}
   240  
   241  	// DEBUG ONLY: n.dump()
   242  }
   243  
   244  // split breaks up a node into multiple smaller nodes, if appropriate.
   245  // This should only be called from the spill() function.
   246  func (n *node) split(pageSize uintptr) []*node {
   247  	var nodes []*node
   248  
   249  	node := n
   250  	for {
   251  		// Split node into two.
   252  		a, b := node.splitTwo(pageSize)
   253  		nodes = append(nodes, a)
   254  
   255  		// If we can't split then exit the loop.
   256  		if b == nil {
   257  			break
   258  		}
   259  
   260  		// Set node to b so it gets split on the next iteration.
   261  		node = b
   262  	}
   263  
   264  	return nodes
   265  }
   266  
   267  // splitTwo breaks up a node into two smaller nodes, if appropriate.
   268  // This should only be called from the split() function.
   269  func (n *node) splitTwo(pageSize uintptr) (*node, *node) {
   270  	// Ignore the split if the page doesn't have at least enough nodes for
   271  	// two pages or if the nodes can fit in a single page.
   272  	if len(n.inodes) <= (minKeysPerPage*2) || n.sizeLessThan(pageSize) {
   273  		return n, nil
   274  	}
   275  
   276  	// Determine the threshold before starting a new node.
   277  	var fillPercent = n.bucket.FillPercent
   278  	if fillPercent < minFillPercent {
   279  		fillPercent = minFillPercent
   280  	} else if fillPercent > maxFillPercent {
   281  		fillPercent = maxFillPercent
   282  	}
   283  	threshold := int(float64(pageSize) * fillPercent)
   284  
   285  	// Determine split position and sizes of the two pages.
   286  	splitIndex, _ := n.splitIndex(threshold)
   287  
   288  	// Split node into two separate nodes.
   289  	// If there's no parent then we'll need to create one.
   290  	if n.parent == nil {
   291  		n.parent = &node{bucket: n.bucket, children: []*node{n}}
   292  	}
   293  
   294  	// Create a new node and add it to the parent.
   295  	next := &node{bucket: n.bucket, isLeaf: n.isLeaf, parent: n.parent}
   296  	n.parent.children = append(n.parent.children, next)
   297  
   298  	// Split inodes across two nodes.
   299  	next.inodes = n.inodes[splitIndex:]
   300  	n.inodes = n.inodes[:splitIndex]
   301  
   302  	// Update the statistics.
   303  	n.bucket.tx.stats.Split++
   304  
   305  	return n, next
   306  }
   307  
   308  // splitIndex finds the position where a page will fill a given threshold.
   309  // It returns the index as well as the size of the first page.
   310  // This is only be called from split().
   311  func (n *node) splitIndex(threshold int) (index, sz uintptr) {
   312  	sz = pageHeaderSize
   313  
   314  	// Loop until we only have the minimum number of keys required for the second page.
   315  	for i := 0; i < len(n.inodes)-minKeysPerPage; i++ {
   316  		index = uintptr(i)
   317  		inode := n.inodes[i]
   318  		elsize := n.pageElementSize() + uintptr(len(inode.key)) + uintptr(len(inode.value))
   319  
   320  		// If we have at least the minimum number of keys and adding another
   321  		// node would put us over the threshold then exit and return.
   322  		if index >= minKeysPerPage && sz+elsize > uintptr(threshold) {
   323  			break
   324  		}
   325  
   326  		// Add the element size to the total size.
   327  		sz += elsize
   328  	}
   329  
   330  	return
   331  }
   332  
   333  // spill writes the nodes to dirty pages and splits nodes as it goes.
   334  // Returns an error if dirty pages cannot be allocated.
   335  func (n *node) spill() error {
   336  	var tx = n.bucket.tx
   337  	if n.spilled {
   338  		return nil
   339  	}
   340  
   341  	// Spill child nodes first. Child nodes can materialize sibling nodes in
   342  	// the case of split-merge so we cannot use a range loop. We have to check
   343  	// the children size on every loop iteration.
   344  	sort.Sort(n.children)
   345  	for i := 0; i < len(n.children); i++ {
   346  		if err := n.children[i].spill(); err != nil {
   347  			return err
   348  		}
   349  	}
   350  
   351  	// We no longer need the child list because it's only used for spill tracking.
   352  	n.children = nil
   353  
   354  	// Split nodes into appropriate sizes. The first node will always be n.
   355  	var nodes = n.split(uintptr(tx.db.pageSize))
   356  	for _, node := range nodes {
   357  		// Add node's page to the freelist if it's not new.
   358  		if node.pgid > 0 {
   359  			tx.db.freelist.free(tx.meta.txid, tx.page(node.pgid))
   360  			node.pgid = 0
   361  		}
   362  
   363  		// Allocate contiguous space for the node.
   364  		p, err := tx.allocate((node.size() + tx.db.pageSize - 1) / tx.db.pageSize)
   365  		if err != nil {
   366  			return err
   367  		}
   368  
   369  		// Write the node.
   370  		if p.id >= tx.meta.pgid {
   371  			panic(fmt.Sprintf("pgid (%d) above high water mark (%d)", p.id, tx.meta.pgid))
   372  		}
   373  		node.pgid = p.id
   374  		node.write(p)
   375  		node.spilled = true
   376  
   377  		// Insert into parent inodes.
   378  		if node.parent != nil {
   379  			var key = node.key
   380  			if key == nil {
   381  				key = node.inodes[0].key
   382  			}
   383  
   384  			node.parent.put(key, node.inodes[0].key, nil, node.pgid, 0)
   385  			node.key = node.inodes[0].key
   386  			_assert(len(node.key) > 0, "spill: zero-length node key")
   387  		}
   388  
   389  		// Update the statistics.
   390  		tx.stats.Spill++
   391  	}
   392  
   393  	// If the root node split and created a new root then we need to spill that
   394  	// as well. We'll clear out the children to make sure it doesn't try to respill.
   395  	if n.parent != nil && n.parent.pgid == 0 {
   396  		n.children = nil
   397  		return n.parent.spill()
   398  	}
   399  
   400  	return nil
   401  }
   402  
   403  // rebalance attempts to combine the node with sibling nodes if the node fill
   404  // size is below a threshold or if there are not enough keys.
   405  func (n *node) rebalance() {
   406  	if !n.unbalanced {
   407  		return
   408  	}
   409  	n.unbalanced = false
   410  
   411  	// Update statistics.
   412  	n.bucket.tx.stats.Rebalance++
   413  
   414  	// Ignore if node is above threshold (25%) and has enough keys.
   415  	var threshold = n.bucket.tx.db.pageSize / 4
   416  	if n.size() > threshold && len(n.inodes) > n.minKeys() {
   417  		return
   418  	}
   419  
   420  	// Root node has special handling.
   421  	if n.parent == nil {
   422  		// If root node is a branch and only has one node then collapse it.
   423  		if !n.isLeaf && len(n.inodes) == 1 {
   424  			// Move root's child up.
   425  			child := n.bucket.node(n.inodes[0].pgid, n)
   426  			n.isLeaf = child.isLeaf
   427  			n.inodes = child.inodes[:]
   428  			n.children = child.children
   429  
   430  			// Reparent all child nodes being moved.
   431  			for _, inode := range n.inodes {
   432  				if child, ok := n.bucket.nodes[inode.pgid]; ok {
   433  					child.parent = n
   434  				}
   435  			}
   436  
   437  			// Remove old child.
   438  			child.parent = nil
   439  			delete(n.bucket.nodes, child.pgid)
   440  			child.free()
   441  		}
   442  
   443  		return
   444  	}
   445  
   446  	// If node has no keys then just remove it.
   447  	if n.numChildren() == 0 {
   448  		n.parent.del(n.key)
   449  		n.parent.removeChild(n)
   450  		delete(n.bucket.nodes, n.pgid)
   451  		n.free()
   452  		n.parent.rebalance()
   453  		return
   454  	}
   455  
   456  	_assert(n.parent.numChildren() > 1, "parent must have at least 2 children")
   457  
   458  	// Destination node is right sibling if idx == 0, otherwise left sibling.
   459  	var target *node
   460  	var useNextSibling = (n.parent.childIndex(n) == 0)
   461  	if useNextSibling {
   462  		target = n.nextSibling()
   463  	} else {
   464  		target = n.prevSibling()
   465  	}
   466  
   467  	// If both this node and the target node are too small then merge them.
   468  	if useNextSibling {
   469  		// Reparent all child nodes being moved.
   470  		for _, inode := range target.inodes {
   471  			if child, ok := n.bucket.nodes[inode.pgid]; ok {
   472  				child.parent.removeChild(child)
   473  				child.parent = n
   474  				child.parent.children = append(child.parent.children, child)
   475  			}
   476  		}
   477  
   478  		// Copy over inodes from target and remove target.
   479  		n.inodes = append(n.inodes, target.inodes...)
   480  		n.parent.del(target.key)
   481  		n.parent.removeChild(target)
   482  		delete(n.bucket.nodes, target.pgid)
   483  		target.free()
   484  	} else {
   485  		// Reparent all child nodes being moved.
   486  		for _, inode := range n.inodes {
   487  			if child, ok := n.bucket.nodes[inode.pgid]; ok {
   488  				child.parent.removeChild(child)
   489  				child.parent = target
   490  				child.parent.children = append(child.parent.children, child)
   491  			}
   492  		}
   493  
   494  		// Copy over inodes to target and remove node.
   495  		target.inodes = append(target.inodes, n.inodes...)
   496  		n.parent.del(n.key)
   497  		n.parent.removeChild(n)
   498  		delete(n.bucket.nodes, n.pgid)
   499  		n.free()
   500  	}
   501  
   502  	// Either this node or the target node was deleted from the parent so rebalance it.
   503  	n.parent.rebalance()
   504  }
   505  
   506  // removes a node from the list of in-memory children.
   507  // This does not affect the inodes.
   508  func (n *node) removeChild(target *node) {
   509  	for i, child := range n.children {
   510  		if child == target {
   511  			n.children = append(n.children[:i], n.children[i+1:]...)
   512  			return
   513  		}
   514  	}
   515  }
   516  
   517  // dereference causes the node to copy all its inode key/value references to heap memory.
   518  // This is required when the mmap is reallocated so inodes are not pointing to stale data.
   519  func (n *node) dereference() {
   520  	if n.key != nil {
   521  		key := make([]byte, len(n.key))
   522  		copy(key, n.key)
   523  		n.key = key
   524  		_assert(n.pgid == 0 || len(n.key) > 0, "dereference: zero-length node key on existing node")
   525  	}
   526  
   527  	for i := range n.inodes {
   528  		inode := &n.inodes[i]
   529  
   530  		key := make([]byte, len(inode.key))
   531  		copy(key, inode.key)
   532  		inode.key = key
   533  		_assert(len(inode.key) > 0, "dereference: zero-length inode key")
   534  
   535  		value := make([]byte, len(inode.value))
   536  		copy(value, inode.value)
   537  		inode.value = value
   538  	}
   539  
   540  	// Recursively dereference children.
   541  	for _, child := range n.children {
   542  		child.dereference()
   543  	}
   544  
   545  	// Update statistics.
   546  	n.bucket.tx.stats.NodeDeref++
   547  }
   548  
   549  // free adds the node's underlying page to the freelist.
   550  func (n *node) free() {
   551  	if n.pgid != 0 {
   552  		n.bucket.tx.db.freelist.free(n.bucket.tx.meta.txid, n.bucket.tx.page(n.pgid))
   553  		n.pgid = 0
   554  	}
   555  }
   556  
   557  // dump writes the contents of the node to STDERR for debugging purposes.
   558  /*
   559  func (n *node) dump() {
   560  	// Write node header.
   561  	var typ = "branch"
   562  	if n.isLeaf {
   563  		typ = "leaf"
   564  	}
   565  	warnf("[NODE %d {type=%s count=%d}]", n.pgid, typ, len(n.inodes))
   566  
   567  	// Write out abbreviated version of each item.
   568  	for _, item := range n.inodes {
   569  		if n.isLeaf {
   570  			if item.flags&bucketLeafFlag != 0 {
   571  				bucket := (*bucket)(unsafe.Pointer(&item.value[0]))
   572  				warnf("+L %08x -> (bucket root=%d)", trunc(item.key, 4), bucket.root)
   573  			} else {
   574  				warnf("+L %08x -> %08x", trunc(item.key, 4), trunc(item.value, 4))
   575  			}
   576  		} else {
   577  			warnf("+B %08x -> pgid=%d", trunc(item.key, 4), item.pgid)
   578  		}
   579  	}
   580  	warn("")
   581  }
   582  */
   583  
   584  type nodes []*node
   585  
   586  func (s nodes) Len() int      { return len(s) }
   587  func (s nodes) Swap(i, j int) { s[i], s[j] = s[j], s[i] }
   588  func (s nodes) Less(i, j int) bool {
   589  	return bytes.Compare(s[i].inodes[0].key, s[j].inodes[0].key) == -1
   590  }
   591  
   592  // inode represents an internal node inside of a node.
   593  // It can be used to point to elements in a page or point
   594  // to an element which hasn't been added to a page yet.
   595  type inode struct {
   596  	flags uint32
   597  	pgid  pgid
   598  	key   []byte
   599  	value []byte
   600  }
   601  
   602  type inodes []inode