github.com/bhojpur/cache@v0.0.4/pkg/memory/node.go (about)

     1  package memory
     2  
     3  // Copyright (c) 2018 Bhojpur Consulting Private Limited, India. All rights reserved.
     4  
     5  // Permission is hereby granted, free of charge, to any person obtaining a copy
     6  // of this software and associated documentation files (the "Software"), to deal
     7  // in the Software without restriction, including without limitation the rights
     8  // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
     9  // copies of the Software, and to permit persons to whom the Software is
    10  // furnished to do so, subject to the following conditions:
    11  
    12  // The above copyright notice and this permission notice shall be included in
    13  // all copies or substantial portions of the Software.
    14  
    15  // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    16  // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    17  // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
    18  // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    19  // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    20  // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
    21  // THE SOFTWARE.
    22  
    23  import (
    24  	"bytes"
    25  	"fmt"
    26  	"sort"
    27  	"unsafe"
    28  )
    29  
    30  // node represents an in-memory, deserialized page.
    31  type node struct {
    32  	bucket     *Bucket
    33  	isLeaf     bool
    34  	unbalanced bool
    35  	spilled    bool
    36  	key        []byte
    37  	pgid       pgid
    38  	parent     *node
    39  	children   nodes
    40  	inodes     inodes
    41  }
    42  
    43  // root returns the top-level node this node is attached to.
    44  func (n *node) root() *node {
    45  	if n.parent == nil {
    46  		return n
    47  	}
    48  	return n.parent.root()
    49  }
    50  
    51  // minKeys returns the minimum number of inodes this node should have.
    52  func (n *node) minKeys() int {
    53  	if n.isLeaf {
    54  		return 1
    55  	}
    56  	return 2
    57  }
    58  
    59  // size returns the size of the node after serialization.
    60  func (n *node) size() int {
    61  	sz, elsz := pageHeaderSize, n.pageElementSize()
    62  	for i := 0; i < len(n.inodes); i++ {
    63  		item := &n.inodes[i]
    64  		sz += elsz + len(item.key) + len(item.value)
    65  	}
    66  	return sz
    67  }
    68  
    69  // sizeLessThan returns true if the node is less than a given size.
    70  // This is an optimization to avoid calculating a large node when we only need
    71  // to know if it fits inside a certain page size.
    72  func (n *node) sizeLessThan(v int) bool {
    73  	sz, elsz := pageHeaderSize, n.pageElementSize()
    74  	for i := 0; i < len(n.inodes); i++ {
    75  		item := &n.inodes[i]
    76  		sz += elsz + len(item.key) + len(item.value)
    77  		if sz >= v {
    78  			return false
    79  		}
    80  	}
    81  	return true
    82  }
    83  
    84  // pageElementSize returns the size of each page element based on the type of node.
    85  func (n *node) pageElementSize() int {
    86  	if n.isLeaf {
    87  		return leafPageElementSize
    88  	}
    89  	return branchPageElementSize
    90  }
    91  
    92  // childAt returns the child node at a given index.
    93  func (n *node) childAt(index int) *node {
    94  	if n.isLeaf {
    95  		panic(fmt.Sprintf("invalid childAt(%d) on a leaf node", index))
    96  	}
    97  	return n.bucket.node(n.inodes[index].pgid, n)
    98  }
    99  
   100  // childIndex returns the index of a given child node.
   101  func (n *node) childIndex(child *node) int {
   102  	index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, child.key) != -1 })
   103  	return index
   104  }
   105  
   106  // numChildren returns the number of children.
   107  func (n *node) numChildren() int {
   108  	return len(n.inodes)
   109  }
   110  
   111  // nextSibling returns the next node with the same parent.
   112  func (n *node) nextSibling() *node {
   113  	if n.parent == nil {
   114  		return nil
   115  	}
   116  	index := n.parent.childIndex(n)
   117  	if index >= n.parent.numChildren()-1 {
   118  		return nil
   119  	}
   120  	return n.parent.childAt(index + 1)
   121  }
   122  
   123  // prevSibling returns the previous node with the same parent.
   124  func (n *node) prevSibling() *node {
   125  	if n.parent == nil {
   126  		return nil
   127  	}
   128  	index := n.parent.childIndex(n)
   129  	if index == 0 {
   130  		return nil
   131  	}
   132  	return n.parent.childAt(index - 1)
   133  }
   134  
   135  // put inserts a key/value.
   136  func (n *node) put(oldKey, newKey, value []byte, pgid pgid, flags uint32) {
   137  	if pgid >= n.bucket.tx.meta.pgid {
   138  		panic(fmt.Sprintf("pgid (%d) above high water mark (%d)", pgid, n.bucket.tx.meta.pgid))
   139  	} else if len(oldKey) <= 0 {
   140  		panic("put: zero-length old key")
   141  	} else if len(newKey) <= 0 {
   142  		panic("put: zero-length new key")
   143  	}
   144  
   145  	// Find insertion index.
   146  	index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, oldKey) != -1 })
   147  
   148  	// Add capacity and shift nodes if we don't have an exact match and need to insert.
   149  	exact := (len(n.inodes) > 0 && index < len(n.inodes) && bytes.Equal(n.inodes[index].key, oldKey))
   150  	if !exact {
   151  		n.inodes = append(n.inodes, inode{})
   152  		copy(n.inodes[index+1:], n.inodes[index:])
   153  	}
   154  
   155  	inode := &n.inodes[index]
   156  	inode.flags = flags
   157  	inode.key = newKey
   158  	inode.value = value
   159  	inode.pgid = pgid
   160  	_assert(len(inode.key) > 0, "put: zero-length inode key")
   161  }
   162  
   163  // del removes a key from the node.
   164  func (n *node) del(key []byte) {
   165  	// Find index of key.
   166  	index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, key) != -1 })
   167  
   168  	// Exit if the key isn't found.
   169  	if index >= len(n.inodes) || !bytes.Equal(n.inodes[index].key, key) {
   170  		return
   171  	}
   172  
   173  	// Delete inode from the node.
   174  	n.inodes = append(n.inodes[:index], n.inodes[index+1:]...)
   175  
   176  	// Mark the node as needing rebalancing.
   177  	n.unbalanced = true
   178  }
   179  
   180  // read initializes the node from a page.
   181  func (n *node) read(p *page) {
   182  	n.pgid = p.id
   183  	n.isLeaf = ((p.flags & leafPageFlag) != 0)
   184  	n.inodes = make(inodes, int(p.count))
   185  
   186  	for i := 0; i < int(p.count); i++ {
   187  		inode := &n.inodes[i]
   188  		if n.isLeaf {
   189  			elem := p.leafPageElement(uint16(i))
   190  			inode.flags = elem.flags
   191  			inode.key = elem.key()
   192  			inode.value = elem.value()
   193  		} else {
   194  			elem := p.branchPageElement(uint16(i))
   195  			inode.pgid = elem.pgid
   196  			inode.key = elem.key()
   197  		}
   198  		_assert(len(inode.key) > 0, "read: zero-length inode key")
   199  	}
   200  
   201  	// Save first key so we can find the node in the parent when we spill.
   202  	if len(n.inodes) > 0 {
   203  		n.key = n.inodes[0].key
   204  		_assert(len(n.key) > 0, "read: zero-length node key")
   205  	} else {
   206  		n.key = nil
   207  	}
   208  }
   209  
   210  // write writes the items onto one or more pages.
   211  func (n *node) write(p *page) {
   212  	// Initialize page.
   213  	if n.isLeaf {
   214  		p.flags |= leafPageFlag
   215  	} else {
   216  		p.flags |= branchPageFlag
   217  	}
   218  
   219  	if len(n.inodes) >= 0xFFFF {
   220  		panic(fmt.Sprintf("inode overflow: %d (pgid=%d)", len(n.inodes), p.id))
   221  	}
   222  	p.count = uint16(len(n.inodes))
   223  
   224  	// Stop here if there are no items to write.
   225  	if p.count == 0 {
   226  		return
   227  	}
   228  
   229  	// Loop over each item and write it to the page.
   230  	b := (*[maxAllocSize]byte)(unsafe.Pointer(&p.ptr))[n.pageElementSize()*len(n.inodes):]
   231  	for i, item := range n.inodes {
   232  		_assert(len(item.key) > 0, "write: zero-length inode key")
   233  
   234  		// Write the page element.
   235  		if n.isLeaf {
   236  			elem := p.leafPageElement(uint16(i))
   237  			elem.pos = uint32(uintptr(unsafe.Pointer(&b[0])) - uintptr(unsafe.Pointer(elem)))
   238  			elem.flags = item.flags
   239  			elem.ksize = uint32(len(item.key))
   240  			elem.vsize = uint32(len(item.value))
   241  		} else {
   242  			elem := p.branchPageElement(uint16(i))
   243  			elem.pos = uint32(uintptr(unsafe.Pointer(&b[0])) - uintptr(unsafe.Pointer(elem)))
   244  			elem.ksize = uint32(len(item.key))
   245  			elem.pgid = item.pgid
   246  			_assert(elem.pgid != p.id, "write: circular dependency occurred")
   247  		}
   248  
   249  		// If the length of key+value is larger than the max allocation size
   250  		// then we need to reallocate the byte array pointer.
   251  		klen, vlen := len(item.key), len(item.value)
   252  		if len(b) < klen+vlen {
   253  			b = (*[maxAllocSize]byte)(unsafe.Pointer(&b[0]))[:]
   254  		}
   255  
   256  		// Write data for the element to the end of the page.
   257  		copy(b[0:], item.key)
   258  		b = b[klen:]
   259  		copy(b[0:], item.value)
   260  		b = b[vlen:]
   261  	}
   262  
   263  	// DEBUG ONLY: n.dump()
   264  }
   265  
   266  // split breaks up a node into multiple smaller nodes, if appropriate.
   267  // This should only be called from the spill() function.
   268  func (n *node) split(pageSize int) []*node {
   269  	var nodes []*node
   270  
   271  	node := n
   272  	for {
   273  		// Split node into two.
   274  		a, b := node.splitTwo(pageSize)
   275  		nodes = append(nodes, a)
   276  
   277  		// If we can't split then exit the loop.
   278  		if b == nil {
   279  			break
   280  		}
   281  
   282  		// Set node to b so it gets split on the next iteration.
   283  		node = b
   284  	}
   285  
   286  	return nodes
   287  }
   288  
   289  // splitTwo breaks up a node into two smaller nodes, if appropriate.
   290  // This should only be called from the split() function.
   291  func (n *node) splitTwo(pageSize int) (*node, *node) {
   292  	// Ignore the split if the page doesn't have at least enough nodes for
   293  	// two pages or if the nodes can fit in a single page.
   294  	if len(n.inodes) <= (minKeysPerPage*2) || n.sizeLessThan(pageSize) {
   295  		return n, nil
   296  	}
   297  
   298  	// Determine the threshold before starting a new node.
   299  	var fillPercent = n.bucket.FillPercent
   300  	if fillPercent < minFillPercent {
   301  		fillPercent = minFillPercent
   302  	} else if fillPercent > maxFillPercent {
   303  		fillPercent = maxFillPercent
   304  	}
   305  	threshold := int(float64(pageSize) * fillPercent)
   306  
   307  	// Determine split position and sizes of the two pages.
   308  	splitIndex, _ := n.splitIndex(threshold)
   309  
   310  	// Split node into two separate nodes.
   311  	// If there's no parent then we'll need to create one.
   312  	if n.parent == nil {
   313  		n.parent = &node{bucket: n.bucket, children: []*node{n}}
   314  	}
   315  
   316  	// Create a new node and add it to the parent.
   317  	next := &node{bucket: n.bucket, isLeaf: n.isLeaf, parent: n.parent}
   318  	n.parent.children = append(n.parent.children, next)
   319  
   320  	// Split inodes across two nodes.
   321  	next.inodes = n.inodes[splitIndex:]
   322  	n.inodes = n.inodes[:splitIndex]
   323  
   324  	// Update the statistics.
   325  	n.bucket.tx.stats.Split++
   326  
   327  	return n, next
   328  }
   329  
   330  // splitIndex finds the position where a page will fill a given threshold.
   331  // It returns the index as well as the size of the first page.
   332  // This is only be called from split().
   333  func (n *node) splitIndex(threshold int) (index, sz int) {
   334  	sz = pageHeaderSize
   335  
   336  	// Loop until we only have the minimum number of keys required for the second page.
   337  	for i := 0; i < len(n.inodes)-minKeysPerPage; i++ {
   338  		index = i
   339  		inode := n.inodes[i]
   340  		elsize := n.pageElementSize() + len(inode.key) + len(inode.value)
   341  
   342  		// If we have at least the minimum number of keys and adding another
   343  		// node would put us over the threshold then exit and return.
   344  		if i >= minKeysPerPage && sz+elsize > threshold {
   345  			break
   346  		}
   347  
   348  		// Add the element size to the total size.
   349  		sz += elsize
   350  	}
   351  
   352  	return
   353  }
   354  
   355  // spill writes the nodes to dirty pages and splits nodes as it goes.
   356  // Returns an error if dirty pages cannot be allocated.
   357  func (n *node) spill() error {
   358  	var tx = n.bucket.tx
   359  	if n.spilled {
   360  		return nil
   361  	}
   362  
   363  	// Spill child nodes first. Child nodes can materialize sibling nodes in
   364  	// the case of split-merge so we cannot use a range loop. We have to check
   365  	// the children size on every loop iteration.
   366  	sort.Sort(n.children)
   367  	for i := 0; i < len(n.children); i++ {
   368  		if err := n.children[i].spill(); err != nil {
   369  			return err
   370  		}
   371  	}
   372  
   373  	// We no longer need the child list because it's only used for spill tracking.
   374  	n.children = nil
   375  
   376  	// Split nodes into appropriate sizes. The first node will always be n.
   377  	var nodes = n.split(tx.db.pageSize)
   378  	for _, node := range nodes {
   379  		// Add node's page to the freelist if it's not new.
   380  		if node.pgid > 0 {
   381  			tx.db.freelist.free(tx.meta.txid, tx.page(node.pgid))
   382  			node.pgid = 0
   383  		}
   384  
   385  		// Allocate contiguous space for the node.
   386  		p, err := tx.allocate((node.size() / tx.db.pageSize) + 1)
   387  		if err != nil {
   388  			return err
   389  		}
   390  
   391  		// Write the node.
   392  		if p.id >= tx.meta.pgid {
   393  			panic(fmt.Sprintf("pgid (%d) above high water mark (%d)", p.id, tx.meta.pgid))
   394  		}
   395  		node.pgid = p.id
   396  		node.write(p)
   397  		node.spilled = true
   398  
   399  		// Insert into parent inodes.
   400  		if node.parent != nil {
   401  			var key = node.key
   402  			if key == nil {
   403  				key = node.inodes[0].key
   404  			}
   405  
   406  			node.parent.put(key, node.inodes[0].key, nil, node.pgid, 0)
   407  			node.key = node.inodes[0].key
   408  			_assert(len(node.key) > 0, "spill: zero-length node key")
   409  		}
   410  
   411  		// Update the statistics.
   412  		tx.stats.Spill++
   413  	}
   414  
   415  	// If the root node split and created a new root then we need to spill that
   416  	// as well. We'll clear out the children to make sure it doesn't try to respill.
   417  	if n.parent != nil && n.parent.pgid == 0 {
   418  		n.children = nil
   419  		return n.parent.spill()
   420  	}
   421  
   422  	return nil
   423  }
   424  
   425  // rebalance attempts to combine the node with sibling nodes if the node fill
   426  // size is below a threshold or if there are not enough keys.
   427  func (n *node) rebalance() {
   428  	if !n.unbalanced {
   429  		return
   430  	}
   431  	n.unbalanced = false
   432  
   433  	// Update statistics.
   434  	n.bucket.tx.stats.Rebalance++
   435  
   436  	// Ignore if node is above threshold (25%) and has enough keys.
   437  	var threshold = n.bucket.tx.db.pageSize / 4
   438  	if n.size() > threshold && len(n.inodes) > n.minKeys() {
   439  		return
   440  	}
   441  
   442  	// Root node has special handling.
   443  	if n.parent == nil {
   444  		// If root node is a branch and only has one node then collapse it.
   445  		if !n.isLeaf && len(n.inodes) == 1 {
   446  			// Move root's child up.
   447  			child := n.bucket.node(n.inodes[0].pgid, n)
   448  			n.isLeaf = child.isLeaf
   449  			n.inodes = child.inodes[:]
   450  			n.children = child.children
   451  
   452  			// Reparent all child nodes being moved.
   453  			for _, inode := range n.inodes {
   454  				if child, ok := n.bucket.nodes[inode.pgid]; ok {
   455  					child.parent = n
   456  				}
   457  			}
   458  
   459  			// Remove old child.
   460  			child.parent = nil
   461  			delete(n.bucket.nodes, child.pgid)
   462  			child.free()
   463  		}
   464  
   465  		return
   466  	}
   467  
   468  	// If node has no keys then just remove it.
   469  	if n.numChildren() == 0 {
   470  		n.parent.del(n.key)
   471  		n.parent.removeChild(n)
   472  		delete(n.bucket.nodes, n.pgid)
   473  		n.free()
   474  		n.parent.rebalance()
   475  		return
   476  	}
   477  
   478  	_assert(n.parent.numChildren() > 1, "parent must have at least 2 children")
   479  
   480  	// Destination node is right sibling if idx == 0, otherwise left sibling.
   481  	var target *node
   482  	var useNextSibling = (n.parent.childIndex(n) == 0)
   483  	if useNextSibling {
   484  		target = n.nextSibling()
   485  	} else {
   486  		target = n.prevSibling()
   487  	}
   488  
   489  	// If both this node and the target node are too small then merge them.
   490  	if useNextSibling {
   491  		// Reparent all child nodes being moved.
   492  		for _, inode := range target.inodes {
   493  			if child, ok := n.bucket.nodes[inode.pgid]; ok {
   494  				child.parent.removeChild(child)
   495  				child.parent = n
   496  				child.parent.children = append(child.parent.children, child)
   497  			}
   498  		}
   499  
   500  		// Copy over inodes from target and remove target.
   501  		n.inodes = append(n.inodes, target.inodes...)
   502  		n.parent.del(target.key)
   503  		n.parent.removeChild(target)
   504  		delete(n.bucket.nodes, target.pgid)
   505  		target.free()
   506  	} else {
   507  		// Reparent all child nodes being moved.
   508  		for _, inode := range n.inodes {
   509  			if child, ok := n.bucket.nodes[inode.pgid]; ok {
   510  				child.parent.removeChild(child)
   511  				child.parent = target
   512  				child.parent.children = append(child.parent.children, child)
   513  			}
   514  		}
   515  
   516  		// Copy over inodes to target and remove node.
   517  		target.inodes = append(target.inodes, n.inodes...)
   518  		n.parent.del(n.key)
   519  		n.parent.removeChild(n)
   520  		delete(n.bucket.nodes, n.pgid)
   521  		n.free()
   522  	}
   523  
   524  	// Either this node or the target node was deleted from the parent so rebalance it.
   525  	n.parent.rebalance()
   526  }
   527  
   528  // removes a node from the list of in-memory children.
   529  // This does not affect the inodes.
   530  func (n *node) removeChild(target *node) {
   531  	for i, child := range n.children {
   532  		if child == target {
   533  			n.children = append(n.children[:i], n.children[i+1:]...)
   534  			return
   535  		}
   536  	}
   537  }
   538  
   539  // dereference causes the node to copy all its inode key/value references to heap memory.
   540  // This is required when the mmap is reallocated so inodes are not pointing to stale data.
   541  func (n *node) dereference() {
   542  	if n.key != nil {
   543  		key := make([]byte, len(n.key))
   544  		copy(key, n.key)
   545  		n.key = key
   546  		_assert(n.pgid == 0 || len(n.key) > 0, "dereference: zero-length node key on existing node")
   547  	}
   548  
   549  	for i := range n.inodes {
   550  		inode := &n.inodes[i]
   551  
   552  		key := make([]byte, len(inode.key))
   553  		copy(key, inode.key)
   554  		inode.key = key
   555  		_assert(len(inode.key) > 0, "dereference: zero-length inode key")
   556  
   557  		value := make([]byte, len(inode.value))
   558  		copy(value, inode.value)
   559  		inode.value = value
   560  	}
   561  
   562  	// Recursively dereference children.
   563  	for _, child := range n.children {
   564  		child.dereference()
   565  	}
   566  
   567  	// Update statistics.
   568  	n.bucket.tx.stats.NodeDeref++
   569  }
   570  
   571  // free adds the node's underlying page to the freelist.
   572  func (n *node) free() {
   573  	if n.pgid != 0 {
   574  		n.bucket.tx.db.freelist.free(n.bucket.tx.meta.txid, n.bucket.tx.page(n.pgid))
   575  		n.pgid = 0
   576  	}
   577  }
   578  
   579  // dump writes the contents of the node to STDERR for debugging purposes.
   580  /*
   581  func (n *node) dump() {
   582  	// Write node header.
   583  	var typ = "branch"
   584  	if n.isLeaf {
   585  		typ = "leaf"
   586  	}
   587  	warnf("[NODE %d {type=%s count=%d}]", n.pgid, typ, len(n.inodes))
   588  
   589  	// Write out abbreviated version of each item.
   590  	for _, item := range n.inodes {
   591  		if n.isLeaf {
   592  			if item.flags&bucketLeafFlag != 0 {
   593  				bucket := (*bucket)(unsafe.Pointer(&item.value[0]))
   594  				warnf("+L %08x -> (bucket root=%d)", trunc(item.key, 4), bucket.root)
   595  			} else {
   596  				warnf("+L %08x -> %08x", trunc(item.key, 4), trunc(item.value, 4))
   597  			}
   598  		} else {
   599  			warnf("+B %08x -> pgid=%d", trunc(item.key, 4), item.pgid)
   600  		}
   601  	}
   602  	warn("")
   603  }
   604  */
   605  
   606  type nodes []*node
   607  
   608  func (s nodes) Len() int      { return len(s) }
   609  func (s nodes) Swap(i, j int) { s[i], s[j] = s[j], s[i] }
   610  func (s nodes) Less(i, j int) bool {
   611  	return bytes.Compare(s[i].inodes[0].key, s[j].inodes[0].key) == -1
   612  }
   613  
   614  // inode represents an internal node inside of a node.
   615  // It can be used to point to elements in a page or point
   616  // to an element which hasn't been added to a page yet.
   617  type inode struct {
   618  	flags uint32
   619  	pgid  pgid
   620  	key   []byte
   621  	value []byte
   622  }
   623  
   624  type inodes []inode