github.com/zuoyebang/bitalosdb@v1.1.1-0.20240516111551-79a8c4d8ce20/bitree/bdb/node.go (about)

     1  // Copyright 2021 The Bitalosdb author(hustxrb@163.com) and other contributors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package bdb
    16  
    17  import (
    18  	"bytes"
    19  	"fmt"
    20  	"sort"
    21  	"unsafe"
    22  )
    23  
    24  type node struct {
    25  	bucket     *Bucket
    26  	isLeaf     bool
    27  	unbalanced bool
    28  	spilled    bool
    29  	key        []byte
    30  	pgid       pgid
    31  	parent     *node
    32  	children   nodes
    33  	inodes     inodes
    34  }
    35  
    36  func (n *node) root() *node {
    37  	if n.parent == nil {
    38  		return n
    39  	}
    40  	return n.parent.root()
    41  }
    42  
    43  func (n *node) minKeys() int {
    44  	if n.isLeaf {
    45  		return 1
    46  	}
    47  	return 2
    48  }
    49  
    50  func (n *node) size() int {
    51  	sz, elsz := pageHeaderSize, n.pageElementSize()
    52  	for i := 0; i < len(n.inodes); i++ {
    53  		item := &n.inodes[i]
    54  		sz += elsz + uintptr(len(item.key)) + uintptr(len(item.value))
    55  	}
    56  	return int(sz)
    57  }
    58  
    59  func (n *node) sizeLessThan(v uintptr) bool {
    60  	sz, elsz := pageHeaderSize, n.pageElementSize()
    61  	for i := 0; i < len(n.inodes); i++ {
    62  		item := &n.inodes[i]
    63  		sz += elsz + uintptr(len(item.key)) + uintptr(len(item.value))
    64  		if sz >= v {
    65  			return false
    66  		}
    67  	}
    68  	return true
    69  }
    70  
    71  func (n *node) pageElementSize() uintptr {
    72  	if n.isLeaf {
    73  		return leafPageElementSize
    74  	}
    75  	return branchPageElementSize
    76  }
    77  
    78  func (n *node) childAt(index int) *node {
    79  	if n.isLeaf {
    80  		panic(fmt.Sprintf("invalid childAt(%d) on a leaf node", index))
    81  	}
    82  	return n.bucket.node(n.inodes[index].pgid, n)
    83  }
    84  
    85  func (n *node) childIndex(child *node) int {
    86  	index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, child.key) != -1 })
    87  	return index
    88  }
    89  
    90  func (n *node) numChildren() int {
    91  	return len(n.inodes)
    92  }
    93  
    94  func (n *node) nextSibling() *node {
    95  	if n.parent == nil {
    96  		return nil
    97  	}
    98  	index := n.parent.childIndex(n)
    99  	if index >= n.parent.numChildren()-1 {
   100  		return nil
   101  	}
   102  	return n.parent.childAt(index + 1)
   103  }
   104  
   105  func (n *node) prevSibling() *node {
   106  	if n.parent == nil {
   107  		return nil
   108  	}
   109  	index := n.parent.childIndex(n)
   110  	if index == 0 {
   111  		return nil
   112  	}
   113  	return n.parent.childAt(index - 1)
   114  }
   115  
   116  func (n *node) put(oldKey, newKey, value []byte, pgid pgid, flags uint32) {
   117  	if pgid >= n.bucket.tx.meta.pgid {
   118  		panic(fmt.Sprintf("pgid (%d) above high water mark (%d)", pgid, n.bucket.tx.meta.pgid))
   119  	} else if len(oldKey) <= 0 {
   120  		panic("put: zero-length old key")
   121  	} else if len(newKey) <= 0 {
   122  		panic("put: zero-length new key")
   123  	}
   124  
   125  	index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, oldKey) != -1 })
   126  
   127  	exact := (len(n.inodes) > 0 && index < len(n.inodes) && bytes.Equal(n.inodes[index].key, oldKey))
   128  	if !exact {
   129  		n.inodes = append(n.inodes, inode{})
   130  		copy(n.inodes[index+1:], n.inodes[index:])
   131  	}
   132  
   133  	inode := &n.inodes[index]
   134  	inode.flags = flags
   135  	inode.key = newKey
   136  	inode.value = value
   137  	inode.pgid = pgid
   138  	_assert(len(inode.key) > 0, "put: zero-length inode key")
   139  }
   140  
   141  func (n *node) del(key []byte) {
   142  	index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, key) != -1 })
   143  
   144  	if index >= len(n.inodes) || !bytes.Equal(n.inodes[index].key, key) {
   145  		return
   146  	}
   147  
   148  	n.inodes = append(n.inodes[:index], n.inodes[index+1:]...)
   149  
   150  	n.unbalanced = true
   151  }
   152  
   153  func (n *node) read(p *page) {
   154  	n.pgid = p.id
   155  	n.isLeaf = ((p.flags & leafPageFlag) != 0)
   156  	n.inodes = make(inodes, int(p.count))
   157  
   158  	for i := 0; i < int(p.count); i++ {
   159  		inode := &n.inodes[i]
   160  		if n.isLeaf {
   161  			elem := p.leafPageElement(uint16(i))
   162  			inode.flags = elem.flags
   163  			inode.key = elem.key()
   164  			inode.value = elem.value()
   165  		} else {
   166  			elem := p.branchPageElement(uint16(i))
   167  			inode.pgid = elem.pgid
   168  			inode.key = elem.key()
   169  		}
   170  		_assert(len(inode.key) > 0, "read: zero-length inode key")
   171  	}
   172  
   173  	if len(n.inodes) > 0 {
   174  		n.key = n.inodes[0].key
   175  		_assert(len(n.key) > 0, "read: zero-length node key")
   176  	} else {
   177  		n.key = nil
   178  	}
   179  }
   180  
   181  func (n *node) write(p *page) {
   182  	if n.isLeaf {
   183  		p.flags |= leafPageFlag
   184  	} else {
   185  		p.flags |= branchPageFlag
   186  	}
   187  
   188  	if len(n.inodes) >= 0xFFFF {
   189  		panic(fmt.Sprintf("inode overflow: %d (pgid=%d)", len(n.inodes), p.id))
   190  	}
   191  	p.count = uint16(len(n.inodes))
   192  
   193  	if p.count == 0 {
   194  		return
   195  	}
   196  
   197  	off := unsafe.Sizeof(*p) + n.pageElementSize()*uintptr(len(n.inodes))
   198  	for i, item := range n.inodes {
   199  		_assert(len(item.key) > 0, "write: zero-length inode key")
   200  
   201  		sz := len(item.key) + len(item.value)
   202  		b := unsafeByteSlice(unsafe.Pointer(p), off, 0, sz)
   203  		off += uintptr(sz)
   204  
   205  		if n.isLeaf {
   206  			elem := p.leafPageElement(uint16(i))
   207  			elem.pos = uint32(uintptr(unsafe.Pointer(&b[0])) - uintptr(unsafe.Pointer(elem)))
   208  			elem.flags = item.flags
   209  			elem.ksize = uint32(len(item.key))
   210  			elem.vsize = uint32(len(item.value))
   211  		} else {
   212  			elem := p.branchPageElement(uint16(i))
   213  			elem.pos = uint32(uintptr(unsafe.Pointer(&b[0])) - uintptr(unsafe.Pointer(elem)))
   214  			elem.ksize = uint32(len(item.key))
   215  			elem.pgid = item.pgid
   216  			_assert(elem.pgid != p.id, "write: circular dependency occurred")
   217  		}
   218  
   219  		l := copy(b, item.key)
   220  		copy(b[l:], item.value)
   221  	}
   222  }
   223  
   224  func (n *node) split(pageSize uintptr) []*node {
   225  	var nodes []*node
   226  
   227  	node := n
   228  	for {
   229  		a, b := node.splitTwo(pageSize)
   230  		nodes = append(nodes, a)
   231  
   232  		if b == nil {
   233  			break
   234  		}
   235  
   236  		node = b
   237  	}
   238  
   239  	return nodes
   240  }
   241  
   242  func (n *node) splitTwo(pageSize uintptr) (*node, *node) {
   243  	if len(n.inodes) <= (minKeysPerPage*2) || n.sizeLessThan(pageSize) {
   244  		return n, nil
   245  	}
   246  
   247  	var fillPercent = n.bucket.FillPercent
   248  	if fillPercent < minFillPercent {
   249  		fillPercent = minFillPercent
   250  	} else if fillPercent > maxFillPercent {
   251  		fillPercent = maxFillPercent
   252  	}
   253  	threshold := int(float64(pageSize) * fillPercent)
   254  
   255  	splitIndex, _ := n.splitIndex(threshold)
   256  
   257  	if n.parent == nil {
   258  		n.parent = &node{bucket: n.bucket, children: []*node{n}}
   259  	}
   260  
   261  	next := &node{bucket: n.bucket, isLeaf: n.isLeaf, parent: n.parent}
   262  	n.parent.children = append(n.parent.children, next)
   263  
   264  	next.inodes = n.inodes[splitIndex:]
   265  	n.inodes = n.inodes[:splitIndex]
   266  
   267  	n.bucket.tx.stats.Split++
   268  
   269  	return n, next
   270  }
   271  
   272  func (n *node) splitIndex(threshold int) (index, sz uintptr) {
   273  	sz = pageHeaderSize
   274  
   275  	for i := 0; i < len(n.inodes)-minKeysPerPage; i++ {
   276  		index = uintptr(i)
   277  		inode := n.inodes[i]
   278  		elsize := n.pageElementSize() + uintptr(len(inode.key)) + uintptr(len(inode.value))
   279  
   280  		if index >= minKeysPerPage && sz+elsize > uintptr(threshold) {
   281  			break
   282  		}
   283  
   284  		sz += elsize
   285  	}
   286  
   287  	return
   288  }
   289  
   290  func (n *node) spill() error {
   291  	var tx = n.bucket.tx
   292  	if n.spilled {
   293  		return nil
   294  	}
   295  
   296  	sort.Sort(n.children)
   297  	for i := 0; i < len(n.children); i++ {
   298  		if err := n.children[i].spill(); err != nil {
   299  			return err
   300  		}
   301  	}
   302  
   303  	n.children = nil
   304  
   305  	var nodes = n.split(uintptr(tx.db.pageSize))
   306  	for _, node := range nodes {
   307  		if node.pgid > 0 {
   308  			tx.db.freelist.free(tx.meta.txid, tx.page(node.pgid))
   309  			node.pgid = 0
   310  		}
   311  
   312  		p, _, err := tx.allocate((node.size() + tx.db.pageSize - 1) / tx.db.pageSize)
   313  		if err != nil {
   314  			return err
   315  		}
   316  
   317  		if p.id >= tx.meta.pgid {
   318  			panic(fmt.Sprintf("pgid (%d) above high water mark (%d)", p.id, tx.meta.pgid))
   319  		}
   320  		node.pgid = p.id
   321  		node.write(p)
   322  		node.spilled = true
   323  
   324  		if node.parent != nil {
   325  			var key = node.key
   326  			if key == nil {
   327  				key = node.inodes[0].key
   328  			}
   329  
   330  			node.parent.put(key, node.inodes[0].key, nil, node.pgid, 0)
   331  			node.key = node.inodes[0].key
   332  			_assert(len(node.key) > 0, "spill: zero-length node key")
   333  		}
   334  
   335  		tx.stats.Spill++
   336  	}
   337  
   338  	if n.parent != nil && n.parent.pgid == 0 {
   339  		n.children = nil
   340  		return n.parent.spill()
   341  	}
   342  
   343  	return nil
   344  }
   345  
   346  func (n *node) rebalance() {
   347  	if !n.unbalanced {
   348  		return
   349  	}
   350  	n.unbalanced = false
   351  
   352  	n.bucket.tx.stats.Rebalance++
   353  
   354  	var threshold = n.bucket.tx.db.pageSize / 4
   355  	if n.size() > threshold && len(n.inodes) > n.minKeys() {
   356  		return
   357  	}
   358  
   359  	if n.parent == nil {
   360  		if !n.isLeaf && len(n.inodes) == 1 {
   361  			child := n.bucket.node(n.inodes[0].pgid, n)
   362  			n.isLeaf = child.isLeaf
   363  			n.inodes = child.inodes[:]
   364  			n.children = child.children
   365  
   366  			for _, inode := range n.inodes {
   367  				if child, ok := n.bucket.nodes[inode.pgid]; ok {
   368  					child.parent = n
   369  				}
   370  			}
   371  
   372  			child.parent = nil
   373  			delete(n.bucket.nodes, child.pgid)
   374  			child.free()
   375  		}
   376  
   377  		return
   378  	}
   379  
   380  	if n.numChildren() == 0 {
   381  		n.parent.del(n.key)
   382  		n.parent.removeChild(n)
   383  		delete(n.bucket.nodes, n.pgid)
   384  		n.free()
   385  		n.parent.rebalance()
   386  		return
   387  	}
   388  
   389  	_assert(n.parent.numChildren() > 1, "parent must have at least 2 children")
   390  
   391  	var target *node
   392  	var useNextSibling = (n.parent.childIndex(n) == 0)
   393  	if useNextSibling {
   394  		target = n.nextSibling()
   395  	} else {
   396  		target = n.prevSibling()
   397  	}
   398  
   399  	if useNextSibling {
   400  		for _, inode := range target.inodes {
   401  			if child, ok := n.bucket.nodes[inode.pgid]; ok {
   402  				child.parent.removeChild(child)
   403  				child.parent = n
   404  				child.parent.children = append(child.parent.children, child)
   405  			}
   406  		}
   407  
   408  		n.inodes = append(n.inodes, target.inodes...)
   409  		n.parent.del(target.key)
   410  		n.parent.removeChild(target)
   411  		delete(n.bucket.nodes, target.pgid)
   412  		target.free()
   413  	} else {
   414  		for _, inode := range n.inodes {
   415  			if child, ok := n.bucket.nodes[inode.pgid]; ok {
   416  				child.parent.removeChild(child)
   417  				child.parent = target
   418  				child.parent.children = append(child.parent.children, child)
   419  			}
   420  		}
   421  
   422  		target.inodes = append(target.inodes, n.inodes...)
   423  		n.parent.del(n.key)
   424  		n.parent.removeChild(n)
   425  		delete(n.bucket.nodes, n.pgid)
   426  		n.free()
   427  	}
   428  
   429  	n.parent.rebalance()
   430  }
   431  
   432  func (n *node) removeChild(target *node) {
   433  	for i, child := range n.children {
   434  		if child == target {
   435  			n.children = append(n.children[:i], n.children[i+1:]...)
   436  			return
   437  		}
   438  	}
   439  }
   440  
   441  func (n *node) dereference() {
   442  	if n.key != nil {
   443  		key := make([]byte, len(n.key))
   444  		copy(key, n.key)
   445  		n.key = key
   446  		_assert(n.pgid == 0 || len(n.key) > 0, "dereference: zero-length node key on existing node")
   447  	}
   448  
   449  	for i := range n.inodes {
   450  		inode := &n.inodes[i]
   451  
   452  		key := make([]byte, len(inode.key))
   453  		copy(key, inode.key)
   454  		inode.key = key
   455  		_assert(len(inode.key) > 0, "dereference: zero-length inode key")
   456  
   457  		value := make([]byte, len(inode.value))
   458  		copy(value, inode.value)
   459  		inode.value = value
   460  	}
   461  
   462  	for _, child := range n.children {
   463  		child.dereference()
   464  	}
   465  
   466  	n.bucket.tx.stats.NodeDeref++
   467  }
   468  
   469  func (n *node) free() {
   470  	if n.pgid != 0 {
   471  		n.bucket.tx.db.freelist.free(n.bucket.tx.meta.txid, n.bucket.tx.page(n.pgid))
   472  		n.pgid = 0
   473  	}
   474  }
   475  
   476  type nodes []*node
   477  
   478  func (s nodes) Len() int      { return len(s) }
   479  func (s nodes) Swap(i, j int) { s[i], s[j] = s[j], s[i] }
   480  func (s nodes) Less(i, j int) bool {
   481  	return bytes.Compare(s[i].inodes[0].key, s[j].inodes[0].key) == -1
   482  }
   483  
   484  type inode struct {
   485  	flags uint32
   486  	pgid  pgid
   487  	key   []byte
   488  	value []byte
   489  }
   490  
   491  type inodes []inode