github.com/zuoyebang/bitalosdb@v1.1.1-0.20240516111551-79a8c4d8ce20/bitree/bdb/bucket.go (about)

     1  // Copyright 2021 The Bitalosdb author(hustxrb@163.com) and other contributors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package bdb
    16  
    17  import (
    18  	"bytes"
    19  	"fmt"
    20  	"unsafe"
    21  
    22  	"github.com/cockroachdb/errors"
    23  )
    24  
    25  const (
    26  	MaxKeySize   = 10 << 10
    27  	MaxValueSize = (1 << 31) - 2
    28  )
    29  
    30  const bucketHeaderSize = int(unsafe.Sizeof(bucket{}))
    31  
    32  const (
    33  	minFillPercent = 0.1
    34  	maxFillPercent = 1.0
    35  )
    36  
    37  const DefaultFillPercent = 1.0
    38  
    39  type Bucket struct {
    40  	*bucket
    41  	tx          *Tx
    42  	buckets     map[string]*Bucket
    43  	page        *page
    44  	rootNode    *node
    45  	nodes       map[pgid]*node
    46  	FillPercent float64
    47  }
    48  
    49  type bucket struct {
    50  	root     pgid
    51  	sequence uint64
    52  }
    53  
    54  func newBucket(tx *Tx) Bucket {
    55  	var b = Bucket{tx: tx, FillPercent: DefaultFillPercent}
    56  	if tx.writable {
    57  		b.buckets = make(map[string]*Bucket)
    58  		b.nodes = make(map[pgid]*node, 1<<4)
    59  	}
    60  	return b
    61  }
    62  
    63  func (b *Bucket) Tx() *Tx {
    64  	return b.tx
    65  }
    66  
    67  func (b *Bucket) Root() pgid {
    68  	return b.root
    69  }
    70  
    71  func (b *Bucket) Writable() bool {
    72  	return b.tx.writable
    73  }
    74  func (b *Bucket) Cursor() *Cursor {
    75  	b.tx.stats.CursorCount++
    76  
    77  	return &Cursor{
    78  		bucket: b,
    79  		stack:  make([]elemRef, 0),
    80  	}
    81  }
    82  
    83  func (b *Bucket) Bucket(name []byte) *Bucket {
    84  	if b.buckets != nil {
    85  		if child := b.buckets[string(name)]; child != nil {
    86  			return child
    87  		}
    88  	}
    89  
    90  	c := b.Cursor()
    91  	k, v, flags := c.seek(name)
    92  
    93  	if !bytes.Equal(name, k) || (flags&bucketLeafFlag) == 0 {
    94  		return nil
    95  	}
    96  
    97  	var child = b.openBucket(v)
    98  	if b.buckets != nil {
    99  		b.buckets[string(name)] = child
   100  	}
   101  
   102  	return child
   103  }
   104  
   105  func (b *Bucket) openBucket(value []byte) *Bucket {
   106  	var child = newBucket(b.tx)
   107  
   108  	const unalignedMask = unsafe.Alignof(struct {
   109  		bucket
   110  		page
   111  	}{}) - 1
   112  	unaligned := uintptr(unsafe.Pointer(&value[0]))&unalignedMask != 0
   113  	if unaligned {
   114  		value = cloneBytes(value)
   115  	}
   116  
   117  	if b.tx.writable && !unaligned {
   118  		child.bucket = &bucket{}
   119  		*child.bucket = *(*bucket)(unsafe.Pointer(&value[0]))
   120  	} else {
   121  		child.bucket = (*bucket)(unsafe.Pointer(&value[0]))
   122  	}
   123  
   124  	if child.root == 0 {
   125  		child.page = (*page)(unsafe.Pointer(&value[bucketHeaderSize]))
   126  	}
   127  
   128  	return &child
   129  }
   130  
   131  func openBucketPage(value []byte) *page {
   132  	var pg *page
   133  	bkt := (*bucket)(unsafe.Pointer(&value[0]))
   134  	if bkt.root == 0 {
   135  		pg = (*page)(unsafe.Pointer(&value[bucketHeaderSize]))
   136  	}
   137  	return pg
   138  }
   139  
   140  func (b *Bucket) CreateBucket(key []byte) (*Bucket, error) {
   141  	if b.tx.db == nil {
   142  		return nil, ErrTxClosed
   143  	} else if !b.tx.writable {
   144  		return nil, ErrTxNotWritable
   145  	} else if len(key) == 0 {
   146  		return nil, ErrBucketNameRequired
   147  	}
   148  
   149  	c := b.Cursor()
   150  	k, _, flags := c.seek(key)
   151  
   152  	if bytes.Equal(key, k) {
   153  		if (flags & bucketLeafFlag) != 0 {
   154  			return nil, ErrBucketExists
   155  		}
   156  		return nil, ErrIncompatibleValue
   157  	}
   158  
   159  	var bucket = Bucket{
   160  		bucket:      &bucket{},
   161  		rootNode:    &node{isLeaf: true},
   162  		FillPercent: DefaultFillPercent,
   163  	}
   164  	var value = bucket.write()
   165  
   166  	key = cloneBytes(key)
   167  	c.node().put(key, key, value, 0, bucketLeafFlag)
   168  
   169  	b.page = nil
   170  
   171  	return b.Bucket(key), nil
   172  }
   173  
   174  func (b *Bucket) CreateBucketIfNotExists(key []byte) (*Bucket, error) {
   175  	child, err := b.CreateBucket(key)
   176  	if err == ErrBucketExists {
   177  		return b.Bucket(key), nil
   178  	} else if err != nil {
   179  		return nil, err
   180  	}
   181  	return child, nil
   182  }
   183  
   184  func (b *Bucket) DeleteBucket(key []byte) error {
   185  	if b.tx.db == nil {
   186  		return ErrTxClosed
   187  	} else if !b.Writable() {
   188  		return ErrTxNotWritable
   189  	}
   190  
   191  	c := b.Cursor()
   192  	k, _, flags := c.seek(key)
   193  
   194  	if !bytes.Equal(key, k) {
   195  		return ErrBucketNotFound
   196  	} else if (flags & bucketLeafFlag) == 0 {
   197  		return ErrIncompatibleValue
   198  	}
   199  
   200  	child := b.Bucket(key)
   201  	err := child.ForEach(func(k, v []byte) error {
   202  		if _, _, childFlags := child.Cursor().seek(k); (childFlags & bucketLeafFlag) != 0 {
   203  			if err := child.DeleteBucket(k); err != nil {
   204  				return errors.Wrap(err, "delete bucket err")
   205  			}
   206  		}
   207  		return nil
   208  	})
   209  	if err != nil {
   210  		return err
   211  	}
   212  
   213  	delete(b.buckets, string(key))
   214  
   215  	child.nodes = nil
   216  	child.rootNode = nil
   217  	child.free()
   218  
   219  	c.node().del(key)
   220  
   221  	return nil
   222  }
   223  
   224  func (b *Bucket) Get(key []byte) []byte {
   225  	k, v, flags := b.Cursor().seek(key)
   226  
   227  	if (flags & bucketLeafFlag) != 0 {
   228  		return nil
   229  	}
   230  
   231  	if !bytes.Equal(key, k) {
   232  		return nil
   233  	}
   234  	return v
   235  }
   236  
   237  func (b *Bucket) Seek(key []byte) ([]byte, []byte) {
   238  	return b.Cursor().Seek(key)
   239  }
   240  
   241  func (b *Bucket) Put(key []byte, value []byte) error {
   242  	if b.tx.db == nil {
   243  		return ErrTxClosed
   244  	} else if !b.Writable() {
   245  		return ErrTxNotWritable
   246  	} else if len(key) == 0 {
   247  		return ErrKeyRequired
   248  	} else if int64(len(value)) > MaxValueSize {
   249  		return ErrValueTooLarge
   250  	}
   251  
   252  	if len(key) > MaxKeySize {
   253  		key = key[:MaxKeySize]
   254  	}
   255  
   256  	c := b.Cursor()
   257  	k, _, flags := c.seek(key)
   258  
   259  	if bytes.Equal(key, k) && (flags&bucketLeafFlag) != 0 {
   260  		return ErrIncompatibleValue
   261  	}
   262  
   263  	key = cloneBytes(key)
   264  	c.node().put(key, key, value, 0, 0)
   265  
   266  	return nil
   267  }
   268  
   269  func (b *Bucket) Delete(key []byte) error {
   270  	if b.tx.db == nil {
   271  		return ErrTxClosed
   272  	} else if !b.Writable() {
   273  		return ErrTxNotWritable
   274  	}
   275  
   276  	c := b.Cursor()
   277  	k, _, flags := c.seek(key)
   278  
   279  	if !bytes.Equal(key, k) {
   280  		return nil
   281  	}
   282  
   283  	if (flags & bucketLeafFlag) != 0 {
   284  		return ErrIncompatibleValue
   285  	}
   286  
   287  	c.node().del(key)
   288  
   289  	return nil
   290  }
   291  
   292  func (b *Bucket) Sequence() uint64 { return b.bucket.sequence }
   293  
   294  func (b *Bucket) SetSequence(v uint64) error {
   295  	if b.tx.db == nil {
   296  		return ErrTxClosed
   297  	} else if !b.Writable() {
   298  		return ErrTxNotWritable
   299  	}
   300  
   301  	if b.rootNode == nil {
   302  		_ = b.node(b.root, nil)
   303  	}
   304  
   305  	b.bucket.sequence = v
   306  	return nil
   307  }
   308  
   309  func (b *Bucket) NextSequence() (uint64, error) {
   310  	if b.tx.db == nil {
   311  		return 0, ErrTxClosed
   312  	} else if !b.Writable() {
   313  		return 0, ErrTxNotWritable
   314  	}
   315  
   316  	if b.rootNode == nil {
   317  		_ = b.node(b.root, nil)
   318  	}
   319  
   320  	b.bucket.sequence++
   321  	return b.bucket.sequence, nil
   322  }
   323  
   324  func (b *Bucket) ForEach(fn func(k, v []byte) error) error {
   325  	if b.tx.db == nil {
   326  		return ErrTxClosed
   327  	}
   328  	c := b.Cursor()
   329  	for k, v := c.First(); k != nil; k, v = c.Next() {
   330  		if err := fn(k, v); err != nil {
   331  			return err
   332  		}
   333  	}
   334  	return nil
   335  }
   336  
   337  func (b *Bucket) Stats() BucketStats {
   338  	var s, subStats BucketStats
   339  	pageSize := b.tx.db.pageSize
   340  	s.BucketN += 1
   341  	if b.root == 0 {
   342  		s.InlineBucketN += 1
   343  	}
   344  	b.forEachPage(func(p *page, depth int) {
   345  		if (p.flags & leafPageFlag) != 0 {
   346  			s.KeyN += int(p.count)
   347  
   348  			used := pageHeaderSize
   349  
   350  			if p.count != 0 {
   351  				used += leafPageElementSize * uintptr(p.count-1)
   352  
   353  				lastElement := p.leafPageElement(p.count - 1)
   354  				used += uintptr(lastElement.pos + lastElement.ksize + lastElement.vsize)
   355  			}
   356  
   357  			if b.root == 0 {
   358  				s.InlineBucketInuse += int(used)
   359  			} else {
   360  				s.LeafPageN++
   361  				s.LeafInuse += int(used)
   362  				s.LeafOverflowN += int(p.overflow)
   363  
   364  				for i := uint16(0); i < p.count; i++ {
   365  					e := p.leafPageElement(i)
   366  					if (e.flags & bucketLeafFlag) != 0 {
   367  						subStats.Add(b.openBucket(e.value()).Stats())
   368  					}
   369  				}
   370  			}
   371  		} else if (p.flags & branchPageFlag) != 0 {
   372  			s.BranchPageN++
   373  			lastElement := p.branchPageElement(p.count - 1)
   374  
   375  			used := pageHeaderSize + (branchPageElementSize * uintptr(p.count-1))
   376  
   377  			used += uintptr(lastElement.pos + lastElement.ksize)
   378  			s.BranchInuse += int(used)
   379  			s.BranchOverflowN += int(p.overflow)
   380  		}
   381  
   382  		if depth+1 > s.Depth {
   383  			s.Depth = (depth + 1)
   384  		}
   385  	})
   386  
   387  	s.BranchAlloc = (s.BranchPageN + s.BranchOverflowN) * pageSize
   388  	s.LeafAlloc = (s.LeafPageN + s.LeafOverflowN) * pageSize
   389  
   390  	s.Depth += subStats.Depth
   391  	s.Add(subStats)
   392  	return s
   393  }
   394  
   395  func (b *Bucket) forEachPage(fn func(*page, int)) {
   396  	if b.page != nil {
   397  		fn(b.page, 0)
   398  		return
   399  	}
   400  
   401  	b.tx.forEachPage(b.root, 0, fn)
   402  }
   403  
   404  func (b *Bucket) forEachPageNode(fn func(*page, *node, int)) {
   405  	if b.page != nil {
   406  		fn(b.page, nil, 0)
   407  		return
   408  	}
   409  	b._forEachPageNode(b.root, 0, fn)
   410  }
   411  
   412  func (b *Bucket) _forEachPageNode(pgid pgid, depth int, fn func(*page, *node, int)) {
   413  	var p, n = b.pageNode(pgid)
   414  
   415  	fn(p, n, depth)
   416  
   417  	if p != nil {
   418  		if (p.flags & branchPageFlag) != 0 {
   419  			for i := 0; i < int(p.count); i++ {
   420  				elem := p.branchPageElement(uint16(i))
   421  				b._forEachPageNode(elem.pgid, depth+1, fn)
   422  			}
   423  		}
   424  	} else {
   425  		if !n.isLeaf {
   426  			for _, inode := range n.inodes {
   427  				b._forEachPageNode(inode.pgid, depth+1, fn)
   428  			}
   429  		}
   430  	}
   431  }
   432  
   433  func (b *Bucket) spill() error {
   434  	for name, child := range b.buckets {
   435  		var value []byte
   436  		if child.inlineable() {
   437  			child.free()
   438  			value = child.write()
   439  		} else {
   440  			if err := child.spill(); err != nil {
   441  				return err
   442  			}
   443  			value = make([]byte, unsafe.Sizeof(bucket{}))
   444  			var bucket = (*bucket)(unsafe.Pointer(&value[0]))
   445  			*bucket = *child.bucket
   446  		}
   447  
   448  		if child.rootNode == nil {
   449  			continue
   450  		}
   451  
   452  		var c = b.Cursor()
   453  		k, _, flags := c.seek([]byte(name))
   454  		if !bytes.Equal([]byte(name), k) {
   455  			panic(fmt.Sprintf("misplaced bucket header: %x -> %x", []byte(name), k))
   456  		}
   457  		if flags&bucketLeafFlag == 0 {
   458  			panic(fmt.Sprintf("unexpected bucket header flag: %x", flags))
   459  		}
   460  		c.node().put([]byte(name), []byte(name), value, 0, bucketLeafFlag)
   461  	}
   462  
   463  	if b.rootNode == nil {
   464  		return nil
   465  	}
   466  
   467  	if err := b.rootNode.spill(); err != nil {
   468  		return err
   469  	}
   470  	b.rootNode = b.rootNode.root()
   471  
   472  	if b.rootNode.pgid >= b.tx.meta.pgid {
   473  		panic(fmt.Sprintf("pgid (%d) above high water mark (%d)", b.rootNode.pgid, b.tx.meta.pgid))
   474  	}
   475  	b.root = b.rootNode.pgid
   476  
   477  	return nil
   478  }
   479  
   480  func (b *Bucket) inlineable() bool {
   481  	var n = b.rootNode
   482  
   483  	if n == nil || !n.isLeaf {
   484  		return false
   485  	}
   486  
   487  	var size = pageHeaderSize
   488  	for _, inode := range n.inodes {
   489  		size += leafPageElementSize + uintptr(len(inode.key)) + uintptr(len(inode.value))
   490  
   491  		if inode.flags&bucketLeafFlag != 0 {
   492  			return false
   493  		} else if size > b.maxInlineBucketSize() {
   494  			return false
   495  		}
   496  	}
   497  
   498  	return true
   499  }
   500  
   501  func (b *Bucket) maxInlineBucketSize() uintptr {
   502  	return uintptr(b.tx.db.pageSize / 4)
   503  }
   504  
   505  func (b *Bucket) write() []byte {
   506  	var n = b.rootNode
   507  	var value = make([]byte, bucketHeaderSize+n.size())
   508  
   509  	var bucket = (*bucket)(unsafe.Pointer(&value[0]))
   510  	*bucket = *b.bucket
   511  
   512  	var p = (*page)(unsafe.Pointer(&value[bucketHeaderSize]))
   513  	n.write(p)
   514  
   515  	return value
   516  }
   517  
   518  func (b *Bucket) rebalance() {
   519  	for _, n := range b.nodes {
   520  		n.rebalance()
   521  	}
   522  	for _, child := range b.buckets {
   523  		child.rebalance()
   524  	}
   525  }
   526  
   527  func (b *Bucket) node(pgid pgid, parent *node) *node {
   528  	_assert(b.nodes != nil, "nodes map expected")
   529  
   530  	if n := b.nodes[pgid]; n != nil {
   531  		return n
   532  	}
   533  
   534  	n := &node{bucket: b, parent: parent}
   535  	if parent == nil {
   536  		b.rootNode = n
   537  	} else {
   538  		parent.children = append(parent.children, n)
   539  	}
   540  
   541  	var p = b.page
   542  	if p == nil {
   543  		p = b.tx.page(pgid)
   544  	}
   545  
   546  	n.read(p)
   547  	b.nodes[pgid] = n
   548  
   549  	b.tx.stats.NodeCount++
   550  
   551  	return n
   552  }
   553  
   554  func (b *Bucket) free() {
   555  	if b.root == 0 {
   556  		return
   557  	}
   558  
   559  	var tx = b.tx
   560  	b.forEachPageNode(func(p *page, n *node, _ int) {
   561  		if p != nil {
   562  			tx.db.freelist.free(tx.meta.txid, p)
   563  		} else {
   564  			n.free()
   565  		}
   566  	})
   567  	b.root = 0
   568  }
   569  
   570  func (b *Bucket) dereference() {
   571  	if b.rootNode != nil {
   572  		b.rootNode.root().dereference()
   573  	}
   574  
   575  	for _, child := range b.buckets {
   576  		child.dereference()
   577  	}
   578  }
   579  
   580  func (b *Bucket) pageNode(id pgid) (*page, *node) {
   581  	if b.root == 0 {
   582  		if id != 0 {
   583  			panic(fmt.Sprintf("inline bucket non-zero page access(2): %d != 0", id))
   584  		}
   585  		if b.rootNode != nil {
   586  			return nil, b.rootNode
   587  		}
   588  		return b.page, nil
   589  	}
   590  
   591  	if b.nodes != nil {
   592  		if n := b.nodes[id]; n != nil {
   593  			return nil, n
   594  		}
   595  	}
   596  
   597  	return b.tx.page(id), nil
   598  }
   599  
   600  type BucketStats struct {
   601  	BranchPageN       int
   602  	BranchOverflowN   int
   603  	LeafPageN         int
   604  	LeafOverflowN     int
   605  	KeyN              int
   606  	Depth             int
   607  	BranchAlloc       int
   608  	BranchInuse       int
   609  	LeafAlloc         int
   610  	LeafInuse         int
   611  	BucketN           int
   612  	InlineBucketN     int
   613  	InlineBucketInuse int
   614  }
   615  
   616  func (s *BucketStats) Add(other BucketStats) {
   617  	s.BranchPageN += other.BranchPageN
   618  	s.BranchOverflowN += other.BranchOverflowN
   619  	s.LeafPageN += other.LeafPageN
   620  	s.LeafOverflowN += other.LeafOverflowN
   621  	s.KeyN += other.KeyN
   622  	if s.Depth < other.Depth {
   623  		s.Depth = other.Depth
   624  	}
   625  	s.BranchAlloc += other.BranchAlloc
   626  	s.BranchInuse += other.BranchInuse
   627  	s.LeafAlloc += other.LeafAlloc
   628  	s.LeafInuse += other.LeafInuse
   629  
   630  	s.BucketN += other.BucketN
   631  	s.InlineBucketN += other.InlineBucketN
   632  	s.InlineBucketInuse += other.InlineBucketInuse
   633  }
   634  
   635  func cloneBytes(v []byte) []byte {
   636  	var clone = make([]byte, len(v))
   637  	copy(clone, v)
   638  	return clone
   639  }