github.com/bhojpur/cache@v0.0.4/pkg/memory/bucket.go (about)

     1  package memory
     2  
     3  // Copyright (c) 2018 Bhojpur Consulting Private Limited, India. All rights reserved.
     4  
     5  // Permission is hereby granted, free of charge, to any person obtaining a copy
     6  // of this software and associated documentation files (the "Software"), to deal
     7  // in the Software without restriction, including without limitation the rights
     8  // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
     9  // copies of the Software, and to permit persons to whom the Software is
    10  // furnished to do so, subject to the following conditions:
    11  
    12  // The above copyright notice and this permission notice shall be included in
    13  // all copies or substantial portions of the Software.
    14  
    15  // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    16  // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    17  // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
    18  // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    19  // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    20  // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
    21  // THE SOFTWARE.
    22  
    23  import (
    24  	"bytes"
    25  	"fmt"
    26  	"unsafe"
    27  )
    28  
    29  const (
    30  	// MaxKeySize is the maximum length of a key, in bytes.
    31  	MaxKeySize = 32768
    32  
    33  	// MaxValueSize is the maximum length of a value, in bytes.
    34  	MaxValueSize = (1 << 31) - 2
    35  )
    36  
    37  const (
    38  	maxUint = ^uint(0)
    39  	minUint = 0
    40  	maxInt  = int(^uint(0) >> 1)
    41  	minInt  = -maxInt - 1
    42  )
    43  
    44  const bucketHeaderSize = int(unsafe.Sizeof(bucket{}))
    45  
    46  const (
    47  	minFillPercent = 0.1
    48  	maxFillPercent = 1.0
    49  )
    50  
    51  // DefaultFillPercent is the percentage that split pages are filled.
    52  // This value can be changed by setting Bucket.FillPercent.
    53  const DefaultFillPercent = 0.5
    54  
    55  // Bucket represents a collection of key/value pairs inside the database.
    56  type Bucket struct {
    57  	*bucket
    58  	tx       *Tx                // the associated transaction
    59  	buckets  map[string]*Bucket // subbucket cache
    60  	page     *page              // inline page reference
    61  	rootNode *node              // materialized node for the root page.
    62  	nodes    map[pgid]*node     // node cache
    63  
    64  	// Sets the threshold for filling nodes when they split. By default,
    65  	// the bucket will fill to 50% but it can be useful to increase this
    66  	// amount if you know that your write workloads are mostly append-only.
    67  	//
    68  	// This is non-persisted across transactions so it must be set in every Tx.
    69  	FillPercent float64
    70  }
    71  
    72  // bucket represents the on-file representation of a bucket.
    73  // This is stored as the "value" of a bucket key. If the bucket is small enough,
    74  // then its root page can be stored inline in the "value", after the bucket
    75  // header. In the case of inline buckets, the "root" will be 0.
    76  type bucket struct {
    77  	root     pgid   // page id of the bucket's root-level page
    78  	sequence uint64 // monotonically incrementing, used by NextSequence()
    79  }
    80  
    81  // newBucket returns a new bucket associated with a transaction.
    82  func newBucket(tx *Tx) Bucket {
    83  	var b = Bucket{tx: tx, FillPercent: DefaultFillPercent}
    84  	if tx.writable {
    85  		b.buckets = make(map[string]*Bucket)
    86  		b.nodes = make(map[pgid]*node)
    87  	}
    88  	return b
    89  }
    90  
    91  // Tx returns the tx of the bucket.
    92  func (b *Bucket) Tx() *Tx {
    93  	return b.tx
    94  }
    95  
    96  // Root returns the root of the bucket.
    97  func (b *Bucket) Root() pgid {
    98  	return b.root
    99  }
   100  
   101  // Writable returns whether the bucket is writable.
   102  func (b *Bucket) Writable() bool {
   103  	return b.tx.writable
   104  }
   105  
   106  // Cursor creates a cursor associated with the bucket.
   107  // The cursor is only valid as long as the transaction is open.
   108  // Do not use a cursor after the transaction is closed.
   109  func (b *Bucket) Cursor() *Cursor {
   110  	// Update transaction statistics.
   111  	b.tx.stats.CursorCount++
   112  
   113  	// Allocate and return a cursor.
   114  	return &Cursor{
   115  		bucket: b,
   116  		stack:  make([]elemRef, 0),
   117  	}
   118  }
   119  
   120  // Bucket retrieves a nested bucket by name.
   121  // Returns nil if the bucket does not exist.
   122  // The bucket instance is only valid for the lifetime of the transaction.
   123  func (b *Bucket) Bucket(name []byte) *Bucket {
   124  	if b.buckets != nil {
   125  		if child := b.buckets[string(name)]; child != nil {
   126  			return child
   127  		}
   128  	}
   129  
   130  	// Move cursor to key.
   131  	c := b.Cursor()
   132  	k, v, flags := c.seek(name)
   133  
   134  	// Return nil if the key doesn't exist or it is not a bucket.
   135  	if !bytes.Equal(name, k) || (flags&bucketLeafFlag) == 0 {
   136  		return nil
   137  	}
   138  
   139  	// Otherwise create a bucket and cache it.
   140  	var child = b.openBucket(v)
   141  	if b.buckets != nil {
   142  		b.buckets[string(name)] = child
   143  	}
   144  
   145  	return child
   146  }
   147  
   148  // Helper method that re-interprets a sub-bucket value
   149  // from a parent into a Bucket
   150  func (b *Bucket) openBucket(value []byte) *Bucket {
   151  	var child = newBucket(b.tx)
   152  
   153  	// If unaligned load/stores are broken on this arch and value is
   154  	// unaligned simply clone to an aligned byte array.
   155  	unaligned := brokenUnaligned && uintptr(unsafe.Pointer(&value[0]))&3 != 0
   156  
   157  	if unaligned {
   158  		value = cloneBytes(value)
   159  	}
   160  
   161  	// If this is a writable transaction then we need to copy the bucket entry.
   162  	// Read-only transactions can point directly at the mmap entry.
   163  	if b.tx.writable && !unaligned {
   164  		child.bucket = &bucket{}
   165  		*child.bucket = *(*bucket)(unsafe.Pointer(&value[0]))
   166  	} else {
   167  		child.bucket = (*bucket)(unsafe.Pointer(&value[0]))
   168  	}
   169  
   170  	// Save a reference to the inline page if the bucket is inline.
   171  	if child.root == 0 {
   172  		child.page = (*page)(unsafe.Pointer(&value[bucketHeaderSize]))
   173  	}
   174  
   175  	return &child
   176  }
   177  
   178  // CreateBucket creates a new bucket at the given key and returns the new bucket.
   179  // Returns an error if the key already exists, if the bucket name is blank, or if the bucket name is too long.
   180  // The bucket instance is only valid for the lifetime of the transaction.
   181  func (b *Bucket) CreateBucket(key []byte) (*Bucket, error) {
   182  	if b.tx.db == nil {
   183  		return nil, ErrTxClosed
   184  	} else if !b.tx.writable {
   185  		return nil, ErrTxNotWritable
   186  	} else if len(key) == 0 {
   187  		return nil, ErrBucketNameRequired
   188  	}
   189  
   190  	// Move cursor to correct position.
   191  	c := b.Cursor()
   192  	k, _, flags := c.seek(key)
   193  
   194  	// Return an error if there is an existing key.
   195  	if bytes.Equal(key, k) {
   196  		if (flags & bucketLeafFlag) != 0 {
   197  			return nil, ErrBucketExists
   198  		}
   199  		return nil, ErrIncompatibleValue
   200  	}
   201  
   202  	// Create empty, inline bucket.
   203  	var bucket = Bucket{
   204  		bucket:      &bucket{},
   205  		rootNode:    &node{isLeaf: true},
   206  		FillPercent: DefaultFillPercent,
   207  	}
   208  	var value = bucket.write()
   209  
   210  	// Insert into node.
   211  	key = cloneBytes(key)
   212  	c.node().put(key, key, value, 0, bucketLeafFlag)
   213  
   214  	// Since subbuckets are not allowed on inline buckets, we need to
   215  	// dereference the inline page, if it exists. This will cause the bucket
   216  	// to be treated as a regular, non-inline bucket for the rest of the tx.
   217  	b.page = nil
   218  
   219  	return b.Bucket(key), nil
   220  }
   221  
   222  // CreateBucketIfNotExists creates a new bucket if it doesn't already exist and returns a reference to it.
   223  // Returns an error if the bucket name is blank, or if the bucket name is too long.
   224  // The bucket instance is only valid for the lifetime of the transaction.
   225  func (b *Bucket) CreateBucketIfNotExists(key []byte) (*Bucket, error) {
   226  	child, err := b.CreateBucket(key)
   227  	if err == ErrBucketExists {
   228  		return b.Bucket(key), nil
   229  	} else if err != nil {
   230  		return nil, err
   231  	}
   232  	return child, nil
   233  }
   234  
   235  // DeleteBucket deletes a bucket at the given key.
   236  // Returns an error if the bucket does not exists, or if the key represents a non-bucket value.
   237  func (b *Bucket) DeleteBucket(key []byte) error {
   238  	if b.tx.db == nil {
   239  		return ErrTxClosed
   240  	} else if !b.Writable() {
   241  		return ErrTxNotWritable
   242  	}
   243  
   244  	// Move cursor to correct position.
   245  	c := b.Cursor()
   246  	k, _, flags := c.seek(key)
   247  
   248  	// Return an error if bucket doesn't exist or is not a bucket.
   249  	if !bytes.Equal(key, k) {
   250  		return ErrBucketNotFound
   251  	} else if (flags & bucketLeafFlag) == 0 {
   252  		return ErrIncompatibleValue
   253  	}
   254  
   255  	// Recursively delete all child buckets.
   256  	child := b.Bucket(key)
   257  	err := child.ForEach(func(k, v []byte) error {
   258  		if v == nil {
   259  			if err := child.DeleteBucket(k); err != nil {
   260  				return fmt.Errorf("delete bucket: %s", err)
   261  			}
   262  		}
   263  		return nil
   264  	})
   265  	if err != nil {
   266  		return err
   267  	}
   268  
   269  	// Remove cached copy.
   270  	delete(b.buckets, string(key))
   271  
   272  	// Release all bucket pages to freelist.
   273  	child.nodes = nil
   274  	child.rootNode = nil
   275  	child.free()
   276  
   277  	// Delete the node if we have a matching key.
   278  	c.node().del(key)
   279  
   280  	return nil
   281  }
   282  
   283  // Get retrieves the value for a key in the bucket.
   284  // Returns a nil value if the key does not exist or if the key is a nested bucket.
   285  // The returned value is only valid for the life of the transaction.
   286  func (b *Bucket) Get(key []byte) []byte {
   287  	k, v, flags := b.Cursor().seek(key)
   288  
   289  	// Return nil if this is a bucket.
   290  	if (flags & bucketLeafFlag) != 0 {
   291  		return nil
   292  	}
   293  
   294  	// If our target node isn't the same key as what's passed in then return nil.
   295  	if !bytes.Equal(key, k) {
   296  		return nil
   297  	}
   298  	return v
   299  }
   300  
   301  // Put sets the value for a key in the bucket.
   302  // If the key exist then its previous value will be overwritten.
   303  // Supplied value must remain valid for the life of the transaction.
   304  // Returns an error if the bucket was created from a read-only transaction, if the key is blank, if the key is too large, or if the value is too large.
   305  func (b *Bucket) Put(key []byte, value []byte) error {
   306  	if b.tx.db == nil {
   307  		return ErrTxClosed
   308  	} else if !b.Writable() {
   309  		return ErrTxNotWritable
   310  	} else if len(key) == 0 {
   311  		return ErrKeyRequired
   312  	} else if len(key) > MaxKeySize {
   313  		return ErrKeyTooLarge
   314  	} else if int64(len(value)) > MaxValueSize {
   315  		return ErrValueTooLarge
   316  	}
   317  
   318  	// Move cursor to correct position.
   319  	c := b.Cursor()
   320  	k, _, flags := c.seek(key)
   321  
   322  	// Return an error if there is an existing key with a bucket value.
   323  	if bytes.Equal(key, k) && (flags&bucketLeafFlag) != 0 {
   324  		return ErrIncompatibleValue
   325  	}
   326  
   327  	// Insert into node.
   328  	key = cloneBytes(key)
   329  	c.node().put(key, key, value, 0, 0)
   330  
   331  	return nil
   332  }
   333  
   334  // Delete removes a key from the bucket.
   335  // If the key does not exist then nothing is done and a nil error is returned.
   336  // Returns an error if the bucket was created from a read-only transaction.
   337  func (b *Bucket) Delete(key []byte) error {
   338  	if b.tx.db == nil {
   339  		return ErrTxClosed
   340  	} else if !b.Writable() {
   341  		return ErrTxNotWritable
   342  	}
   343  
   344  	// Move cursor to correct position.
   345  	c := b.Cursor()
   346  	_, _, flags := c.seek(key)
   347  
   348  	// Return an error if there is already existing bucket value.
   349  	if (flags & bucketLeafFlag) != 0 {
   350  		return ErrIncompatibleValue
   351  	}
   352  
   353  	// Delete the node if we have a matching key.
   354  	c.node().del(key)
   355  
   356  	return nil
   357  }
   358  
   359  // Sequence returns the current integer for the bucket without incrementing it.
   360  func (b *Bucket) Sequence() uint64 { return b.bucket.sequence }
   361  
   362  // SetSequence updates the sequence number for the bucket.
   363  func (b *Bucket) SetSequence(v uint64) error {
   364  	if b.tx.db == nil {
   365  		return ErrTxClosed
   366  	} else if !b.Writable() {
   367  		return ErrTxNotWritable
   368  	}
   369  
   370  	// Materialize the root node if it hasn't been already so that the
   371  	// bucket will be saved during commit.
   372  	if b.rootNode == nil {
   373  		_ = b.node(b.root, nil)
   374  	}
   375  
   376  	// Increment and return the sequence.
   377  	b.bucket.sequence = v
   378  	return nil
   379  }
   380  
   381  // NextSequence returns an autoincrementing integer for the bucket.
   382  func (b *Bucket) NextSequence() (uint64, error) {
   383  	if b.tx.db == nil {
   384  		return 0, ErrTxClosed
   385  	} else if !b.Writable() {
   386  		return 0, ErrTxNotWritable
   387  	}
   388  
   389  	// Materialize the root node if it hasn't been already so that the
   390  	// bucket will be saved during commit.
   391  	if b.rootNode == nil {
   392  		_ = b.node(b.root, nil)
   393  	}
   394  
   395  	// Increment and return the sequence.
   396  	b.bucket.sequence++
   397  	return b.bucket.sequence, nil
   398  }
   399  
   400  // ForEach executes a function for each key/value pair in a bucket.
   401  // If the provided function returns an error then the iteration is stopped and
   402  // the error is returned to the caller. The provided function must not modify
   403  // the bucket; this will result in undefined behavior.
   404  func (b *Bucket) ForEach(fn func(k, v []byte) error) error {
   405  	if b.tx.db == nil {
   406  		return ErrTxClosed
   407  	}
   408  	c := b.Cursor()
   409  	for k, v := c.First(); k != nil; k, v = c.Next() {
   410  		if err := fn(k, v); err != nil {
   411  			return err
   412  		}
   413  	}
   414  	return nil
   415  }
   416  
   417  // Stat returns stats on a bucket.
   418  func (b *Bucket) Stats() BucketStats {
   419  	var s, subStats BucketStats
   420  	pageSize := b.tx.db.pageSize
   421  	s.BucketN += 1
   422  	if b.root == 0 {
   423  		s.InlineBucketN += 1
   424  	}
   425  	b.forEachPage(func(p *page, depth int) {
   426  		if (p.flags & leafPageFlag) != 0 {
   427  			s.KeyN += int(p.count)
   428  
   429  			// used totals the used bytes for the page
   430  			used := pageHeaderSize
   431  
   432  			if p.count != 0 {
   433  				// If page has any elements, add all element headers.
   434  				used += leafPageElementSize * int(p.count-1)
   435  
   436  				// Add all element key, value sizes.
   437  				// The computation takes advantage of the fact that the position
   438  				// of the last element's key/value equals to the total of the sizes
   439  				// of all previous elements' keys and values.
   440  				// It also includes the last element's header.
   441  				lastElement := p.leafPageElement(p.count - 1)
   442  				used += int(lastElement.pos + lastElement.ksize + lastElement.vsize)
   443  			}
   444  
   445  			if b.root == 0 {
   446  				// For inlined bucket just update the inline stats
   447  				s.InlineBucketInuse += used
   448  			} else {
   449  				// For non-inlined bucket update all the leaf stats
   450  				s.LeafPageN++
   451  				s.LeafInuse += used
   452  				s.LeafOverflowN += int(p.overflow)
   453  
   454  				// Collect stats from sub-buckets.
   455  				// Do that by iterating over all element headers
   456  				// looking for the ones with the bucketLeafFlag.
   457  				for i := uint16(0); i < p.count; i++ {
   458  					e := p.leafPageElement(i)
   459  					if (e.flags & bucketLeafFlag) != 0 {
   460  						// For any bucket element, open the element value
   461  						// and recursively call Stats on the contained bucket.
   462  						subStats.Add(b.openBucket(e.value()).Stats())
   463  					}
   464  				}
   465  			}
   466  		} else if (p.flags & branchPageFlag) != 0 {
   467  			s.BranchPageN++
   468  			lastElement := p.branchPageElement(p.count - 1)
   469  
   470  			// used totals the used bytes for the page
   471  			// Add header and all element headers.
   472  			used := pageHeaderSize + (branchPageElementSize * int(p.count-1))
   473  
   474  			// Add size of all keys and values.
   475  			// Again, use the fact that last element's position equals to
   476  			// the total of key, value sizes of all previous elements.
   477  			used += int(lastElement.pos + lastElement.ksize)
   478  			s.BranchInuse += used
   479  			s.BranchOverflowN += int(p.overflow)
   480  		}
   481  
   482  		// Keep track of maximum page depth.
   483  		if depth+1 > s.Depth {
   484  			s.Depth = (depth + 1)
   485  		}
   486  	})
   487  
   488  	// Alloc stats can be computed from page counts and pageSize.
   489  	s.BranchAlloc = (s.BranchPageN + s.BranchOverflowN) * pageSize
   490  	s.LeafAlloc = (s.LeafPageN + s.LeafOverflowN) * pageSize
   491  
   492  	// Add the max depth of sub-buckets to get total nested depth.
   493  	s.Depth += subStats.Depth
   494  	// Add the stats for all sub-buckets
   495  	s.Add(subStats)
   496  	return s
   497  }
   498  
   499  // forEachPage iterates over every page in a bucket, including inline pages.
   500  func (b *Bucket) forEachPage(fn func(*page, int)) {
   501  	// If we have an inline page then just use that.
   502  	if b.page != nil {
   503  		fn(b.page, 0)
   504  		return
   505  	}
   506  
   507  	// Otherwise traverse the page hierarchy.
   508  	b.tx.forEachPage(b.root, 0, fn)
   509  }
   510  
   511  // forEachPageNode iterates over every page (or node) in a bucket.
   512  // This also includes inline pages.
   513  func (b *Bucket) forEachPageNode(fn func(*page, *node, int)) {
   514  	// If we have an inline page or root node then just use that.
   515  	if b.page != nil {
   516  		fn(b.page, nil, 0)
   517  		return
   518  	}
   519  	b._forEachPageNode(b.root, 0, fn)
   520  }
   521  
   522  func (b *Bucket) _forEachPageNode(pgid pgid, depth int, fn func(*page, *node, int)) {
   523  	var p, n = b.pageNode(pgid)
   524  
   525  	// Execute function.
   526  	fn(p, n, depth)
   527  
   528  	// Recursively loop over children.
   529  	if p != nil {
   530  		if (p.flags & branchPageFlag) != 0 {
   531  			for i := 0; i < int(p.count); i++ {
   532  				elem := p.branchPageElement(uint16(i))
   533  				b._forEachPageNode(elem.pgid, depth+1, fn)
   534  			}
   535  		}
   536  	} else {
   537  		if !n.isLeaf {
   538  			for _, inode := range n.inodes {
   539  				b._forEachPageNode(inode.pgid, depth+1, fn)
   540  			}
   541  		}
   542  	}
   543  }
   544  
   545  // spill writes all the nodes for this bucket to dirty pages.
   546  func (b *Bucket) spill() error {
   547  	// Spill all child buckets first.
   548  	for name, child := range b.buckets {
   549  		// If the child bucket is small enough and it has no child buckets then
   550  		// write it inline into the parent bucket's page. Otherwise spill it
   551  		// like a normal bucket and make the parent value a pointer to the page.
   552  		var value []byte
   553  		if child.inlineable() {
   554  			child.free()
   555  			value = child.write()
   556  		} else {
   557  			if err := child.spill(); err != nil {
   558  				return err
   559  			}
   560  
   561  			// Update the child bucket header in this bucket.
   562  			value = make([]byte, unsafe.Sizeof(bucket{}))
   563  			var bucket = (*bucket)(unsafe.Pointer(&value[0]))
   564  			*bucket = *child.bucket
   565  		}
   566  
   567  		// Skip writing the bucket if there are no materialized nodes.
   568  		if child.rootNode == nil {
   569  			continue
   570  		}
   571  
   572  		// Update parent node.
   573  		var c = b.Cursor()
   574  		k, _, flags := c.seek([]byte(name))
   575  		if !bytes.Equal([]byte(name), k) {
   576  			panic(fmt.Sprintf("misplaced bucket header: %x -> %x", []byte(name), k))
   577  		}
   578  		if flags&bucketLeafFlag == 0 {
   579  			panic(fmt.Sprintf("unexpected bucket header flag: %x", flags))
   580  		}
   581  		c.node().put([]byte(name), []byte(name), value, 0, bucketLeafFlag)
   582  	}
   583  
   584  	// Ignore if there's not a materialized root node.
   585  	if b.rootNode == nil {
   586  		return nil
   587  	}
   588  
   589  	// Spill nodes.
   590  	if err := b.rootNode.spill(); err != nil {
   591  		return err
   592  	}
   593  	b.rootNode = b.rootNode.root()
   594  
   595  	// Update the root node for this bucket.
   596  	if b.rootNode.pgid >= b.tx.meta.pgid {
   597  		panic(fmt.Sprintf("pgid (%d) above high water mark (%d)", b.rootNode.pgid, b.tx.meta.pgid))
   598  	}
   599  	b.root = b.rootNode.pgid
   600  
   601  	return nil
   602  }
   603  
   604  // inlineable returns true if a bucket is small enough to be written inline
   605  // and if it contains no subbuckets. Otherwise returns false.
   606  func (b *Bucket) inlineable() bool {
   607  	var n = b.rootNode
   608  
   609  	// Bucket must only contain a single leaf node.
   610  	if n == nil || !n.isLeaf {
   611  		return false
   612  	}
   613  
   614  	// Bucket is not inlineable if it contains subbuckets or if it goes beyond
   615  	// our threshold for inline bucket size.
   616  	var size = pageHeaderSize
   617  	for _, inode := range n.inodes {
   618  		size += leafPageElementSize + len(inode.key) + len(inode.value)
   619  
   620  		if inode.flags&bucketLeafFlag != 0 {
   621  			return false
   622  		} else if size > b.maxInlineBucketSize() {
   623  			return false
   624  		}
   625  	}
   626  
   627  	return true
   628  }
   629  
   630  // Returns the maximum total size of a bucket to make it a candidate for inlining.
   631  func (b *Bucket) maxInlineBucketSize() int {
   632  	return b.tx.db.pageSize / 4
   633  }
   634  
   635  // write allocates and writes a bucket to a byte slice.
   636  func (b *Bucket) write() []byte {
   637  	// Allocate the appropriate size.
   638  	var n = b.rootNode
   639  	var value = make([]byte, bucketHeaderSize+n.size())
   640  
   641  	// Write a bucket header.
   642  	var bucket = (*bucket)(unsafe.Pointer(&value[0]))
   643  	*bucket = *b.bucket
   644  
   645  	// Convert byte slice to a fake page and write the root node.
   646  	var p = (*page)(unsafe.Pointer(&value[bucketHeaderSize]))
   647  	n.write(p)
   648  
   649  	return value
   650  }
   651  
   652  // rebalance attempts to balance all nodes.
   653  func (b *Bucket) rebalance() {
   654  	for _, n := range b.nodes {
   655  		n.rebalance()
   656  	}
   657  	for _, child := range b.buckets {
   658  		child.rebalance()
   659  	}
   660  }
   661  
   662  // node creates a node from a page and associates it with a given parent.
   663  func (b *Bucket) node(pgid pgid, parent *node) *node {
   664  	_assert(b.nodes != nil, "nodes map expected")
   665  
   666  	// Retrieve node if it's already been created.
   667  	if n := b.nodes[pgid]; n != nil {
   668  		return n
   669  	}
   670  
   671  	// Otherwise create a node and cache it.
   672  	n := &node{bucket: b, parent: parent}
   673  	if parent == nil {
   674  		b.rootNode = n
   675  	} else {
   676  		parent.children = append(parent.children, n)
   677  	}
   678  
   679  	// Use the inline page if this is an inline bucket.
   680  	var p = b.page
   681  	if p == nil {
   682  		p = b.tx.page(pgid)
   683  	}
   684  
   685  	// Read the page into the node and cache it.
   686  	n.read(p)
   687  	b.nodes[pgid] = n
   688  
   689  	// Update statistics.
   690  	b.tx.stats.NodeCount++
   691  
   692  	return n
   693  }
   694  
   695  // free recursively frees all pages in the bucket.
   696  func (b *Bucket) free() {
   697  	if b.root == 0 {
   698  		return
   699  	}
   700  
   701  	var tx = b.tx
   702  	b.forEachPageNode(func(p *page, n *node, _ int) {
   703  		if p != nil {
   704  			tx.db.freelist.free(tx.meta.txid, p)
   705  		} else {
   706  			n.free()
   707  		}
   708  	})
   709  	b.root = 0
   710  }
   711  
   712  // dereference removes all references to the old mmap.
   713  func (b *Bucket) dereference() {
   714  	if b.rootNode != nil {
   715  		b.rootNode.root().dereference()
   716  	}
   717  
   718  	for _, child := range b.buckets {
   719  		child.dereference()
   720  	}
   721  }
   722  
   723  // pageNode returns the Bhojpur Cache in-memory storage node, if it exists.
   724  // Otherwise returns the underlying page.
   725  func (b *Bucket) pageNode(id pgid) (*page, *node) {
   726  	// Inline buckets have a fake page embedded in their value so treat them
   727  	// differently. We'll return the rootNode (if available) or the fake page.
   728  	if b.root == 0 {
   729  		if id != 0 {
   730  			panic(fmt.Sprintf("inline bucket non-zero page access(2): %d != 0", id))
   731  		}
   732  		if b.rootNode != nil {
   733  			return nil, b.rootNode
   734  		}
   735  		return b.page, nil
   736  	}
   737  
   738  	// Check the node cache for non-inline buckets.
   739  	if b.nodes != nil {
   740  		if n := b.nodes[id]; n != nil {
   741  			return nil, n
   742  		}
   743  	}
   744  
   745  	// Finally lookup the page from the transaction if no node is materialized.
   746  	return b.tx.page(id), nil
   747  }
   748  
   749  // BucketStats records statistics about resources used by a bucket.
   750  type BucketStats struct {
   751  	// Page count statistics.
   752  	BranchPageN     int // number of logical branch pages
   753  	BranchOverflowN int // number of physical branch overflow pages
   754  	LeafPageN       int // number of logical leaf pages
   755  	LeafOverflowN   int // number of physical leaf overflow pages
   756  
   757  	// Tree statistics.
   758  	KeyN  int // number of keys/value pairs
   759  	Depth int // number of levels in B+tree
   760  
   761  	// Page size utilization.
   762  	BranchAlloc int // bytes allocated for physical branch pages
   763  	BranchInuse int // bytes actually used for branch data
   764  	LeafAlloc   int // bytes allocated for physical leaf pages
   765  	LeafInuse   int // bytes actually used for leaf data
   766  
   767  	// Bucket statistics
   768  	BucketN           int // total number of buckets including the top bucket
   769  	InlineBucketN     int // total number on inlined buckets
   770  	InlineBucketInuse int // bytes used for inlined buckets (also accounted for in LeafInuse)
   771  }
   772  
   773  func (s *BucketStats) Add(other BucketStats) {
   774  	s.BranchPageN += other.BranchPageN
   775  	s.BranchOverflowN += other.BranchOverflowN
   776  	s.LeafPageN += other.LeafPageN
   777  	s.LeafOverflowN += other.LeafOverflowN
   778  	s.KeyN += other.KeyN
   779  	if s.Depth < other.Depth {
   780  		s.Depth = other.Depth
   781  	}
   782  	s.BranchAlloc += other.BranchAlloc
   783  	s.BranchInuse += other.BranchInuse
   784  	s.LeafAlloc += other.LeafAlloc
   785  	s.LeafInuse += other.LeafInuse
   786  
   787  	s.BucketN += other.BucketN
   788  	s.InlineBucketN += other.InlineBucketN
   789  	s.InlineBucketInuse += other.InlineBucketInuse
   790  }
   791  
   792  // cloneBytes returns a copy of a given slice.
   793  func cloneBytes(v []byte) []byte {
   794  	var clone = make([]byte, len(v))
   795  	copy(clone, v)
   796  	return clone
   797  }