github.com/bhojpur/cache@v0.0.4/pkg/memory/node.go (about) 1 package memory 2 3 // Copyright (c) 2018 Bhojpur Consulting Private Limited, India. All rights reserved. 4 5 // Permission is hereby granted, free of charge, to any person obtaining a copy 6 // of this software and associated documentation files (the "Software"), to deal 7 // in the Software without restriction, including without limitation the rights 8 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 // copies of the Software, and to permit persons to whom the Software is 10 // furnished to do so, subject to the following conditions: 11 12 // The above copyright notice and this permission notice shall be included in 13 // all copies or substantial portions of the Software. 14 15 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 // THE SOFTWARE. 22 23 import ( 24 "bytes" 25 "fmt" 26 "sort" 27 "unsafe" 28 ) 29 30 // node represents an in-memory, deserialized page. 31 type node struct { 32 bucket *Bucket 33 isLeaf bool 34 unbalanced bool 35 spilled bool 36 key []byte 37 pgid pgid 38 parent *node 39 children nodes 40 inodes inodes 41 } 42 43 // root returns the top-level node this node is attached to. 44 func (n *node) root() *node { 45 if n.parent == nil { 46 return n 47 } 48 return n.parent.root() 49 } 50 51 // minKeys returns the minimum number of inodes this node should have. 52 func (n *node) minKeys() int { 53 if n.isLeaf { 54 return 1 55 } 56 return 2 57 } 58 59 // size returns the size of the node after serialization. 60 func (n *node) size() int { 61 sz, elsz := pageHeaderSize, n.pageElementSize() 62 for i := 0; i < len(n.inodes); i++ { 63 item := &n.inodes[i] 64 sz += elsz + len(item.key) + len(item.value) 65 } 66 return sz 67 } 68 69 // sizeLessThan returns true if the node is less than a given size. 70 // This is an optimization to avoid calculating a large node when we only need 71 // to know if it fits inside a certain page size. 72 func (n *node) sizeLessThan(v int) bool { 73 sz, elsz := pageHeaderSize, n.pageElementSize() 74 for i := 0; i < len(n.inodes); i++ { 75 item := &n.inodes[i] 76 sz += elsz + len(item.key) + len(item.value) 77 if sz >= v { 78 return false 79 } 80 } 81 return true 82 } 83 84 // pageElementSize returns the size of each page element based on the type of node. 85 func (n *node) pageElementSize() int { 86 if n.isLeaf { 87 return leafPageElementSize 88 } 89 return branchPageElementSize 90 } 91 92 // childAt returns the child node at a given index. 93 func (n *node) childAt(index int) *node { 94 if n.isLeaf { 95 panic(fmt.Sprintf("invalid childAt(%d) on a leaf node", index)) 96 } 97 return n.bucket.node(n.inodes[index].pgid, n) 98 } 99 100 // childIndex returns the index of a given child node. 101 func (n *node) childIndex(child *node) int { 102 index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, child.key) != -1 }) 103 return index 104 } 105 106 // numChildren returns the number of children. 107 func (n *node) numChildren() int { 108 return len(n.inodes) 109 } 110 111 // nextSibling returns the next node with the same parent. 112 func (n *node) nextSibling() *node { 113 if n.parent == nil { 114 return nil 115 } 116 index := n.parent.childIndex(n) 117 if index >= n.parent.numChildren()-1 { 118 return nil 119 } 120 return n.parent.childAt(index + 1) 121 } 122 123 // prevSibling returns the previous node with the same parent. 124 func (n *node) prevSibling() *node { 125 if n.parent == nil { 126 return nil 127 } 128 index := n.parent.childIndex(n) 129 if index == 0 { 130 return nil 131 } 132 return n.parent.childAt(index - 1) 133 } 134 135 // put inserts a key/value. 136 func (n *node) put(oldKey, newKey, value []byte, pgid pgid, flags uint32) { 137 if pgid >= n.bucket.tx.meta.pgid { 138 panic(fmt.Sprintf("pgid (%d) above high water mark (%d)", pgid, n.bucket.tx.meta.pgid)) 139 } else if len(oldKey) <= 0 { 140 panic("put: zero-length old key") 141 } else if len(newKey) <= 0 { 142 panic("put: zero-length new key") 143 } 144 145 // Find insertion index. 146 index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, oldKey) != -1 }) 147 148 // Add capacity and shift nodes if we don't have an exact match and need to insert. 149 exact := (len(n.inodes) > 0 && index < len(n.inodes) && bytes.Equal(n.inodes[index].key, oldKey)) 150 if !exact { 151 n.inodes = append(n.inodes, inode{}) 152 copy(n.inodes[index+1:], n.inodes[index:]) 153 } 154 155 inode := &n.inodes[index] 156 inode.flags = flags 157 inode.key = newKey 158 inode.value = value 159 inode.pgid = pgid 160 _assert(len(inode.key) > 0, "put: zero-length inode key") 161 } 162 163 // del removes a key from the node. 164 func (n *node) del(key []byte) { 165 // Find index of key. 166 index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, key) != -1 }) 167 168 // Exit if the key isn't found. 169 if index >= len(n.inodes) || !bytes.Equal(n.inodes[index].key, key) { 170 return 171 } 172 173 // Delete inode from the node. 174 n.inodes = append(n.inodes[:index], n.inodes[index+1:]...) 175 176 // Mark the node as needing rebalancing. 177 n.unbalanced = true 178 } 179 180 // read initializes the node from a page. 181 func (n *node) read(p *page) { 182 n.pgid = p.id 183 n.isLeaf = ((p.flags & leafPageFlag) != 0) 184 n.inodes = make(inodes, int(p.count)) 185 186 for i := 0; i < int(p.count); i++ { 187 inode := &n.inodes[i] 188 if n.isLeaf { 189 elem := p.leafPageElement(uint16(i)) 190 inode.flags = elem.flags 191 inode.key = elem.key() 192 inode.value = elem.value() 193 } else { 194 elem := p.branchPageElement(uint16(i)) 195 inode.pgid = elem.pgid 196 inode.key = elem.key() 197 } 198 _assert(len(inode.key) > 0, "read: zero-length inode key") 199 } 200 201 // Save first key so we can find the node in the parent when we spill. 202 if len(n.inodes) > 0 { 203 n.key = n.inodes[0].key 204 _assert(len(n.key) > 0, "read: zero-length node key") 205 } else { 206 n.key = nil 207 } 208 } 209 210 // write writes the items onto one or more pages. 211 func (n *node) write(p *page) { 212 // Initialize page. 213 if n.isLeaf { 214 p.flags |= leafPageFlag 215 } else { 216 p.flags |= branchPageFlag 217 } 218 219 if len(n.inodes) >= 0xFFFF { 220 panic(fmt.Sprintf("inode overflow: %d (pgid=%d)", len(n.inodes), p.id)) 221 } 222 p.count = uint16(len(n.inodes)) 223 224 // Stop here if there are no items to write. 225 if p.count == 0 { 226 return 227 } 228 229 // Loop over each item and write it to the page. 230 b := (*[maxAllocSize]byte)(unsafe.Pointer(&p.ptr))[n.pageElementSize()*len(n.inodes):] 231 for i, item := range n.inodes { 232 _assert(len(item.key) > 0, "write: zero-length inode key") 233 234 // Write the page element. 235 if n.isLeaf { 236 elem := p.leafPageElement(uint16(i)) 237 elem.pos = uint32(uintptr(unsafe.Pointer(&b[0])) - uintptr(unsafe.Pointer(elem))) 238 elem.flags = item.flags 239 elem.ksize = uint32(len(item.key)) 240 elem.vsize = uint32(len(item.value)) 241 } else { 242 elem := p.branchPageElement(uint16(i)) 243 elem.pos = uint32(uintptr(unsafe.Pointer(&b[0])) - uintptr(unsafe.Pointer(elem))) 244 elem.ksize = uint32(len(item.key)) 245 elem.pgid = item.pgid 246 _assert(elem.pgid != p.id, "write: circular dependency occurred") 247 } 248 249 // If the length of key+value is larger than the max allocation size 250 // then we need to reallocate the byte array pointer. 251 klen, vlen := len(item.key), len(item.value) 252 if len(b) < klen+vlen { 253 b = (*[maxAllocSize]byte)(unsafe.Pointer(&b[0]))[:] 254 } 255 256 // Write data for the element to the end of the page. 257 copy(b[0:], item.key) 258 b = b[klen:] 259 copy(b[0:], item.value) 260 b = b[vlen:] 261 } 262 263 // DEBUG ONLY: n.dump() 264 } 265 266 // split breaks up a node into multiple smaller nodes, if appropriate. 267 // This should only be called from the spill() function. 268 func (n *node) split(pageSize int) []*node { 269 var nodes []*node 270 271 node := n 272 for { 273 // Split node into two. 274 a, b := node.splitTwo(pageSize) 275 nodes = append(nodes, a) 276 277 // If we can't split then exit the loop. 278 if b == nil { 279 break 280 } 281 282 // Set node to b so it gets split on the next iteration. 283 node = b 284 } 285 286 return nodes 287 } 288 289 // splitTwo breaks up a node into two smaller nodes, if appropriate. 290 // This should only be called from the split() function. 291 func (n *node) splitTwo(pageSize int) (*node, *node) { 292 // Ignore the split if the page doesn't have at least enough nodes for 293 // two pages or if the nodes can fit in a single page. 294 if len(n.inodes) <= (minKeysPerPage*2) || n.sizeLessThan(pageSize) { 295 return n, nil 296 } 297 298 // Determine the threshold before starting a new node. 299 var fillPercent = n.bucket.FillPercent 300 if fillPercent < minFillPercent { 301 fillPercent = minFillPercent 302 } else if fillPercent > maxFillPercent { 303 fillPercent = maxFillPercent 304 } 305 threshold := int(float64(pageSize) * fillPercent) 306 307 // Determine split position and sizes of the two pages. 308 splitIndex, _ := n.splitIndex(threshold) 309 310 // Split node into two separate nodes. 311 // If there's no parent then we'll need to create one. 312 if n.parent == nil { 313 n.parent = &node{bucket: n.bucket, children: []*node{n}} 314 } 315 316 // Create a new node and add it to the parent. 317 next := &node{bucket: n.bucket, isLeaf: n.isLeaf, parent: n.parent} 318 n.parent.children = append(n.parent.children, next) 319 320 // Split inodes across two nodes. 321 next.inodes = n.inodes[splitIndex:] 322 n.inodes = n.inodes[:splitIndex] 323 324 // Update the statistics. 325 n.bucket.tx.stats.Split++ 326 327 return n, next 328 } 329 330 // splitIndex finds the position where a page will fill a given threshold. 331 // It returns the index as well as the size of the first page. 332 // This is only be called from split(). 333 func (n *node) splitIndex(threshold int) (index, sz int) { 334 sz = pageHeaderSize 335 336 // Loop until we only have the minimum number of keys required for the second page. 337 for i := 0; i < len(n.inodes)-minKeysPerPage; i++ { 338 index = i 339 inode := n.inodes[i] 340 elsize := n.pageElementSize() + len(inode.key) + len(inode.value) 341 342 // If we have at least the minimum number of keys and adding another 343 // node would put us over the threshold then exit and return. 344 if i >= minKeysPerPage && sz+elsize > threshold { 345 break 346 } 347 348 // Add the element size to the total size. 349 sz += elsize 350 } 351 352 return 353 } 354 355 // spill writes the nodes to dirty pages and splits nodes as it goes. 356 // Returns an error if dirty pages cannot be allocated. 357 func (n *node) spill() error { 358 var tx = n.bucket.tx 359 if n.spilled { 360 return nil 361 } 362 363 // Spill child nodes first. Child nodes can materialize sibling nodes in 364 // the case of split-merge so we cannot use a range loop. We have to check 365 // the children size on every loop iteration. 366 sort.Sort(n.children) 367 for i := 0; i < len(n.children); i++ { 368 if err := n.children[i].spill(); err != nil { 369 return err 370 } 371 } 372 373 // We no longer need the child list because it's only used for spill tracking. 374 n.children = nil 375 376 // Split nodes into appropriate sizes. The first node will always be n. 377 var nodes = n.split(tx.db.pageSize) 378 for _, node := range nodes { 379 // Add node's page to the freelist if it's not new. 380 if node.pgid > 0 { 381 tx.db.freelist.free(tx.meta.txid, tx.page(node.pgid)) 382 node.pgid = 0 383 } 384 385 // Allocate contiguous space for the node. 386 p, err := tx.allocate((node.size() / tx.db.pageSize) + 1) 387 if err != nil { 388 return err 389 } 390 391 // Write the node. 392 if p.id >= tx.meta.pgid { 393 panic(fmt.Sprintf("pgid (%d) above high water mark (%d)", p.id, tx.meta.pgid)) 394 } 395 node.pgid = p.id 396 node.write(p) 397 node.spilled = true 398 399 // Insert into parent inodes. 400 if node.parent != nil { 401 var key = node.key 402 if key == nil { 403 key = node.inodes[0].key 404 } 405 406 node.parent.put(key, node.inodes[0].key, nil, node.pgid, 0) 407 node.key = node.inodes[0].key 408 _assert(len(node.key) > 0, "spill: zero-length node key") 409 } 410 411 // Update the statistics. 412 tx.stats.Spill++ 413 } 414 415 // If the root node split and created a new root then we need to spill that 416 // as well. We'll clear out the children to make sure it doesn't try to respill. 417 if n.parent != nil && n.parent.pgid == 0 { 418 n.children = nil 419 return n.parent.spill() 420 } 421 422 return nil 423 } 424 425 // rebalance attempts to combine the node with sibling nodes if the node fill 426 // size is below a threshold or if there are not enough keys. 427 func (n *node) rebalance() { 428 if !n.unbalanced { 429 return 430 } 431 n.unbalanced = false 432 433 // Update statistics. 434 n.bucket.tx.stats.Rebalance++ 435 436 // Ignore if node is above threshold (25%) and has enough keys. 437 var threshold = n.bucket.tx.db.pageSize / 4 438 if n.size() > threshold && len(n.inodes) > n.minKeys() { 439 return 440 } 441 442 // Root node has special handling. 443 if n.parent == nil { 444 // If root node is a branch and only has one node then collapse it. 445 if !n.isLeaf && len(n.inodes) == 1 { 446 // Move root's child up. 447 child := n.bucket.node(n.inodes[0].pgid, n) 448 n.isLeaf = child.isLeaf 449 n.inodes = child.inodes[:] 450 n.children = child.children 451 452 // Reparent all child nodes being moved. 453 for _, inode := range n.inodes { 454 if child, ok := n.bucket.nodes[inode.pgid]; ok { 455 child.parent = n 456 } 457 } 458 459 // Remove old child. 460 child.parent = nil 461 delete(n.bucket.nodes, child.pgid) 462 child.free() 463 } 464 465 return 466 } 467 468 // If node has no keys then just remove it. 469 if n.numChildren() == 0 { 470 n.parent.del(n.key) 471 n.parent.removeChild(n) 472 delete(n.bucket.nodes, n.pgid) 473 n.free() 474 n.parent.rebalance() 475 return 476 } 477 478 _assert(n.parent.numChildren() > 1, "parent must have at least 2 children") 479 480 // Destination node is right sibling if idx == 0, otherwise left sibling. 481 var target *node 482 var useNextSibling = (n.parent.childIndex(n) == 0) 483 if useNextSibling { 484 target = n.nextSibling() 485 } else { 486 target = n.prevSibling() 487 } 488 489 // If both this node and the target node are too small then merge them. 490 if useNextSibling { 491 // Reparent all child nodes being moved. 492 for _, inode := range target.inodes { 493 if child, ok := n.bucket.nodes[inode.pgid]; ok { 494 child.parent.removeChild(child) 495 child.parent = n 496 child.parent.children = append(child.parent.children, child) 497 } 498 } 499 500 // Copy over inodes from target and remove target. 501 n.inodes = append(n.inodes, target.inodes...) 502 n.parent.del(target.key) 503 n.parent.removeChild(target) 504 delete(n.bucket.nodes, target.pgid) 505 target.free() 506 } else { 507 // Reparent all child nodes being moved. 508 for _, inode := range n.inodes { 509 if child, ok := n.bucket.nodes[inode.pgid]; ok { 510 child.parent.removeChild(child) 511 child.parent = target 512 child.parent.children = append(child.parent.children, child) 513 } 514 } 515 516 // Copy over inodes to target and remove node. 517 target.inodes = append(target.inodes, n.inodes...) 518 n.parent.del(n.key) 519 n.parent.removeChild(n) 520 delete(n.bucket.nodes, n.pgid) 521 n.free() 522 } 523 524 // Either this node or the target node was deleted from the parent so rebalance it. 525 n.parent.rebalance() 526 } 527 528 // removes a node from the list of in-memory children. 529 // This does not affect the inodes. 530 func (n *node) removeChild(target *node) { 531 for i, child := range n.children { 532 if child == target { 533 n.children = append(n.children[:i], n.children[i+1:]...) 534 return 535 } 536 } 537 } 538 539 // dereference causes the node to copy all its inode key/value references to heap memory. 540 // This is required when the mmap is reallocated so inodes are not pointing to stale data. 541 func (n *node) dereference() { 542 if n.key != nil { 543 key := make([]byte, len(n.key)) 544 copy(key, n.key) 545 n.key = key 546 _assert(n.pgid == 0 || len(n.key) > 0, "dereference: zero-length node key on existing node") 547 } 548 549 for i := range n.inodes { 550 inode := &n.inodes[i] 551 552 key := make([]byte, len(inode.key)) 553 copy(key, inode.key) 554 inode.key = key 555 _assert(len(inode.key) > 0, "dereference: zero-length inode key") 556 557 value := make([]byte, len(inode.value)) 558 copy(value, inode.value) 559 inode.value = value 560 } 561 562 // Recursively dereference children. 563 for _, child := range n.children { 564 child.dereference() 565 } 566 567 // Update statistics. 568 n.bucket.tx.stats.NodeDeref++ 569 } 570 571 // free adds the node's underlying page to the freelist. 572 func (n *node) free() { 573 if n.pgid != 0 { 574 n.bucket.tx.db.freelist.free(n.bucket.tx.meta.txid, n.bucket.tx.page(n.pgid)) 575 n.pgid = 0 576 } 577 } 578 579 // dump writes the contents of the node to STDERR for debugging purposes. 580 /* 581 func (n *node) dump() { 582 // Write node header. 583 var typ = "branch" 584 if n.isLeaf { 585 typ = "leaf" 586 } 587 warnf("[NODE %d {type=%s count=%d}]", n.pgid, typ, len(n.inodes)) 588 589 // Write out abbreviated version of each item. 590 for _, item := range n.inodes { 591 if n.isLeaf { 592 if item.flags&bucketLeafFlag != 0 { 593 bucket := (*bucket)(unsafe.Pointer(&item.value[0])) 594 warnf("+L %08x -> (bucket root=%d)", trunc(item.key, 4), bucket.root) 595 } else { 596 warnf("+L %08x -> %08x", trunc(item.key, 4), trunc(item.value, 4)) 597 } 598 } else { 599 warnf("+B %08x -> pgid=%d", trunc(item.key, 4), item.pgid) 600 } 601 } 602 warn("") 603 } 604 */ 605 606 type nodes []*node 607 608 func (s nodes) Len() int { return len(s) } 609 func (s nodes) Swap(i, j int) { s[i], s[j] = s[j], s[i] } 610 func (s nodes) Less(i, j int) bool { 611 return bytes.Compare(s[i].inodes[0].key, s[j].inodes[0].key) == -1 612 } 613 614 // inode represents an internal node inside of a node. 615 // It can be used to point to elements in a page or point 616 // to an element which hasn't been added to a page yet. 617 type inode struct { 618 flags uint32 619 pgid pgid 620 key []byte 621 value []byte 622 } 623 624 type inodes []inode