github.com/ooni/psiphon/tunnel-core@v0.0.0-20230105123940-fe12a24c96ee/oovendor/bolt/node.go (about) 1 package bolt 2 3 import ( 4 "bytes" 5 "fmt" 6 "sort" 7 "unsafe" 8 ) 9 10 // node represents an in-memory, deserialized page. 11 type node struct { 12 bucket *Bucket 13 isLeaf bool 14 unbalanced bool 15 spilled bool 16 key []byte 17 pgid pgid 18 parent *node 19 children nodes 20 inodes inodes 21 } 22 23 // root returns the top-level node this node is attached to. 24 func (n *node) root() *node { 25 if n.parent == nil { 26 return n 27 } 28 return n.parent.root() 29 } 30 31 // minKeys returns the minimum number of inodes this node should have. 32 func (n *node) minKeys() int { 33 if n.isLeaf { 34 return 1 35 } 36 return 2 37 } 38 39 // size returns the size of the node after serialization. 40 func (n *node) size() int { 41 sz, elsz := pageHeaderSize, n.pageElementSize() 42 for i := 0; i < len(n.inodes); i++ { 43 item := &n.inodes[i] 44 sz += elsz + uintptr(len(item.key)) + uintptr(len(item.value)) 45 } 46 return int(sz) 47 } 48 49 // sizeLessThan returns true if the node is less than a given size. 50 // This is an optimization to avoid calculating a large node when we only need 51 // to know if it fits inside a certain page size. 52 func (n *node) sizeLessThan(v uintptr) bool { 53 sz, elsz := pageHeaderSize, n.pageElementSize() 54 for i := 0; i < len(n.inodes); i++ { 55 item := &n.inodes[i] 56 sz += elsz + uintptr(len(item.key)) + uintptr(len(item.value)) 57 if sz >= v { 58 return false 59 } 60 } 61 return true 62 } 63 64 // pageElementSize returns the size of each page element based on the type of node. 65 func (n *node) pageElementSize() uintptr { 66 if n.isLeaf { 67 return leafPageElementSize 68 } 69 return branchPageElementSize 70 } 71 72 // childAt returns the child node at a given index. 73 func (n *node) childAt(index int) *node { 74 if n.isLeaf { 75 panic(fmt.Sprintf("invalid childAt(%d) on a leaf node", index)) 76 } 77 return n.bucket.node(n.inodes[index].pgid, n) 78 } 79 80 // childIndex returns the index of a given child node. 81 func (n *node) childIndex(child *node) int { 82 index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, child.key) != -1 }) 83 return index 84 } 85 86 // numChildren returns the number of children. 87 func (n *node) numChildren() int { 88 return len(n.inodes) 89 } 90 91 // nextSibling returns the next node with the same parent. 92 func (n *node) nextSibling() *node { 93 if n.parent == nil { 94 return nil 95 } 96 index := n.parent.childIndex(n) 97 if index >= n.parent.numChildren()-1 { 98 return nil 99 } 100 return n.parent.childAt(index + 1) 101 } 102 103 // prevSibling returns the previous node with the same parent. 104 func (n *node) prevSibling() *node { 105 if n.parent == nil { 106 return nil 107 } 108 index := n.parent.childIndex(n) 109 if index == 0 { 110 return nil 111 } 112 return n.parent.childAt(index - 1) 113 } 114 115 // put inserts a key/value. 116 func (n *node) put(oldKey, newKey, value []byte, pgid pgid, flags uint32) { 117 if pgid >= n.bucket.tx.meta.pgid { 118 panic(fmt.Sprintf("pgid (%d) above high water mark (%d)", pgid, n.bucket.tx.meta.pgid)) 119 } else if len(oldKey) <= 0 { 120 panic("put: zero-length old key") 121 } else if len(newKey) <= 0 { 122 panic("put: zero-length new key") 123 } 124 125 // Find insertion index. 126 index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, oldKey) != -1 }) 127 128 // Add capacity and shift nodes if we don't have an exact match and need to insert. 129 exact := (len(n.inodes) > 0 && index < len(n.inodes) && bytes.Equal(n.inodes[index].key, oldKey)) 130 if !exact { 131 n.inodes = append(n.inodes, inode{}) 132 copy(n.inodes[index+1:], n.inodes[index:]) 133 } 134 135 inode := &n.inodes[index] 136 inode.flags = flags 137 inode.key = newKey 138 inode.value = value 139 inode.pgid = pgid 140 _assert(len(inode.key) > 0, "put: zero-length inode key") 141 } 142 143 // del removes a key from the node. 144 func (n *node) del(key []byte) { 145 // Find index of key. 146 index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, key) != -1 }) 147 148 // Exit if the key isn't found. 149 if index >= len(n.inodes) || !bytes.Equal(n.inodes[index].key, key) { 150 return 151 } 152 153 // Delete inode from the node. 154 n.inodes = append(n.inodes[:index], n.inodes[index+1:]...) 155 156 // Mark the node as needing rebalancing. 157 n.unbalanced = true 158 } 159 160 // read initializes the node from a page. 161 func (n *node) read(p *page) { 162 n.pgid = p.id 163 n.isLeaf = ((p.flags & leafPageFlag) != 0) 164 n.inodes = make(inodes, int(p.count)) 165 166 for i := 0; i < int(p.count); i++ { 167 inode := &n.inodes[i] 168 if n.isLeaf { 169 elem := p.leafPageElement(uint16(i)) 170 inode.flags = elem.flags 171 inode.key = elem.key() 172 inode.value = elem.value() 173 } else { 174 elem := p.branchPageElement(uint16(i)) 175 inode.pgid = elem.pgid 176 inode.key = elem.key() 177 } 178 _assert(len(inode.key) > 0, "read: zero-length inode key") 179 } 180 181 // Save first key so we can find the node in the parent when we spill. 182 if len(n.inodes) > 0 { 183 n.key = n.inodes[0].key 184 _assert(len(n.key) > 0, "read: zero-length node key") 185 } else { 186 n.key = nil 187 } 188 } 189 190 // write writes the items onto one or more pages. 191 func (n *node) write(p *page) { 192 // Initialize page. 193 if n.isLeaf { 194 p.flags |= leafPageFlag 195 } else { 196 p.flags |= branchPageFlag 197 } 198 199 if len(n.inodes) >= 0xFFFF { 200 panic(fmt.Sprintf("inode overflow: %d (pgid=%d)", len(n.inodes), p.id)) 201 } 202 p.count = uint16(len(n.inodes)) 203 204 // Stop here if there are no items to write. 205 if p.count == 0 { 206 return 207 } 208 209 // Loop over each item and write it to the page. 210 // off tracks the offset into p of the start of the next data. 211 off := unsafe.Sizeof(*p) + n.pageElementSize()*uintptr(len(n.inodes)) 212 for i, item := range n.inodes { 213 _assert(len(item.key) > 0, "write: zero-length inode key") 214 215 // Create a slice to write into of needed size and advance 216 // byte pointer for next iteration. 217 sz := len(item.key) + len(item.value) 218 b := unsafeByteSlice(unsafe.Pointer(p), off, 0, sz) 219 off += uintptr(sz) 220 221 // Write the page element. 222 if n.isLeaf { 223 elem := p.leafPageElement(uint16(i)) 224 elem.pos = uint32(uintptr(unsafe.Pointer(&b[0])) - uintptr(unsafe.Pointer(elem))) 225 elem.flags = item.flags 226 elem.ksize = uint32(len(item.key)) 227 elem.vsize = uint32(len(item.value)) 228 } else { 229 elem := p.branchPageElement(uint16(i)) 230 elem.pos = uint32(uintptr(unsafe.Pointer(&b[0])) - uintptr(unsafe.Pointer(elem))) 231 elem.ksize = uint32(len(item.key)) 232 elem.pgid = item.pgid 233 _assert(elem.pgid != p.id, "write: circular dependency occurred") 234 } 235 236 // Write data for the element to the end of the page. 237 l := copy(b, item.key) 238 copy(b[l:], item.value) 239 } 240 241 // DEBUG ONLY: n.dump() 242 } 243 244 // split breaks up a node into multiple smaller nodes, if appropriate. 245 // This should only be called from the spill() function. 246 func (n *node) split(pageSize uintptr) []*node { 247 var nodes []*node 248 249 node := n 250 for { 251 // Split node into two. 252 a, b := node.splitTwo(pageSize) 253 nodes = append(nodes, a) 254 255 // If we can't split then exit the loop. 256 if b == nil { 257 break 258 } 259 260 // Set node to b so it gets split on the next iteration. 261 node = b 262 } 263 264 return nodes 265 } 266 267 // splitTwo breaks up a node into two smaller nodes, if appropriate. 268 // This should only be called from the split() function. 269 func (n *node) splitTwo(pageSize uintptr) (*node, *node) { 270 // Ignore the split if the page doesn't have at least enough nodes for 271 // two pages or if the nodes can fit in a single page. 272 if len(n.inodes) <= (minKeysPerPage*2) || n.sizeLessThan(pageSize) { 273 return n, nil 274 } 275 276 // Determine the threshold before starting a new node. 277 var fillPercent = n.bucket.FillPercent 278 if fillPercent < minFillPercent { 279 fillPercent = minFillPercent 280 } else if fillPercent > maxFillPercent { 281 fillPercent = maxFillPercent 282 } 283 threshold := int(float64(pageSize) * fillPercent) 284 285 // Determine split position and sizes of the two pages. 286 splitIndex, _ := n.splitIndex(threshold) 287 288 // Split node into two separate nodes. 289 // If there's no parent then we'll need to create one. 290 if n.parent == nil { 291 n.parent = &node{bucket: n.bucket, children: []*node{n}} 292 } 293 294 // Create a new node and add it to the parent. 295 next := &node{bucket: n.bucket, isLeaf: n.isLeaf, parent: n.parent} 296 n.parent.children = append(n.parent.children, next) 297 298 // Split inodes across two nodes. 299 next.inodes = n.inodes[splitIndex:] 300 n.inodes = n.inodes[:splitIndex] 301 302 // Update the statistics. 303 n.bucket.tx.stats.Split++ 304 305 return n, next 306 } 307 308 // splitIndex finds the position where a page will fill a given threshold. 309 // It returns the index as well as the size of the first page. 310 // This is only be called from split(). 311 func (n *node) splitIndex(threshold int) (index, sz uintptr) { 312 sz = pageHeaderSize 313 314 // Loop until we only have the minimum number of keys required for the second page. 315 for i := 0; i < len(n.inodes)-minKeysPerPage; i++ { 316 index = uintptr(i) 317 inode := n.inodes[i] 318 elsize := n.pageElementSize() + uintptr(len(inode.key)) + uintptr(len(inode.value)) 319 320 // If we have at least the minimum number of keys and adding another 321 // node would put us over the threshold then exit and return. 322 if index >= minKeysPerPage && sz+elsize > uintptr(threshold) { 323 break 324 } 325 326 // Add the element size to the total size. 327 sz += elsize 328 } 329 330 return 331 } 332 333 // spill writes the nodes to dirty pages and splits nodes as it goes. 334 // Returns an error if dirty pages cannot be allocated. 335 func (n *node) spill() error { 336 var tx = n.bucket.tx 337 if n.spilled { 338 return nil 339 } 340 341 // Spill child nodes first. Child nodes can materialize sibling nodes in 342 // the case of split-merge so we cannot use a range loop. We have to check 343 // the children size on every loop iteration. 344 sort.Sort(n.children) 345 for i := 0; i < len(n.children); i++ { 346 if err := n.children[i].spill(); err != nil { 347 return err 348 } 349 } 350 351 // We no longer need the child list because it's only used for spill tracking. 352 n.children = nil 353 354 // Split nodes into appropriate sizes. The first node will always be n. 355 var nodes = n.split(uintptr(tx.db.pageSize)) 356 for _, node := range nodes { 357 // Add node's page to the freelist if it's not new. 358 if node.pgid > 0 { 359 tx.db.freelist.free(tx.meta.txid, tx.page(node.pgid)) 360 node.pgid = 0 361 } 362 363 // Allocate contiguous space for the node. 364 p, err := tx.allocate((node.size() + tx.db.pageSize - 1) / tx.db.pageSize) 365 if err != nil { 366 return err 367 } 368 369 // Write the node. 370 if p.id >= tx.meta.pgid { 371 panic(fmt.Sprintf("pgid (%d) above high water mark (%d)", p.id, tx.meta.pgid)) 372 } 373 node.pgid = p.id 374 node.write(p) 375 node.spilled = true 376 377 // Insert into parent inodes. 378 if node.parent != nil { 379 var key = node.key 380 if key == nil { 381 key = node.inodes[0].key 382 } 383 384 node.parent.put(key, node.inodes[0].key, nil, node.pgid, 0) 385 node.key = node.inodes[0].key 386 _assert(len(node.key) > 0, "spill: zero-length node key") 387 } 388 389 // Update the statistics. 390 tx.stats.Spill++ 391 } 392 393 // If the root node split and created a new root then we need to spill that 394 // as well. We'll clear out the children to make sure it doesn't try to respill. 395 if n.parent != nil && n.parent.pgid == 0 { 396 n.children = nil 397 return n.parent.spill() 398 } 399 400 return nil 401 } 402 403 // rebalance attempts to combine the node with sibling nodes if the node fill 404 // size is below a threshold or if there are not enough keys. 405 func (n *node) rebalance() { 406 if !n.unbalanced { 407 return 408 } 409 n.unbalanced = false 410 411 // Update statistics. 412 n.bucket.tx.stats.Rebalance++ 413 414 // Ignore if node is above threshold (25%) and has enough keys. 415 var threshold = n.bucket.tx.db.pageSize / 4 416 if n.size() > threshold && len(n.inodes) > n.minKeys() { 417 return 418 } 419 420 // Root node has special handling. 421 if n.parent == nil { 422 // If root node is a branch and only has one node then collapse it. 423 if !n.isLeaf && len(n.inodes) == 1 { 424 // Move root's child up. 425 child := n.bucket.node(n.inodes[0].pgid, n) 426 n.isLeaf = child.isLeaf 427 n.inodes = child.inodes[:] 428 n.children = child.children 429 430 // Reparent all child nodes being moved. 431 for _, inode := range n.inodes { 432 if child, ok := n.bucket.nodes[inode.pgid]; ok { 433 child.parent = n 434 } 435 } 436 437 // Remove old child. 438 child.parent = nil 439 delete(n.bucket.nodes, child.pgid) 440 child.free() 441 } 442 443 return 444 } 445 446 // If node has no keys then just remove it. 447 if n.numChildren() == 0 { 448 n.parent.del(n.key) 449 n.parent.removeChild(n) 450 delete(n.bucket.nodes, n.pgid) 451 n.free() 452 n.parent.rebalance() 453 return 454 } 455 456 _assert(n.parent.numChildren() > 1, "parent must have at least 2 children") 457 458 // Destination node is right sibling if idx == 0, otherwise left sibling. 459 var target *node 460 var useNextSibling = (n.parent.childIndex(n) == 0) 461 if useNextSibling { 462 target = n.nextSibling() 463 } else { 464 target = n.prevSibling() 465 } 466 467 // If both this node and the target node are too small then merge them. 468 if useNextSibling { 469 // Reparent all child nodes being moved. 470 for _, inode := range target.inodes { 471 if child, ok := n.bucket.nodes[inode.pgid]; ok { 472 child.parent.removeChild(child) 473 child.parent = n 474 child.parent.children = append(child.parent.children, child) 475 } 476 } 477 478 // Copy over inodes from target and remove target. 479 n.inodes = append(n.inodes, target.inodes...) 480 n.parent.del(target.key) 481 n.parent.removeChild(target) 482 delete(n.bucket.nodes, target.pgid) 483 target.free() 484 } else { 485 // Reparent all child nodes being moved. 486 for _, inode := range n.inodes { 487 if child, ok := n.bucket.nodes[inode.pgid]; ok { 488 child.parent.removeChild(child) 489 child.parent = target 490 child.parent.children = append(child.parent.children, child) 491 } 492 } 493 494 // Copy over inodes to target and remove node. 495 target.inodes = append(target.inodes, n.inodes...) 496 n.parent.del(n.key) 497 n.parent.removeChild(n) 498 delete(n.bucket.nodes, n.pgid) 499 n.free() 500 } 501 502 // Either this node or the target node was deleted from the parent so rebalance it. 503 n.parent.rebalance() 504 } 505 506 // removes a node from the list of in-memory children. 507 // This does not affect the inodes. 508 func (n *node) removeChild(target *node) { 509 for i, child := range n.children { 510 if child == target { 511 n.children = append(n.children[:i], n.children[i+1:]...) 512 return 513 } 514 } 515 } 516 517 // dereference causes the node to copy all its inode key/value references to heap memory. 518 // This is required when the mmap is reallocated so inodes are not pointing to stale data. 519 func (n *node) dereference() { 520 if n.key != nil { 521 key := make([]byte, len(n.key)) 522 copy(key, n.key) 523 n.key = key 524 _assert(n.pgid == 0 || len(n.key) > 0, "dereference: zero-length node key on existing node") 525 } 526 527 for i := range n.inodes { 528 inode := &n.inodes[i] 529 530 key := make([]byte, len(inode.key)) 531 copy(key, inode.key) 532 inode.key = key 533 _assert(len(inode.key) > 0, "dereference: zero-length inode key") 534 535 value := make([]byte, len(inode.value)) 536 copy(value, inode.value) 537 inode.value = value 538 } 539 540 // Recursively dereference children. 541 for _, child := range n.children { 542 child.dereference() 543 } 544 545 // Update statistics. 546 n.bucket.tx.stats.NodeDeref++ 547 } 548 549 // free adds the node's underlying page to the freelist. 550 func (n *node) free() { 551 if n.pgid != 0 { 552 n.bucket.tx.db.freelist.free(n.bucket.tx.meta.txid, n.bucket.tx.page(n.pgid)) 553 n.pgid = 0 554 } 555 } 556 557 // dump writes the contents of the node to STDERR for debugging purposes. 558 /* 559 func (n *node) dump() { 560 // Write node header. 561 var typ = "branch" 562 if n.isLeaf { 563 typ = "leaf" 564 } 565 warnf("[NODE %d {type=%s count=%d}]", n.pgid, typ, len(n.inodes)) 566 567 // Write out abbreviated version of each item. 568 for _, item := range n.inodes { 569 if n.isLeaf { 570 if item.flags&bucketLeafFlag != 0 { 571 bucket := (*bucket)(unsafe.Pointer(&item.value[0])) 572 warnf("+L %08x -> (bucket root=%d)", trunc(item.key, 4), bucket.root) 573 } else { 574 warnf("+L %08x -> %08x", trunc(item.key, 4), trunc(item.value, 4)) 575 } 576 } else { 577 warnf("+B %08x -> pgid=%d", trunc(item.key, 4), item.pgid) 578 } 579 } 580 warn("") 581 } 582 */ 583 584 type nodes []*node 585 586 func (s nodes) Len() int { return len(s) } 587 func (s nodes) Swap(i, j int) { s[i], s[j] = s[j], s[i] } 588 func (s nodes) Less(i, j int) bool { 589 return bytes.Compare(s[i].inodes[0].key, s[j].inodes[0].key) == -1 590 } 591 592 // inode represents an internal node inside of a node. 593 // It can be used to point to elements in a page or point 594 // to an element which hasn't been added to a page yet. 595 type inode struct { 596 flags uint32 597 pgid pgid 598 key []byte 599 value []byte 600 } 601 602 type inodes []inode