github.com/zuoyebang/bitalosdb@v1.1.1-0.20240516111551-79a8c4d8ce20/bitree/bdb/node.go (about) 1 // Copyright 2021 The Bitalosdb author(hustxrb@163.com) and other contributors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package bdb 16 17 import ( 18 "bytes" 19 "fmt" 20 "sort" 21 "unsafe" 22 ) 23 24 type node struct { 25 bucket *Bucket 26 isLeaf bool 27 unbalanced bool 28 spilled bool 29 key []byte 30 pgid pgid 31 parent *node 32 children nodes 33 inodes inodes 34 } 35 36 func (n *node) root() *node { 37 if n.parent == nil { 38 return n 39 } 40 return n.parent.root() 41 } 42 43 func (n *node) minKeys() int { 44 if n.isLeaf { 45 return 1 46 } 47 return 2 48 } 49 50 func (n *node) size() int { 51 sz, elsz := pageHeaderSize, n.pageElementSize() 52 for i := 0; i < len(n.inodes); i++ { 53 item := &n.inodes[i] 54 sz += elsz + uintptr(len(item.key)) + uintptr(len(item.value)) 55 } 56 return int(sz) 57 } 58 59 func (n *node) sizeLessThan(v uintptr) bool { 60 sz, elsz := pageHeaderSize, n.pageElementSize() 61 for i := 0; i < len(n.inodes); i++ { 62 item := &n.inodes[i] 63 sz += elsz + uintptr(len(item.key)) + uintptr(len(item.value)) 64 if sz >= v { 65 return false 66 } 67 } 68 return true 69 } 70 71 func (n *node) pageElementSize() uintptr { 72 if n.isLeaf { 73 return leafPageElementSize 74 } 75 return branchPageElementSize 76 } 77 78 func (n *node) childAt(index int) *node { 79 if n.isLeaf { 80 panic(fmt.Sprintf("invalid childAt(%d) on a leaf node", index)) 81 } 82 return n.bucket.node(n.inodes[index].pgid, n) 83 } 84 85 func (n *node) childIndex(child *node) int { 86 index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, child.key) != -1 }) 87 return index 88 } 89 90 func (n *node) numChildren() int { 91 return len(n.inodes) 92 } 93 94 func (n *node) nextSibling() *node { 95 if n.parent == nil { 96 return nil 97 } 98 index := n.parent.childIndex(n) 99 if index >= n.parent.numChildren()-1 { 100 return nil 101 } 102 return n.parent.childAt(index + 1) 103 } 104 105 func (n *node) prevSibling() *node { 106 if n.parent == nil { 107 return nil 108 } 109 index := n.parent.childIndex(n) 110 if index == 0 { 111 return nil 112 } 113 return n.parent.childAt(index - 1) 114 } 115 116 func (n *node) put(oldKey, newKey, value []byte, pgid pgid, flags uint32) { 117 if pgid >= n.bucket.tx.meta.pgid { 118 panic(fmt.Sprintf("pgid (%d) above high water mark (%d)", pgid, n.bucket.tx.meta.pgid)) 119 } else if len(oldKey) <= 0 { 120 panic("put: zero-length old key") 121 } else if len(newKey) <= 0 { 122 panic("put: zero-length new key") 123 } 124 125 index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, oldKey) != -1 }) 126 127 exact := (len(n.inodes) > 0 && index < len(n.inodes) && bytes.Equal(n.inodes[index].key, oldKey)) 128 if !exact { 129 n.inodes = append(n.inodes, inode{}) 130 copy(n.inodes[index+1:], n.inodes[index:]) 131 } 132 133 inode := &n.inodes[index] 134 inode.flags = flags 135 inode.key = newKey 136 inode.value = value 137 inode.pgid = pgid 138 _assert(len(inode.key) > 0, "put: zero-length inode key") 139 } 140 141 func (n *node) del(key []byte) { 142 index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, key) != -1 }) 143 144 if index >= len(n.inodes) || !bytes.Equal(n.inodes[index].key, key) { 145 return 146 } 147 148 n.inodes = append(n.inodes[:index], n.inodes[index+1:]...) 149 150 n.unbalanced = true 151 } 152 153 func (n *node) read(p *page) { 154 n.pgid = p.id 155 n.isLeaf = ((p.flags & leafPageFlag) != 0) 156 n.inodes = make(inodes, int(p.count)) 157 158 for i := 0; i < int(p.count); i++ { 159 inode := &n.inodes[i] 160 if n.isLeaf { 161 elem := p.leafPageElement(uint16(i)) 162 inode.flags = elem.flags 163 inode.key = elem.key() 164 inode.value = elem.value() 165 } else { 166 elem := p.branchPageElement(uint16(i)) 167 inode.pgid = elem.pgid 168 inode.key = elem.key() 169 } 170 _assert(len(inode.key) > 0, "read: zero-length inode key") 171 } 172 173 if len(n.inodes) > 0 { 174 n.key = n.inodes[0].key 175 _assert(len(n.key) > 0, "read: zero-length node key") 176 } else { 177 n.key = nil 178 } 179 } 180 181 func (n *node) write(p *page) { 182 if n.isLeaf { 183 p.flags |= leafPageFlag 184 } else { 185 p.flags |= branchPageFlag 186 } 187 188 if len(n.inodes) >= 0xFFFF { 189 panic(fmt.Sprintf("inode overflow: %d (pgid=%d)", len(n.inodes), p.id)) 190 } 191 p.count = uint16(len(n.inodes)) 192 193 if p.count == 0 { 194 return 195 } 196 197 off := unsafe.Sizeof(*p) + n.pageElementSize()*uintptr(len(n.inodes)) 198 for i, item := range n.inodes { 199 _assert(len(item.key) > 0, "write: zero-length inode key") 200 201 sz := len(item.key) + len(item.value) 202 b := unsafeByteSlice(unsafe.Pointer(p), off, 0, sz) 203 off += uintptr(sz) 204 205 if n.isLeaf { 206 elem := p.leafPageElement(uint16(i)) 207 elem.pos = uint32(uintptr(unsafe.Pointer(&b[0])) - uintptr(unsafe.Pointer(elem))) 208 elem.flags = item.flags 209 elem.ksize = uint32(len(item.key)) 210 elem.vsize = uint32(len(item.value)) 211 } else { 212 elem := p.branchPageElement(uint16(i)) 213 elem.pos = uint32(uintptr(unsafe.Pointer(&b[0])) - uintptr(unsafe.Pointer(elem))) 214 elem.ksize = uint32(len(item.key)) 215 elem.pgid = item.pgid 216 _assert(elem.pgid != p.id, "write: circular dependency occurred") 217 } 218 219 l := copy(b, item.key) 220 copy(b[l:], item.value) 221 } 222 } 223 224 func (n *node) split(pageSize uintptr) []*node { 225 var nodes []*node 226 227 node := n 228 for { 229 a, b := node.splitTwo(pageSize) 230 nodes = append(nodes, a) 231 232 if b == nil { 233 break 234 } 235 236 node = b 237 } 238 239 return nodes 240 } 241 242 func (n *node) splitTwo(pageSize uintptr) (*node, *node) { 243 if len(n.inodes) <= (minKeysPerPage*2) || n.sizeLessThan(pageSize) { 244 return n, nil 245 } 246 247 var fillPercent = n.bucket.FillPercent 248 if fillPercent < minFillPercent { 249 fillPercent = minFillPercent 250 } else if fillPercent > maxFillPercent { 251 fillPercent = maxFillPercent 252 } 253 threshold := int(float64(pageSize) * fillPercent) 254 255 splitIndex, _ := n.splitIndex(threshold) 256 257 if n.parent == nil { 258 n.parent = &node{bucket: n.bucket, children: []*node{n}} 259 } 260 261 next := &node{bucket: n.bucket, isLeaf: n.isLeaf, parent: n.parent} 262 n.parent.children = append(n.parent.children, next) 263 264 next.inodes = n.inodes[splitIndex:] 265 n.inodes = n.inodes[:splitIndex] 266 267 n.bucket.tx.stats.Split++ 268 269 return n, next 270 } 271 272 func (n *node) splitIndex(threshold int) (index, sz uintptr) { 273 sz = pageHeaderSize 274 275 for i := 0; i < len(n.inodes)-minKeysPerPage; i++ { 276 index = uintptr(i) 277 inode := n.inodes[i] 278 elsize := n.pageElementSize() + uintptr(len(inode.key)) + uintptr(len(inode.value)) 279 280 if index >= minKeysPerPage && sz+elsize > uintptr(threshold) { 281 break 282 } 283 284 sz += elsize 285 } 286 287 return 288 } 289 290 func (n *node) spill() error { 291 var tx = n.bucket.tx 292 if n.spilled { 293 return nil 294 } 295 296 sort.Sort(n.children) 297 for i := 0; i < len(n.children); i++ { 298 if err := n.children[i].spill(); err != nil { 299 return err 300 } 301 } 302 303 n.children = nil 304 305 var nodes = n.split(uintptr(tx.db.pageSize)) 306 for _, node := range nodes { 307 if node.pgid > 0 { 308 tx.db.freelist.free(tx.meta.txid, tx.page(node.pgid)) 309 node.pgid = 0 310 } 311 312 p, _, err := tx.allocate((node.size() + tx.db.pageSize - 1) / tx.db.pageSize) 313 if err != nil { 314 return err 315 } 316 317 if p.id >= tx.meta.pgid { 318 panic(fmt.Sprintf("pgid (%d) above high water mark (%d)", p.id, tx.meta.pgid)) 319 } 320 node.pgid = p.id 321 node.write(p) 322 node.spilled = true 323 324 if node.parent != nil { 325 var key = node.key 326 if key == nil { 327 key = node.inodes[0].key 328 } 329 330 node.parent.put(key, node.inodes[0].key, nil, node.pgid, 0) 331 node.key = node.inodes[0].key 332 _assert(len(node.key) > 0, "spill: zero-length node key") 333 } 334 335 tx.stats.Spill++ 336 } 337 338 if n.parent != nil && n.parent.pgid == 0 { 339 n.children = nil 340 return n.parent.spill() 341 } 342 343 return nil 344 } 345 346 func (n *node) rebalance() { 347 if !n.unbalanced { 348 return 349 } 350 n.unbalanced = false 351 352 n.bucket.tx.stats.Rebalance++ 353 354 var threshold = n.bucket.tx.db.pageSize / 4 355 if n.size() > threshold && len(n.inodes) > n.minKeys() { 356 return 357 } 358 359 if n.parent == nil { 360 if !n.isLeaf && len(n.inodes) == 1 { 361 child := n.bucket.node(n.inodes[0].pgid, n) 362 n.isLeaf = child.isLeaf 363 n.inodes = child.inodes[:] 364 n.children = child.children 365 366 for _, inode := range n.inodes { 367 if child, ok := n.bucket.nodes[inode.pgid]; ok { 368 child.parent = n 369 } 370 } 371 372 child.parent = nil 373 delete(n.bucket.nodes, child.pgid) 374 child.free() 375 } 376 377 return 378 } 379 380 if n.numChildren() == 0 { 381 n.parent.del(n.key) 382 n.parent.removeChild(n) 383 delete(n.bucket.nodes, n.pgid) 384 n.free() 385 n.parent.rebalance() 386 return 387 } 388 389 _assert(n.parent.numChildren() > 1, "parent must have at least 2 children") 390 391 var target *node 392 var useNextSibling = (n.parent.childIndex(n) == 0) 393 if useNextSibling { 394 target = n.nextSibling() 395 } else { 396 target = n.prevSibling() 397 } 398 399 if useNextSibling { 400 for _, inode := range target.inodes { 401 if child, ok := n.bucket.nodes[inode.pgid]; ok { 402 child.parent.removeChild(child) 403 child.parent = n 404 child.parent.children = append(child.parent.children, child) 405 } 406 } 407 408 n.inodes = append(n.inodes, target.inodes...) 409 n.parent.del(target.key) 410 n.parent.removeChild(target) 411 delete(n.bucket.nodes, target.pgid) 412 target.free() 413 } else { 414 for _, inode := range n.inodes { 415 if child, ok := n.bucket.nodes[inode.pgid]; ok { 416 child.parent.removeChild(child) 417 child.parent = target 418 child.parent.children = append(child.parent.children, child) 419 } 420 } 421 422 target.inodes = append(target.inodes, n.inodes...) 423 n.parent.del(n.key) 424 n.parent.removeChild(n) 425 delete(n.bucket.nodes, n.pgid) 426 n.free() 427 } 428 429 n.parent.rebalance() 430 } 431 432 func (n *node) removeChild(target *node) { 433 for i, child := range n.children { 434 if child == target { 435 n.children = append(n.children[:i], n.children[i+1:]...) 436 return 437 } 438 } 439 } 440 441 func (n *node) dereference() { 442 if n.key != nil { 443 key := make([]byte, len(n.key)) 444 copy(key, n.key) 445 n.key = key 446 _assert(n.pgid == 0 || len(n.key) > 0, "dereference: zero-length node key on existing node") 447 } 448 449 for i := range n.inodes { 450 inode := &n.inodes[i] 451 452 key := make([]byte, len(inode.key)) 453 copy(key, inode.key) 454 inode.key = key 455 _assert(len(inode.key) > 0, "dereference: zero-length inode key") 456 457 value := make([]byte, len(inode.value)) 458 copy(value, inode.value) 459 inode.value = value 460 } 461 462 for _, child := range n.children { 463 child.dereference() 464 } 465 466 n.bucket.tx.stats.NodeDeref++ 467 } 468 469 func (n *node) free() { 470 if n.pgid != 0 { 471 n.bucket.tx.db.freelist.free(n.bucket.tx.meta.txid, n.bucket.tx.page(n.pgid)) 472 n.pgid = 0 473 } 474 } 475 476 type nodes []*node 477 478 func (s nodes) Len() int { return len(s) } 479 func (s nodes) Swap(i, j int) { s[i], s[j] = s[j], s[i] } 480 func (s nodes) Less(i, j int) bool { 481 return bytes.Compare(s[i].inodes[0].key, s[j].inodes[0].key) == -1 482 } 483 484 type inode struct { 485 flags uint32 486 pgid pgid 487 key []byte 488 value []byte 489 } 490 491 type inodes []inode