github.com/zuoyebang/bitalosdb@v1.1.1-0.20240516111551-79a8c4d8ce20/bitree/bdb/bucket.go (about) 1 // Copyright 2021 The Bitalosdb author(hustxrb@163.com) and other contributors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package bdb 16 17 import ( 18 "bytes" 19 "fmt" 20 "unsafe" 21 22 "github.com/cockroachdb/errors" 23 ) 24 25 const ( 26 MaxKeySize = 10 << 10 27 MaxValueSize = (1 << 31) - 2 28 ) 29 30 const bucketHeaderSize = int(unsafe.Sizeof(bucket{})) 31 32 const ( 33 minFillPercent = 0.1 34 maxFillPercent = 1.0 35 ) 36 37 const DefaultFillPercent = 1.0 38 39 type Bucket struct { 40 *bucket 41 tx *Tx 42 buckets map[string]*Bucket 43 page *page 44 rootNode *node 45 nodes map[pgid]*node 46 FillPercent float64 47 } 48 49 type bucket struct { 50 root pgid 51 sequence uint64 52 } 53 54 func newBucket(tx *Tx) Bucket { 55 var b = Bucket{tx: tx, FillPercent: DefaultFillPercent} 56 if tx.writable { 57 b.buckets = make(map[string]*Bucket) 58 b.nodes = make(map[pgid]*node, 1<<4) 59 } 60 return b 61 } 62 63 func (b *Bucket) Tx() *Tx { 64 return b.tx 65 } 66 67 func (b *Bucket) Root() pgid { 68 return b.root 69 } 70 71 func (b *Bucket) Writable() bool { 72 return b.tx.writable 73 } 74 func (b *Bucket) Cursor() *Cursor { 75 b.tx.stats.CursorCount++ 76 77 return &Cursor{ 78 bucket: b, 79 stack: make([]elemRef, 0), 80 } 81 } 82 83 func (b *Bucket) Bucket(name []byte) *Bucket { 84 if b.buckets != nil { 85 if child := b.buckets[string(name)]; child != nil { 86 return child 87 } 88 } 89 90 c := b.Cursor() 91 k, v, flags := c.seek(name) 92 93 if !bytes.Equal(name, k) || (flags&bucketLeafFlag) == 0 { 94 return nil 95 } 96 97 var child = b.openBucket(v) 98 if b.buckets != nil { 99 b.buckets[string(name)] = child 100 } 101 102 return child 103 } 104 105 func (b *Bucket) openBucket(value []byte) *Bucket { 106 var child = newBucket(b.tx) 107 108 const unalignedMask = unsafe.Alignof(struct { 109 bucket 110 page 111 }{}) - 1 112 unaligned := uintptr(unsafe.Pointer(&value[0]))&unalignedMask != 0 113 if unaligned { 114 value = cloneBytes(value) 115 } 116 117 if b.tx.writable && !unaligned { 118 child.bucket = &bucket{} 119 *child.bucket = *(*bucket)(unsafe.Pointer(&value[0])) 120 } else { 121 child.bucket = (*bucket)(unsafe.Pointer(&value[0])) 122 } 123 124 if child.root == 0 { 125 child.page = (*page)(unsafe.Pointer(&value[bucketHeaderSize])) 126 } 127 128 return &child 129 } 130 131 func openBucketPage(value []byte) *page { 132 var pg *page 133 bkt := (*bucket)(unsafe.Pointer(&value[0])) 134 if bkt.root == 0 { 135 pg = (*page)(unsafe.Pointer(&value[bucketHeaderSize])) 136 } 137 return pg 138 } 139 140 func (b *Bucket) CreateBucket(key []byte) (*Bucket, error) { 141 if b.tx.db == nil { 142 return nil, ErrTxClosed 143 } else if !b.tx.writable { 144 return nil, ErrTxNotWritable 145 } else if len(key) == 0 { 146 return nil, ErrBucketNameRequired 147 } 148 149 c := b.Cursor() 150 k, _, flags := c.seek(key) 151 152 if bytes.Equal(key, k) { 153 if (flags & bucketLeafFlag) != 0 { 154 return nil, ErrBucketExists 155 } 156 return nil, ErrIncompatibleValue 157 } 158 159 var bucket = Bucket{ 160 bucket: &bucket{}, 161 rootNode: &node{isLeaf: true}, 162 FillPercent: DefaultFillPercent, 163 } 164 var value = bucket.write() 165 166 key = cloneBytes(key) 167 c.node().put(key, key, value, 0, bucketLeafFlag) 168 169 b.page = nil 170 171 return b.Bucket(key), nil 172 } 173 174 func (b *Bucket) CreateBucketIfNotExists(key []byte) (*Bucket, error) { 175 child, err := b.CreateBucket(key) 176 if err == ErrBucketExists { 177 return b.Bucket(key), nil 178 } else if err != nil { 179 return nil, err 180 } 181 return child, nil 182 } 183 184 func (b *Bucket) DeleteBucket(key []byte) error { 185 if b.tx.db == nil { 186 return ErrTxClosed 187 } else if !b.Writable() { 188 return ErrTxNotWritable 189 } 190 191 c := b.Cursor() 192 k, _, flags := c.seek(key) 193 194 if !bytes.Equal(key, k) { 195 return ErrBucketNotFound 196 } else if (flags & bucketLeafFlag) == 0 { 197 return ErrIncompatibleValue 198 } 199 200 child := b.Bucket(key) 201 err := child.ForEach(func(k, v []byte) error { 202 if _, _, childFlags := child.Cursor().seek(k); (childFlags & bucketLeafFlag) != 0 { 203 if err := child.DeleteBucket(k); err != nil { 204 return errors.Wrap(err, "delete bucket err") 205 } 206 } 207 return nil 208 }) 209 if err != nil { 210 return err 211 } 212 213 delete(b.buckets, string(key)) 214 215 child.nodes = nil 216 child.rootNode = nil 217 child.free() 218 219 c.node().del(key) 220 221 return nil 222 } 223 224 func (b *Bucket) Get(key []byte) []byte { 225 k, v, flags := b.Cursor().seek(key) 226 227 if (flags & bucketLeafFlag) != 0 { 228 return nil 229 } 230 231 if !bytes.Equal(key, k) { 232 return nil 233 } 234 return v 235 } 236 237 func (b *Bucket) Seek(key []byte) ([]byte, []byte) { 238 return b.Cursor().Seek(key) 239 } 240 241 func (b *Bucket) Put(key []byte, value []byte) error { 242 if b.tx.db == nil { 243 return ErrTxClosed 244 } else if !b.Writable() { 245 return ErrTxNotWritable 246 } else if len(key) == 0 { 247 return ErrKeyRequired 248 } else if int64(len(value)) > MaxValueSize { 249 return ErrValueTooLarge 250 } 251 252 if len(key) > MaxKeySize { 253 key = key[:MaxKeySize] 254 } 255 256 c := b.Cursor() 257 k, _, flags := c.seek(key) 258 259 if bytes.Equal(key, k) && (flags&bucketLeafFlag) != 0 { 260 return ErrIncompatibleValue 261 } 262 263 key = cloneBytes(key) 264 c.node().put(key, key, value, 0, 0) 265 266 return nil 267 } 268 269 func (b *Bucket) Delete(key []byte) error { 270 if b.tx.db == nil { 271 return ErrTxClosed 272 } else if !b.Writable() { 273 return ErrTxNotWritable 274 } 275 276 c := b.Cursor() 277 k, _, flags := c.seek(key) 278 279 if !bytes.Equal(key, k) { 280 return nil 281 } 282 283 if (flags & bucketLeafFlag) != 0 { 284 return ErrIncompatibleValue 285 } 286 287 c.node().del(key) 288 289 return nil 290 } 291 292 func (b *Bucket) Sequence() uint64 { return b.bucket.sequence } 293 294 func (b *Bucket) SetSequence(v uint64) error { 295 if b.tx.db == nil { 296 return ErrTxClosed 297 } else if !b.Writable() { 298 return ErrTxNotWritable 299 } 300 301 if b.rootNode == nil { 302 _ = b.node(b.root, nil) 303 } 304 305 b.bucket.sequence = v 306 return nil 307 } 308 309 func (b *Bucket) NextSequence() (uint64, error) { 310 if b.tx.db == nil { 311 return 0, ErrTxClosed 312 } else if !b.Writable() { 313 return 0, ErrTxNotWritable 314 } 315 316 if b.rootNode == nil { 317 _ = b.node(b.root, nil) 318 } 319 320 b.bucket.sequence++ 321 return b.bucket.sequence, nil 322 } 323 324 func (b *Bucket) ForEach(fn func(k, v []byte) error) error { 325 if b.tx.db == nil { 326 return ErrTxClosed 327 } 328 c := b.Cursor() 329 for k, v := c.First(); k != nil; k, v = c.Next() { 330 if err := fn(k, v); err != nil { 331 return err 332 } 333 } 334 return nil 335 } 336 337 func (b *Bucket) Stats() BucketStats { 338 var s, subStats BucketStats 339 pageSize := b.tx.db.pageSize 340 s.BucketN += 1 341 if b.root == 0 { 342 s.InlineBucketN += 1 343 } 344 b.forEachPage(func(p *page, depth int) { 345 if (p.flags & leafPageFlag) != 0 { 346 s.KeyN += int(p.count) 347 348 used := pageHeaderSize 349 350 if p.count != 0 { 351 used += leafPageElementSize * uintptr(p.count-1) 352 353 lastElement := p.leafPageElement(p.count - 1) 354 used += uintptr(lastElement.pos + lastElement.ksize + lastElement.vsize) 355 } 356 357 if b.root == 0 { 358 s.InlineBucketInuse += int(used) 359 } else { 360 s.LeafPageN++ 361 s.LeafInuse += int(used) 362 s.LeafOverflowN += int(p.overflow) 363 364 for i := uint16(0); i < p.count; i++ { 365 e := p.leafPageElement(i) 366 if (e.flags & bucketLeafFlag) != 0 { 367 subStats.Add(b.openBucket(e.value()).Stats()) 368 } 369 } 370 } 371 } else if (p.flags & branchPageFlag) != 0 { 372 s.BranchPageN++ 373 lastElement := p.branchPageElement(p.count - 1) 374 375 used := pageHeaderSize + (branchPageElementSize * uintptr(p.count-1)) 376 377 used += uintptr(lastElement.pos + lastElement.ksize) 378 s.BranchInuse += int(used) 379 s.BranchOverflowN += int(p.overflow) 380 } 381 382 if depth+1 > s.Depth { 383 s.Depth = (depth + 1) 384 } 385 }) 386 387 s.BranchAlloc = (s.BranchPageN + s.BranchOverflowN) * pageSize 388 s.LeafAlloc = (s.LeafPageN + s.LeafOverflowN) * pageSize 389 390 s.Depth += subStats.Depth 391 s.Add(subStats) 392 return s 393 } 394 395 func (b *Bucket) forEachPage(fn func(*page, int)) { 396 if b.page != nil { 397 fn(b.page, 0) 398 return 399 } 400 401 b.tx.forEachPage(b.root, 0, fn) 402 } 403 404 func (b *Bucket) forEachPageNode(fn func(*page, *node, int)) { 405 if b.page != nil { 406 fn(b.page, nil, 0) 407 return 408 } 409 b._forEachPageNode(b.root, 0, fn) 410 } 411 412 func (b *Bucket) _forEachPageNode(pgid pgid, depth int, fn func(*page, *node, int)) { 413 var p, n = b.pageNode(pgid) 414 415 fn(p, n, depth) 416 417 if p != nil { 418 if (p.flags & branchPageFlag) != 0 { 419 for i := 0; i < int(p.count); i++ { 420 elem := p.branchPageElement(uint16(i)) 421 b._forEachPageNode(elem.pgid, depth+1, fn) 422 } 423 } 424 } else { 425 if !n.isLeaf { 426 for _, inode := range n.inodes { 427 b._forEachPageNode(inode.pgid, depth+1, fn) 428 } 429 } 430 } 431 } 432 433 func (b *Bucket) spill() error { 434 for name, child := range b.buckets { 435 var value []byte 436 if child.inlineable() { 437 child.free() 438 value = child.write() 439 } else { 440 if err := child.spill(); err != nil { 441 return err 442 } 443 value = make([]byte, unsafe.Sizeof(bucket{})) 444 var bucket = (*bucket)(unsafe.Pointer(&value[0])) 445 *bucket = *child.bucket 446 } 447 448 if child.rootNode == nil { 449 continue 450 } 451 452 var c = b.Cursor() 453 k, _, flags := c.seek([]byte(name)) 454 if !bytes.Equal([]byte(name), k) { 455 panic(fmt.Sprintf("misplaced bucket header: %x -> %x", []byte(name), k)) 456 } 457 if flags&bucketLeafFlag == 0 { 458 panic(fmt.Sprintf("unexpected bucket header flag: %x", flags)) 459 } 460 c.node().put([]byte(name), []byte(name), value, 0, bucketLeafFlag) 461 } 462 463 if b.rootNode == nil { 464 return nil 465 } 466 467 if err := b.rootNode.spill(); err != nil { 468 return err 469 } 470 b.rootNode = b.rootNode.root() 471 472 if b.rootNode.pgid >= b.tx.meta.pgid { 473 panic(fmt.Sprintf("pgid (%d) above high water mark (%d)", b.rootNode.pgid, b.tx.meta.pgid)) 474 } 475 b.root = b.rootNode.pgid 476 477 return nil 478 } 479 480 func (b *Bucket) inlineable() bool { 481 var n = b.rootNode 482 483 if n == nil || !n.isLeaf { 484 return false 485 } 486 487 var size = pageHeaderSize 488 for _, inode := range n.inodes { 489 size += leafPageElementSize + uintptr(len(inode.key)) + uintptr(len(inode.value)) 490 491 if inode.flags&bucketLeafFlag != 0 { 492 return false 493 } else if size > b.maxInlineBucketSize() { 494 return false 495 } 496 } 497 498 return true 499 } 500 501 func (b *Bucket) maxInlineBucketSize() uintptr { 502 return uintptr(b.tx.db.pageSize / 4) 503 } 504 505 func (b *Bucket) write() []byte { 506 var n = b.rootNode 507 var value = make([]byte, bucketHeaderSize+n.size()) 508 509 var bucket = (*bucket)(unsafe.Pointer(&value[0])) 510 *bucket = *b.bucket 511 512 var p = (*page)(unsafe.Pointer(&value[bucketHeaderSize])) 513 n.write(p) 514 515 return value 516 } 517 518 func (b *Bucket) rebalance() { 519 for _, n := range b.nodes { 520 n.rebalance() 521 } 522 for _, child := range b.buckets { 523 child.rebalance() 524 } 525 } 526 527 func (b *Bucket) node(pgid pgid, parent *node) *node { 528 _assert(b.nodes != nil, "nodes map expected") 529 530 if n := b.nodes[pgid]; n != nil { 531 return n 532 } 533 534 n := &node{bucket: b, parent: parent} 535 if parent == nil { 536 b.rootNode = n 537 } else { 538 parent.children = append(parent.children, n) 539 } 540 541 var p = b.page 542 if p == nil { 543 p = b.tx.page(pgid) 544 } 545 546 n.read(p) 547 b.nodes[pgid] = n 548 549 b.tx.stats.NodeCount++ 550 551 return n 552 } 553 554 func (b *Bucket) free() { 555 if b.root == 0 { 556 return 557 } 558 559 var tx = b.tx 560 b.forEachPageNode(func(p *page, n *node, _ int) { 561 if p != nil { 562 tx.db.freelist.free(tx.meta.txid, p) 563 } else { 564 n.free() 565 } 566 }) 567 b.root = 0 568 } 569 570 func (b *Bucket) dereference() { 571 if b.rootNode != nil { 572 b.rootNode.root().dereference() 573 } 574 575 for _, child := range b.buckets { 576 child.dereference() 577 } 578 } 579 580 func (b *Bucket) pageNode(id pgid) (*page, *node) { 581 if b.root == 0 { 582 if id != 0 { 583 panic(fmt.Sprintf("inline bucket non-zero page access(2): %d != 0", id)) 584 } 585 if b.rootNode != nil { 586 return nil, b.rootNode 587 } 588 return b.page, nil 589 } 590 591 if b.nodes != nil { 592 if n := b.nodes[id]; n != nil { 593 return nil, n 594 } 595 } 596 597 return b.tx.page(id), nil 598 } 599 600 type BucketStats struct { 601 BranchPageN int 602 BranchOverflowN int 603 LeafPageN int 604 LeafOverflowN int 605 KeyN int 606 Depth int 607 BranchAlloc int 608 BranchInuse int 609 LeafAlloc int 610 LeafInuse int 611 BucketN int 612 InlineBucketN int 613 InlineBucketInuse int 614 } 615 616 func (s *BucketStats) Add(other BucketStats) { 617 s.BranchPageN += other.BranchPageN 618 s.BranchOverflowN += other.BranchOverflowN 619 s.LeafPageN += other.LeafPageN 620 s.LeafOverflowN += other.LeafOverflowN 621 s.KeyN += other.KeyN 622 if s.Depth < other.Depth { 623 s.Depth = other.Depth 624 } 625 s.BranchAlloc += other.BranchAlloc 626 s.BranchInuse += other.BranchInuse 627 s.LeafAlloc += other.LeafAlloc 628 s.LeafInuse += other.LeafInuse 629 630 s.BucketN += other.BucketN 631 s.InlineBucketN += other.InlineBucketN 632 s.InlineBucketInuse += other.InlineBucketInuse 633 } 634 635 func cloneBytes(v []byte) []byte { 636 var clone = make([]byte, len(v)) 637 copy(clone, v) 638 return clone 639 }