github.com/zuoyebang/bitalosdb@v1.1.1-0.20240516111551-79a8c4d8ce20/bitree/bdb/tx.go (about) 1 // Copyright 2021 The Bitalosdb author(hustxrb@163.com) and other contributors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package bdb 16 17 import ( 18 "io" 19 "os" 20 "sort" 21 "strings" 22 "sync/atomic" 23 "time" 24 "unsafe" 25 26 "github.com/cockroachdb/errors" 27 ) 28 29 type txid uint64 30 31 type ReadTx struct { 32 ref atomic.Int32 33 tx *Tx 34 bkt *Bucket 35 bdb *DB 36 } 37 38 func (rt *ReadTx) Init(tx *Tx, bkt *Bucket, bdb *DB) { 39 rt.ref.Store(1) 40 rt.tx = tx 41 rt.bkt = bkt 42 rt.bdb = bdb 43 } 44 45 func (rt *ReadTx) Bucket() *Bucket { 46 return rt.bkt 47 } 48 49 func (rt *ReadTx) Ref() { 50 rt.ref.Add(1) 51 } 52 53 func (rt *ReadTx) Unref(update bool) (err error) { 54 if rt.ref.Add(-1) == 0 { 55 err = rt.tx.Rollback() 56 if update { 57 err = rt.bdb.Update(func(tx *Tx) error { return nil }) 58 } 59 } 60 return err 61 } 62 63 type Tx struct { 64 writable bool 65 managed bool 66 db *DB 67 meta *meta 68 root Bucket 69 pages map[pgid]*page 70 stats TxStats 71 commitHandlers []func() 72 WriteFlag int 73 } 74 75 func (tx *Tx) init(db *DB) { 76 tx.db = db 77 tx.pages = nil 78 79 tx.meta = &meta{} 80 db.meta().copy(tx.meta) 81 82 tx.root = newBucket(tx) 83 tx.root.bucket = &bucket{} 84 *tx.root.bucket = tx.meta.root 85 86 if tx.writable { 87 tx.pages = make(map[pgid]*page, 1<<4) 88 tx.meta.txid += txid(1) 89 } 90 } 91 92 func (tx *Tx) ID() int { 93 return int(tx.meta.txid) 94 } 95 96 func (tx *Tx) DB() *DB { 97 return tx.db 98 } 99 100 func (tx *Tx) Size() int64 { 101 return int64(tx.meta.pgid) * int64(tx.db.pageSize) 102 } 103 104 func (tx *Tx) Writable() bool { 105 return tx.writable 106 } 107 108 func (tx *Tx) Cursor() *Cursor { 109 return tx.root.Cursor() 110 } 111 112 func (tx *Tx) Stats() TxStats { 113 return tx.stats 114 } 115 116 func (tx *Tx) Bucket(name []byte) *Bucket { 117 return tx.root.Bucket(name) 118 } 119 120 func (tx *Tx) CreateBucket(name []byte) (*Bucket, error) { 121 return tx.root.CreateBucket(name) 122 } 123 124 func (tx *Tx) CreateBucketIfNotExists(name []byte) (*Bucket, error) { 125 return tx.root.CreateBucketIfNotExists(name) 126 } 127 128 func (tx *Tx) DeleteBucket(name []byte) error { 129 return tx.root.DeleteBucket(name) 130 } 131 132 func (tx *Tx) ForEach(fn func(name []byte, b *Bucket) error) error { 133 return tx.root.ForEach(func(k, v []byte) error { 134 return fn(k, tx.root.Bucket(k)) 135 }) 136 } 137 138 func (tx *Tx) OnCommit(fn func()) { 139 tx.commitHandlers = append(tx.commitHandlers, fn) 140 } 141 142 func (tx *Tx) Commit() error { 143 _assert(!tx.managed, "managed tx commit not allowed") 144 if tx.db == nil { 145 return ErrTxClosed 146 } else if !tx.writable { 147 return ErrTxNotWritable 148 } 149 150 var startTime = time.Now() 151 tx.root.rebalance() 152 if tx.stats.Rebalance > 0 { 153 tx.stats.RebalanceTime += time.Since(startTime) 154 } 155 156 startTime = time.Now() 157 if err := tx.root.spill(); err != nil { 158 tx.rollback() 159 return err 160 } 161 tx.stats.SpillTime += time.Since(startTime) 162 163 tx.meta.root.root = tx.root.root 164 165 if tx.meta.freelist != pgidNoFreelist { 166 tx.db.freelist.free(tx.meta.txid, tx.db.page(tx.meta.freelist)) 167 } 168 169 if !tx.db.NoFreelistSync { 170 if err := tx.commitFreelist(); err != nil { 171 return err 172 } 173 } else { 174 tx.meta.freelist = pgidNoFreelist 175 } 176 177 startTime = time.Now() 178 if err := tx.write(); err != nil { 179 tx.rollback() 180 return err 181 } 182 183 if tx.db.StrictMode { 184 ch := tx.Check() 185 var errs []string 186 for { 187 err, ok := <-ch 188 if !ok { 189 break 190 } 191 errs = append(errs, err.Error()) 192 } 193 if len(errs) > 0 { 194 panic("check fail: " + strings.Join(errs, "\n")) 195 } 196 } 197 198 if err := tx.writeMeta(); err != nil { 199 tx.rollback() 200 return err 201 } 202 tx.stats.WriteTime += time.Since(startTime) 203 204 tx.close() 205 206 for _, fn := range tx.commitHandlers { 207 fn() 208 } 209 210 return nil 211 } 212 213 func (tx *Tx) commitFreelist() error { 214 opgid := tx.meta.pgid 215 rb, freeCount := tx.db.freelist.convertBitmap() 216 rbLen := rb.GetSerializedSizeInBytes() 217 count := ((int(rbLen) + freelistBitmapHeaderSize) / tx.db.pageSize) + 2 218 p, isFreePage, err := tx.allocate(count) 219 if err != nil { 220 tx.rollback() 221 return err 222 } 223 224 if isFreePage { 225 for pid := p.id; pid < p.id+pgid(count); pid++ { 226 if rb.Contains(uint64(pid)) { 227 rb.Remove(uint64(pid)) 228 freeCount-- 229 } 230 } 231 rbLen = rb.GetSerializedSizeInBytes() 232 } 233 234 if freeCount < 0 { 235 freeCount = 0 236 } 237 238 if err := tx.db.freelist.writeBitmap(p, rb, rbLen, freeCount); err != nil { 239 tx.rollback() 240 return err 241 } 242 243 tx.meta.freelist = p.id 244 if tx.meta.pgid > opgid { 245 if err := tx.db.grow(int(tx.meta.pgid+1) * tx.db.pageSize); err != nil { 246 tx.rollback() 247 return err 248 } 249 } 250 251 return nil 252 } 253 254 func (tx *Tx) Rollback() error { 255 _assert(!tx.managed, "managed tx rollback not allowed") 256 if tx.db == nil { 257 return ErrTxClosed 258 } 259 tx.nonPhysicalRollback() 260 return nil 261 } 262 263 func (tx *Tx) nonPhysicalRollback() { 264 if tx.db == nil { 265 return 266 } 267 if tx.writable { 268 tx.db.freelist.rollback(tx.meta.txid) 269 } 270 tx.close() 271 } 272 273 func (tx *Tx) rollback() { 274 if tx.db == nil { 275 return 276 } 277 if tx.writable { 278 tx.db.freelist.rollback(tx.meta.txid) 279 if !tx.db.hasSyncedFreelist() { 280 tx.db.freelist.noSyncReload(tx.db.freepages()) 281 } else { 282 tx.db.freelist.reload(tx.db.page(tx.db.meta().freelist), tx.db.meta().version) 283 } 284 } 285 tx.close() 286 } 287 288 func (tx *Tx) close() { 289 if tx.db == nil { 290 return 291 } 292 if tx.writable { 293 var freelistFreeN = tx.db.freelist.free_count() 294 var freelistPendingN = tx.db.freelist.pending_count() 295 var freelistAlloc = tx.db.freelist.size() 296 297 tx.db.rwtx = nil 298 tx.db.rwlock.Unlock() 299 300 tx.db.statlock.Lock() 301 tx.db.stats.FreePageN = freelistFreeN 302 tx.db.stats.PendingPageN = freelistPendingN 303 tx.db.stats.FreeAlloc = (freelistFreeN + freelistPendingN) * tx.db.pageSize 304 tx.db.stats.FreelistInuse = freelistAlloc 305 tx.db.stats.TxStats.add(&tx.stats) 306 tx.db.statlock.Unlock() 307 } else { 308 tx.db.removeTx(tx) 309 } 310 311 tx.db = nil 312 tx.meta = nil 313 tx.root = Bucket{tx: tx} 314 tx.pages = nil 315 } 316 317 func (tx *Tx) Copy(w io.Writer) error { 318 _, err := tx.WriteTo(w) 319 return err 320 } 321 322 func (tx *Tx) WriteTo(w io.Writer) (n int64, err error) { 323 f, err := tx.db.openFile(tx.db.path, os.O_RDONLY|tx.WriteFlag, 0) 324 if err != nil { 325 return 0, err 326 } 327 defer func() { 328 if cerr := f.Close(); err == nil { 329 err = cerr 330 } 331 }() 332 333 buf := make([]byte, tx.db.pageSize) 334 page := (*page)(unsafe.Pointer(&buf[0])) 335 page.flags = metaPageFlag 336 *page.meta() = *tx.meta 337 338 page.id = 0 339 page.meta().checksum = page.meta().sum64() 340 nn, err := w.Write(buf) 341 n += int64(nn) 342 if err != nil { 343 return n, errors.Wrap(err, "meta 0 copy err") 344 } 345 346 page.id = 1 347 page.meta().txid -= 1 348 page.meta().checksum = page.meta().sum64() 349 nn, err = w.Write(buf) 350 n += int64(nn) 351 if err != nil { 352 return n, errors.Wrap(err, "meta 1 copy err") 353 } 354 355 if _, err := f.Seek(int64(tx.db.pageSize*2), io.SeekStart); err != nil { 356 return n, errors.Wrap(err, "seek err") 357 } 358 359 wn, err := io.CopyN(w, f, tx.Size()-int64(tx.db.pageSize*2)) 360 n += wn 361 if err != nil { 362 return n, err 363 } 364 365 return n, nil 366 } 367 368 func (tx *Tx) CopyFile(path string, mode os.FileMode) error { 369 f, err := tx.db.openFile(path, os.O_RDWR|os.O_CREATE|os.O_TRUNC, mode) 370 if err != nil { 371 return err 372 } 373 374 _, err = tx.WriteTo(f) 375 if err != nil { 376 _ = f.Close() 377 return err 378 } 379 return f.Close() 380 } 381 382 func (tx *Tx) Check() <-chan error { 383 ch := make(chan error) 384 go tx.check(ch) 385 return ch 386 } 387 388 func (tx *Tx) check(ch chan error) { 389 tx.db.loadFreelist() 390 391 freed := make(map[pgid]bool, 1<<4) 392 all := make([]pgid, tx.db.freelist.count()) 393 tx.db.freelist.copyall(all) 394 for _, id := range all { 395 if freed[id] { 396 ch <- errors.Errorf("page %d: already freed", id) 397 } 398 freed[id] = true 399 } 400 401 reachable := make(map[pgid]*page, 1<<4) 402 reachable[0] = tx.page(0) 403 reachable[1] = tx.page(1) 404 if tx.meta.freelist != pgidNoFreelist { 405 for i := uint32(0); i <= tx.page(tx.meta.freelist).overflow; i++ { 406 reachable[tx.meta.freelist+pgid(i)] = tx.page(tx.meta.freelist) 407 } 408 } 409 410 tx.checkBucket(&tx.root, reachable, freed, ch) 411 412 for i := pgid(0); i < tx.meta.pgid; i++ { 413 _, isReachable := reachable[i] 414 if !isReachable && !freed[i] { 415 ch <- errors.Errorf("page %d: unreachable unfreed", int(i)) 416 } 417 } 418 419 close(ch) 420 } 421 422 func (tx *Tx) checkBucket(b *Bucket, reachable map[pgid]*page, freed map[pgid]bool, ch chan error) { 423 if b.root == 0 { 424 return 425 } 426 427 b.tx.forEachPage(b.root, 0, func(p *page, _ int) { 428 if p.id > tx.meta.pgid { 429 ch <- errors.Errorf("page %d: out of bounds: %d", int(p.id), int(b.tx.meta.pgid)) 430 } 431 432 for i := pgid(0); i <= pgid(p.overflow); i++ { 433 var id = p.id + i 434 if _, ok := reachable[id]; ok { 435 ch <- errors.Errorf("page %d: multiple references", int(id)) 436 } 437 reachable[id] = p 438 } 439 440 if freed[p.id] { 441 ch <- errors.Errorf("page %d: reachable freed", int(p.id)) 442 } else if (p.flags&branchPageFlag) == 0 && (p.flags&leafPageFlag) == 0 { 443 ch <- errors.Errorf("page %d: invalid type: %s", int(p.id), p.typ()) 444 } 445 }) 446 447 _ = b.ForEach(func(k, v []byte) error { 448 if child := b.Bucket(k); child != nil { 449 tx.checkBucket(child, reachable, freed, ch) 450 } 451 return nil 452 }) 453 } 454 455 func (tx *Tx) allocate(count int) (*page, bool, error) { 456 p, isFreePage, err := tx.db.allocate(tx.meta.txid, count) 457 if err != nil { 458 return nil, isFreePage, err 459 } 460 461 tx.pages[p.id] = p 462 463 tx.stats.PageCount += count 464 tx.stats.PageAlloc += count * tx.db.pageSize 465 466 return p, isFreePage, nil 467 } 468 469 func (tx *Tx) write() error { 470 pages := make(pages, 0, len(tx.pages)) 471 for _, p := range tx.pages { 472 pages = append(pages, p) 473 } 474 475 tx.pages = make(map[pgid]*page, 1<<4) 476 sort.Sort(pages) 477 478 for _, p := range pages { 479 rem := (uint64(p.overflow) + 1) * uint64(tx.db.pageSize) 480 offset := int64(p.id) * int64(tx.db.pageSize) 481 var written uintptr 482 483 for { 484 sz := rem 485 if sz > maxAllocSize-1 { 486 sz = maxAllocSize - 1 487 } 488 buf := unsafeByteSlice(unsafe.Pointer(p), written, 0, int(sz)) 489 490 if _, err := tx.db.ops.writeAt(buf, offset); err != nil { 491 return err 492 } 493 494 tx.stats.Write++ 495 496 rem -= sz 497 if rem == 0 { 498 break 499 } 500 501 offset += int64(sz) 502 written += uintptr(sz) 503 } 504 } 505 506 if !tx.db.NoSync || IgnoreNoSync { 507 if err := fdatasync(tx.db); err != nil { 508 return err 509 } 510 } 511 512 for _, p := range pages { 513 if int(p.overflow) != 0 { 514 continue 515 } 516 517 buf := unsafeByteSlice(unsafe.Pointer(p), 0, 0, tx.db.pageSize) 518 519 for i := range buf { 520 buf[i] = 0 521 } 522 tx.db.pagePool.Put(buf) 523 } 524 525 return nil 526 } 527 528 func (tx *Tx) writeMeta() (err error) { 529 buf := make([]byte, tx.db.pageSize) 530 p := tx.db.pageInBuffer(buf, 0) 531 532 metaOldVersion := tx.meta.version 533 if tx.db.version == versionFreelistBitmap && metaOldVersion != versionFreelistBitmap { 534 tx.meta.version = versionFreelistBitmap 535 } 536 537 tx.meta.write(p) 538 539 defer func() { 540 if err != nil && tx.meta.version != metaOldVersion { 541 tx.meta.version = metaOldVersion 542 } 543 }() 544 545 if _, err = tx.db.ops.writeAt(buf, int64(p.id)*int64(tx.db.pageSize)); err != nil { 546 547 return err 548 } 549 if !tx.db.NoSync || IgnoreNoSync { 550 if err = fdatasync(tx.db); err != nil { 551 tx.meta.version = metaOldVersion 552 return err 553 } 554 } 555 556 tx.stats.Write++ 557 558 return nil 559 } 560 561 func (tx *Tx) page(id pgid) *page { 562 if tx.pages != nil { 563 if p, ok := tx.pages[id]; ok { 564 return p 565 } 566 } 567 568 return tx.db.page(id) 569 } 570 571 func (tx *Tx) forEachPage(pgid pgid, depth int, fn func(*page, int)) { 572 p := tx.page(pgid) 573 574 fn(p, depth) 575 576 if (p.flags & branchPageFlag) != 0 { 577 for i := 0; i < int(p.count); i++ { 578 elem := p.branchPageElement(uint16(i)) 579 tx.forEachPage(elem.pgid, depth+1, fn) 580 } 581 } 582 } 583 584 func (tx *Tx) Page(id int) (*PageInfo, error) { 585 if tx.db == nil { 586 return nil, ErrTxClosed 587 } else if pgid(id) >= tx.meta.pgid { 588 return nil, nil 589 } 590 591 p := tx.db.page(pgid(id)) 592 info := &PageInfo{ 593 ID: id, 594 Count: int(p.count), 595 OverflowCount: int(p.overflow), 596 } 597 598 if tx.db.freelist.freed(pgid(id)) { 599 info.Type = "free" 600 } else { 601 info.Type = p.typ() 602 } 603 604 return info, nil 605 } 606 607 type TxStats struct { 608 PageCount int 609 PageAlloc int 610 CursorCount int 611 NodeCount int 612 NodeDeref int 613 Rebalance int 614 RebalanceTime time.Duration 615 Split int 616 Spill int 617 SpillTime time.Duration 618 Write int 619 WriteTime time.Duration 620 } 621 622 func (s *TxStats) add(other *TxStats) { 623 s.PageCount += other.PageCount 624 s.PageAlloc += other.PageAlloc 625 s.CursorCount += other.CursorCount 626 s.NodeCount += other.NodeCount 627 s.NodeDeref += other.NodeDeref 628 s.Rebalance += other.Rebalance 629 s.RebalanceTime += other.RebalanceTime 630 s.Split += other.Split 631 s.Spill += other.Spill 632 s.SpillTime += other.SpillTime 633 s.Write += other.Write 634 s.WriteTime += other.WriteTime 635 } 636 637 func (s *TxStats) Sub(other *TxStats) TxStats { 638 var diff TxStats 639 diff.PageCount = s.PageCount - other.PageCount 640 diff.PageAlloc = s.PageAlloc - other.PageAlloc 641 diff.CursorCount = s.CursorCount - other.CursorCount 642 diff.NodeCount = s.NodeCount - other.NodeCount 643 diff.NodeDeref = s.NodeDeref - other.NodeDeref 644 diff.Rebalance = s.Rebalance - other.Rebalance 645 diff.RebalanceTime = s.RebalanceTime - other.RebalanceTime 646 diff.Split = s.Split - other.Split 647 diff.Spill = s.Spill - other.Spill 648 diff.SpillTime = s.SpillTime - other.SpillTime 649 diff.Write = s.Write - other.Write 650 diff.WriteTime = s.WriteTime - other.WriteTime 651 return diff 652 }