github.com/zuoyebang/bitalosdb@v1.1.1-0.20240516111551-79a8c4d8ce20/bitower.go (about) 1 // Copyright 2021 The Bitalosdb author(hustxrb@163.com) and other contributors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package bitalosdb 16 17 import ( 18 "bytes" 19 "io" 20 "os" 21 "sort" 22 "sync" 23 "sync/atomic" 24 25 "github.com/cockroachdb/errors" 26 "github.com/zuoyebang/bitalosdb/bitree" 27 "github.com/zuoyebang/bitalosdb/internal/arenaskl" 28 "github.com/zuoyebang/bitalosdb/internal/base" 29 "github.com/zuoyebang/bitalosdb/internal/hash" 30 "github.com/zuoyebang/bitalosdb/internal/invariants" 31 "github.com/zuoyebang/bitalosdb/internal/manual" 32 "github.com/zuoyebang/bitalosdb/internal/options" 33 "github.com/zuoyebang/bitalosdb/internal/record" 34 "github.com/zuoyebang/bitalosdb/internal/utils" 35 "github.com/zuoyebang/bitalosdb/internal/vfs" 36 ) 37 38 type Bitower struct { 39 db *DB 40 btree *bitree.Bitree 41 index int 42 walDirname string 43 walDir vfs.File 44 commit *commitPipeline 45 logRecycler logRecycler 46 memTableSize int 47 memTableReserved atomic.Int64 48 iterSlowCount atomic.Uint64 49 50 readState struct { 51 sync.RWMutex 52 val *readState 53 } 54 55 mu struct { 56 sync.Mutex 57 metaEdit *bitowerMetaEditor 58 59 log struct { 60 queue []fileInfo 61 *record.LogWriter 62 } 63 64 mem struct { 65 cond sync.Cond 66 mutable *memTable 67 queue flushableList 68 switching bool 69 } 70 71 compact struct { 72 cond sync.Cond 73 flushing bool 74 } 75 } 76 } 77 78 func openBitower(d *DB, index int) (*Bitower, error) { 79 s := &Bitower{ 80 db: d, 81 index: index, 82 memTableSize: d.opts.MemTableSize, 83 logRecycler: logRecycler{limit: d.opts.MemTableStopWritesThreshold + 1}, 84 } 85 86 s.walDirname = base.MakeWalpath(d.walDirname, index) 87 if err := d.opts.FS.MkdirAll(s.walDirname, 0755); err != nil { 88 return nil, err 89 } 90 walDir, err := d.opts.FS.OpenDir(s.walDirname) 91 if err != nil { 92 return nil, err 93 } 94 s.walDir = walDir 95 96 s.commit = newCommitPipeline(commitEnv{ 97 logSeqNum: &d.meta.atomic.logSeqNum, 98 visibleSeqNum: &d.meta.atomic.visibleSeqNum, 99 apply: s.commitApply, 100 write: s.commitWrite, 101 useQueue: !d.opts.DisableWAL, 102 }) 103 104 s.mu.mem.cond.L = &s.mu.Mutex 105 s.mu.compact.cond.L = &s.mu.Mutex 106 107 s.mu.Lock() 108 defer s.mu.Unlock() 109 110 s.initMetaEdit() 111 112 btreeOpts := s.db.optspool.CloneBitreeOptions() 113 btreeOpts.IsFlushedBitableCB = s.db.isFlushedBitable 114 btreeOpts.Index = s.index 115 s.btree, err = bitree.NewBitree(s.db.dirname, btreeOpts) 116 if err != nil { 117 return nil, err 118 } 119 120 var entry *flushableEntry 121 s.mu.mem.mutable, entry = s.newMemTable(0, d.meta.atomic.logSeqNum) 122 s.mu.mem.queue = append(s.mu.mem.queue, entry) 123 124 ls, err := d.opts.FS.List(s.walDirname) 125 if err != nil { 126 return nil, err 127 } 128 type fileNumAndName struct { 129 num FileNum 130 name string 131 } 132 var logFiles []fileNumAndName 133 var maxSeqNum uint64 134 for _, filename := range ls { 135 ft, fn, ok := base.ParseFilename(d.opts.FS, filename) 136 if !ok { 137 continue 138 } 139 140 if s.mu.metaEdit.NextFileNum <= fn { 141 s.mu.metaEdit.NextFileNum = fn + 1 142 } 143 144 switch ft { 145 case fileTypeLog: 146 if fn >= s.getMinUnflushedLogNum() { 147 logFiles = append(logFiles, fileNumAndName{fn, filename}) 148 } 149 if s.logRecycler.minRecycleLogNum <= fn { 150 s.logRecycler.minRecycleLogNum = fn + 1 151 } 152 } 153 } 154 155 sort.Slice(logFiles, func(i, j int) bool { 156 return logFiles[i].num < logFiles[j].num 157 }) 158 159 for _, lf := range logFiles { 160 walFilename := d.opts.FS.PathJoin(s.walDirname, lf.name) 161 maxSeqNum, err = s.replayWAL(walFilename, lf.num) 162 if err != nil { 163 return nil, err 164 } 165 d.opts.Logger.Infof("[BITOWER %d] replayWAL ok wal:%s maxSeqNum:%d", s.index, walFilename, maxSeqNum) 166 s.mu.metaEdit.MarkFileNumUsed(lf.num) 167 if d.meta.atomic.logSeqNum < maxSeqNum { 168 d.meta.atomic.logSeqNum = maxSeqNum 169 } 170 } 171 172 newLogNum := s.mu.metaEdit.GetNextFileNum() 173 sme := &bitowerMetaEditor{MinUnflushedLogNum: newLogNum} 174 if err = s.metaApply(sme); err != nil { 175 return nil, err 176 } 177 178 newLogName := s.makeWalFilename(newLogNum) 179 s.mu.log.queue = append(s.mu.log.queue, fileInfo{fileNum: newLogNum, fileSize: 0}) 180 logFile, err := d.opts.FS.Create(newLogName) 181 if err != nil { 182 return nil, err 183 } 184 if err = s.walDir.Sync(); err != nil { 185 return nil, err 186 } 187 188 d.opts.EventListener.WALCreated(WALCreateInfo{ 189 Index: index, 190 Path: newLogName, 191 FileNum: newLogNum, 192 }) 193 194 s.mu.mem.queue[len(s.mu.mem.queue)-1].logNum = newLogNum 195 196 logFile = vfs.NewSyncingFile(logFile, vfs.SyncingFileOptions{ 197 BytesPerSync: d.opts.WALBytesPerSync, 198 PreallocateSize: s.walPreallocateSize(), 199 }) 200 s.mu.log.LogWriter = record.NewLogWriter(logFile, newLogNum) 201 s.mu.log.LogWriter.SetMinSyncInterval(d.opts.WALMinSyncInterval) 202 203 s.updateReadState() 204 205 s.scanObsoleteFiles(ls) 206 s.doDeleteObsoleteFiles() 207 208 return s, nil 209 } 210 211 func (s *Bitower) replayWAL(filename string, logNum FileNum) (maxSeqNum uint64, err error) { 212 file, err := s.db.opts.FS.Open(filename) 213 if err != nil { 214 return 0, err 215 } 216 defer file.Close() 217 218 var ( 219 b BatchBitower 220 buf bytes.Buffer 221 mem *memTable 222 entry *flushableEntry 223 toFlush flushableList 224 rr = record.NewReader(file, logNum) 225 offset int64 226 lastFlushOffset int64 227 ) 228 229 flushMem := func() { 230 if mem == nil { 231 return 232 } 233 var logSize uint64 234 if offset >= lastFlushOffset { 235 logSize = uint64(offset - lastFlushOffset) 236 } 237 lastFlushOffset = offset 238 entry.logSize = logSize 239 toFlush = append(toFlush, entry) 240 mem, entry = nil, nil 241 } 242 243 ensureMem := func(seqNum uint64) { 244 if mem != nil { 245 return 246 } 247 mem, entry = s.newMemTable(logNum, seqNum) 248 } 249 250 for { 251 offset = rr.Offset() 252 r, err := rr.Next() 253 if err == nil { 254 _, err = io.Copy(&buf, r) 255 } 256 if err != nil { 257 if err == io.EOF { 258 break 259 } else if record.IsInvalidRecord(err) { 260 break 261 } 262 return 0, errors.Wrap(err, "bitalosdb: error when replaying WAL") 263 } 264 265 if buf.Len() < batchHeaderLen { 266 return 0, base.CorruptionErrorf("bitalosdb: corrupt log file %q (num %s)", 267 filename, errors.Safe(logNum)) 268 } 269 270 b = BatchBitower{ 271 db: s.db, 272 index: s.index, 273 } 274 b.SetRepr(buf.Bytes()) 275 seqNum := b.SeqNum() 276 maxSeqNum = seqNum + uint64(b.Count()) 277 278 ensureMem(seqNum) 279 if err = mem.prepare(&b, false); err != nil && err != arenaskl.ErrArenaFull { 280 return 0, err 281 } 282 for err == arenaskl.ErrArenaFull { 283 flushMem() 284 ensureMem(seqNum) 285 err = mem.prepare(&b, false) 286 if err != nil && err != arenaskl.ErrArenaFull { 287 return 0, err 288 } 289 } 290 if err = mem.apply(&b, seqNum); err != nil { 291 return 0, err 292 } 293 mem.writerUnref() 294 buf.Reset() 295 } 296 flushMem() 297 298 if len(toFlush) > 0 { 299 c := newFlush(s.db.opts, toFlush) 300 if err = s.runCompaction(c, len(toFlush)); err != nil { 301 return 0, err 302 } 303 for i := range toFlush { 304 toFlush[i].readerUnref() 305 } 306 } 307 308 return maxSeqNum, err 309 } 310 311 func (s *Bitower) initMetaEdit() { 312 s.mu.metaEdit = &s.db.meta.meta.Bmes[s.index] 313 314 if s.mu.metaEdit.NextFileNum == 0 { 315 s.mu.metaEdit.NextFileNum = 1 316 } 317 318 s.mu.metaEdit.MarkFileNumUsed(s.mu.metaEdit.MinUnflushedLogNum) 319 } 320 321 func (s *Bitower) getMinUnflushedLogNum() FileNum { 322 return s.mu.metaEdit.MinUnflushedLogNum 323 } 324 325 func (s *Bitower) metaApply(sme *bitowerMetaEditor) error { 326 if sme.MinUnflushedLogNum != 0 { 327 if sme.MinUnflushedLogNum < s.getMinUnflushedLogNum() || s.mu.metaEdit.NextFileNum <= sme.MinUnflushedLogNum { 328 return errors.Errorf("inconsistent bitowerMetaEditor minUnflushedLogNum %d", sme.MinUnflushedLogNum) 329 } 330 } 331 332 sme.Index = s.index 333 sme.NextFileNum = s.mu.metaEdit.NextFileNum 334 return s.db.meta.apply(sme) 335 } 336 337 func (s *Bitower) walPreallocateSize() int { 338 size := s.memTableSize 339 size = (size / 10) + size 340 return size 341 } 342 343 func (s *Bitower) makeWalFilename(fileNum FileNum) string { 344 return base.MakeFilepath(s.db.opts.FS, s.walDirname, fileTypeLog, fileNum) 345 } 346 347 func (s *Bitower) newMemTable(logNum FileNum, logSeqNum uint64) (*memTable, *flushableEntry) { 348 size := s.memTableSize 349 mem := newMemTable(memTableOptions{ 350 Options: s.db.opts, 351 arenaBuf: manual.New(size), 352 size: size, 353 logSeqNum: logSeqNum, 354 }) 355 356 s.memTableReserved.Add(int64(size)) 357 358 invariants.SetFinalizer(mem, checkMemTable) 359 360 entry := newFlushableEntry(mem, logNum, logSeqNum) 361 entry.releaseMemAccounting = func() { 362 manual.Free(mem.arenaBuf) 363 mem.arenaBuf = nil 364 s.memTableReserved.Add(-int64(size)) 365 } 366 return mem, entry 367 } 368 369 func (s *Bitower) Get(key []byte) ([]byte, func(), error) { 370 rs := s.loadReadState() 371 372 for n := len(rs.memtables) - 1; n >= 0; n-- { 373 m := rs.memtables[n] 374 mValue, mExist, kind := m.get(key) 375 if mExist { 376 switch kind { 377 case InternalKeyKindSet, InternalKeyKindPrefixDelete: 378 return mValue, func() { rs.unref() }, nil 379 case InternalKeyKindDelete: 380 rs.unref() 381 return nil, nil, ErrNotFound 382 } 383 } 384 } 385 386 rs.unref() 387 388 useCache := false 389 keyHash := hash.Crc32(key) 390 if s.db.cache != nil { 391 useCache = true 392 ivCache, ivCloser, ivExist := s.db.cache.Get(key, keyHash) 393 if ivExist { 394 return ivCache, ivCloser, nil 395 } 396 } 397 398 v, exist, closer := s.btree.Get(key, keyHash) 399 if exist { 400 if useCache && v != nil { 401 s.db.cache.Set(key, v, keyHash) 402 } 403 404 return v, closer, nil 405 } 406 return nil, nil, ErrNotFound 407 } 408 409 func (s *Bitower) Exist(key []byte) bool { 410 rs := s.loadReadState() 411 defer rs.unref() 412 413 for n := len(rs.memtables) - 1; n >= 0; n-- { 414 m := rs.memtables[n] 415 _, mExist, kind := m.get(key) 416 if mExist { 417 switch kind { 418 case InternalKeyKindSet: 419 return true 420 case InternalKeyKindDelete, InternalKeyKindPrefixDelete: 421 return false 422 } 423 } 424 } 425 426 keyHash := hash.Crc32(key) 427 428 if s.db.cache != nil { 429 _, ivCloser, ivExist := s.db.cache.Get(key, keyHash) 430 if ivCloser != nil { 431 ivCloser() 432 } 433 return ivExist 434 } 435 436 return s.btree.Exist(key, keyHash) 437 } 438 439 func (s *Bitower) Flush() error { 440 flushDone, err := s.AsyncFlush() 441 if err != nil { 442 return err 443 } 444 if flushDone != nil { 445 <-flushDone 446 } 447 return nil 448 } 449 450 func (s *Bitower) AsyncFlush() (<-chan struct{}, error) { 451 if s.db.IsClosed() { 452 return nil, ErrClosed 453 } 454 455 s.commit.mu.Lock() 456 defer s.commit.mu.Unlock() 457 s.mu.Lock() 458 defer s.mu.Unlock() 459 empty := true 460 for i := range s.mu.mem.queue { 461 if !s.mu.mem.queue[i].empty() { 462 empty = false 463 break 464 } 465 } 466 if empty { 467 return nil, nil 468 } 469 flushed := s.mu.mem.queue[len(s.mu.mem.queue)-1].flushed 470 err := s.makeRoomForWrite(nil, false) 471 if err != nil { 472 return nil, err 473 } 474 return flushed, nil 475 } 476 477 func (s *Bitower) newBitreeIter(o *options.IterOptions) (iters []base.InternalIterator) { 478 return s.btree.NewIters(o) 479 } 480 481 func (s *Bitower) checkpoint(fs vfs.FS, destDir string, isSync bool) error { 482 s.mu.Lock() 483 memQueue := s.mu.mem.queue 484 s.mu.Unlock() 485 486 if isSync { 487 if err := s.db.LogData(nil, s.index, Sync); err != nil { 488 return err 489 } 490 } 491 492 for i := range memQueue { 493 logNum := memQueue[i].logNum 494 if logNum == 0 { 495 continue 496 } 497 srcPath := base.MakeFilepath(fs, s.walDirname, fileTypeLog, logNum) 498 destPath := fs.PathJoin(destDir, fs.PathBase(srcPath)) 499 if err := vfs.Copy(fs, srcPath, destPath); err != nil { 500 return err 501 } 502 } 503 504 return s.checkpointBitree(fs, destDir) 505 } 506 507 func (s *Bitower) checkpointBitree(fs vfs.FS, destDir string) error { 508 srcPath := base.MakeBitreeFilepath(s.db.dirname, s.index) 509 destPath := fs.PathJoin(destDir, fs.PathBase(srcPath)) 510 if err := vfs.Copy(fs, srcPath, destPath); err != nil { 511 return err 512 } 513 if err := s.btree.Checkpoint(fs, destDir, s.db.dirname); err != nil { 514 return err 515 } 516 517 return nil 518 } 519 520 func (s *Bitower) Close() (err error) { 521 s.mu.Lock() 522 defer s.mu.Unlock() 523 524 for s.mu.compact.flushing { 525 s.mu.compact.cond.Wait() 526 } 527 528 err = utils.FirstError(err, s.mu.log.Close()) 529 530 s.readState.val.unref() 531 532 for _, mem := range s.mu.mem.queue { 533 mem.readerUnref() 534 } 535 536 if reserved := s.memTableReserved.Load(); reserved != 0 { 537 err = utils.FirstError(err, errors.Errorf("leaked memtable reservation:%d", reserved)) 538 } 539 540 err = utils.FirstError(err, s.walDir.Close()) 541 err = utils.FirstError(err, s.btree.Close()) 542 s.db.opts.Logger.Infof("[BITOWER %d] closed...", s.index) 543 return err 544 } 545 546 func (s *Bitower) commitApply(b *BatchBitower, mem *memTable) error { 547 err := mem.apply(b, b.SeqNum()) 548 if err != nil { 549 return err 550 } 551 552 if mem.writerUnref() { 553 s.mu.Lock() 554 s.maybeScheduleFlush(true) 555 s.mu.Unlock() 556 } 557 return nil 558 } 559 560 func (s *Bitower) commitWrite(b *BatchBitower, syncWG *sync.WaitGroup, syncErr *error) (*memTable, error) { 561 repr := b.Repr() 562 563 s.mu.Lock() 564 565 err := s.makeRoomForWrite(b, true) 566 567 mem := s.mu.mem.mutable 568 569 s.mu.Unlock() 570 if err != nil { 571 return nil, err 572 } 573 574 if s.db.opts.DisableWAL { 575 return mem, nil 576 } 577 578 if _, err = s.mu.log.SyncRecord(repr, syncWG, syncErr); err != nil { 579 return nil, err 580 } 581 582 return mem, err 583 } 584 585 func (s *Bitower) makeRoomForWrite(b *BatchBitower, needReport bool) error { 586 force := b == nil 587 stalled := false 588 for { 589 if s.mu.mem.switching { 590 s.mu.mem.cond.Wait() 591 continue 592 } 593 if b != nil { 594 err := s.mu.mem.mutable.prepare(b, true) 595 if err != arenaskl.ErrArenaFull && err != errMemExceedDelPercent { 596 if stalled { 597 s.db.opts.EventListener.WriteStallEnd() 598 } 599 return err 600 } 601 } else if !force { 602 if stalled { 603 s.db.opts.EventListener.WriteStallEnd() 604 } 605 return nil 606 } 607 608 { 609 var size uint64 610 for i := range s.mu.mem.queue { 611 size += s.mu.mem.queue[i].totalBytes() 612 } 613 if size >= uint64(s.db.opts.MemTableStopWritesThreshold*s.db.opts.MemTableSize) { 614 if !stalled { 615 stalled = true 616 s.db.opts.EventListener.WriteStallBegin(WriteStallBeginInfo{ 617 Index: s.index, 618 Reason: "memtable count limit reached", 619 }) 620 } 621 s.mu.compact.cond.Wait() 622 continue 623 } 624 } 625 626 var newLogNum FileNum 627 var newLogFile vfs.File 628 var newLogSize uint64 629 var prevLogSize uint64 630 var err error 631 632 if !s.db.opts.DisableWAL { 633 newLogNum = s.mu.metaEdit.GetNextFileNum() 634 s.mu.mem.switching = true 635 prevLogSize = uint64(s.mu.log.Size()) 636 637 if s.mu.log.queue[len(s.mu.log.queue)-1].fileSize < prevLogSize { 638 s.mu.log.queue[len(s.mu.log.queue)-1].fileSize = prevLogSize 639 } 640 641 s.mu.Unlock() 642 643 err = s.mu.log.Close() 644 newLogName := s.makeWalFilename(newLogNum) 645 646 var recycleLog fileInfo 647 var recycleOK bool 648 if err == nil { 649 recycleLog, recycleOK = s.logRecycler.peek() 650 if recycleOK { 651 recycleLogName := s.makeWalFilename(recycleLog.fileNum) 652 newLogFile, err = s.db.opts.FS.ReuseForWrite(recycleLogName, newLogName) 653 } else { 654 newLogFile, err = s.db.opts.FS.Create(newLogName) 655 } 656 } 657 658 if err == nil && recycleOK { 659 var finfo os.FileInfo 660 finfo, err = newLogFile.Stat() 661 if err == nil { 662 newLogSize = uint64(finfo.Size()) 663 } 664 } 665 666 if err == nil { 667 err = s.walDir.Sync() 668 } 669 670 if err != nil && newLogFile != nil { 671 newLogFile.Close() 672 } else if err == nil { 673 newLogFile = vfs.NewSyncingFile(newLogFile, vfs.SyncingFileOptions{ 674 BytesPerSync: s.db.opts.WALBytesPerSync, 675 PreallocateSize: s.walPreallocateSize(), 676 }) 677 } 678 679 if recycleOK { 680 err = utils.FirstError(err, s.logRecycler.pop(recycleLog.fileNum)) 681 } 682 683 s.db.opts.EventListener.WALCreated(WALCreateInfo{ 684 Index: s.index, 685 Path: newLogName, 686 FileNum: newLogNum, 687 RecycledFileNum: recycleLog.fileNum, 688 Err: err, 689 }) 690 691 s.mu.Lock() 692 s.mu.mem.switching = false 693 s.mu.mem.cond.Broadcast() 694 } 695 696 if err != nil { 697 s.db.opts.Logger.Errorf("panic: makeRoomForWrite err:%s", err) 698 return err 699 } 700 701 if !s.db.opts.DisableWAL { 702 s.mu.log.queue = append(s.mu.log.queue, fileInfo{fileNum: newLogNum, fileSize: newLogSize}) 703 s.mu.log.LogWriter = record.NewLogWriter(newLogFile, newLogNum) 704 s.mu.log.LogWriter.SetMinSyncInterval(s.db.opts.WALMinSyncInterval) 705 } 706 707 immMem := s.mu.mem.mutable 708 imm := s.mu.mem.queue[len(s.mu.mem.queue)-1] 709 imm.logSize = prevLogSize 710 imm.flushForced = imm.flushForced || (b == nil) 711 712 var logSeqNum uint64 713 if b != nil { 714 logSeqNum = b.SeqNum() 715 } else { 716 logSeqNum = atomic.LoadUint64(&s.db.meta.atomic.logSeqNum) 717 } 718 719 var entry *flushableEntry 720 s.mu.mem.mutable, entry = s.newMemTable(newLogNum, logSeqNum) 721 s.mu.mem.queue = append(s.mu.mem.queue, entry) 722 s.updateReadState() 723 if immMem.writerUnref() { 724 s.maybeScheduleFlush(needReport) 725 } 726 force = false 727 } 728 } 729 730 func (s *Bitower) newFlushWriter() (*flushBitowerWriter, error) { 731 bitreeWriter, err := s.btree.NewBitreeWriter() 732 if err != nil { 733 return nil, err 734 } 735 736 w := &flushBitowerWriter{ 737 s: s, 738 writer: bitreeWriter, 739 } 740 741 return w, nil 742 }