github.com/df-mc/goleveldb@v1.1.9/leveldb/db_compaction.go (about) 1 // Copyright (c) 2012, Suryandaru Triandana <syndtr@gmail.com> 2 // All rights reserved. 3 // 4 // Use of this source code is governed by a BSD-style license that can be 5 // found in the LICENSE file. 6 7 package leveldb 8 9 import ( 10 "sync" 11 "sync/atomic" 12 "time" 13 14 "github.com/df-mc/goleveldb/leveldb/errors" 15 "github.com/df-mc/goleveldb/leveldb/opt" 16 "github.com/df-mc/goleveldb/leveldb/storage" 17 ) 18 19 var ( 20 errCompactionTransactExiting = errors.New("leveldb: compaction transact exiting") 21 ) 22 23 type cStat struct { 24 duration time.Duration 25 read int64 26 write int64 27 } 28 29 func (p *cStat) add(n *cStatStaging) { 30 p.duration += n.duration 31 p.read += n.read 32 p.write += n.write 33 } 34 35 func (p *cStat) get() (duration time.Duration, read, write int64) { 36 return p.duration, p.read, p.write 37 } 38 39 type cStatStaging struct { 40 start time.Time 41 duration time.Duration 42 on bool 43 read int64 44 write int64 45 } 46 47 func (p *cStatStaging) startTimer() { 48 if !p.on { 49 p.start = time.Now() 50 p.on = true 51 } 52 } 53 54 func (p *cStatStaging) stopTimer() { 55 if p.on { 56 p.duration += time.Since(p.start) 57 p.on = false 58 } 59 } 60 61 type cStats struct { 62 lk sync.Mutex 63 stats []cStat 64 } 65 66 func (p *cStats) addStat(level int, n *cStatStaging) { 67 p.lk.Lock() 68 if level >= len(p.stats) { 69 newStats := make([]cStat, level+1) 70 copy(newStats, p.stats) 71 p.stats = newStats 72 } 73 p.stats[level].add(n) 74 p.lk.Unlock() 75 } 76 77 func (p *cStats) getStat(level int) (duration time.Duration, read, write int64) { 78 p.lk.Lock() 79 defer p.lk.Unlock() 80 if level < len(p.stats) { 81 return p.stats[level].get() 82 } 83 return 84 } 85 86 func (db *DB) compactionError() { 87 var err error 88 noerr: 89 // No error. 90 for { 91 select { 92 case err = <-db.compErrSetC: 93 switch { 94 case err == nil: 95 case err == ErrReadOnly, errors.IsCorrupted(err): 96 goto hasperr 97 default: 98 goto haserr 99 } 100 case <-db.closeC: 101 return 102 } 103 } 104 haserr: 105 // Transient error. 106 for { 107 select { 108 case db.compErrC <- err: 109 case err = <-db.compErrSetC: 110 switch { 111 case err == nil: 112 goto noerr 113 case err == ErrReadOnly, errors.IsCorrupted(err): 114 goto hasperr 115 default: 116 } 117 case <-db.closeC: 118 return 119 } 120 } 121 hasperr: 122 // Persistent error. 123 for { 124 select { 125 case db.compErrC <- err: 126 case db.compPerErrC <- err: 127 case db.writeLockC <- struct{}{}: 128 // Hold write lock, so that write won't pass-through. 129 db.compWriteLocking = true 130 case <-db.closeC: 131 if db.compWriteLocking { 132 // We should release the lock or Close will hang. 133 <-db.writeLockC 134 } 135 return 136 } 137 } 138 } 139 140 type compactionTransactCounter int 141 142 func (cnt *compactionTransactCounter) incr() { 143 *cnt++ 144 } 145 146 type compactionTransactInterface interface { 147 run(cnt *compactionTransactCounter) error 148 revert() error 149 } 150 151 func (db *DB) compactionTransact(name string, t compactionTransactInterface) { 152 defer func() { 153 if x := recover(); x != nil { 154 if x == errCompactionTransactExiting { 155 if err := t.revert(); err != nil { 156 db.logf("%s revert error %q", name, err) 157 } 158 } 159 panic(x) 160 } 161 }() 162 163 const ( 164 backoffMin = 1 * time.Second 165 backoffMax = 8 * time.Second 166 backoffMul = 2 * time.Second 167 ) 168 var ( 169 backoff = backoffMin 170 backoffT = time.NewTimer(backoff) 171 lastCnt = compactionTransactCounter(0) 172 173 disableBackoff = db.s.o.GetDisableCompactionBackoff() 174 ) 175 for n := 0; ; n++ { 176 // Check whether the DB is closed. 177 if db.isClosed() { 178 db.logf("%s exiting", name) 179 db.compactionExitTransact() 180 } else if n > 0 { 181 db.logf("%s retrying N·%d", name, n) 182 } 183 184 // Execute. 185 cnt := compactionTransactCounter(0) 186 err := t.run(&cnt) 187 if err != nil { 188 db.logf("%s error I·%d %q", name, cnt, err) 189 } 190 191 // Set compaction error status. 192 select { 193 case db.compErrSetC <- err: 194 case perr := <-db.compPerErrC: 195 if err != nil { 196 db.logf("%s exiting (persistent error %q)", name, perr) 197 db.compactionExitTransact() 198 } 199 case <-db.closeC: 200 db.logf("%s exiting", name) 201 db.compactionExitTransact() 202 } 203 if err == nil { 204 return 205 } 206 if errors.IsCorrupted(err) { 207 db.logf("%s exiting (corruption detected)", name) 208 db.compactionExitTransact() 209 } 210 211 if !disableBackoff { 212 // Reset backoff duration if counter is advancing. 213 if cnt > lastCnt { 214 backoff = backoffMin 215 lastCnt = cnt 216 } 217 218 // Backoff. 219 backoffT.Reset(backoff) 220 if backoff < backoffMax { 221 backoff *= backoffMul 222 if backoff > backoffMax { 223 backoff = backoffMax 224 } 225 } 226 select { 227 case <-backoffT.C: 228 case <-db.closeC: 229 db.logf("%s exiting", name) 230 db.compactionExitTransact() 231 } 232 } 233 } 234 } 235 236 type compactionTransactFunc struct { 237 runFunc func(cnt *compactionTransactCounter) error 238 revertFunc func() error 239 } 240 241 func (t *compactionTransactFunc) run(cnt *compactionTransactCounter) error { 242 return t.runFunc(cnt) 243 } 244 245 func (t *compactionTransactFunc) revert() error { 246 if t.revertFunc != nil { 247 return t.revertFunc() 248 } 249 return nil 250 } 251 252 func (db *DB) compactionTransactFunc(name string, run func(cnt *compactionTransactCounter) error, revert func() error) { 253 db.compactionTransact(name, &compactionTransactFunc{run, revert}) 254 } 255 256 func (db *DB) compactionExitTransact() { 257 panic(errCompactionTransactExiting) 258 } 259 260 func (db *DB) compactionCommit(name string, rec *sessionRecord) { 261 db.compCommitLk.Lock() 262 defer db.compCommitLk.Unlock() // Defer is necessary. 263 db.compactionTransactFunc(name+"@commit", func(cnt *compactionTransactCounter) error { 264 return db.s.commit(rec, true) 265 }, nil) 266 } 267 268 func (db *DB) memCompaction() { 269 mdb := db.getFrozenMem() 270 if mdb == nil { 271 return 272 } 273 defer mdb.decref() 274 275 db.logf("memdb@flush N·%d S·%s", mdb.Len(), shortenb(mdb.Size())) 276 277 // Don't compact empty memdb. 278 if mdb.Len() == 0 { 279 db.logf("memdb@flush skipping") 280 // drop frozen memdb 281 db.dropFrozenMem() 282 return 283 } 284 285 // Pause table compaction. 286 resumeC := make(chan struct{}) 287 select { 288 case db.tcompPauseC <- (chan<- struct{})(resumeC): 289 case <-db.compPerErrC: 290 close(resumeC) 291 resumeC = nil 292 case <-db.closeC: 293 db.compactionExitTransact() 294 } 295 296 var ( 297 rec = &sessionRecord{} 298 stats = &cStatStaging{} 299 flushLevel int 300 ) 301 302 // Generate tables. 303 db.compactionTransactFunc("memdb@flush", func(cnt *compactionTransactCounter) (err error) { 304 stats.startTimer() 305 flushLevel, err = db.s.flushMemdb(rec, mdb.DB, db.memdbMaxLevel) 306 stats.stopTimer() 307 return 308 }, func() error { 309 for _, r := range rec.addedTables { 310 db.logf("memdb@flush revert @%d", r.num) 311 if err := db.s.stor.Remove(storage.FileDesc{Type: storage.TypeTable, Num: r.num}); err != nil { 312 return err 313 } 314 } 315 return nil 316 }) 317 318 rec.setJournalNum(db.journalFd.Num) 319 rec.setSeqNum(db.frozenSeq) 320 321 // Commit. 322 stats.startTimer() 323 db.compactionCommit("memdb", rec) 324 stats.stopTimer() 325 326 db.logf("memdb@flush committed F·%d T·%v", len(rec.addedTables), stats.duration) 327 328 // Save compaction stats 329 for _, r := range rec.addedTables { 330 stats.write += r.size 331 } 332 db.compStats.addStat(flushLevel, stats) 333 atomic.AddUint32(&db.memComp, 1) 334 335 // Drop frozen memdb. 336 db.dropFrozenMem() 337 338 // Resume table compaction. 339 if resumeC != nil { 340 select { 341 case <-resumeC: 342 close(resumeC) 343 case <-db.closeC: 344 db.compactionExitTransact() 345 } 346 } 347 348 // Trigger table compaction. 349 db.compTrigger(db.tcompCmdC) 350 } 351 352 type tableCompactionBuilder struct { 353 db *DB 354 s *session 355 c *compaction 356 rec *sessionRecord 357 stat0, stat1 *cStatStaging 358 359 snapHasLastUkey bool 360 snapLastUkey []byte 361 snapLastSeq uint64 362 snapIter int 363 snapKerrCnt int 364 snapDropCnt int 365 366 kerrCnt int 367 dropCnt int 368 369 minSeq uint64 370 strict bool 371 tableSize int 372 373 tw *tWriter 374 } 375 376 func (b *tableCompactionBuilder) appendKV(key, value []byte) error { 377 // Create new table if not already. 378 if b.tw == nil { 379 // Check for pause event. 380 if b.db != nil { 381 select { 382 case ch := <-b.db.tcompPauseC: 383 b.db.pauseCompaction(ch) 384 case <-b.db.closeC: 385 b.db.compactionExitTransact() 386 default: 387 } 388 } 389 390 // Create new table. 391 var err error 392 b.tw, err = b.s.tops.create() 393 if err != nil { 394 return err 395 } 396 } 397 398 // Write key/value into table. 399 return b.tw.append(key, value) 400 } 401 402 func (b *tableCompactionBuilder) needFlush() bool { 403 return b.tw.tw.BytesLen() >= b.tableSize 404 } 405 406 func (b *tableCompactionBuilder) flush() error { 407 t, err := b.tw.finish() 408 if err != nil { 409 return err 410 } 411 b.rec.addTableFile(b.c.sourceLevel+1, t) 412 b.stat1.write += t.size 413 b.s.logf("table@build created L%d@%d N·%d S·%s %q:%q", b.c.sourceLevel+1, t.fd.Num, b.tw.tw.EntriesLen(), shortenb(int(t.size)), t.imin, t.imax) 414 b.tw = nil 415 return nil 416 } 417 418 func (b *tableCompactionBuilder) cleanup() { 419 if b.tw != nil { 420 b.tw.drop() 421 b.tw = nil 422 } 423 } 424 425 func (b *tableCompactionBuilder) run(cnt *compactionTransactCounter) error { 426 snapResumed := b.snapIter > 0 427 hasLastUkey := b.snapHasLastUkey // The key might has zero length, so this is necessary. 428 lastUkey := append([]byte{}, b.snapLastUkey...) 429 lastSeq := b.snapLastSeq 430 b.kerrCnt = b.snapKerrCnt 431 b.dropCnt = b.snapDropCnt 432 // Restore compaction state. 433 b.c.restore() 434 435 defer b.cleanup() 436 437 b.stat1.startTimer() 438 defer b.stat1.stopTimer() 439 440 iter := b.c.newIterator() 441 defer iter.Release() 442 for i := 0; iter.Next(); i++ { 443 // Incr transact counter. 444 cnt.incr() 445 446 // Skip until last state. 447 if i < b.snapIter { 448 continue 449 } 450 451 resumed := false 452 if snapResumed { 453 resumed = true 454 snapResumed = false 455 } 456 457 ikey := iter.Key() 458 ukey, seq, kt, kerr := parseInternalKey(ikey) 459 460 if kerr == nil { 461 shouldStop := !resumed && b.c.shouldStopBefore(ikey) 462 463 if !hasLastUkey || b.s.icmp.uCompare(lastUkey, ukey) != 0 { 464 // First occurrence of this user key. 465 466 // Only rotate tables if ukey doesn't hop across. 467 if b.tw != nil && (shouldStop || b.needFlush()) { 468 if err := b.flush(); err != nil { 469 return err 470 } 471 472 // Creates snapshot of the state. 473 b.c.save() 474 b.snapHasLastUkey = hasLastUkey 475 b.snapLastUkey = append(b.snapLastUkey[:0], lastUkey...) 476 b.snapLastSeq = lastSeq 477 b.snapIter = i 478 b.snapKerrCnt = b.kerrCnt 479 b.snapDropCnt = b.dropCnt 480 } 481 482 hasLastUkey = true 483 lastUkey = append(lastUkey[:0], ukey...) 484 lastSeq = keyMaxSeq 485 } 486 487 switch { 488 case lastSeq <= b.minSeq: 489 // Dropped because newer entry for same user key exist 490 fallthrough // (A) 491 case kt == keyTypeDel && seq <= b.minSeq && b.c.baseLevelForKey(lastUkey): 492 // For this user key: 493 // (1) there is no data in higher levels 494 // (2) data in lower levels will have larger seq numbers 495 // (3) data in layers that are being compacted here and have 496 // smaller seq numbers will be dropped in the next 497 // few iterations of this loop (by rule (A) above). 498 // Therefore this deletion marker is obsolete and can be dropped. 499 lastSeq = seq 500 b.dropCnt++ 501 continue 502 default: 503 lastSeq = seq 504 } 505 } else { 506 if b.strict { 507 return kerr 508 } 509 510 // Don't drop corrupted keys. 511 hasLastUkey = false 512 lastUkey = lastUkey[:0] 513 lastSeq = keyMaxSeq 514 b.kerrCnt++ 515 } 516 517 if err := b.appendKV(ikey, iter.Value()); err != nil { 518 return err 519 } 520 } 521 522 if err := iter.Error(); err != nil { 523 return err 524 } 525 526 // Finish last table. 527 if b.tw != nil && !b.tw.empty() { 528 return b.flush() 529 } 530 return nil 531 } 532 533 func (b *tableCompactionBuilder) revert() error { 534 for _, at := range b.rec.addedTables { 535 b.s.logf("table@build revert @%d", at.num) 536 if err := b.s.stor.Remove(storage.FileDesc{Type: storage.TypeTable, Num: at.num}); err != nil { 537 return err 538 } 539 } 540 return nil 541 } 542 543 func (db *DB) tableCompaction(c *compaction, noTrivial bool) { 544 defer c.release() 545 546 rec := &sessionRecord{} 547 rec.addCompPtr(c.sourceLevel, c.imax) 548 549 if !noTrivial && c.trivial() { 550 t := c.levels[0][0] 551 db.logf("table@move L%d@%d -> L%d", c.sourceLevel, t.fd.Num, c.sourceLevel+1) 552 rec.delTable(c.sourceLevel, t.fd.Num) 553 rec.addTableFile(c.sourceLevel+1, t) 554 db.compactionCommit("table-move", rec) 555 return 556 } 557 558 var stats [2]cStatStaging 559 for i, tables := range c.levels { 560 for _, t := range tables { 561 stats[i].read += t.size 562 // Insert deleted tables into record 563 rec.delTable(c.sourceLevel+i, t.fd.Num) 564 } 565 } 566 sourceSize := int(stats[0].read + stats[1].read) 567 minSeq := db.minSeq() 568 db.logf("table@compaction L%d·%d -> L%d·%d S·%s Q·%d", c.sourceLevel, len(c.levels[0]), c.sourceLevel+1, len(c.levels[1]), shortenb(sourceSize), minSeq) 569 570 b := &tableCompactionBuilder{ 571 db: db, 572 s: db.s, 573 c: c, 574 rec: rec, 575 stat1: &stats[1], 576 minSeq: minSeq, 577 strict: db.s.o.GetStrict(opt.StrictCompaction), 578 tableSize: db.s.o.GetCompactionTableSize(c.sourceLevel + 1), 579 } 580 db.compactionTransact("table@build", b) 581 582 // Commit. 583 stats[1].startTimer() 584 db.compactionCommit("table", rec) 585 stats[1].stopTimer() 586 587 resultSize := int(stats[1].write) 588 db.logf("table@compaction committed F%s S%s Ke·%d D·%d T·%v", sint(len(rec.addedTables)-len(rec.deletedTables)), sshortenb(resultSize-sourceSize), b.kerrCnt, b.dropCnt, stats[1].duration) 589 590 // Save compaction stats 591 for i := range stats { 592 db.compStats.addStat(c.sourceLevel+1, &stats[i]) 593 } 594 switch c.typ { 595 case level0Compaction: 596 atomic.AddUint32(&db.level0Comp, 1) 597 case nonLevel0Compaction: 598 atomic.AddUint32(&db.nonLevel0Comp, 1) 599 case seekCompaction: 600 atomic.AddUint32(&db.seekComp, 1) 601 } 602 } 603 604 func (db *DB) tableRangeCompaction(level int, umin, umax []byte) error { 605 db.logf("table@compaction range L%d %q:%q", level, umin, umax) 606 if level >= 0 { 607 if c := db.s.getCompactionRange(level, umin, umax, true); c != nil { 608 db.tableCompaction(c, true) 609 } 610 } else { 611 // Retry until nothing to compact. 612 for { 613 compacted := false 614 615 // Scan for maximum level with overlapped tables. 616 v := db.s.version() 617 m := 1 618 for i := m; i < len(v.levels); i++ { 619 tables := v.levels[i] 620 if tables.overlaps(db.s.icmp, umin, umax, false) { 621 m = i 622 } 623 } 624 v.release() 625 626 for level := 0; level < m; level++ { 627 if c := db.s.getCompactionRange(level, umin, umax, false); c != nil { 628 db.tableCompaction(c, true) 629 compacted = true 630 } 631 } 632 633 if !compacted { 634 break 635 } 636 } 637 } 638 639 return nil 640 } 641 642 func (db *DB) tableAutoCompaction() { 643 if c := db.s.pickCompaction(); c != nil { 644 db.tableCompaction(c, false) 645 } 646 } 647 648 func (db *DB) tableNeedCompaction() bool { 649 v := db.s.version() 650 defer v.release() 651 return v.needCompaction() 652 } 653 654 // resumeWrite returns an indicator whether we should resume write operation if enough level0 files are compacted. 655 func (db *DB) resumeWrite() bool { 656 v := db.s.version() 657 defer v.release() 658 if v.tLen(0) < db.s.o.GetWriteL0PauseTrigger() { 659 return true 660 } 661 return false 662 } 663 664 func (db *DB) pauseCompaction(ch chan<- struct{}) { 665 select { 666 case ch <- struct{}{}: 667 case <-db.closeC: 668 db.compactionExitTransact() 669 } 670 } 671 672 type cCmd interface { 673 ack(err error) 674 } 675 676 type cAuto struct { 677 // Note for table compaction, an non-empty ackC represents it's a compaction waiting command. 678 ackC chan<- error 679 } 680 681 func (r cAuto) ack(err error) { 682 if r.ackC != nil { 683 defer func() { 684 recover() 685 }() 686 r.ackC <- err 687 } 688 } 689 690 type cRange struct { 691 level int 692 min, max []byte 693 ackC chan<- error 694 } 695 696 func (r cRange) ack(err error) { 697 if r.ackC != nil { 698 defer func() { 699 recover() 700 }() 701 r.ackC <- err 702 } 703 } 704 705 // This will trigger auto compaction but will not wait for it. 706 func (db *DB) compTrigger(compC chan<- cCmd) { 707 select { 708 case compC <- cAuto{}: 709 default: 710 } 711 } 712 713 // This will trigger auto compaction and/or wait for all compaction to be done. 714 func (db *DB) compTriggerWait(compC chan<- cCmd) (err error) { 715 ch := make(chan error) 716 defer close(ch) 717 // Send cmd. 718 select { 719 case compC <- cAuto{ch}: 720 case err = <-db.compErrC: 721 return 722 case <-db.closeC: 723 return ErrClosed 724 } 725 // Wait cmd. 726 select { 727 case err = <-ch: 728 case err = <-db.compErrC: 729 case <-db.closeC: 730 return ErrClosed 731 } 732 return err 733 } 734 735 // Send range compaction request. 736 func (db *DB) compTriggerRange(compC chan<- cCmd, level int, min, max []byte) (err error) { 737 ch := make(chan error) 738 defer close(ch) 739 // Send cmd. 740 select { 741 case compC <- cRange{level, min, max, ch}: 742 case err := <-db.compErrC: 743 return err 744 case <-db.closeC: 745 return ErrClosed 746 } 747 // Wait cmd. 748 select { 749 case err = <-ch: 750 case err = <-db.compErrC: 751 case <-db.closeC: 752 return ErrClosed 753 } 754 return err 755 } 756 757 func (db *DB) mCompaction() { 758 var x cCmd 759 760 defer func() { 761 if x := recover(); x != nil { 762 if x != errCompactionTransactExiting { 763 panic(x) 764 } 765 } 766 if x != nil { 767 x.ack(ErrClosed) 768 } 769 db.closeW.Done() 770 }() 771 772 for { 773 select { 774 case x = <-db.mcompCmdC: 775 switch x.(type) { 776 case cAuto: 777 db.memCompaction() 778 x.ack(nil) 779 x = nil 780 default: 781 panic("leveldb: unknown command") 782 } 783 case <-db.closeC: 784 return 785 } 786 } 787 } 788 789 func (db *DB) tCompaction() { 790 var ( 791 x cCmd 792 waitQ []cCmd 793 ) 794 795 defer func() { 796 if x := recover(); x != nil { 797 if x != errCompactionTransactExiting { 798 panic(x) 799 } 800 } 801 for i := range waitQ { 802 waitQ[i].ack(ErrClosed) 803 waitQ[i] = nil 804 } 805 if x != nil { 806 x.ack(ErrClosed) 807 } 808 db.closeW.Done() 809 }() 810 811 for { 812 if db.tableNeedCompaction() { 813 select { 814 case x = <-db.tcompCmdC: 815 case ch := <-db.tcompPauseC: 816 db.pauseCompaction(ch) 817 continue 818 case <-db.closeC: 819 return 820 default: 821 } 822 // Resume write operation as soon as possible. 823 if len(waitQ) > 0 && db.resumeWrite() { 824 for i := range waitQ { 825 waitQ[i].ack(nil) 826 waitQ[i] = nil 827 } 828 waitQ = waitQ[:0] 829 } 830 } else { 831 for i := range waitQ { 832 waitQ[i].ack(nil) 833 waitQ[i] = nil 834 } 835 waitQ = waitQ[:0] 836 select { 837 case x = <-db.tcompCmdC: 838 case ch := <-db.tcompPauseC: 839 db.pauseCompaction(ch) 840 continue 841 case <-db.closeC: 842 return 843 } 844 } 845 if x != nil { 846 switch cmd := x.(type) { 847 case cAuto: 848 if cmd.ackC != nil { 849 // Check the write pause state before caching it. 850 if db.resumeWrite() { 851 x.ack(nil) 852 } else { 853 waitQ = append(waitQ, x) 854 } 855 } 856 case cRange: 857 x.ack(db.tableRangeCompaction(cmd.level, cmd.min, cmd.max)) 858 default: 859 panic("leveldb: unknown command") 860 } 861 x = nil 862 } 863 db.tableAutoCompaction() 864 } 865 }