github.com/coocood/badger@v1.5.1-0.20200528065104-c02ac3616d04/db.go (about) 1 /* 2 * Copyright 2017 Dgraph Labs, Inc. and Contributors 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package badger 18 19 import ( 20 "bytes" 21 "io" 22 "math" 23 "os" 24 "path/filepath" 25 "sort" 26 "strconv" 27 "sync" 28 "sync/atomic" 29 "time" 30 31 "github.com/coocood/badger/cache" 32 "github.com/coocood/badger/epoch" 33 "github.com/coocood/badger/options" 34 "github.com/coocood/badger/protos" 35 "github.com/coocood/badger/table" 36 "github.com/coocood/badger/table/memtable" 37 "github.com/coocood/badger/table/sstable" 38 "github.com/coocood/badger/y" 39 "github.com/dgryski/go-farm" 40 "github.com/ncw/directio" 41 "github.com/pingcap/errors" 42 "github.com/pingcap/log" 43 "go.uber.org/zap" 44 "golang.org/x/time/rate" 45 ) 46 47 var ( 48 txnKey = []byte("!badger!txn") // For indicating end of entries in txn. 49 ) 50 51 type closers struct { 52 updateSize *y.Closer 53 compactors *y.Closer 54 resourceManager *y.Closer 55 blobManager *y.Closer 56 memtable *y.Closer 57 writes *y.Closer 58 } 59 60 // DB provides the various functions required to interact with Badger. 61 // DB is thread-safe. 62 type DB struct { 63 dirLockGuard *directoryLockGuard 64 // nil if Dir and ValueDir are the same 65 valueDirGuard *directoryLockGuard 66 67 closers closers 68 mtbls atomic.Value 69 opt Options 70 manifest *manifestFile 71 lc *levelsController 72 vlog valueLog 73 logOff logOffset // less than or equal to a pointer to the last vlog value put into mt 74 syncedFid uint32 // The log fid that has been flushed to SST, older log files are safe to be deleted. 75 writeCh chan *request 76 flushChan chan *flushTask // For flushing memtables. 77 ingestCh chan *ingestTask 78 79 // mem table buffer to avoid expensive allocating big chunk of memory 80 memTableCh chan *memtable.Table 81 82 orc *oracle 83 safeTsTracker safeTsTracker 84 85 limiter *rate.Limiter 86 87 blockCache *cache.Cache 88 indexCache *cache.Cache 89 90 metrics *y.MetricsSet 91 lsmSize int64 92 vlogSize int64 93 volatileMode bool 94 95 blobManger blobManager 96 97 resourceMgr *epoch.ResourceManager 98 } 99 100 type memTables struct { 101 tables []*memtable.Table // tables from new to old, the first one is mutable. 102 length uint32 // The length is updated by the flusher. 103 } 104 105 func (tbls *memTables) getMutable() *memtable.Table { 106 return tbls.tables[0] 107 } 108 109 func newMemTables(mt *memtable.Table, old *memTables) *memTables { 110 newTbls := &memTables{} 111 newTbls.tables = make([]*memtable.Table, 1+atomic.LoadUint32(&old.length)) 112 newTbls.tables[0] = mt 113 copy(newTbls.tables[1:], old.tables) 114 newTbls.length = uint32(len(newTbls.tables)) 115 return newTbls 116 } 117 118 const ( 119 kvWriteChCapacity = 1000 120 ) 121 122 func replayFunction(out *DB) func(Entry) error { 123 type txnEntry struct { 124 nk y.Key 125 v y.ValueStruct 126 } 127 128 var txn []txnEntry 129 var lastCommit uint64 130 131 toLSM := func(nk y.Key, vs y.ValueStruct) { 132 e := memtable.Entry{Key: nk.UserKey, Value: vs} 133 mTbls := out.mtbls.Load().(*memTables) 134 if out.ensureRoomForWrite(mTbls.getMutable(), e.EstimateSize()) == out.opt.MaxMemTableSize { 135 mTbls = out.mtbls.Load().(*memTables) 136 } 137 mTbls.getMutable().PutToSkl(nk.UserKey, vs) 138 } 139 140 first := true 141 return func(e Entry) error { // Function for replaying. 142 if first { 143 log.Info("replay wal", zap.Stringer("first key", e.Key)) 144 } 145 first = false 146 147 if out.orc.curRead < e.Key.Version { 148 out.orc.curRead = e.Key.Version 149 } 150 151 var nk y.Key 152 nk.Copy(e.Key) 153 nv := make([]byte, len(e.Value)) 154 copy(nv, e.Value) 155 156 v := y.ValueStruct{ 157 Value: nv, 158 Meta: e.meta, 159 UserMeta: e.UserMeta, 160 Version: nk.Version, 161 } 162 163 if e.meta&bitFinTxn > 0 { 164 txnTs, err := strconv.ParseUint(string(e.Value), 10, 64) 165 if err != nil { 166 return errors.Wrapf(err, "Unable to parse txn fin: %q", e.Value) 167 } 168 if !out.IsManaged() { 169 y.Assert(lastCommit == txnTs) 170 } 171 y.Assert(len(txn) > 0) 172 // Got the end of txn. Now we can store them. 173 for _, t := range txn { 174 toLSM(t.nk, t.v) 175 } 176 txn = txn[:0] 177 lastCommit = 0 178 179 } else if e.meta&bitTxn == 0 { 180 // This entry is from a rewrite. 181 toLSM(nk, v) 182 183 // We shouldn't get this entry in the middle of a transaction. 184 y.Assert(lastCommit == 0) 185 y.Assert(len(txn) == 0) 186 187 } else { 188 if lastCommit == 0 { 189 lastCommit = e.Key.Version 190 } 191 if !out.IsManaged() { 192 y.Assert(lastCommit == e.Key.Version) 193 } 194 te := txnEntry{nk: nk, v: v} 195 txn = append(txn, te) 196 } 197 return nil 198 } 199 } 200 201 // Open returns a new DB object. 202 func Open(opt Options) (db *DB, err error) { 203 opt.maxBatchSize = (15 * opt.MaxMemTableSize) / 100 204 opt.maxBatchCount = opt.maxBatchSize / int64(memtable.MaxNodeSize) 205 206 if opt.ValueThreshold > math.MaxUint16-16 { 207 return nil, ErrValueThreshold 208 } 209 210 if opt.ReadOnly { 211 // Can't truncate if the DB is read only. 212 opt.Truncate = false 213 } 214 215 for _, path := range []string{opt.Dir, opt.ValueDir} { 216 dirExists, err := exists(path) 217 if err != nil { 218 return nil, y.Wrapf(err, "Invalid Dir: %q", path) 219 } 220 if !dirExists { 221 if opt.ReadOnly { 222 return nil, y.Wrapf(err, "Cannot find Dir for read-only open: %q", path) 223 } 224 // Try to create the directory 225 err = os.Mkdir(path, 0700) 226 if err != nil { 227 return nil, y.Wrapf(err, "Error Creating Dir: %q", path) 228 } 229 } 230 } 231 absDir, err := filepath.Abs(opt.Dir) 232 if err != nil { 233 return nil, err 234 } 235 absValueDir, err := filepath.Abs(opt.ValueDir) 236 if err != nil { 237 return nil, err 238 } 239 var dirLockGuard, valueDirLockGuard *directoryLockGuard 240 dirLockGuard, err = acquireDirectoryLock(opt.Dir, lockFile, opt.ReadOnly) 241 if err != nil { 242 return nil, err 243 } 244 defer func() { 245 if dirLockGuard != nil { 246 _ = dirLockGuard.release() 247 } 248 }() 249 if absValueDir != absDir { 250 valueDirLockGuard, err = acquireDirectoryLock(opt.ValueDir, lockFile, opt.ReadOnly) 251 if err != nil { 252 return nil, err 253 } 254 } 255 defer func() { 256 if valueDirLockGuard != nil { 257 _ = valueDirLockGuard.release() 258 } 259 }() 260 if !(opt.ValueLogFileSize <= 2<<30 && opt.ValueLogFileSize >= 1<<20) { 261 return nil, ErrValueLogSize 262 } 263 manifestFile, manifest, err := openOrCreateManifestFile(opt.Dir, opt.ReadOnly) 264 if err != nil { 265 return nil, err 266 } 267 defer func() { 268 if manifestFile != nil { 269 _ = manifestFile.close() 270 } 271 }() 272 273 orc := &oracle{ 274 isManaged: opt.ManagedTxns, 275 nextCommit: 1, 276 commits: make(map[uint64]uint64), 277 } 278 279 var blkCache, idxCache *cache.Cache 280 if opt.MaxBlockCacheSize != 0 { 281 var err error 282 blkCache, err = cache.NewCache(&cache.Config{ 283 // The expected keys is MaxCacheSize / BlockSize, then x10 as documentation suggests. 284 NumCounters: opt.MaxBlockCacheSize / int64(opt.TableBuilderOptions.BlockSize) * 10, 285 MaxCost: opt.MaxBlockCacheSize, 286 BufferItems: 64, 287 }) 288 if err != nil { 289 return nil, errors.Wrap(err, "failed to create block cache") 290 } 291 292 indexSizeHint := float64(opt.MaxTableSize) / 6.0 293 idxCache, err = cache.NewCache(&cache.Config{ 294 NumCounters: int64(float64(opt.MaxIndexCacheSize) / indexSizeHint * 10), 295 MaxCost: opt.MaxIndexCacheSize, 296 BufferItems: 64, 297 }) 298 if err != nil { 299 return nil, errors.Wrap(err, "failed to create index cache") 300 } 301 } 302 db = &DB{ 303 flushChan: make(chan *flushTask, opt.NumMemtables), 304 writeCh: make(chan *request, kvWriteChCapacity), 305 memTableCh: make(chan *memtable.Table, 1), 306 ingestCh: make(chan *ingestTask), 307 opt: opt, 308 manifest: manifestFile, 309 dirLockGuard: dirLockGuard, 310 valueDirGuard: valueDirLockGuard, 311 orc: orc, 312 metrics: y.NewMetricSet(opt.Dir), 313 blockCache: blkCache, 314 indexCache: idxCache, 315 volatileMode: opt.VolatileMode, 316 } 317 db.vlog.metrics = db.metrics 318 319 rateLimit := opt.TableBuilderOptions.BytesPerSecond 320 if rateLimit > 0 { 321 db.limiter = rate.NewLimiter(rate.Limit(rateLimit), rateLimit) 322 } 323 324 // Calculate initial size. 325 db.calculateSize() 326 db.closers.updateSize = y.NewCloser(1) 327 go db.updateSize(db.closers.updateSize) 328 329 db.closers.resourceManager = y.NewCloser(0) 330 db.resourceMgr = epoch.NewResourceManager(db.closers.resourceManager, &db.safeTsTracker) 331 332 // newLevelsController potentially loads files in directory. 333 if db.lc, err = newLevelsController(db, &manifest, db.resourceMgr, opt.TableBuilderOptions); err != nil { 334 return nil, err 335 } 336 337 db.closers.memtable = y.NewCloser(1) 338 go func() { 339 lc := db.closers.memtable 340 for { 341 select { 342 case db.memTableCh <- memtable.New(arenaSize(db.opt), db.lc.reserveFileID()): 343 case <-lc.HasBeenClosed(): 344 lc.Done() 345 return 346 } 347 } 348 }() 349 db.mtbls.Store(newMemTables(<-db.memTableCh, &memTables{})) 350 351 if err = db.blobManger.Open(db, opt); err != nil { 352 return nil, err 353 } 354 355 if !opt.ReadOnly { 356 db.closers.compactors = y.NewCloser(1) 357 db.lc.startCompact(db.closers.compactors) 358 359 db.closers.memtable.AddRunning(1) 360 go db.runFlushMemTable(db.closers.memtable) // Need levels controller to be up. 361 } 362 363 if err = db.vlog.Open(db, opt); err != nil { 364 return nil, err 365 } 366 367 var logOff logOffset 368 head := manifest.Head 369 if head != nil { 370 db.orc.curRead = head.Version 371 logOff.fid = head.LogID 372 logOff.offset = head.LogOffset 373 } 374 375 // lastUsedCasCounter will either be the value stored in !badger!head, or some subsequently 376 // written value log entry that we replay. (Subsequent value log entries might be _less_ 377 // than lastUsedCasCounter, if there was value log gc so we have to max() values while 378 // replaying.) 379 // out.lastUsedCasCounter = item.casCounter 380 // TODO: Figure this out. This would update the read timestamp, and set nextCommitTs. 381 382 replayCloser := startWriteWorker(db) 383 384 if err = db.vlog.Replay(logOff, replayFunction(db)); err != nil { 385 return db, err 386 } 387 388 replayCloser.SignalAndWait() // Wait for replay to be applied first. 389 // Now that we have the curRead, we can update the nextCommit. 390 db.orc.Lock() 391 db.orc.nextCommit = db.orc.curRead + 1 392 db.orc.Unlock() 393 394 db.writeCh = make(chan *request, kvWriteChCapacity) 395 db.closers.writes = startWriteWorker(db) 396 397 valueDirLockGuard = nil 398 dirLockGuard = nil 399 manifestFile = nil 400 return db, nil 401 } 402 403 // DeleteFilesInRange delete files in [start, end). 404 // If some file contains keys outside the range, they will not be deleted. 405 // This function is designed to reclaim space quickly. 406 // If you want to ensure no future transaction can read keys in range, 407 // considering iterate and delete the remained keys, or using compaction filter to cleanup them asynchronously. 408 func (db *DB) DeleteFilesInRange(start, end []byte) { 409 var ( 410 changes []*protos.ManifestChange 411 pruneTbls []table.Table 412 startKey = y.KeyWithTs(start, math.MaxUint64) 413 endKey = y.KeyWithTs(end, 0) 414 guard = db.resourceMgr.Acquire() 415 ) 416 417 for level, lc := range db.lc.levels { 418 lc.Lock() 419 left, right := 0, len(lc.tables) 420 if lc.level > 0 { 421 left, right = getTablesInRange(lc.tables, startKey, endKey) 422 } 423 if left >= right { 424 lc.Unlock() 425 continue 426 } 427 428 newTables := lc.tables[:left] 429 for _, tbl := range lc.tables[left:right] { 430 if !isRangeCoversTable(startKey, endKey, tbl) || tbl.IsCompacting() { 431 newTables = append(newTables, tbl) 432 continue 433 } 434 pruneTbls = append(pruneTbls, tbl) 435 changes = append(changes, newDeleteChange(tbl.ID())) 436 } 437 newTables = append(newTables, lc.tables[right:]...) 438 for i := len(newTables); i < len(lc.tables); i++ { 439 lc.tables[i] = nil 440 } 441 assertTablesOrder(level, newTables, nil) 442 lc.tables = newTables 443 lc.Unlock() 444 } 445 446 db.manifest.addChanges(changes, nil) 447 var discardStats DiscardStats 448 deletes := make([]epoch.Resource, len(pruneTbls)) 449 for i, tbl := range pruneTbls { 450 it := tbl.NewIterator(false) 451 // TODO: use rate limiter to avoid burst IO. 452 for it.Rewind(); it.Valid(); y.NextAllVersion(it) { 453 discardStats.collect(it.Value()) 454 } 455 deletes[i] = tbl 456 } 457 if len(discardStats.ptrs) > 0 { 458 db.blobManger.discardCh <- &discardStats 459 } 460 guard.Delete(deletes) 461 guard.Done() 462 } 463 464 func isRangeCoversTable(start, end y.Key, t table.Table) bool { 465 left := start.Compare(t.Smallest()) <= 0 466 right := t.Biggest().Compare(end) < 0 467 return left && right 468 } 469 470 // NewExternalTableBuilder returns a new sst builder. 471 func (db *DB) NewExternalTableBuilder(f *os.File, compression options.CompressionType, limiter *rate.Limiter) *sstable.Builder { 472 return sstable.NewExternalTableBuilder(f, limiter, db.opt.TableBuilderOptions, compression) 473 } 474 475 // ErrExternalTableOverlap returned by IngestExternalFiles when files overlaps. 476 var ErrExternalTableOverlap = errors.New("keys of external tables has overlap") 477 478 type ExternalTableSpec struct { 479 Filename string 480 } 481 482 // IngestExternalFiles ingest external constructed tables into DB. 483 // Note: insure there is no concurrent write overlap with tables to be ingested. 484 func (db *DB) IngestExternalFiles(files []ExternalTableSpec) (int, error) { 485 tbls, err := db.prepareExternalFiles(files) 486 if err != nil { 487 return 0, err 488 } 489 490 if err := db.checkExternalTables(tbls); err != nil { 491 return 0, err 492 } 493 494 task := &ingestTask{tbls: tbls} 495 task.Add(1) 496 db.ingestCh <- task 497 task.Wait() 498 return task.cnt, task.err 499 } 500 501 func (db *DB) prepareExternalFiles(specs []ExternalTableSpec) ([]table.Table, error) { 502 tbls := make([]table.Table, len(specs)) 503 for i, spec := range specs { 504 id := db.lc.reserveFileID() 505 filename := sstable.NewFilename(id, db.opt.Dir) 506 507 err := os.Link(spec.Filename, filename) 508 if err != nil { 509 return nil, err 510 } 511 512 err = os.Link(sstable.IndexFilename(spec.Filename), sstable.IndexFilename(filename)) 513 if err != nil { 514 return nil, err 515 } 516 517 tbl, err := sstable.OpenTable(filename, db.blockCache, db.indexCache) 518 if err != nil { 519 return nil, err 520 } 521 522 tbls[i] = tbl 523 } 524 525 sort.Slice(tbls, func(i, j int) bool { 526 return tbls[i].Smallest().Compare(tbls[j].Smallest()) < 0 527 }) 528 529 return tbls, syncDir(db.lc.kv.opt.Dir) 530 } 531 532 func (db *DB) checkExternalTables(tbls []table.Table) error { 533 keys := make([][]byte, 0, len(tbls)*2) 534 for _, t := range tbls { 535 keys = append(keys, t.Smallest().UserKey, t.Biggest().UserKey) 536 } 537 ok := sort.SliceIsSorted(keys, func(i, j int) bool { 538 return bytes.Compare(keys[i], keys[j]) < 0 539 }) 540 if !ok { 541 return ErrExternalTableOverlap 542 } 543 544 for i := 1; i < len(keys)-1; i += 2 { 545 if bytes.Compare(keys[i], keys[i+1]) == 0 { 546 return ErrExternalTableOverlap 547 } 548 } 549 550 return nil 551 } 552 553 // CacheMetrics returns the metrics for the underlying cache. 554 func (db *DB) CacheMetrics() *cache.Metrics { 555 // Do not enable ristretto metrics in badger until issue 556 // https://github.com/dgraph-io/ristretto/issues/92 is resolved. 557 // return db.blockCache.Metrics() 558 return nil 559 } 560 561 // Close closes a DB. It's crucial to call it to ensure all the pending updates 562 // make their way to disk. Calling DB.Close() multiple times is not safe and would 563 // cause panic. 564 func (db *DB) Close() (err error) { 565 log.Info("Closing database") 566 567 // Stop writes next. 568 db.closers.writes.SignalAndWait() 569 570 // Now close the value log. 571 if vlogErr := db.vlog.Close(); err == nil { 572 err = errors.Wrap(vlogErr, "DB.Close") 573 } 574 575 // Make sure that block writer is done pushing stuff into memtable! 576 // Otherwise, you will have a race condition: we are trying to flush memtables 577 // and remove them completely, while the block / memtable writer is still 578 // trying to push stuff into the memtable. This will also resolve the value 579 // offset problem: as we push into memtable, we update value offsets there. 580 mTbls := db.mtbls.Load().(*memTables) 581 if !mTbls.getMutable().Empty() && !db.volatileMode { 582 log.Info("Flushing memtable") 583 db.mtbls.Store(newMemTables(nil, mTbls)) 584 db.flushChan <- newFlushTask(mTbls.getMutable(), db.logOff) 585 } 586 db.flushChan <- newFlushTask(nil, logOffset{}) // Tell flusher to quit. 587 588 if db.closers.memtable != nil { 589 db.closers.memtable.SignalAndWait() 590 log.Info("Memtable flushed") 591 } 592 if db.closers.compactors != nil { 593 db.closers.compactors.SignalAndWait() 594 log.Info("Compaction finished") 595 } 596 if db.opt.CompactL0WhenClose && !db.volatileMode { 597 // Force Compact L0 598 // We don't need to care about cstatus since no parallel compaction is running. 599 cd := &compactDef{ 600 thisLevel: db.lc.levels[0], 601 nextLevel: db.lc.levels[1], 602 } 603 guard := db.resourceMgr.Acquire() 604 defer guard.Done() 605 if db.lc.fillTablesL0(cd) { 606 if err := db.lc.runCompactDef(0, cd, nil, guard); err != nil { 607 log.Info("LOG Compact FAILED", zap.Stringer("compact def", cd), zap.Error(err)) 608 } 609 } else { 610 log.Info("fillTables failed for level zero. No compaction required") 611 } 612 } 613 614 if db.closers.blobManager != nil { 615 db.closers.blobManager.SignalAndWait() 616 log.Info("BlobManager finished") 617 } 618 if db.closers.resourceManager != nil { 619 db.closers.resourceManager.SignalAndWait() 620 log.Info("ResourceManager finished") 621 } 622 623 if lcErr := db.lc.close(); err == nil { 624 err = errors.Wrap(lcErr, "DB.Close") 625 } 626 log.Info("Waiting for closer") 627 db.closers.updateSize.SignalAndWait() 628 if db.blockCache != nil { 629 db.blockCache.Close() 630 } 631 632 if db.dirLockGuard != nil { 633 if guardErr := db.dirLockGuard.release(); err == nil { 634 err = errors.Wrap(guardErr, "DB.Close") 635 } 636 } 637 if db.valueDirGuard != nil { 638 if guardErr := db.valueDirGuard.release(); err == nil { 639 err = errors.Wrap(guardErr, "DB.Close") 640 } 641 } 642 if manifestErr := db.manifest.close(); err == nil { 643 err = errors.Wrap(manifestErr, "DB.Close") 644 } 645 646 // Fsync directories to ensure that lock file, and any other removed files whose directory 647 // we haven't specifically fsynced, are guaranteed to have their directory entry removal 648 // persisted to disk. 649 if syncErr := syncDir(db.opt.Dir); err == nil { 650 err = errors.Wrap(syncErr, "DB.Close") 651 } 652 if syncErr := syncDir(db.opt.ValueDir); err == nil { 653 err = errors.Wrap(syncErr, "DB.Close") 654 } 655 656 return err 657 } 658 659 const ( 660 lockFile = "LOCK" 661 ) 662 663 // When you create or delete a file, you have to ensure the directory entry for the file is synced 664 // in order to guarantee the file is visible (if the system crashes). (See the man page for fsync, 665 // or see https://github.com/coreos/etcd/issues/6368 for an example.) 666 func syncDir(dir string) error { 667 f, err := openDir(dir) 668 if err != nil { 669 return errors.Wrapf(err, "While opening directory: %s.", dir) 670 } 671 err = f.Sync() 672 closeErr := f.Close() 673 if err != nil { 674 return errors.Wrapf(err, "While syncing directory: %s.", dir) 675 } 676 return errors.Wrapf(closeErr, "While closing directory: %s.", dir) 677 } 678 679 // getMemtables returns the current memtables. 680 func (db *DB) getMemTables() []*memtable.Table { 681 tbls := db.mtbls.Load().(*memTables) 682 l := atomic.LoadUint32(&tbls.length) 683 return tbls.tables[:l] 684 } 685 686 // get returns the value in memtable or disk for given key. 687 // Note that value will include meta byte. 688 // 689 // IMPORTANT: We should never write an entry with an older timestamp for the same key, We need to 690 // maintain this invariant to search for the latest value of a key, or else we need to search in all 691 // tables and find the max version among them. To maintain this invariant, we also need to ensure 692 // that all versions of a key are always present in the same table from level 1, because compaction 693 // can push any table down. 694 func (db *DB) get(key y.Key) y.ValueStruct { 695 tables := db.getMemTables() // Lock should be released. 696 697 db.metrics.NumGets.Inc() 698 for _, table := range tables { 699 db.metrics.NumMemtableGets.Inc() 700 vs, err := table.Get(key, 0) 701 if err != nil { 702 log.Error("search table meets error", zap.Error(err)) 703 } 704 if vs.Valid() { 705 return vs 706 } 707 } 708 keyHash := farm.Fingerprint64(key.UserKey) 709 return db.lc.get(key, keyHash) 710 } 711 712 func (db *DB) multiGet(pairs []keyValuePair) { 713 tables := db.getMemTables() // Lock should be released. 714 715 var foundCount, mtGets int 716 for _, table := range tables { 717 for j := range pairs { 718 pair := &pairs[j] 719 if pair.found { 720 continue 721 } 722 for { 723 val, err := table.Get(pair.key, 0) 724 if err != nil { 725 log.Error("search table meets error", zap.Error(err)) 726 } 727 if val.Valid() { 728 pair.val = val 729 pair.found = true 730 foundCount++ 731 } 732 mtGets++ 733 break 734 } 735 } 736 } 737 db.metrics.NumMemtableGets.Add(float64(mtGets)) 738 db.metrics.NumGets.Add(float64(len(pairs))) 739 740 if foundCount == len(pairs) { 741 return 742 } 743 db.lc.multiGet(pairs) 744 } 745 746 func (db *DB) updateOffset(off logOffset) { 747 y.Assert(!off.Less(db.logOff)) 748 // We don't need to protect it by a lock because the value is never accessed 749 // by more than one goroutine at the same time. 750 db.logOff = off 751 } 752 753 var requestPool = sync.Pool{ 754 New: func() interface{} { 755 return new(request) 756 }, 757 } 758 759 func (db *DB) sendToWriteCh(entries []*Entry) (*request, error) { 760 var count, size int64 761 for _, e := range entries { 762 size += int64(e.estimateSize()) 763 count++ 764 } 765 766 // We can only service one request because we need each txn to be stored in a contigous section. 767 // Txns should not interleave among other txns or rewrites. 768 req := requestPool.Get().(*request) 769 req.Entries = entries 770 req.Wg = sync.WaitGroup{} 771 req.Wg.Add(1) 772 db.writeCh <- req // Handled in writeWorker. 773 db.metrics.NumPuts.Add(float64(len(entries))) 774 775 return req, nil 776 } 777 778 // batchSet applies a list of badger.Entry. If a request level error occurs it 779 // will be returned. 780 // Check(kv.BatchSet(entries)) 781 func (db *DB) batchSet(entries []*Entry) error { 782 sort.Slice(entries, func(i, j int) bool { 783 return entries[i].Key.Compare(entries[j].Key) < 0 784 }) 785 req, err := db.sendToWriteCh(entries) 786 if err != nil { 787 return err 788 } 789 790 return req.Wait() 791 } 792 793 // batchSetAsync is the asynchronous version of batchSet. It accepts a callback 794 // function which is called when all the sets are complete. If a request level 795 // error occurs, it will be passed back via the callback. 796 // err := kv.BatchSetAsync(entries, func(err error)) { 797 // Check(err) 798 // } 799 func (db *DB) batchSetAsync(entries []*Entry, f func(error)) error { 800 req, err := db.sendToWriteCh(entries) 801 if err != nil { 802 return err 803 } 804 go func() { 805 err := req.Wait() 806 // Write is complete. Let's call the callback function now. 807 f(err) 808 }() 809 return nil 810 } 811 812 // ensureRoomForWrite is always called serially. 813 func (db *DB) ensureRoomForWrite(mt *memtable.Table, minSize int64) int64 { 814 free := db.opt.MaxMemTableSize - mt.Size() 815 if free >= minSize { 816 return free 817 } 818 _ = db.flushMemTable() 819 return db.opt.MaxMemTableSize 820 } 821 822 func (db *DB) flushMemTable() *sync.WaitGroup { 823 mTbls := db.mtbls.Load().(*memTables) 824 newTbls := newMemTables(<-db.memTableCh, mTbls) 825 db.mtbls.Store(newTbls) 826 ft := newFlushTask(mTbls.getMutable(), db.logOff) 827 db.flushChan <- ft 828 log.Info("flushing memtable", zap.Int64("memtable size", mTbls.getMutable().Size()), zap.Int("size of flushChan", len(db.flushChan))) 829 830 // New memtable is empty. We certainly have room. 831 return &ft.wg 832 } 833 834 func arenaSize(opt Options) int64 { 835 return opt.MaxMemTableSize + opt.maxBatchCount*int64(memtable.MaxNodeSize) 836 } 837 838 // WriteLevel0Table flushes memtable. It drops deleteValues. 839 func (db *DB) writeLevel0Table(s *memtable.Table, f *os.File) error { 840 iter := s.NewIterator(false) 841 var ( 842 bb *blobFileBuilder 843 numWrite, bytesWrite int 844 err error 845 ) 846 b := sstable.NewTableBuilder(f, db.limiter, 0, db.opt.TableBuilderOptions) 847 defer b.Close() 848 849 for iter.Rewind(); iter.Valid(); y.NextAllVersion(iter) { 850 key := iter.Key() 851 value := iter.Value() 852 if db.opt.ValueThreshold > 0 && len(value.Value) > db.opt.ValueThreshold { 853 if bb == nil { 854 if bb, err = db.newBlobFileBuilder(); err != nil { 855 return y.Wrap(err) 856 } 857 } 858 859 bp, err := bb.append(value.Value) 860 if err != nil { 861 return err 862 } 863 value.Meta |= bitValuePointer 864 value.Value = bp 865 } 866 if err := b.Add(key, value); err != nil { 867 return err 868 } 869 numWrite++ 870 bytesWrite += key.Len() + int(value.EncodedSize()) 871 } 872 stats := &y.CompactionStats{ 873 KeysWrite: numWrite, 874 BytesWrite: bytesWrite, 875 } 876 db.lc.levels[0].metrics.UpdateCompactionStats(stats) 877 878 if err := b.Finish(); err != nil { 879 return y.Wrap(err) 880 } 881 if bb != nil { 882 bf, err1 := bb.finish() 883 if err1 != nil { 884 return err1 885 } 886 log.Info("build L0 blob", zap.Uint32("id", bf.fid), zap.Uint32("size", bf.fileSize)) 887 err1 = db.blobManger.addFile(bf) 888 if err1 != nil { 889 return err1 890 } 891 } 892 return nil 893 } 894 895 func (db *DB) newBlobFileBuilder() (*blobFileBuilder, error) { 896 return newBlobFileBuilder(db.blobManger.allocFileID(), db.opt.Dir, db.opt.TableBuilderOptions.WriteBufferSize) 897 } 898 899 type flushTask struct { 900 mt *memtable.Table 901 off logOffset 902 wg sync.WaitGroup 903 } 904 905 func newFlushTask(mt *memtable.Table, off logOffset) *flushTask { 906 ft := &flushTask{mt: mt, off: off} 907 ft.wg.Add(1) 908 return ft 909 } 910 911 type fastL0Table struct { 912 *memtable.Table 913 sst *sstable.Table 914 } 915 916 func newFastL0Table(mt *memtable.Table, sst *sstable.Table) *fastL0Table { 917 return &fastL0Table{ 918 Table: mt, 919 sst: sst, 920 } 921 } 922 923 func (t *fastL0Table) Close() error { 924 return t.sst.Close() 925 } 926 927 func (t *fastL0Table) Delete() error { 928 _ = t.Table.Delete() 929 return t.sst.Delete() 930 } 931 932 // TODO: Ensure that this function doesn't return, or is handled by another wrapper function. 933 // Otherwise, we would have no goroutine which can flush memtables. 934 func (db *DB) runFlushMemTable(c *y.Closer) error { 935 defer c.Done() 936 937 for ft := range db.flushChan { 938 if ft.mt == nil { 939 return nil 940 } 941 var headInfo *protos.HeadInfo 942 if !ft.mt.Empty() { 943 headInfo = &protos.HeadInfo{ 944 // Pick the max commit ts, so in case of crash, our read ts would be higher than all the 945 // commits. 946 Version: db.orc.commitTs(), 947 LogID: ft.off.fid, 948 LogOffset: ft.off.offset, 949 } 950 // Store badger head even if vptr is zero, need it for readTs 951 log.Info("flush memtable storing offset", zap.Uint32("fid", ft.off.fid), zap.Uint32("offset", ft.off.offset)) 952 } 953 954 fileID := ft.mt.ID() 955 filename := sstable.NewFilename(fileID, db.opt.Dir) 956 fd, err := directio.OpenFile(filename, os.O_CREATE|os.O_RDWR, 0666) 957 if err != nil { 958 log.Error("error while writing to level 0", zap.Error(err)) 959 return y.Wrap(err) 960 } 961 962 // Don't block just to sync the directory entry. 963 dirSyncCh := make(chan error) 964 go func() { dirSyncCh <- syncDir(db.opt.Dir) }() 965 966 err = db.writeLevel0Table(ft.mt, fd) 967 dirSyncErr := <-dirSyncCh 968 if err != nil { 969 log.Error("error while writing to level 0", zap.Error(err)) 970 return err 971 } 972 if dirSyncErr != nil { 973 log.Error("error while syncing level directory", zap.Error(dirSyncErr)) 974 return err 975 } 976 atomic.StoreUint32(&db.syncedFid, ft.off.fid) 977 fd.Close() 978 tbl, err := sstable.OpenTable(filename, db.blockCache, db.indexCache) 979 if err != nil { 980 log.Info("error while opening table", zap.Error(err)) 981 return err 982 } 983 err = db.lc.addLevel0Table(newFastL0Table(ft.mt, tbl), headInfo) 984 if err != nil { 985 log.Error("error while syncing level directory", zap.Error(err)) 986 return err 987 } 988 mTbls := db.mtbls.Load().(*memTables) 989 // Update the length of mTbls. 990 for i, tbl := range mTbls.tables { 991 if tbl == ft.mt { 992 atomic.StoreUint32(&mTbls.length, uint32(i)) 993 break 994 } 995 } 996 ft.wg.Done() 997 } 998 return nil 999 } 1000 1001 func exists(path string) (bool, error) { 1002 _, err := os.Stat(path) 1003 if err == nil { 1004 return true, nil 1005 } 1006 if os.IsNotExist(err) { 1007 return false, nil 1008 } 1009 return true, err 1010 } 1011 1012 // This function does a filewalk, calculates the size of vlog and sst files and stores it in 1013 // y.LSMSize and y.VlogSize. 1014 func (db *DB) calculateSize() { 1015 totalSize := func(dir string) (int64, int64) { 1016 var lsmSize, vlogSize int64 1017 err := filepath.Walk(dir, func(path string, info os.FileInfo, err error) error { 1018 if err != nil { 1019 return err 1020 } 1021 ext := filepath.Ext(path) 1022 if ext == ".sst" { 1023 lsmSize += info.Size() 1024 } else if ext == ".vlog" { 1025 vlogSize += info.Size() 1026 } 1027 return nil 1028 }) 1029 if err != nil { 1030 log.Info("error while calculating total size of directory", zap.String("path", dir)) 1031 } 1032 return lsmSize, vlogSize 1033 } 1034 1035 lsmSize, vlogSize := totalSize(db.opt.Dir) 1036 // If valueDir is different from dir, we'd have to do another walk. 1037 if db.opt.ValueDir != db.opt.Dir { 1038 _, vlogSize = totalSize(db.opt.ValueDir) 1039 } 1040 atomic.StoreInt64(&db.lsmSize, lsmSize) 1041 atomic.StoreInt64(&db.vlogSize, vlogSize) 1042 db.metrics.LSMSize.Set(float64(lsmSize)) 1043 db.metrics.VlogSize.Set(float64(vlogSize)) 1044 } 1045 1046 func (db *DB) updateSize(c *y.Closer) { 1047 defer c.Done() 1048 1049 metricsTicker := time.NewTicker(time.Minute) 1050 defer metricsTicker.Stop() 1051 1052 for { 1053 select { 1054 case <-metricsTicker.C: 1055 db.calculateSize() 1056 case <-c.HasBeenClosed(): 1057 return 1058 } 1059 } 1060 } 1061 1062 // Size returns the size of lsm and value log files in bytes. It can be used to decide how often to 1063 // call RunValueLogGC. 1064 func (db *DB) Size() (lsm int64, vlog int64) { 1065 return atomic.LoadInt64(&db.lsmSize), atomic.LoadInt64(&db.vlogSize) 1066 } 1067 1068 func (db *DB) Tables() []TableInfo { 1069 return db.lc.getTableInfo() 1070 } 1071 1072 func (db *DB) GetVLogOffset() uint64 { 1073 return db.vlog.getMaxPtr() 1074 } 1075 1076 // IterateVLog iterates VLog for external replay, this function should be called only when there is no 1077 // concurrent write operation on the DB. 1078 func (db *DB) IterateVLog(offset uint64, fn func(e Entry)) error { 1079 startFid := uint32(offset >> 32) 1080 vOffset := uint32(offset) 1081 for fid := startFid; fid <= db.vlog.maxFid(); fid++ { 1082 lf, err := db.vlog.getFile(fid) 1083 if err != nil { 1084 return err 1085 } 1086 if fid != startFid { 1087 vOffset = 0 1088 } 1089 endOffset, err := db.vlog.iterate(lf, vOffset, func(e Entry) error { 1090 if e.meta&bitTxn > 0 { 1091 fn(e) 1092 } 1093 return nil 1094 }) 1095 if err != nil { 1096 return err 1097 } 1098 if fid == db.vlog.maxFid() { 1099 _, err = lf.fd.Seek(int64(endOffset), io.SeekStart) 1100 if err != nil { 1101 return err 1102 } 1103 } 1104 } 1105 return nil 1106 } 1107 1108 func (db *DB) getCompactSafeTs() uint64 { 1109 return atomic.LoadUint64(&db.safeTsTracker.safeTs) 1110 } 1111 1112 // UpdateSafeTs is used for Managed DB, during compaction old version smaller than the safe ts will be discarded. 1113 // If this is not called, all old versions are kept. 1114 func (db *DB) UpdateSafeTs(ts uint64) { 1115 y.Assert(db.IsManaged()) 1116 for { 1117 old := db.getCompactSafeTs() 1118 if old < ts { 1119 if !atomic.CompareAndSwapUint64(&db.safeTsTracker.safeTs, old, ts) { 1120 continue 1121 } 1122 } 1123 break 1124 } 1125 } 1126 1127 func (db *DB) IsManaged() bool { 1128 return db.opt.ManagedTxns 1129 } 1130 1131 type safeTsTracker struct { 1132 safeTs uint64 1133 1134 maxInactive uint64 1135 minActive uint64 1136 } 1137 1138 func (t *safeTsTracker) Begin() { 1139 // t.maxInactive = 0 1140 t.minActive = math.MaxUint64 1141 } 1142 1143 func (t *safeTsTracker) Inspect(payload interface{}, isActive bool) { 1144 ts, ok := payload.(uint64) 1145 if !ok { 1146 return 1147 } 1148 1149 if isActive { 1150 if ts < t.minActive { 1151 t.minActive = ts 1152 } 1153 } else { 1154 if ts > t.maxInactive { 1155 t.maxInactive = ts 1156 } 1157 } 1158 } 1159 1160 func (t *safeTsTracker) End() { 1161 var safe uint64 1162 if t.minActive == math.MaxUint64 { 1163 safe = t.maxInactive 1164 } else { 1165 safe = t.minActive - 1 1166 } 1167 1168 if safe > atomic.LoadUint64(&t.safeTs) { 1169 atomic.StoreUint64(&t.safeTs, safe) 1170 } 1171 }