github.com/pingcap/badger@v1.5.1-0.20230103063557-828f39b09b6d/db.go (about) 1 /* 2 * Copyright 2017 Dgraph Labs, Inc. and Contributors 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package badger 18 19 import ( 20 "bytes" 21 "io" 22 "math" 23 "os" 24 "path/filepath" 25 "sort" 26 "strconv" 27 "sync" 28 "sync/atomic" 29 "time" 30 31 "github.com/dgryski/go-farm" 32 "github.com/pingcap/badger/cache" 33 "github.com/pingcap/badger/directio" 34 "github.com/pingcap/badger/epoch" 35 "github.com/pingcap/badger/options" 36 "github.com/pingcap/badger/protos" 37 "github.com/pingcap/badger/table" 38 "github.com/pingcap/badger/table/memtable" 39 "github.com/pingcap/badger/table/sstable" 40 "github.com/pingcap/badger/y" 41 "github.com/pingcap/errors" 42 "github.com/pingcap/log" 43 "go.uber.org/zap" 44 "golang.org/x/time/rate" 45 ) 46 47 var ( 48 txnKey = []byte("!badger!txn") // For indicating end of entries in txn. 49 ) 50 51 type closers struct { 52 updateSize *y.Closer 53 compactors *y.Closer 54 resourceManager *y.Closer 55 blobManager *y.Closer 56 memtable *y.Closer 57 writes *y.Closer 58 } 59 60 // DB provides the various functions required to interact with Badger. 61 // DB is thread-safe. 62 type DB struct { 63 dirLockGuard *directoryLockGuard 64 // nil if Dir and ValueDir are the same 65 valueDirGuard *directoryLockGuard 66 67 closers closers 68 mtbls atomic.Value 69 opt Options 70 manifest *manifestFile 71 lc *levelsController 72 vlog valueLog 73 logOff logOffset // less than or equal to a pointer to the last vlog value put into mt 74 syncedFid uint32 // The log fid that has been flushed to SST, older log files are safe to be deleted. 75 writeCh chan *request 76 flushChan chan *flushTask // For flushing memtables. 77 ingestCh chan *ingestTask 78 79 // mem table buffer to avoid expensive allocating big chunk of memory 80 memTableCh chan *memtable.Table 81 82 orc *oracle 83 safeTsTracker safeTsTracker 84 85 limiter *rate.Limiter 86 87 blockCache *cache.Cache 88 indexCache *cache.Cache 89 90 metrics *y.MetricsSet 91 lsmSize int64 92 vlogSize int64 93 volatileMode bool 94 95 blobManger blobManager 96 97 resourceMgr *epoch.ResourceManager 98 } 99 100 type memTables struct { 101 tables []*memtable.Table // tables from new to old, the first one is mutable. 102 length uint32 // The length is updated by the flusher. 103 } 104 105 func (tbls *memTables) getMutable() *memtable.Table { 106 return tbls.tables[0] 107 } 108 109 func newMemTables(mt *memtable.Table, old *memTables) *memTables { 110 newTbls := &memTables{} 111 newTbls.tables = make([]*memtable.Table, 1+atomic.LoadUint32(&old.length)) 112 newTbls.tables[0] = mt 113 copy(newTbls.tables[1:], old.tables) 114 newTbls.length = uint32(len(newTbls.tables)) 115 return newTbls 116 } 117 118 const ( 119 kvWriteChCapacity = 1000 120 ) 121 122 func replayFunction(out *DB) func(Entry) error { 123 type txnEntry struct { 124 nk y.Key 125 v y.ValueStruct 126 } 127 128 var txn []txnEntry 129 var lastCommit uint64 130 131 toLSM := func(nk y.Key, vs y.ValueStruct) { 132 e := memtable.Entry{Key: nk.UserKey, Value: vs} 133 mTbls := out.mtbls.Load().(*memTables) 134 if out.ensureRoomForWrite(mTbls.getMutable(), e.EstimateSize()) == out.opt.MaxMemTableSize { 135 mTbls = out.mtbls.Load().(*memTables) 136 } 137 mTbls.getMutable().PutToSkl(nk.UserKey, vs) 138 } 139 140 first := true 141 return func(e Entry) error { // Function for replaying. 142 if first { 143 log.Info("replay wal", zap.Stringer("first key", e.Key)) 144 } 145 first = false 146 147 if out.orc.curRead < e.Key.Version { 148 out.orc.curRead = e.Key.Version 149 } 150 151 var nk y.Key 152 nk.Copy(e.Key) 153 nv := make([]byte, len(e.Value)) 154 copy(nv, e.Value) 155 156 v := y.ValueStruct{ 157 Value: nv, 158 Meta: e.meta, 159 UserMeta: e.UserMeta, 160 Version: nk.Version, 161 } 162 163 if e.meta&bitFinTxn > 0 { 164 txnTs, err := strconv.ParseUint(string(e.Value), 10, 64) 165 if err != nil { 166 return errors.Wrapf(err, "Unable to parse txn fin: %q", e.Value) 167 } 168 if !out.IsManaged() { 169 y.Assert(lastCommit == txnTs) 170 } 171 y.Assert(len(txn) > 0) 172 // Got the end of txn. Now we can store them. 173 for _, t := range txn { 174 toLSM(t.nk, t.v) 175 } 176 txn = txn[:0] 177 lastCommit = 0 178 179 } else if e.meta&bitTxn == 0 { 180 // This entry is from a rewrite. 181 toLSM(nk, v) 182 183 // We shouldn't get this entry in the middle of a transaction. 184 y.Assert(lastCommit == 0) 185 y.Assert(len(txn) == 0) 186 187 } else { 188 if lastCommit == 0 { 189 lastCommit = e.Key.Version 190 } 191 if !out.IsManaged() { 192 y.Assert(lastCommit == e.Key.Version) 193 } 194 te := txnEntry{nk: nk, v: v} 195 txn = append(txn, te) 196 } 197 return nil 198 } 199 } 200 201 // Open returns a new DB object. 202 func Open(opt Options) (db *DB, err error) { 203 opt.maxBatchSize = (15 * opt.MaxMemTableSize) / 100 204 opt.maxBatchCount = opt.maxBatchSize / int64(memtable.MaxNodeSize) 205 206 if opt.ValueThreshold > math.MaxUint16-16 { 207 return nil, ErrValueThreshold 208 } 209 210 if opt.ReadOnly { 211 // Can't truncate if the DB is read only. 212 opt.Truncate = false 213 } 214 215 for _, path := range []string{opt.Dir, opt.ValueDir} { 216 dirExists, err := exists(path) 217 if err != nil { 218 return nil, y.Wrapf(err, "Invalid Dir: %q", path) 219 } 220 if !dirExists { 221 if opt.ReadOnly { 222 return nil, y.Wrapf(err, "Cannot find Dir for read-only open: %q", path) 223 } 224 // Try to create the directory 225 err = os.Mkdir(path, 0700) 226 if err != nil { 227 return nil, y.Wrapf(err, "Error Creating Dir: %q", path) 228 } 229 } 230 } 231 absDir, err := filepath.Abs(opt.Dir) 232 if err != nil { 233 return nil, err 234 } 235 absValueDir, err := filepath.Abs(opt.ValueDir) 236 if err != nil { 237 return nil, err 238 } 239 var dirLockGuard, valueDirLockGuard *directoryLockGuard 240 dirLockGuard, err = acquireDirectoryLock(opt.Dir, lockFile, opt.ReadOnly) 241 if err != nil { 242 return nil, err 243 } 244 defer func() { 245 if dirLockGuard != nil { 246 _ = dirLockGuard.release() 247 } 248 }() 249 if absValueDir != absDir { 250 valueDirLockGuard, err = acquireDirectoryLock(opt.ValueDir, lockFile, opt.ReadOnly) 251 if err != nil { 252 return nil, err 253 } 254 } 255 defer func() { 256 if valueDirLockGuard != nil { 257 _ = valueDirLockGuard.release() 258 } 259 }() 260 if !(opt.ValueLogFileSize <= 2<<30 && opt.ValueLogFileSize >= 1<<20) { 261 return nil, ErrValueLogSize 262 } 263 manifestFile, manifest, err := openOrCreateManifestFile(opt.Dir, opt.ReadOnly) 264 if err != nil { 265 return nil, err 266 } 267 defer func() { 268 if manifestFile != nil { 269 _ = manifestFile.close() 270 } 271 }() 272 273 orc := &oracle{ 274 isManaged: opt.ManagedTxns, 275 nextCommit: 1, 276 commits: make(map[uint64]uint64), 277 } 278 279 var blkCache, idxCache *cache.Cache 280 if opt.MaxBlockCacheSize != 0 { 281 var err error 282 blkCache, err = cache.NewCache(&cache.Config{ 283 // The expected keys is MaxCacheSize / BlockSize, then x10 as documentation suggests. 284 NumCounters: opt.MaxBlockCacheSize / int64(opt.TableBuilderOptions.BlockSize) * 10, 285 MaxCost: opt.MaxBlockCacheSize, 286 BufferItems: 64, 287 OnEvict: sstable.OnEvict, 288 }) 289 if err != nil { 290 return nil, errors.Wrap(err, "failed to create block cache") 291 } 292 293 indexSizeHint := float64(opt.TableBuilderOptions.MaxTableSize) / 6.0 294 idxCache, err = cache.NewCache(&cache.Config{ 295 NumCounters: int64(float64(opt.MaxIndexCacheSize) / indexSizeHint * 10), 296 MaxCost: opt.MaxIndexCacheSize, 297 BufferItems: 64, 298 }) 299 if err != nil { 300 return nil, errors.Wrap(err, "failed to create index cache") 301 } 302 } 303 db = &DB{ 304 flushChan: make(chan *flushTask, opt.NumMemtables), 305 writeCh: make(chan *request, kvWriteChCapacity), 306 memTableCh: make(chan *memtable.Table, 1), 307 ingestCh: make(chan *ingestTask), 308 opt: opt, 309 manifest: manifestFile, 310 dirLockGuard: dirLockGuard, 311 valueDirGuard: valueDirLockGuard, 312 orc: orc, 313 metrics: y.NewMetricSet(opt.Dir), 314 blockCache: blkCache, 315 indexCache: idxCache, 316 volatileMode: opt.VolatileMode, 317 } 318 db.vlog.metrics = db.metrics 319 320 rateLimit := opt.TableBuilderOptions.BytesPerSecond 321 if rateLimit > 0 { 322 db.limiter = rate.NewLimiter(rate.Limit(rateLimit), rateLimit) 323 } 324 325 // Calculate initial size. 326 db.calculateSize() 327 db.closers.updateSize = y.NewCloser(1) 328 go db.updateSize(db.closers.updateSize) 329 330 db.closers.resourceManager = y.NewCloser(0) 331 db.resourceMgr = epoch.NewResourceManager(db.closers.resourceManager, &db.safeTsTracker) 332 333 // newLevelsController potentially loads files in directory. 334 if db.lc, err = newLevelsController(db, &manifest, db.resourceMgr, opt.TableBuilderOptions); err != nil { 335 return nil, err 336 } 337 338 db.closers.memtable = y.NewCloser(1) 339 go func() { 340 lc := db.closers.memtable 341 for { 342 select { 343 case db.memTableCh <- memtable.New(arenaSize(db.opt), db.lc.reserveFileID()): 344 case <-lc.HasBeenClosed(): 345 lc.Done() 346 return 347 } 348 } 349 }() 350 db.mtbls.Store(newMemTables(<-db.memTableCh, &memTables{})) 351 352 if err = db.blobManger.Open(db, opt); err != nil { 353 return nil, err 354 } 355 356 if !opt.ReadOnly { 357 db.closers.compactors = y.NewCloser(1) 358 db.lc.startCompact(db.closers.compactors) 359 360 db.closers.memtable.AddRunning(1) 361 go db.runFlushMemTable(db.closers.memtable) // Need levels controller to be up. 362 } 363 364 if err = db.vlog.Open(db, opt); err != nil { 365 return nil, err 366 } 367 368 var logOff logOffset 369 head := manifest.Head 370 if head != nil { 371 db.orc.curRead = head.Version 372 logOff.fid = head.LogID 373 logOff.offset = head.LogOffset 374 } 375 376 // lastUsedCasCounter will either be the value stored in !badger!head, or some subsequently 377 // written value log entry that we replay. (Subsequent value log entries might be _less_ 378 // than lastUsedCasCounter, if there was value log gc so we have to max() values while 379 // replaying.) 380 // out.lastUsedCasCounter = item.casCounter 381 // TODO: Figure this out. This would update the read timestamp, and set nextCommitTs. 382 383 replayCloser := startWriteWorker(db) 384 385 if err = db.vlog.Replay(logOff, replayFunction(db)); err != nil { 386 return db, err 387 } 388 389 replayCloser.SignalAndWait() // Wait for replay to be applied first. 390 // Now that we have the curRead, we can update the nextCommit. 391 db.orc.Lock() 392 db.orc.nextCommit = db.orc.curRead + 1 393 db.orc.Unlock() 394 395 db.writeCh = make(chan *request, kvWriteChCapacity) 396 db.closers.writes = startWriteWorker(db) 397 398 valueDirLockGuard = nil 399 dirLockGuard = nil 400 manifestFile = nil 401 return db, nil 402 } 403 404 // DeleteFilesInRange delete files in [start, end). 405 // If some file contains keys outside the range, they will not be deleted. 406 // This function is designed to reclaim space quickly. 407 // If you want to ensure no future transaction can read keys in range, 408 // considering iterate and delete the remained keys, or using compaction filter to cleanup them asynchronously. 409 func (db *DB) DeleteFilesInRange(start, end []byte) { 410 var ( 411 changes []*protos.ManifestChange 412 pruneTbls []table.Table 413 startKey = y.KeyWithTs(start, math.MaxUint64) 414 endKey = y.KeyWithTs(end, 0) 415 guard = db.resourceMgr.Acquire() 416 ) 417 418 for level, lc := range db.lc.levels { 419 lc.Lock() 420 left, right := 0, len(lc.tables) 421 if lc.level > 0 { 422 left, right = getTablesInRange(lc.tables, startKey, endKey) 423 } 424 if left >= right { 425 lc.Unlock() 426 continue 427 } 428 429 newTables := lc.tables[:left] 430 for _, tbl := range lc.tables[left:right] { 431 if !isRangeCoversTable(startKey, endKey, tbl) || tbl.IsCompacting() { 432 newTables = append(newTables, tbl) 433 continue 434 } 435 pruneTbls = append(pruneTbls, tbl) 436 changes = append(changes, newDeleteChange(tbl.ID())) 437 } 438 newTables = append(newTables, lc.tables[right:]...) 439 for i := len(newTables); i < len(lc.tables); i++ { 440 lc.tables[i] = nil 441 } 442 assertTablesOrder(level, newTables, nil) 443 lc.tables = newTables 444 lc.Unlock() 445 } 446 447 db.manifest.addChanges(changes, nil) 448 var discardStats DiscardStats 449 deletes := make([]epoch.Resource, len(pruneTbls)) 450 for i, tbl := range pruneTbls { 451 it := tbl.NewIterator(false) 452 // TODO: use rate limiter to avoid burst IO. 453 for it.Rewind(); it.Valid(); y.NextAllVersion(it) { 454 discardStats.collect(it.Value()) 455 } 456 deletes[i] = tbl 457 it.Close() 458 } 459 if len(discardStats.ptrs) > 0 { 460 db.blobManger.discardCh <- &discardStats 461 } 462 guard.Delete(deletes) 463 guard.Done() 464 } 465 466 func isRangeCoversTable(start, end y.Key, t table.Table) bool { 467 left := start.Compare(t.Smallest()) <= 0 468 right := t.Biggest().Compare(end) < 0 469 return left && right 470 } 471 472 // NewExternalTableBuilder returns a new sst builder. 473 func (db *DB) NewExternalTableBuilder(f *os.File, compression options.CompressionType, limiter *rate.Limiter) *sstable.Builder { 474 return sstable.NewExternalTableBuilder(f, limiter, db.opt.TableBuilderOptions, compression) 475 } 476 477 // ErrExternalTableOverlap returned by IngestExternalFiles when files overlaps. 478 var ErrExternalTableOverlap = errors.New("keys of external tables has overlap") 479 480 type ExternalTableSpec struct { 481 Filename string 482 } 483 484 // IngestExternalFiles ingest external constructed tables into DB. 485 // Note: insure there is no concurrent write overlap with tables to be ingested. 486 func (db *DB) IngestExternalFiles(files []ExternalTableSpec) (int, error) { 487 tbls, err := db.prepareExternalFiles(files) 488 if err != nil { 489 return 0, err 490 } 491 492 if err := db.checkExternalTables(tbls); err != nil { 493 return 0, err 494 } 495 496 task := &ingestTask{tbls: tbls} 497 task.Add(1) 498 db.ingestCh <- task 499 task.Wait() 500 return task.cnt, task.err 501 } 502 503 func (db *DB) prepareExternalFiles(specs []ExternalTableSpec) ([]table.Table, error) { 504 tbls := make([]table.Table, len(specs)) 505 for i, spec := range specs { 506 id := db.lc.reserveFileID() 507 filename := sstable.NewFilename(id, db.opt.Dir) 508 509 err := os.Link(spec.Filename, filename) 510 if err != nil { 511 return nil, err 512 } 513 514 err = os.Link(sstable.IndexFilename(spec.Filename), sstable.IndexFilename(filename)) 515 if err != nil { 516 return nil, err 517 } 518 519 tbl, err := sstable.OpenTable(filename, db.blockCache, db.indexCache) 520 if err != nil { 521 return nil, err 522 } 523 524 tbls[i] = tbl 525 } 526 527 sort.Slice(tbls, func(i, j int) bool { 528 return tbls[i].Smallest().Compare(tbls[j].Smallest()) < 0 529 }) 530 531 return tbls, syncDir(db.lc.kv.opt.Dir) 532 } 533 534 func (db *DB) checkExternalTables(tbls []table.Table) error { 535 keys := make([][]byte, 0, len(tbls)*2) 536 for _, t := range tbls { 537 keys = append(keys, t.Smallest().UserKey, t.Biggest().UserKey) 538 } 539 ok := sort.SliceIsSorted(keys, func(i, j int) bool { 540 return bytes.Compare(keys[i], keys[j]) < 0 541 }) 542 if !ok { 543 return ErrExternalTableOverlap 544 } 545 546 for i := 1; i < len(keys)-1; i += 2 { 547 if bytes.Compare(keys[i], keys[i+1]) == 0 { 548 return ErrExternalTableOverlap 549 } 550 } 551 552 return nil 553 } 554 555 // CacheMetrics returns the metrics for the underlying cache. 556 func (db *DB) CacheMetrics() *cache.Metrics { 557 // Do not enable ristretto metrics in badger until issue 558 // https://github.com/dgraph-io/ristretto/issues/92 is resolved. 559 // return db.blockCache.Metrics() 560 return nil 561 } 562 563 // Close closes a DB. It's crucial to call it to ensure all the pending updates 564 // make their way to disk. Calling DB.Close() multiple times is not safe and would 565 // cause panic. 566 func (db *DB) Close() (err error) { 567 log.Info("Closing database") 568 569 // Stop writes next. 570 db.closers.writes.SignalAndWait() 571 572 // Now close the value log. 573 if vlogErr := db.vlog.Close(); err == nil { 574 err = errors.Wrap(vlogErr, "DB.Close") 575 } 576 577 // Make sure that block writer is done pushing stuff into memtable! 578 // Otherwise, you will have a race condition: we are trying to flush memtables 579 // and remove them completely, while the block / memtable writer is still 580 // trying to push stuff into the memtable. This will also resolve the value 581 // offset problem: as we push into memtable, we update value offsets there. 582 mTbls := db.mtbls.Load().(*memTables) 583 if !mTbls.getMutable().Empty() && !db.volatileMode { 584 log.Info("Flushing memtable") 585 db.mtbls.Store(newMemTables(nil, mTbls)) 586 db.flushChan <- newFlushTask(mTbls.getMutable(), db.logOff) 587 } 588 db.flushChan <- newFlushTask(nil, logOffset{}) // Tell flusher to quit. 589 590 if db.closers.memtable != nil { 591 db.closers.memtable.SignalAndWait() 592 log.Info("Memtable flushed") 593 } 594 if db.closers.compactors != nil { 595 db.closers.compactors.SignalAndWait() 596 log.Info("Compaction finished") 597 } 598 if db.opt.CompactL0WhenClose && !db.volatileMode { 599 // Force Compact L0 600 // We don't need to care about cstatus since no parallel compaction is running. 601 cd := &CompactDef{} 602 guard := db.resourceMgr.Acquire() 603 defer guard.Done() 604 if cd.fillTablesL0(&db.lc.cstatus, db.lc.levels[0], db.lc.levels[1]) { 605 if err := db.lc.runCompactDef(cd, guard); err != nil { 606 log.Info("LOG Compact FAILED", zap.Stringer("compact def", cd), zap.Error(err)) 607 } 608 } else { 609 log.Info("fillTables failed for level zero. No compaction required") 610 } 611 } 612 613 if db.closers.blobManager != nil { 614 db.closers.blobManager.SignalAndWait() 615 log.Info("BlobManager finished") 616 } 617 if db.closers.resourceManager != nil { 618 db.closers.resourceManager.SignalAndWait() 619 log.Info("ResourceManager finished") 620 } 621 622 if lcErr := db.lc.close(); err == nil { 623 err = errors.Wrap(lcErr, "DB.Close") 624 } 625 log.Info("Waiting for closer") 626 db.closers.updateSize.SignalAndWait() 627 if db.blockCache != nil { 628 db.blockCache.Close() 629 } 630 631 if db.indexCache != nil { 632 db.indexCache.Close() 633 } 634 635 if db.dirLockGuard != nil { 636 if guardErr := db.dirLockGuard.release(); err == nil { 637 err = errors.Wrap(guardErr, "DB.Close") 638 } 639 } 640 if db.valueDirGuard != nil { 641 if guardErr := db.valueDirGuard.release(); err == nil { 642 err = errors.Wrap(guardErr, "DB.Close") 643 } 644 } 645 if manifestErr := db.manifest.close(); err == nil { 646 err = errors.Wrap(manifestErr, "DB.Close") 647 } 648 649 // Fsync directories to ensure that lock file, and any other removed files whose directory 650 // we haven't specifically fsynced, are guaranteed to have their directory entry removal 651 // persisted to disk. 652 if syncErr := syncDir(db.opt.Dir); err == nil { 653 err = errors.Wrap(syncErr, "DB.Close") 654 } 655 if syncErr := syncDir(db.opt.ValueDir); err == nil { 656 err = errors.Wrap(syncErr, "DB.Close") 657 } 658 659 return err 660 } 661 662 const ( 663 lockFile = "LOCK" 664 ) 665 666 // When you create or delete a file, you have to ensure the directory entry for the file is synced 667 // in order to guarantee the file is visible (if the system crashes). (See the man page for fsync, 668 // or see https://github.com/coreos/etcd/issues/6368 for an example.) 669 func syncDir(dir string) error { 670 f, err := openDir(dir) 671 if err != nil { 672 return errors.Wrapf(err, "While opening directory: %s.", dir) 673 } 674 err = f.Sync() 675 closeErr := f.Close() 676 if err != nil { 677 return errors.Wrapf(err, "While syncing directory: %s.", dir) 678 } 679 return errors.Wrapf(closeErr, "While closing directory: %s.", dir) 680 } 681 682 // getMemtables returns the current memtables. 683 func (db *DB) getMemTables() []*memtable.Table { 684 tbls := db.mtbls.Load().(*memTables) 685 l := atomic.LoadUint32(&tbls.length) 686 return tbls.tables[:l] 687 } 688 689 // get returns the value in memtable or disk for given key. 690 // Note that value will include meta byte. 691 // 692 // IMPORTANT: We should never write an entry with an older timestamp for the same key, We need to 693 // maintain this invariant to search for the latest value of a key, or else we need to search in all 694 // tables and find the max version among them. To maintain this invariant, we also need to ensure 695 // that all versions of a key are always present in the same table from level 1, because compaction 696 // can push any table down. 697 func (db *DB) get(key y.Key) y.ValueStruct { 698 tables := db.getMemTables() // Lock should be released. 699 700 db.metrics.NumGets.Inc() 701 for _, table := range tables { 702 db.metrics.NumMemtableGets.Inc() 703 vs, err := table.Get(key, 0) 704 if err != nil { 705 log.Error("search table meets error", zap.Error(err)) 706 } 707 if vs.Valid() { 708 return vs 709 } 710 } 711 keyHash := farm.Fingerprint64(key.UserKey) 712 return db.lc.get(key, keyHash) 713 } 714 715 func (db *DB) multiGet(pairs []keyValuePair) { 716 tables := db.getMemTables() // Lock should be released. 717 718 var foundCount, mtGets int 719 for _, table := range tables { 720 for j := range pairs { 721 pair := &pairs[j] 722 if pair.found { 723 continue 724 } 725 for { 726 val, err := table.Get(pair.key, 0) 727 if err != nil { 728 log.Error("search table meets error", zap.Error(err)) 729 } 730 if val.Valid() { 731 pair.val = val 732 pair.found = true 733 foundCount++ 734 } 735 mtGets++ 736 break 737 } 738 } 739 } 740 db.metrics.NumMemtableGets.Add(float64(mtGets)) 741 db.metrics.NumGets.Add(float64(len(pairs))) 742 743 if foundCount == len(pairs) { 744 return 745 } 746 db.lc.multiGet(pairs) 747 } 748 749 func (db *DB) updateOffset(off logOffset) { 750 y.Assert(!off.Less(db.logOff)) 751 // We don't need to protect it by a lock because the value is never accessed 752 // by more than one goroutine at the same time. 753 db.logOff = off 754 } 755 756 var requestPool = sync.Pool{ 757 New: func() interface{} { 758 return new(request) 759 }, 760 } 761 762 func (db *DB) sendToWriteCh(entries []*Entry) (*request, error) { 763 var count, size int64 764 for _, e := range entries { 765 size += int64(e.estimateSize()) 766 count++ 767 } 768 769 // We can only service one request because we need each txn to be stored in a contigous section. 770 // Txns should not interleave among other txns or rewrites. 771 req := requestPool.Get().(*request) 772 req.Entries = entries 773 req.Wg = sync.WaitGroup{} 774 req.Wg.Add(1) 775 db.writeCh <- req // Handled in writeWorker. 776 db.metrics.NumPuts.Add(float64(len(entries))) 777 778 return req, nil 779 } 780 781 // batchSet applies a list of badger.Entry. If a request level error occurs it 782 // will be returned. 783 // 784 // Check(kv.BatchSet(entries)) 785 func (db *DB) batchSet(entries []*Entry) error { 786 sort.Slice(entries, func(i, j int) bool { 787 return entries[i].Key.Compare(entries[j].Key) < 0 788 }) 789 req, err := db.sendToWriteCh(entries) 790 if err != nil { 791 return err 792 } 793 794 return req.Wait() 795 } 796 797 // batchSetAsync is the asynchronous version of batchSet. It accepts a callback 798 // function which is called when all the sets are complete. If a request level 799 // error occurs, it will be passed back via the callback. 800 // 801 // err := kv.BatchSetAsync(entries, func(err error)) { 802 // Check(err) 803 // } 804 func (db *DB) batchSetAsync(entries []*Entry, f func(error)) error { 805 req, err := db.sendToWriteCh(entries) 806 if err != nil { 807 return err 808 } 809 go func() { 810 err := req.Wait() 811 // Write is complete. Let's call the callback function now. 812 f(err) 813 }() 814 return nil 815 } 816 817 // ensureRoomForWrite is always called serially. 818 func (db *DB) ensureRoomForWrite(mt *memtable.Table, minSize int64) int64 { 819 free := db.opt.MaxMemTableSize - mt.Size() 820 if free >= minSize { 821 return free 822 } 823 _ = db.flushMemTable() 824 return db.opt.MaxMemTableSize 825 } 826 827 func (db *DB) flushMemTable() *sync.WaitGroup { 828 mTbls := db.mtbls.Load().(*memTables) 829 newTbls := newMemTables(<-db.memTableCh, mTbls) 830 db.mtbls.Store(newTbls) 831 ft := newFlushTask(mTbls.getMutable(), db.logOff) 832 db.flushChan <- ft 833 log.Info("flushing memtable", zap.Int64("memtable size", mTbls.getMutable().Size()), zap.Int("size of flushChan", len(db.flushChan))) 834 835 // New memtable is empty. We certainly have room. 836 return &ft.wg 837 } 838 839 func arenaSize(opt Options) int64 { 840 return opt.MaxMemTableSize + opt.maxBatchCount*int64(memtable.MaxNodeSize) 841 } 842 843 // WriteLevel0Table flushes memtable. It drops deleteValues. 844 func (db *DB) writeLevel0Table(s *memtable.Table, f *os.File) error { 845 iter := s.NewIterator(false) 846 defer iter.Close() 847 var ( 848 bb *blobFileBuilder 849 numWrite, bytesWrite int 850 err error 851 ) 852 b := sstable.NewTableBuilder(f, db.limiter, 0, db.opt.TableBuilderOptions) 853 defer b.Close() 854 855 for iter.Rewind(); iter.Valid(); y.NextAllVersion(iter) { 856 key := iter.Key() 857 value := iter.Value() 858 if db.opt.ValueThreshold > 0 && len(value.Value) > db.opt.ValueThreshold { 859 if bb == nil { 860 if bb, err = db.newBlobFileBuilder(); err != nil { 861 return y.Wrap(err) 862 } 863 } 864 865 bp, err := bb.append(value.Value) 866 if err != nil { 867 return err 868 } 869 value.Meta |= bitValuePointer 870 value.Value = bp 871 } 872 if err = b.Add(key, value); err != nil { 873 return err 874 } 875 numWrite++ 876 bytesWrite += key.Len() + int(value.EncodedSize()) 877 } 878 stats := &y.CompactionStats{ 879 KeysWrite: numWrite, 880 BytesWrite: bytesWrite, 881 } 882 db.lc.levels[0].metrics.UpdateCompactionStats(stats) 883 884 if _, err = b.Finish(); err != nil { 885 return y.Wrap(err) 886 } 887 if bb != nil { 888 bf, err1 := bb.finish() 889 if err1 != nil { 890 return err1 891 } 892 log.Info("build L0 blob", zap.Uint32("id", bf.fid), zap.Uint32("size", bf.fileSize)) 893 err1 = db.blobManger.addFile(bf) 894 if err1 != nil { 895 return err1 896 } 897 } 898 return nil 899 } 900 901 func (db *DB) newBlobFileBuilder() (*blobFileBuilder, error) { 902 return newBlobFileBuilder(db.blobManger.allocFileID(), db.opt.Dir, db.opt.TableBuilderOptions.WriteBufferSize) 903 } 904 905 type flushTask struct { 906 mt *memtable.Table 907 off logOffset 908 wg sync.WaitGroup 909 } 910 911 func newFlushTask(mt *memtable.Table, off logOffset) *flushTask { 912 ft := &flushTask{mt: mt, off: off} 913 ft.wg.Add(1) 914 return ft 915 } 916 917 // TODO: Ensure that this function doesn't return, or is handled by another wrapper function. 918 // Otherwise, we would have no goroutine which can flush memtables. 919 func (db *DB) runFlushMemTable(c *y.Closer) error { 920 defer c.Done() 921 922 for ft := range db.flushChan { 923 if ft.mt == nil { 924 return nil 925 } 926 guard := db.resourceMgr.Acquire() 927 var headInfo *protos.HeadInfo 928 if !ft.mt.Empty() { 929 headInfo = &protos.HeadInfo{ 930 // Pick the max commit ts, so in case of crash, our read ts would be higher than all the 931 // commits. 932 Version: db.orc.commitTs(), 933 LogID: ft.off.fid, 934 LogOffset: ft.off.offset, 935 } 936 // Store badger head even if vptr is zero, need it for readTs 937 log.Info("flush memtable storing offset", zap.Uint32("fid", ft.off.fid), zap.Uint32("offset", ft.off.offset)) 938 } 939 940 fileID := ft.mt.ID() 941 filename := sstable.NewFilename(fileID, db.opt.Dir) 942 fd, err := directio.OpenFile(filename, os.O_CREATE|os.O_RDWR, 0666) 943 if err != nil { 944 log.Error("error while writing to level 0", zap.Error(err)) 945 return y.Wrap(err) 946 } 947 948 // Don't block just to sync the directory entry. 949 dirSyncCh := make(chan error) 950 go func() { dirSyncCh <- syncDir(db.opt.Dir) }() 951 952 err = db.writeLevel0Table(ft.mt, fd) 953 dirSyncErr := <-dirSyncCh 954 if err != nil { 955 log.Error("error while writing to level 0", zap.Error(err)) 956 return err 957 } 958 if dirSyncErr != nil { 959 log.Error("error while syncing level directory", zap.Error(dirSyncErr)) 960 return err 961 } 962 atomic.StoreUint32(&db.syncedFid, ft.off.fid) 963 fd.Close() 964 tbl, err := sstable.OpenTable(filename, db.blockCache, db.indexCache) 965 if err != nil { 966 log.Info("error while opening table", zap.Error(err)) 967 return err 968 } 969 err = db.lc.addLevel0Table(tbl, headInfo) 970 if err != nil { 971 log.Error("error while syncing level directory", zap.Error(err)) 972 return err 973 } 974 mTbls := db.mtbls.Load().(*memTables) 975 // Update the length of mTbls. 976 for i, tbl := range mTbls.tables { 977 if tbl == ft.mt { 978 atomic.StoreUint32(&mTbls.length, uint32(i)) 979 break 980 } 981 } 982 guard.Delete([]epoch.Resource{ft.mt}) 983 guard.Done() 984 ft.wg.Done() 985 } 986 return nil 987 } 988 989 func exists(path string) (bool, error) { 990 _, err := os.Stat(path) 991 if err == nil { 992 return true, nil 993 } 994 if os.IsNotExist(err) { 995 return false, nil 996 } 997 return true, err 998 } 999 1000 // This function does a filewalk, calculates the size of vlog and sst files and stores it in 1001 // y.LSMSize and y.VlogSize. 1002 func (db *DB) calculateSize() { 1003 totalSize := func(dir string) (int64, int64) { 1004 var lsmSize, vlogSize int64 1005 err := filepath.Walk(dir, func(path string, info os.FileInfo, err error) error { 1006 if err != nil { 1007 return err 1008 } 1009 ext := filepath.Ext(path) 1010 if ext == ".sst" { 1011 lsmSize += info.Size() 1012 } else if ext == ".vlog" { 1013 vlogSize += info.Size() 1014 } 1015 return nil 1016 }) 1017 if err != nil { 1018 log.Info("error while calculating total size of directory", zap.String("path", dir)) 1019 } 1020 return lsmSize, vlogSize 1021 } 1022 1023 lsmSize, vlogSize := totalSize(db.opt.Dir) 1024 // If valueDir is different from dir, we'd have to do another walk. 1025 if db.opt.ValueDir != db.opt.Dir { 1026 _, vlogSize = totalSize(db.opt.ValueDir) 1027 } 1028 atomic.StoreInt64(&db.lsmSize, lsmSize) 1029 atomic.StoreInt64(&db.vlogSize, vlogSize) 1030 db.metrics.LSMSize.Set(float64(lsmSize)) 1031 db.metrics.VlogSize.Set(float64(vlogSize)) 1032 } 1033 1034 func (db *DB) updateSize(c *y.Closer) { 1035 defer c.Done() 1036 1037 metricsTicker := time.NewTicker(time.Minute) 1038 defer metricsTicker.Stop() 1039 1040 for { 1041 select { 1042 case <-metricsTicker.C: 1043 db.calculateSize() 1044 case <-c.HasBeenClosed(): 1045 return 1046 } 1047 } 1048 } 1049 1050 // Size returns the size of lsm and value log files in bytes. It can be used to decide how often to 1051 // call RunValueLogGC. 1052 func (db *DB) Size() (lsm int64, vlog int64) { 1053 return atomic.LoadInt64(&db.lsmSize), atomic.LoadInt64(&db.vlogSize) 1054 } 1055 1056 func (db *DB) Tables() []TableInfo { 1057 return db.lc.getTableInfo() 1058 } 1059 1060 func (db *DB) GetVLogOffset() uint64 { 1061 return db.vlog.getMaxPtr() 1062 } 1063 1064 // IterateVLog iterates VLog for external replay, this function should be called only when there is no 1065 // concurrent write operation on the DB. 1066 func (db *DB) IterateVLog(offset uint64, fn func(e Entry)) error { 1067 startFid := uint32(offset >> 32) 1068 vOffset := uint32(offset) 1069 for fid := startFid; fid <= db.vlog.maxFid(); fid++ { 1070 lf, err := db.vlog.getFile(fid) 1071 if err != nil { 1072 return err 1073 } 1074 if fid != startFid { 1075 vOffset = 0 1076 } 1077 endOffset, err := db.vlog.iterate(lf, vOffset, func(e Entry) error { 1078 if e.meta&bitTxn > 0 { 1079 fn(e) 1080 } 1081 return nil 1082 }) 1083 if err != nil { 1084 return err 1085 } 1086 if fid == db.vlog.maxFid() { 1087 _, err = lf.fd.Seek(int64(endOffset), io.SeekStart) 1088 if err != nil { 1089 return err 1090 } 1091 } 1092 } 1093 return nil 1094 } 1095 1096 func (db *DB) getCompactSafeTs() uint64 { 1097 return atomic.LoadUint64(&db.safeTsTracker.safeTs) 1098 } 1099 1100 // UpdateSafeTs is used for Managed DB, during compaction old version smaller than the safe ts will be discarded. 1101 // If this is not called, all old versions are kept. 1102 func (db *DB) UpdateSafeTs(ts uint64) { 1103 y.Assert(db.IsManaged()) 1104 for { 1105 old := db.getCompactSafeTs() 1106 if old < ts { 1107 if !atomic.CompareAndSwapUint64(&db.safeTsTracker.safeTs, old, ts) { 1108 continue 1109 } 1110 } 1111 break 1112 } 1113 } 1114 1115 func (db *DB) IsManaged() bool { 1116 return db.opt.ManagedTxns 1117 } 1118 1119 type safeTsTracker struct { 1120 safeTs uint64 1121 1122 maxInactive uint64 1123 minActive uint64 1124 } 1125 1126 func (t *safeTsTracker) Begin() { 1127 // t.maxInactive = 0 1128 t.minActive = math.MaxUint64 1129 } 1130 1131 func (t *safeTsTracker) Inspect(payload interface{}, isActive bool) { 1132 ts, ok := payload.(uint64) 1133 if !ok { 1134 return 1135 } 1136 1137 if isActive { 1138 if ts < t.minActive { 1139 t.minActive = ts 1140 } 1141 } else { 1142 if ts > t.maxInactive { 1143 t.maxInactive = ts 1144 } 1145 } 1146 } 1147 1148 func (t *safeTsTracker) End() { 1149 var safe uint64 1150 if t.minActive == math.MaxUint64 { 1151 safe = t.maxInactive 1152 } else { 1153 safe = t.minActive - 1 1154 } 1155 1156 if safe > atomic.LoadUint64(&t.safeTs) { 1157 atomic.StoreUint64(&t.safeTs, safe) 1158 } 1159 }