github.com/cellofellow/gopkg@v0.0.0-20140722061823-eec0544a62ad/database/leveldb.chai2010/src/db_impl.cc (about)

     1  // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style license that can be
     3  // found in the LICENSE file. See the AUTHORS file for names of contributors.
     4  
     5  #include "db/db_impl.h"
     6  
     7  #include <algorithm>
     8  #include <set>
     9  #include <string>
    10  #include <stdint.h>
    11  #include <stdio.h>
    12  #include <vector>
    13  #include "db/builder.h"
    14  #include "db/db_iter.h"
    15  #include "db/dbformat.h"
    16  #include "db/filename.h"
    17  #include "db/log_reader.h"
    18  #include "db/log_writer.h"
    19  #include "db/memtable.h"
    20  #include "db/table_cache.h"
    21  #include "db/version_set.h"
    22  #include "db/write_batch_internal.h"
    23  #include "leveldb/db.h"
    24  #include "leveldb/env.h"
    25  #include "leveldb/status.h"
    26  #include "leveldb/table.h"
    27  #include "leveldb/table_builder.h"
    28  #include "port/port.h"
    29  #include "table/block.h"
    30  #include "table/merger.h"
    31  #include "table/two_level_iterator.h"
    32  #include "util/coding.h"
    33  #include "util/logging.h"
    34  #include "util/mutexlock.h"
    35  
    36  namespace leveldb {
    37  
    38  const int kNumNonTableCacheFiles = 10;
    39  
    40  // Information kept for every waiting writer
    41  struct DBImpl::Writer {
    42    Status status;
    43    WriteBatch* batch;
    44    bool sync;
    45    bool done;
    46    port::CondVar cv;
    47  
    48    explicit Writer(port::Mutex* mu) : cv(mu) { }
    49  };
    50  
    51  struct DBImpl::CompactionState {
    52    Compaction* const compaction;
    53  
    54    // Sequence numbers < smallest_snapshot are not significant since we
    55    // will never have to service a snapshot below smallest_snapshot.
    56    // Therefore if we have seen a sequence number S <= smallest_snapshot,
    57    // we can drop all entries for the same key with sequence numbers < S.
    58    SequenceNumber smallest_snapshot;
    59  
    60    // Files produced by compaction
    61    struct Output {
    62      uint64_t number;
    63      uint64_t file_size;
    64      InternalKey smallest, largest;
    65    };
    66    std::vector<Output> outputs;
    67  
    68    // State kept for output being generated
    69    WritableFile* outfile;
    70    TableBuilder* builder;
    71  
    72    uint64_t total_bytes;
    73  
    74    Output* current_output() { return &outputs[outputs.size()-1]; }
    75  
    76    explicit CompactionState(Compaction* c)
    77        : compaction(c),
    78          outfile(NULL),
    79          builder(NULL),
    80          total_bytes(0) {
    81    }
    82  };
    83  
    84  // Fix user-supplied options to be reasonable
    85  template <class T,class V>
    86  static void ClipToRange(T* ptr, V minvalue, V maxvalue) {
    87    if (static_cast<V>(*ptr) > maxvalue) *ptr = maxvalue;
    88    if (static_cast<V>(*ptr) < minvalue) *ptr = minvalue;
    89  }
    90  Options SanitizeOptions(const std::string& dbname,
    91                          const InternalKeyComparator* icmp,
    92                          const InternalFilterPolicy* ipolicy,
    93                          const Options& src) {
    94    Options result = src;
    95    result.comparator = icmp;
    96    result.filter_policy = (src.filter_policy != NULL) ? ipolicy : NULL;
    97    ClipToRange(&result.max_open_files,    64 + kNumNonTableCacheFiles, 50000);
    98    ClipToRange(&result.write_buffer_size, 64<<10,                      1<<30);
    99    ClipToRange(&result.block_size,        1<<10,                       4<<20);
   100    if (result.info_log == NULL) {
   101      // Open a log file in the same directory as the db
   102      src.env->CreateDir(dbname);  // In case it does not exist
   103      src.env->RenameFile(InfoLogFileName(dbname), OldInfoLogFileName(dbname));
   104      Status s = src.env->NewLogger(InfoLogFileName(dbname), &result.info_log);
   105      if (!s.ok()) {
   106        // No place suitable for logging
   107        result.info_log = NULL;
   108      }
   109    }
   110    if (result.block_cache == NULL) {
   111      result.block_cache = NewLRUCache(8 << 20);
   112    }
   113    return result;
   114  }
   115  
   116  DBImpl::DBImpl(const Options& raw_options, const std::string& dbname)
   117      : env_(raw_options.env),
   118        internal_comparator_(raw_options.comparator),
   119        internal_filter_policy_(raw_options.filter_policy),
   120        options_(SanitizeOptions(dbname, &internal_comparator_,
   121                                 &internal_filter_policy_, raw_options)),
   122        owns_info_log_(options_.info_log != raw_options.info_log),
   123        owns_cache_(options_.block_cache != raw_options.block_cache),
   124        dbname_(dbname),
   125        db_lock_(NULL),
   126        shutting_down_(NULL),
   127        bg_cv_(&mutex_),
   128        mem_(new MemTable(internal_comparator_)),
   129        imm_(NULL),
   130        logfile_(NULL),
   131        logfile_number_(0),
   132        log_(NULL),
   133        seed_(0),
   134        tmp_batch_(new WriteBatch),
   135        bg_compaction_scheduled_(false),
   136        manual_compaction_(NULL),
   137        consecutive_compaction_errors_(0) {
   138    mem_->Ref();
   139    has_imm_.Release_Store(NULL);
   140  
   141    // Reserve ten files or so for other uses and give the rest to TableCache.
   142    const int table_cache_size = options_.max_open_files - kNumNonTableCacheFiles;
   143    table_cache_ = new TableCache(dbname_, &options_, table_cache_size);
   144  
   145    versions_ = new VersionSet(dbname_, &options_, table_cache_,
   146                               &internal_comparator_);
   147  }
   148  
   149  DBImpl::~DBImpl() {
   150    // Wait for background work to finish
   151    mutex_.Lock();
   152    shutting_down_.Release_Store(this);  // Any non-NULL value is ok
   153    while (bg_compaction_scheduled_) {
   154      bg_cv_.Wait();
   155    }
   156    mutex_.Unlock();
   157  
   158    if (db_lock_ != NULL) {
   159      env_->UnlockFile(db_lock_);
   160    }
   161  
   162    delete versions_;
   163    if (mem_ != NULL) mem_->Unref();
   164    if (imm_ != NULL) imm_->Unref();
   165    delete tmp_batch_;
   166    delete log_;
   167    delete logfile_;
   168    delete table_cache_;
   169  
   170    if (owns_info_log_) {
   171      delete options_.info_log;
   172    }
   173    if (owns_cache_) {
   174      delete options_.block_cache;
   175    }
   176  }
   177  
   178  Status DBImpl::NewDB() {
   179    VersionEdit new_db;
   180    new_db.SetComparatorName(user_comparator()->Name());
   181    new_db.SetLogNumber(0);
   182    new_db.SetNextFile(2);
   183    new_db.SetLastSequence(0);
   184  
   185    const std::string manifest = DescriptorFileName(dbname_, 1);
   186    WritableFile* file;
   187    Status s = env_->NewWritableFile(manifest, &file);
   188    if (!s.ok()) {
   189      return s;
   190    }
   191    {
   192      log::Writer log(file);
   193      std::string record;
   194      new_db.EncodeTo(&record);
   195      s = log.AddRecord(record);
   196      if (s.ok()) {
   197        s = file->Close();
   198      }
   199    }
   200    delete file;
   201    if (s.ok()) {
   202      // Make "CURRENT" file that points to the new manifest file.
   203      s = SetCurrentFile(env_, dbname_, 1);
   204    } else {
   205      env_->DeleteFile(manifest);
   206    }
   207    return s;
   208  }
   209  
   210  void DBImpl::MaybeIgnoreError(Status* s) const {
   211    if (s->ok() || options_.paranoid_checks) {
   212      // No change needed
   213    } else {
   214      Log(options_.info_log, "Ignoring error %s", s->ToString().c_str());
   215      *s = Status::OK();
   216    }
   217  }
   218  
   219  void DBImpl::DeleteObsoleteFiles() {
   220    // Make a set of all of the live files
   221    std::set<uint64_t> live = pending_outputs_;
   222    versions_->AddLiveFiles(&live);
   223  
   224    std::vector<std::string> filenames;
   225    env_->GetChildren(dbname_, &filenames); // Ignoring errors on purpose
   226    uint64_t number;
   227    FileType type;
   228    for (size_t i = 0; i < filenames.size(); i++) {
   229      if (ParseFileName(filenames[i], &number, &type)) {
   230        bool keep = true;
   231        switch (type) {
   232          case kLogFile:
   233            keep = ((number >= versions_->LogNumber()) ||
   234                    (number == versions_->PrevLogNumber()));
   235            break;
   236          case kDescriptorFile:
   237            // Keep my manifest file, and any newer incarnations'
   238            // (in case there is a race that allows other incarnations)
   239            keep = (number >= versions_->ManifestFileNumber());
   240            break;
   241          case kTableFile:
   242            keep = (live.find(number) != live.end());
   243            break;
   244          case kTempFile:
   245            // Any temp files that are currently being written to must
   246            // be recorded in pending_outputs_, which is inserted into "live"
   247            keep = (live.find(number) != live.end());
   248            break;
   249          case kCurrentFile:
   250          case kDBLockFile:
   251          case kInfoLogFile:
   252            keep = true;
   253            break;
   254        }
   255  
   256        if (!keep) {
   257          if (type == kTableFile) {
   258            table_cache_->Evict(number);
   259          }
   260          Log(options_.info_log, "Delete type=%d #%lld\n",
   261              int(type),
   262              static_cast<unsigned long long>(number));
   263          env_->DeleteFile(dbname_ + "/" + filenames[i]);
   264        }
   265      }
   266    }
   267  }
   268  
   269  Status DBImpl::Recover(VersionEdit* edit) {
   270    mutex_.AssertHeld();
   271  
   272    // Ignore error from CreateDir since the creation of the DB is
   273    // committed only when the descriptor is created, and this directory
   274    // may already exist from a previous failed creation attempt.
   275    env_->CreateDir(dbname_);
   276    assert(db_lock_ == NULL);
   277    Status s = env_->LockFile(LockFileName(dbname_), &db_lock_);
   278    if (!s.ok()) {
   279      return s;
   280    }
   281  
   282    if (!env_->FileExists(CurrentFileName(dbname_))) {
   283      if (options_.create_if_missing) {
   284        s = NewDB();
   285        if (!s.ok()) {
   286          return s;
   287        }
   288      } else {
   289        return Status::InvalidArgument(
   290            dbname_, "does not exist (create_if_missing is false)");
   291      }
   292    } else {
   293      if (options_.error_if_exists) {
   294        return Status::InvalidArgument(
   295            dbname_, "exists (error_if_exists is true)");
   296      }
   297    }
   298  
   299    s = versions_->Recover();
   300    if (s.ok()) {
   301      SequenceNumber max_sequence(0);
   302  
   303      // Recover from all newer log files than the ones named in the
   304      // descriptor (new log files may have been added by the previous
   305      // incarnation without registering them in the descriptor).
   306      //
   307      // Note that PrevLogNumber() is no longer used, but we pay
   308      // attention to it in case we are recovering a database
   309      // produced by an older version of leveldb.
   310      const uint64_t min_log = versions_->LogNumber();
   311      const uint64_t prev_log = versions_->PrevLogNumber();
   312      std::vector<std::string> filenames;
   313      s = env_->GetChildren(dbname_, &filenames);
   314      if (!s.ok()) {
   315        return s;
   316      }
   317      std::set<uint64_t> expected;
   318      versions_->AddLiveFiles(&expected);
   319      uint64_t number;
   320      FileType type;
   321      std::vector<uint64_t> logs;
   322      for (size_t i = 0; i < filenames.size(); i++) {
   323        if (ParseFileName(filenames[i], &number, &type)) {
   324          expected.erase(number);
   325          if (type == kLogFile && ((number >= min_log) || (number == prev_log)))
   326            logs.push_back(number);
   327        }
   328      }
   329      if (!expected.empty()) {
   330        char buf[50];
   331        snprintf(buf, sizeof(buf), "%d missing files; e.g.",
   332                 static_cast<int>(expected.size()));
   333        return Status::Corruption(buf, TableFileName(dbname_, *(expected.begin())));
   334      }
   335  
   336      // Recover in the order in which the logs were generated
   337      std::sort(logs.begin(), logs.end());
   338      for (size_t i = 0; i < logs.size(); i++) {
   339        s = RecoverLogFile(logs[i], edit, &max_sequence);
   340  
   341        // The previous incarnation may not have written any MANIFEST
   342        // records after allocating this log number.  So we manually
   343        // update the file number allocation counter in VersionSet.
   344        versions_->MarkFileNumberUsed(logs[i]);
   345      }
   346  
   347      if (s.ok()) {
   348        if (versions_->LastSequence() < max_sequence) {
   349          versions_->SetLastSequence(max_sequence);
   350        }
   351      }
   352    }
   353  
   354    return s;
   355  }
   356  
   357  Status DBImpl::RecoverLogFile(uint64_t log_number,
   358                                VersionEdit* edit,
   359                                SequenceNumber* max_sequence) {
   360    struct LogReporter : public log::Reader::Reporter {
   361      Env* env;
   362      Logger* info_log;
   363      const char* fname;
   364      Status* status;  // NULL if options_.paranoid_checks==false
   365      virtual void Corruption(size_t bytes, const Status& s) {
   366        Log(info_log, "%s%s: dropping %d bytes; %s",
   367            (this->status == NULL ? "(ignoring error) " : ""),
   368            fname, static_cast<int>(bytes), s.ToString().c_str());
   369        if (this->status != NULL && this->status->ok()) *this->status = s;
   370      }
   371    };
   372  
   373    mutex_.AssertHeld();
   374  
   375    // Open the log file
   376    std::string fname = LogFileName(dbname_, log_number);
   377    SequentialFile* file;
   378    Status status = env_->NewSequentialFile(fname, &file);
   379    if (!status.ok()) {
   380      MaybeIgnoreError(&status);
   381      return status;
   382    }
   383  
   384    // Create the log reader.
   385    LogReporter reporter;
   386    reporter.env = env_;
   387    reporter.info_log = options_.info_log;
   388    reporter.fname = fname.c_str();
   389    reporter.status = (options_.paranoid_checks ? &status : NULL);
   390    // We intentially make log::Reader do checksumming even if
   391    // paranoid_checks==false so that corruptions cause entire commits
   392    // to be skipped instead of propagating bad information (like overly
   393    // large sequence numbers).
   394    log::Reader reader(file, &reporter, true/*checksum*/,
   395                       0/*initial_offset*/);
   396    Log(options_.info_log, "Recovering log #%llu",
   397        (unsigned long long) log_number);
   398  
   399    // Read all the records and add to a memtable
   400    std::string scratch;
   401    Slice record;
   402    WriteBatch batch;
   403    MemTable* mem = NULL;
   404    while (reader.ReadRecord(&record, &scratch) &&
   405           status.ok()) {
   406      if (record.size() < 12) {
   407        reporter.Corruption(
   408            record.size(), Status::Corruption("log record too small"));
   409        continue;
   410      }
   411      WriteBatchInternal::SetContents(&batch, record);
   412  
   413      if (mem == NULL) {
   414        mem = new MemTable(internal_comparator_);
   415        mem->Ref();
   416      }
   417      status = WriteBatchInternal::InsertInto(&batch, mem);
   418      MaybeIgnoreError(&status);
   419      if (!status.ok()) {
   420        break;
   421      }
   422      const SequenceNumber last_seq =
   423          WriteBatchInternal::Sequence(&batch) +
   424          WriteBatchInternal::Count(&batch) - 1;
   425      if (last_seq > *max_sequence) {
   426        *max_sequence = last_seq;
   427      }
   428  
   429      if (mem->ApproximateMemoryUsage() > options_.write_buffer_size) {
   430        status = WriteLevel0Table(mem, edit, NULL);
   431        if (!status.ok()) {
   432          // Reflect errors immediately so that conditions like full
   433          // file-systems cause the DB::Open() to fail.
   434          break;
   435        }
   436        mem->Unref();
   437        mem = NULL;
   438      }
   439    }
   440  
   441    if (status.ok() && mem != NULL) {
   442      status = WriteLevel0Table(mem, edit, NULL);
   443      // Reflect errors immediately so that conditions like full
   444      // file-systems cause the DB::Open() to fail.
   445    }
   446  
   447    if (mem != NULL) mem->Unref();
   448    delete file;
   449    return status;
   450  }
   451  
   452  Status DBImpl::WriteLevel0Table(MemTable* mem, VersionEdit* edit,
   453                                  Version* base) {
   454    mutex_.AssertHeld();
   455    const uint64_t start_micros = env_->NowMicros();
   456    FileMetaData meta;
   457    meta.number = versions_->NewFileNumber();
   458    pending_outputs_.insert(meta.number);
   459    Iterator* iter = mem->NewIterator();
   460    Log(options_.info_log, "Level-0 table #%llu: started",
   461        (unsigned long long) meta.number);
   462  
   463    Status s;
   464    {
   465      mutex_.Unlock();
   466      s = BuildTable(dbname_, env_, options_, table_cache_, iter, &meta);
   467      mutex_.Lock();
   468    }
   469  
   470    Log(options_.info_log, "Level-0 table #%llu: %lld bytes %s",
   471        (unsigned long long) meta.number,
   472        (unsigned long long) meta.file_size,
   473        s.ToString().c_str());
   474    delete iter;
   475    pending_outputs_.erase(meta.number);
   476  
   477  
   478    // Note that if file_size is zero, the file has been deleted and
   479    // should not be added to the manifest.
   480    int level = 0;
   481    if (s.ok() && meta.file_size > 0) {
   482      const Slice min_user_key = meta.smallest.user_key();
   483      const Slice max_user_key = meta.largest.user_key();
   484      if (base != NULL) {
   485        level = base->PickLevelForMemTableOutput(min_user_key, max_user_key);
   486      }
   487      edit->AddFile(level, meta.number, meta.file_size,
   488                    meta.smallest, meta.largest);
   489    }
   490  
   491    CompactionStats stats;
   492    stats.micros = env_->NowMicros() - start_micros;
   493    stats.bytes_written = meta.file_size;
   494    stats_[level].Add(stats);
   495    return s;
   496  }
   497  
   498  Status DBImpl::CompactMemTable() {
   499    mutex_.AssertHeld();
   500    assert(imm_ != NULL);
   501  
   502    // Save the contents of the memtable as a new Table
   503    VersionEdit edit;
   504    Version* base = versions_->current();
   505    base->Ref();
   506    Status s = WriteLevel0Table(imm_, &edit, base);
   507    base->Unref();
   508  
   509    if (s.ok() && shutting_down_.Acquire_Load()) {
   510      s = Status::IOError("Deleting DB during memtable compaction");
   511    }
   512  
   513    // Replace immutable memtable with the generated Table
   514    if (s.ok()) {
   515      edit.SetPrevLogNumber(0);
   516      edit.SetLogNumber(logfile_number_);  // Earlier logs no longer needed
   517      s = versions_->LogAndApply(&edit, &mutex_);
   518    }
   519  
   520    if (s.ok()) {
   521      // Commit to the new state
   522      imm_->Unref();
   523      imm_ = NULL;
   524      has_imm_.Release_Store(NULL);
   525      DeleteObsoleteFiles();
   526    }
   527  
   528    return s;
   529  }
   530  
   531  void DBImpl::CompactRange(const Slice* begin, const Slice* end) {
   532    int max_level_with_files = 1;
   533    {
   534      MutexLock l(&mutex_);
   535      Version* base = versions_->current();
   536      for (int level = 1; level < config::kNumLevels; level++) {
   537        if (base->OverlapInLevel(level, begin, end)) {
   538          max_level_with_files = level;
   539        }
   540      }
   541    }
   542    TEST_CompactMemTable(); // TODO(sanjay): Skip if memtable does not overlap
   543    for (int level = 0; level < max_level_with_files; level++) {
   544      TEST_CompactRange(level, begin, end);
   545    }
   546  }
   547  
   548  void DBImpl::TEST_CompactRange(int level, const Slice* begin,const Slice* end) {
   549    assert(level >= 0);
   550    assert(level + 1 < config::kNumLevels);
   551  
   552    InternalKey begin_storage, end_storage;
   553  
   554    ManualCompaction manual;
   555    manual.level = level;
   556    manual.done = false;
   557    if (begin == NULL) {
   558      manual.begin = NULL;
   559    } else {
   560      begin_storage = InternalKey(*begin, kMaxSequenceNumber, kValueTypeForSeek);
   561      manual.begin = &begin_storage;
   562    }
   563    if (end == NULL) {
   564      manual.end = NULL;
   565    } else {
   566      end_storage = InternalKey(*end, 0, static_cast<ValueType>(0));
   567      manual.end = &end_storage;
   568    }
   569  
   570    MutexLock l(&mutex_);
   571    while (!manual.done) {
   572      while (manual_compaction_ != NULL) {
   573        bg_cv_.Wait();
   574      }
   575      manual_compaction_ = &manual;
   576      MaybeScheduleCompaction();
   577      while (manual_compaction_ == &manual) {
   578        bg_cv_.Wait();
   579      }
   580    }
   581  }
   582  
   583  Status DBImpl::TEST_CompactMemTable() {
   584    // NULL batch means just wait for earlier writes to be done
   585    Status s = Write(WriteOptions(), NULL);
   586    if (s.ok()) {
   587      // Wait until the compaction completes
   588      MutexLock l(&mutex_);
   589      while (imm_ != NULL && bg_error_.ok()) {
   590        bg_cv_.Wait();
   591      }
   592      if (imm_ != NULL) {
   593        s = bg_error_;
   594      }
   595    }
   596    return s;
   597  }
   598  
   599  void DBImpl::MaybeScheduleCompaction() {
   600    mutex_.AssertHeld();
   601    if (bg_compaction_scheduled_) {
   602      // Already scheduled
   603    } else if (shutting_down_.Acquire_Load()) {
   604      // DB is being deleted; no more background compactions
   605    } else if (imm_ == NULL &&
   606               manual_compaction_ == NULL &&
   607               !versions_->NeedsCompaction()) {
   608      // No work to be done
   609    } else {
   610      bg_compaction_scheduled_ = true;
   611      env_->Schedule(&DBImpl::BGWork, this);
   612    }
   613  }
   614  
   615  void DBImpl::BGWork(void* db) {
   616    reinterpret_cast<DBImpl*>(db)->BackgroundCall();
   617  }
   618  
   619  void DBImpl::BackgroundCall() {
   620    MutexLock l(&mutex_);
   621    assert(bg_compaction_scheduled_);
   622    if (!shutting_down_.Acquire_Load()) {
   623      Status s = BackgroundCompaction();
   624      if (s.ok()) {
   625        // Success
   626        consecutive_compaction_errors_ = 0;
   627      } else if (shutting_down_.Acquire_Load()) {
   628        // Error most likely due to shutdown; do not wait
   629      } else {
   630        // Wait a little bit before retrying background compaction in
   631        // case this is an environmental problem and we do not want to
   632        // chew up resources for failed compactions for the duration of
   633        // the problem.
   634        bg_cv_.SignalAll();  // In case a waiter can proceed despite the error
   635        Log(options_.info_log, "Waiting after background compaction error: %s",
   636            s.ToString().c_str());
   637        mutex_.Unlock();
   638        ++consecutive_compaction_errors_;
   639        int seconds_to_sleep = 1;
   640        for (int i = 0; i < 3 && i < consecutive_compaction_errors_ - 1; ++i) {
   641          seconds_to_sleep *= 2;
   642        }
   643        env_->SleepForMicroseconds(seconds_to_sleep * 1000000);
   644        mutex_.Lock();
   645      }
   646    }
   647  
   648    bg_compaction_scheduled_ = false;
   649  
   650    // Previous compaction may have produced too many files in a level,
   651    // so reschedule another compaction if needed.
   652    MaybeScheduleCompaction();
   653    bg_cv_.SignalAll();
   654  }
   655  
   656  Status DBImpl::BackgroundCompaction() {
   657    mutex_.AssertHeld();
   658  
   659    if (imm_ != NULL) {
   660      return CompactMemTable();
   661    }
   662  
   663    Compaction* c;
   664    bool is_manual = (manual_compaction_ != NULL);
   665    InternalKey manual_end;
   666    if (is_manual) {
   667      ManualCompaction* m = manual_compaction_;
   668      c = versions_->CompactRange(m->level, m->begin, m->end);
   669      m->done = (c == NULL);
   670      if (c != NULL) {
   671        manual_end = c->input(0, c->num_input_files(0) - 1)->largest;
   672      }
   673      Log(options_.info_log,
   674          "Manual compaction at level-%d from %s .. %s; will stop at %s\n",
   675          m->level,
   676          (m->begin ? m->begin->DebugString().c_str() : "(begin)"),
   677          (m->end ? m->end->DebugString().c_str() : "(end)"),
   678          (m->done ? "(end)" : manual_end.DebugString().c_str()));
   679    } else {
   680      c = versions_->PickCompaction();
   681    }
   682  
   683    Status status;
   684    if (c == NULL) {
   685      // Nothing to do
   686    } else if (!is_manual && c->IsTrivialMove()) {
   687      // Move file to next level
   688      assert(c->num_input_files(0) == 1);
   689      FileMetaData* f = c->input(0, 0);
   690      c->edit()->DeleteFile(c->level(), f->number);
   691      c->edit()->AddFile(c->level() + 1, f->number, f->file_size,
   692                         f->smallest, f->largest);
   693      status = versions_->LogAndApply(c->edit(), &mutex_);
   694      VersionSet::LevelSummaryStorage tmp;
   695      Log(options_.info_log, "Moved #%lld to level-%d %lld bytes %s: %s\n",
   696          static_cast<unsigned long long>(f->number),
   697          c->level() + 1,
   698          static_cast<unsigned long long>(f->file_size),
   699          status.ToString().c_str(),
   700          versions_->LevelSummary(&tmp));
   701    } else {
   702      CompactionState* compact = new CompactionState(c);
   703      status = DoCompactionWork(compact);
   704      CleanupCompaction(compact);
   705      c->ReleaseInputs();
   706      DeleteObsoleteFiles();
   707    }
   708    delete c;
   709  
   710    if (status.ok()) {
   711      // Done
   712    } else if (shutting_down_.Acquire_Load()) {
   713      // Ignore compaction errors found during shutting down
   714    } else {
   715      Log(options_.info_log,
   716          "Compaction error: %s", status.ToString().c_str());
   717      if (options_.paranoid_checks && bg_error_.ok()) {
   718        bg_error_ = status;
   719      }
   720    }
   721  
   722    if (is_manual) {
   723      ManualCompaction* m = manual_compaction_;
   724      if (!status.ok()) {
   725        m->done = true;
   726      }
   727      if (!m->done) {
   728        // We only compacted part of the requested range.  Update *m
   729        // to the range that is left to be compacted.
   730        m->tmp_storage = manual_end;
   731        m->begin = &m->tmp_storage;
   732      }
   733      manual_compaction_ = NULL;
   734    }
   735    return status;
   736  }
   737  
   738  void DBImpl::CleanupCompaction(CompactionState* compact) {
   739    mutex_.AssertHeld();
   740    if (compact->builder != NULL) {
   741      // May happen if we get a shutdown call in the middle of compaction
   742      compact->builder->Abandon();
   743      delete compact->builder;
   744    } else {
   745      assert(compact->outfile == NULL);
   746    }
   747    delete compact->outfile;
   748    for (size_t i = 0; i < compact->outputs.size(); i++) {
   749      const CompactionState::Output& out = compact->outputs[i];
   750      pending_outputs_.erase(out.number);
   751    }
   752    delete compact;
   753  }
   754  
   755  Status DBImpl::OpenCompactionOutputFile(CompactionState* compact) {
   756    assert(compact != NULL);
   757    assert(compact->builder == NULL);
   758    uint64_t file_number;
   759    {
   760      mutex_.Lock();
   761      file_number = versions_->NewFileNumber();
   762      pending_outputs_.insert(file_number);
   763      CompactionState::Output out;
   764      out.number = file_number;
   765      out.smallest.Clear();
   766      out.largest.Clear();
   767      compact->outputs.push_back(out);
   768      mutex_.Unlock();
   769    }
   770  
   771    // Make the output file
   772    std::string fname = TableFileName(dbname_, file_number);
   773    Status s = env_->NewWritableFile(fname, &compact->outfile);
   774    if (s.ok()) {
   775      compact->builder = new TableBuilder(options_, compact->outfile);
   776    }
   777    return s;
   778  }
   779  
   780  Status DBImpl::FinishCompactionOutputFile(CompactionState* compact,
   781                                            Iterator* input) {
   782    assert(compact != NULL);
   783    assert(compact->outfile != NULL);
   784    assert(compact->builder != NULL);
   785  
   786    const uint64_t output_number = compact->current_output()->number;
   787    assert(output_number != 0);
   788  
   789    // Check for iterator errors
   790    Status s = input->status();
   791    const uint64_t current_entries = compact->builder->NumEntries();
   792    if (s.ok()) {
   793      s = compact->builder->Finish();
   794    } else {
   795      compact->builder->Abandon();
   796    }
   797    const uint64_t current_bytes = compact->builder->FileSize();
   798    compact->current_output()->file_size = current_bytes;
   799    compact->total_bytes += current_bytes;
   800    delete compact->builder;
   801    compact->builder = NULL;
   802  
   803    // Finish and check for file errors
   804    if (s.ok()) {
   805      s = compact->outfile->Sync();
   806    }
   807    if (s.ok()) {
   808      s = compact->outfile->Close();
   809    }
   810    delete compact->outfile;
   811    compact->outfile = NULL;
   812  
   813    if (s.ok() && current_entries > 0) {
   814      // Verify that the table is usable
   815      Iterator* iter = table_cache_->NewIterator(ReadOptions(),
   816                                                 output_number,
   817                                                 current_bytes);
   818      s = iter->status();
   819      delete iter;
   820      if (s.ok()) {
   821        Log(options_.info_log,
   822            "Generated table #%llu: %lld keys, %lld bytes",
   823            (unsigned long long) output_number,
   824            (unsigned long long) current_entries,
   825            (unsigned long long) current_bytes);
   826      }
   827    }
   828    return s;
   829  }
   830  
   831  
   832  Status DBImpl::InstallCompactionResults(CompactionState* compact) {
   833    mutex_.AssertHeld();
   834    Log(options_.info_log,  "Compacted %d@%d + %d@%d files => %lld bytes",
   835        compact->compaction->num_input_files(0),
   836        compact->compaction->level(),
   837        compact->compaction->num_input_files(1),
   838        compact->compaction->level() + 1,
   839        static_cast<long long>(compact->total_bytes));
   840  
   841    // Add compaction outputs
   842    compact->compaction->AddInputDeletions(compact->compaction->edit());
   843    const int level = compact->compaction->level();
   844    for (size_t i = 0; i < compact->outputs.size(); i++) {
   845      const CompactionState::Output& out = compact->outputs[i];
   846      compact->compaction->edit()->AddFile(
   847          level + 1,
   848          out.number, out.file_size, out.smallest, out.largest);
   849    }
   850    return versions_->LogAndApply(compact->compaction->edit(), &mutex_);
   851  }
   852  
   853  Status DBImpl::DoCompactionWork(CompactionState* compact) {
   854    const uint64_t start_micros = env_->NowMicros();
   855    int64_t imm_micros = 0;  // Micros spent doing imm_ compactions
   856  
   857    Log(options_.info_log,  "Compacting %d@%d + %d@%d files",
   858        compact->compaction->num_input_files(0),
   859        compact->compaction->level(),
   860        compact->compaction->num_input_files(1),
   861        compact->compaction->level() + 1);
   862  
   863    assert(versions_->NumLevelFiles(compact->compaction->level()) > 0);
   864    assert(compact->builder == NULL);
   865    assert(compact->outfile == NULL);
   866    if (snapshots_.empty()) {
   867      compact->smallest_snapshot = versions_->LastSequence();
   868    } else {
   869      compact->smallest_snapshot = snapshots_.oldest()->number_;
   870    }
   871  
   872    // Release mutex while we're actually doing the compaction work
   873    mutex_.Unlock();
   874  
   875    Iterator* input = versions_->MakeInputIterator(compact->compaction);
   876    input->SeekToFirst();
   877    Status status;
   878    ParsedInternalKey ikey;
   879    std::string current_user_key;
   880    bool has_current_user_key = false;
   881    SequenceNumber last_sequence_for_key = kMaxSequenceNumber;
   882    for (; input->Valid() && !shutting_down_.Acquire_Load(); ) {
   883      // Prioritize immutable compaction work
   884      if (has_imm_.NoBarrier_Load() != NULL) {
   885        const uint64_t imm_start = env_->NowMicros();
   886        mutex_.Lock();
   887        if (imm_ != NULL) {
   888          CompactMemTable();
   889          bg_cv_.SignalAll();  // Wakeup MakeRoomForWrite() if necessary
   890        }
   891        mutex_.Unlock();
   892        imm_micros += (env_->NowMicros() - imm_start);
   893      }
   894  
   895      Slice key = input->key();
   896      if (compact->compaction->ShouldStopBefore(key) &&
   897          compact->builder != NULL) {
   898        status = FinishCompactionOutputFile(compact, input);
   899        if (!status.ok()) {
   900          break;
   901        }
   902      }
   903  
   904      // Handle key/value, add to state, etc.
   905      bool drop = false;
   906      if (!ParseInternalKey(key, &ikey)) {
   907        // Do not hide error keys
   908        current_user_key.clear();
   909        has_current_user_key = false;
   910        last_sequence_for_key = kMaxSequenceNumber;
   911      } else {
   912        if (!has_current_user_key ||
   913            user_comparator()->Compare(ikey.user_key,
   914                                       Slice(current_user_key)) != 0) {
   915          // First occurrence of this user key
   916          current_user_key.assign(ikey.user_key.data(), ikey.user_key.size());
   917          has_current_user_key = true;
   918          last_sequence_for_key = kMaxSequenceNumber;
   919        }
   920  
   921        if (last_sequence_for_key <= compact->smallest_snapshot) {
   922          // Hidden by an newer entry for same user key
   923          drop = true;    // (A)
   924        } else if (ikey.type == kTypeDeletion &&
   925                   ikey.sequence <= compact->smallest_snapshot &&
   926                   compact->compaction->IsBaseLevelForKey(ikey.user_key)) {
   927          // For this user key:
   928          // (1) there is no data in higher levels
   929          // (2) data in lower levels will have larger sequence numbers
   930          // (3) data in layers that are being compacted here and have
   931          //     smaller sequence numbers will be dropped in the next
   932          //     few iterations of this loop (by rule (A) above).
   933          // Therefore this deletion marker is obsolete and can be dropped.
   934          drop = true;
   935        }
   936  
   937        last_sequence_for_key = ikey.sequence;
   938      }
   939  #if 0
   940      Log(options_.info_log,
   941          "  Compact: %s, seq %d, type: %d %d, drop: %d, is_base: %d, "
   942          "%d smallest_snapshot: %d",
   943          ikey.user_key.ToString().c_str(),
   944          (int)ikey.sequence, ikey.type, kTypeValue, drop,
   945          compact->compaction->IsBaseLevelForKey(ikey.user_key),
   946          (int)last_sequence_for_key, (int)compact->smallest_snapshot);
   947  #endif
   948  
   949      if (!drop) {
   950        // Open output file if necessary
   951        if (compact->builder == NULL) {
   952          status = OpenCompactionOutputFile(compact);
   953          if (!status.ok()) {
   954            break;
   955          }
   956        }
   957        if (compact->builder->NumEntries() == 0) {
   958          compact->current_output()->smallest.DecodeFrom(key);
   959        }
   960        compact->current_output()->largest.DecodeFrom(key);
   961        compact->builder->Add(key, input->value());
   962  
   963        // Close output file if it is big enough
   964        if (compact->builder->FileSize() >=
   965            compact->compaction->MaxOutputFileSize()) {
   966          status = FinishCompactionOutputFile(compact, input);
   967          if (!status.ok()) {
   968            break;
   969          }
   970        }
   971      }
   972  
   973      input->Next();
   974    }
   975  
   976    if (status.ok() && shutting_down_.Acquire_Load()) {
   977      status = Status::IOError("Deleting DB during compaction");
   978    }
   979    if (status.ok() && compact->builder != NULL) {
   980      status = FinishCompactionOutputFile(compact, input);
   981    }
   982    if (status.ok()) {
   983      status = input->status();
   984    }
   985    delete input;
   986    input = NULL;
   987  
   988    CompactionStats stats;
   989    stats.micros = env_->NowMicros() - start_micros - imm_micros;
   990    for (int which = 0; which < 2; which++) {
   991      for (int i = 0; i < compact->compaction->num_input_files(which); i++) {
   992        stats.bytes_read += compact->compaction->input(which, i)->file_size;
   993      }
   994    }
   995    for (size_t i = 0; i < compact->outputs.size(); i++) {
   996      stats.bytes_written += compact->outputs[i].file_size;
   997    }
   998  
   999    mutex_.Lock();
  1000    stats_[compact->compaction->level() + 1].Add(stats);
  1001  
  1002    if (status.ok()) {
  1003      status = InstallCompactionResults(compact);
  1004    }
  1005    VersionSet::LevelSummaryStorage tmp;
  1006    Log(options_.info_log,
  1007        "compacted to: %s", versions_->LevelSummary(&tmp));
  1008    return status;
  1009  }
  1010  
  1011  namespace {
  1012  struct IterState {
  1013    port::Mutex* mu;
  1014    Version* version;
  1015    MemTable* mem;
  1016    MemTable* imm;
  1017  };
  1018  
  1019  static void CleanupIteratorState(void* arg1, void* arg2) {
  1020    IterState* state = reinterpret_cast<IterState*>(arg1);
  1021    state->mu->Lock();
  1022    state->mem->Unref();
  1023    if (state->imm != NULL) state->imm->Unref();
  1024    state->version->Unref();
  1025    state->mu->Unlock();
  1026    delete state;
  1027  }
  1028  }  // namespace
  1029  
  1030  Iterator* DBImpl::NewInternalIterator(const ReadOptions& options,
  1031                                        SequenceNumber* latest_snapshot,
  1032                                        uint32_t* seed) {
  1033    IterState* cleanup = new IterState;
  1034    mutex_.Lock();
  1035    *latest_snapshot = versions_->LastSequence();
  1036  
  1037    // Collect together all needed child iterators
  1038    std::vector<Iterator*> list;
  1039    list.push_back(mem_->NewIterator());
  1040    mem_->Ref();
  1041    if (imm_ != NULL) {
  1042      list.push_back(imm_->NewIterator());
  1043      imm_->Ref();
  1044    }
  1045    versions_->current()->AddIterators(options, &list);
  1046    Iterator* internal_iter =
  1047        NewMergingIterator(&internal_comparator_, &list[0], list.size());
  1048    versions_->current()->Ref();
  1049  
  1050    cleanup->mu = &mutex_;
  1051    cleanup->mem = mem_;
  1052    cleanup->imm = imm_;
  1053    cleanup->version = versions_->current();
  1054    internal_iter->RegisterCleanup(CleanupIteratorState, cleanup, NULL);
  1055  
  1056    *seed = ++seed_;
  1057    mutex_.Unlock();
  1058    return internal_iter;
  1059  }
  1060  
  1061  Iterator* DBImpl::TEST_NewInternalIterator() {
  1062    SequenceNumber ignored;
  1063    uint32_t ignored_seed;
  1064    return NewInternalIterator(ReadOptions(), &ignored, &ignored_seed);
  1065  }
  1066  
  1067  int64_t DBImpl::TEST_MaxNextLevelOverlappingBytes() {
  1068    MutexLock l(&mutex_);
  1069    return versions_->MaxNextLevelOverlappingBytes();
  1070  }
  1071  
  1072  Status DBImpl::Get(const ReadOptions& options,
  1073                     const Slice& key,
  1074                     std::string* value) {
  1075    Status s;
  1076    MutexLock l(&mutex_);
  1077    SequenceNumber snapshot;
  1078    if (options.snapshot != NULL) {
  1079      snapshot = reinterpret_cast<const SnapshotImpl*>(options.snapshot)->number_;
  1080    } else {
  1081      snapshot = versions_->LastSequence();
  1082    }
  1083  
  1084    MemTable* mem = mem_;
  1085    MemTable* imm = imm_;
  1086    Version* current = versions_->current();
  1087    mem->Ref();
  1088    if (imm != NULL) imm->Ref();
  1089    current->Ref();
  1090  
  1091    bool have_stat_update = false;
  1092    Version::GetStats stats;
  1093  
  1094    // Unlock while reading from files and memtables
  1095    {
  1096      mutex_.Unlock();
  1097      // First look in the memtable, then in the immutable memtable (if any).
  1098      LookupKey lkey(key, snapshot);
  1099      if (mem->Get(lkey, value, &s)) {
  1100        // Done
  1101      } else if (imm != NULL && imm->Get(lkey, value, &s)) {
  1102        // Done
  1103      } else {
  1104        s = current->Get(options, lkey, value, &stats);
  1105        have_stat_update = true;
  1106      }
  1107      mutex_.Lock();
  1108    }
  1109  
  1110    if (have_stat_update && current->UpdateStats(stats)) {
  1111      MaybeScheduleCompaction();
  1112    }
  1113    mem->Unref();
  1114    if (imm != NULL) imm->Unref();
  1115    current->Unref();
  1116    return s;
  1117  }
  1118  
  1119  Iterator* DBImpl::NewIterator(const ReadOptions& options) {
  1120    SequenceNumber latest_snapshot;
  1121    uint32_t seed;
  1122    Iterator* iter = NewInternalIterator(options, &latest_snapshot, &seed);
  1123    return NewDBIterator(
  1124        this, user_comparator(), iter,
  1125        (options.snapshot != NULL
  1126         ? reinterpret_cast<const SnapshotImpl*>(options.snapshot)->number_
  1127         : latest_snapshot),
  1128        seed);
  1129  }
  1130  
  1131  void DBImpl::RecordReadSample(Slice key) {
  1132    MutexLock l(&mutex_);
  1133    if (versions_->current()->RecordReadSample(key)) {
  1134      MaybeScheduleCompaction();
  1135    }
  1136  }
  1137  
  1138  const Snapshot* DBImpl::GetSnapshot() {
  1139    MutexLock l(&mutex_);
  1140    return snapshots_.New(versions_->LastSequence());
  1141  }
  1142  
  1143  void DBImpl::ReleaseSnapshot(const Snapshot* s) {
  1144    MutexLock l(&mutex_);
  1145    snapshots_.Delete(reinterpret_cast<const SnapshotImpl*>(s));
  1146  }
  1147  
  1148  // Convenience methods
  1149  Status DBImpl::Put(const WriteOptions& o, const Slice& key, const Slice& val) {
  1150    return DB::Put(o, key, val);
  1151  }
  1152  
  1153  Status DBImpl::Delete(const WriteOptions& options, const Slice& key) {
  1154    return DB::Delete(options, key);
  1155  }
  1156  
  1157  Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
  1158    Writer w(&mutex_);
  1159    w.batch = my_batch;
  1160    w.sync = options.sync;
  1161    w.done = false;
  1162  
  1163    MutexLock l(&mutex_);
  1164    writers_.push_back(&w);
  1165    while (!w.done && &w != writers_.front()) {
  1166      w.cv.Wait();
  1167    }
  1168    if (w.done) {
  1169      return w.status;
  1170    }
  1171  
  1172    // May temporarily unlock and wait.
  1173    Status status = MakeRoomForWrite(my_batch == NULL);
  1174    uint64_t last_sequence = versions_->LastSequence();
  1175    Writer* last_writer = &w;
  1176    if (status.ok() && my_batch != NULL) {  // NULL batch is for compactions
  1177      WriteBatch* updates = BuildBatchGroup(&last_writer);
  1178      WriteBatchInternal::SetSequence(updates, last_sequence + 1);
  1179      last_sequence += WriteBatchInternal::Count(updates);
  1180  
  1181      // Add to log and apply to memtable.  We can release the lock
  1182      // during this phase since &w is currently responsible for logging
  1183      // and protects against concurrent loggers and concurrent writes
  1184      // into mem_.
  1185      {
  1186        mutex_.Unlock();
  1187        status = log_->AddRecord(WriteBatchInternal::Contents(updates));
  1188        if (status.ok() && options.sync) {
  1189          status = logfile_->Sync();
  1190        }
  1191        if (status.ok()) {
  1192          status = WriteBatchInternal::InsertInto(updates, mem_);
  1193        }
  1194        mutex_.Lock();
  1195      }
  1196      if (updates == tmp_batch_) tmp_batch_->Clear();
  1197  
  1198      versions_->SetLastSequence(last_sequence);
  1199    }
  1200  
  1201    while (true) {
  1202      Writer* ready = writers_.front();
  1203      writers_.pop_front();
  1204      if (ready != &w) {
  1205        ready->status = status;
  1206        ready->done = true;
  1207        ready->cv.Signal();
  1208      }
  1209      if (ready == last_writer) break;
  1210    }
  1211  
  1212    // Notify new head of write queue
  1213    if (!writers_.empty()) {
  1214      writers_.front()->cv.Signal();
  1215    }
  1216  
  1217    return status;
  1218  }
  1219  
  1220  // REQUIRES: Writer list must be non-empty
  1221  // REQUIRES: First writer must have a non-NULL batch
  1222  WriteBatch* DBImpl::BuildBatchGroup(Writer** last_writer) {
  1223    assert(!writers_.empty());
  1224    Writer* first = writers_.front();
  1225    WriteBatch* result = first->batch;
  1226    assert(result != NULL);
  1227  
  1228    size_t size = WriteBatchInternal::ByteSize(first->batch);
  1229  
  1230    // Allow the group to grow up to a maximum size, but if the
  1231    // original write is small, limit the growth so we do not slow
  1232    // down the small write too much.
  1233    size_t max_size = 1 << 20;
  1234    if (size <= (128<<10)) {
  1235      max_size = size + (128<<10);
  1236    }
  1237  
  1238    *last_writer = first;
  1239    std::deque<Writer*>::iterator iter = writers_.begin();
  1240    ++iter;  // Advance past "first"
  1241    for (; iter != writers_.end(); ++iter) {
  1242      Writer* w = *iter;
  1243      if (w->sync && !first->sync) {
  1244        // Do not include a sync write into a batch handled by a non-sync write.
  1245        break;
  1246      }
  1247  
  1248      if (w->batch != NULL) {
  1249        size += WriteBatchInternal::ByteSize(w->batch);
  1250        if (size > max_size) {
  1251          // Do not make batch too big
  1252          break;
  1253        }
  1254  
  1255        // Append to *reuslt
  1256        if (result == first->batch) {
  1257          // Switch to temporary batch instead of disturbing caller's batch
  1258          result = tmp_batch_;
  1259          assert(WriteBatchInternal::Count(result) == 0);
  1260          WriteBatchInternal::Append(result, first->batch);
  1261        }
  1262        WriteBatchInternal::Append(result, w->batch);
  1263      }
  1264      *last_writer = w;
  1265    }
  1266    return result;
  1267  }
  1268  
  1269  // REQUIRES: mutex_ is held
  1270  // REQUIRES: this thread is currently at the front of the writer queue
  1271  Status DBImpl::MakeRoomForWrite(bool force) {
  1272    mutex_.AssertHeld();
  1273    assert(!writers_.empty());
  1274    bool allow_delay = !force;
  1275    Status s;
  1276    while (true) {
  1277      if (!bg_error_.ok()) {
  1278        // Yield previous error
  1279        s = bg_error_;
  1280        break;
  1281      } else if (
  1282          allow_delay &&
  1283          versions_->NumLevelFiles(0) >= config::kL0_SlowdownWritesTrigger) {
  1284        // We are getting close to hitting a hard limit on the number of
  1285        // L0 files.  Rather than delaying a single write by several
  1286        // seconds when we hit the hard limit, start delaying each
  1287        // individual write by 1ms to reduce latency variance.  Also,
  1288        // this delay hands over some CPU to the compaction thread in
  1289        // case it is sharing the same core as the writer.
  1290        mutex_.Unlock();
  1291        env_->SleepForMicroseconds(1000);
  1292        allow_delay = false;  // Do not delay a single write more than once
  1293        mutex_.Lock();
  1294      } else if (!force &&
  1295                 (mem_->ApproximateMemoryUsage() <= options_.write_buffer_size)) {
  1296        // There is room in current memtable
  1297        break;
  1298      } else if (imm_ != NULL) {
  1299        // We have filled up the current memtable, but the previous
  1300        // one is still being compacted, so we wait.
  1301        Log(options_.info_log, "Current memtable full; waiting...\n");
  1302        bg_cv_.Wait();
  1303      } else if (versions_->NumLevelFiles(0) >= config::kL0_StopWritesTrigger) {
  1304        // There are too many level-0 files.
  1305        Log(options_.info_log, "Too many L0 files; waiting...\n");
  1306        bg_cv_.Wait();
  1307      } else {
  1308        // Attempt to switch to a new memtable and trigger compaction of old
  1309        assert(versions_->PrevLogNumber() == 0);
  1310        uint64_t new_log_number = versions_->NewFileNumber();
  1311        WritableFile* lfile = NULL;
  1312        s = env_->NewWritableFile(LogFileName(dbname_, new_log_number), &lfile);
  1313        if (!s.ok()) {
  1314          // Avoid chewing through file number space in a tight loop.
  1315          versions_->ReuseFileNumber(new_log_number);
  1316          break;
  1317        }
  1318        delete log_;
  1319        delete logfile_;
  1320        logfile_ = lfile;
  1321        logfile_number_ = new_log_number;
  1322        log_ = new log::Writer(lfile);
  1323        imm_ = mem_;
  1324        has_imm_.Release_Store(imm_);
  1325        mem_ = new MemTable(internal_comparator_);
  1326        mem_->Ref();
  1327        force = false;   // Do not force another compaction if have room
  1328        MaybeScheduleCompaction();
  1329      }
  1330    }
  1331    return s;
  1332  }
  1333  
  1334  bool DBImpl::GetProperty(const Slice& property, std::string* value) {
  1335    value->clear();
  1336  
  1337    MutexLock l(&mutex_);
  1338    Slice in = property;
  1339    Slice prefix("leveldb.");
  1340    if (!in.starts_with(prefix)) return false;
  1341    in.remove_prefix(prefix.size());
  1342  
  1343    if (in.starts_with("num-files-at-level")) {
  1344      in.remove_prefix(strlen("num-files-at-level"));
  1345      uint64_t level;
  1346      bool ok = ConsumeDecimalNumber(&in, &level) && in.empty();
  1347      if (!ok || level >= config::kNumLevels) {
  1348        return false;
  1349      } else {
  1350        char buf[100];
  1351        snprintf(buf, sizeof(buf), "%d",
  1352                 versions_->NumLevelFiles(static_cast<int>(level)));
  1353        *value = buf;
  1354        return true;
  1355      }
  1356    } else if (in == "stats") {
  1357      char buf[200];
  1358      snprintf(buf, sizeof(buf),
  1359               "                               Compactions\n"
  1360               "Level  Files Size(MB) Time(sec) Read(MB) Write(MB)\n"
  1361               "--------------------------------------------------\n"
  1362               );
  1363      value->append(buf);
  1364      for (int level = 0; level < config::kNumLevels; level++) {
  1365        int files = versions_->NumLevelFiles(level);
  1366        if (stats_[level].micros > 0 || files > 0) {
  1367          snprintf(
  1368              buf, sizeof(buf),
  1369              "%3d %8d %8.0f %9.0f %8.0f %9.0f\n",
  1370              level,
  1371              files,
  1372              versions_->NumLevelBytes(level) / 1048576.0,
  1373              stats_[level].micros / 1e6,
  1374              stats_[level].bytes_read / 1048576.0,
  1375              stats_[level].bytes_written / 1048576.0);
  1376          value->append(buf);
  1377        }
  1378      }
  1379      return true;
  1380    } else if (in == "sstables") {
  1381      *value = versions_->current()->DebugString();
  1382      return true;
  1383    }
  1384  
  1385    return false;
  1386  }
  1387  
  1388  void DBImpl::GetApproximateSizes(
  1389      const Range* range, int n,
  1390      uint64_t* sizes) {
  1391    // TODO(opt): better implementation
  1392    Version* v;
  1393    {
  1394      MutexLock l(&mutex_);
  1395      versions_->current()->Ref();
  1396      v = versions_->current();
  1397    }
  1398  
  1399    for (int i = 0; i < n; i++) {
  1400      // Convert user_key into a corresponding internal key.
  1401      InternalKey k1(range[i].start, kMaxSequenceNumber, kValueTypeForSeek);
  1402      InternalKey k2(range[i].limit, kMaxSequenceNumber, kValueTypeForSeek);
  1403      uint64_t start = versions_->ApproximateOffsetOf(v, k1);
  1404      uint64_t limit = versions_->ApproximateOffsetOf(v, k2);
  1405      sizes[i] = (limit >= start ? limit - start : 0);
  1406    }
  1407  
  1408    {
  1409      MutexLock l(&mutex_);
  1410      v->Unref();
  1411    }
  1412  }
  1413  
  1414  // Default implementations of convenience methods that subclasses of DB
  1415  // can call if they wish
  1416  Status DB::Put(const WriteOptions& opt, const Slice& key, const Slice& value) {
  1417    WriteBatch batch;
  1418    batch.Put(key, value);
  1419    return Write(opt, &batch);
  1420  }
  1421  
  1422  Status DB::Delete(const WriteOptions& opt, const Slice& key) {
  1423    WriteBatch batch;
  1424    batch.Delete(key);
  1425    return Write(opt, &batch);
  1426  }
  1427  
  1428  DB::~DB() { }
  1429  
  1430  Status DB::Open(const Options& options, const std::string& dbname,
  1431                  DB** dbptr) {
  1432    *dbptr = NULL;
  1433  
  1434    DBImpl* impl = new DBImpl(options, dbname);
  1435    impl->mutex_.Lock();
  1436    VersionEdit edit;
  1437    Status s = impl->Recover(&edit); // Handles create_if_missing, error_if_exists
  1438    if (s.ok()) {
  1439      uint64_t new_log_number = impl->versions_->NewFileNumber();
  1440      WritableFile* lfile;
  1441      s = options.env->NewWritableFile(LogFileName(dbname, new_log_number),
  1442                                       &lfile);
  1443      if (s.ok()) {
  1444        edit.SetLogNumber(new_log_number);
  1445        impl->logfile_ = lfile;
  1446        impl->logfile_number_ = new_log_number;
  1447        impl->log_ = new log::Writer(lfile);
  1448        s = impl->versions_->LogAndApply(&edit, &impl->mutex_);
  1449      }
  1450      if (s.ok()) {
  1451        impl->DeleteObsoleteFiles();
  1452        impl->MaybeScheduleCompaction();
  1453      }
  1454    }
  1455    impl->mutex_.Unlock();
  1456    if (s.ok()) {
  1457      *dbptr = impl;
  1458    } else {
  1459      delete impl;
  1460    }
  1461    return s;
  1462  }
  1463  
  1464  Snapshot::~Snapshot() {
  1465  }
  1466  
  1467  Status DestroyDB(const std::string& dbname, const Options& options) {
  1468    Env* env = options.env;
  1469    std::vector<std::string> filenames;
  1470    // Ignore error in case directory does not exist
  1471    env->GetChildren(dbname, &filenames);
  1472    if (filenames.empty()) {
  1473      return Status::OK();
  1474    }
  1475  
  1476    FileLock* lock;
  1477    const std::string lockname = LockFileName(dbname);
  1478    Status result = env->LockFile(lockname, &lock);
  1479    if (result.ok()) {
  1480      uint64_t number;
  1481      FileType type;
  1482      for (size_t i = 0; i < filenames.size(); i++) {
  1483        if (ParseFileName(filenames[i], &number, &type) &&
  1484            type != kDBLockFile) {  // Lock file will be deleted at end
  1485          Status del = env->DeleteFile(dbname + "/" + filenames[i]);
  1486          if (result.ok() && !del.ok()) {
  1487            result = del;
  1488          }
  1489        }
  1490      }
  1491      env->UnlockFile(lock);  // Ignore error since state is already gone
  1492      env->DeleteFile(lockname);
  1493      env->DeleteDir(dbname);  // Ignore error in case dir contains other files
  1494    }
  1495    return result;
  1496  }
  1497  
  1498  }  // namespace leveldb