github.com/cellofellow/gopkg@v0.0.0-20140722061823-eec0544a62ad/database/leveldb.chai2010/src/repair.cc (about)

     1  // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style license that can be
     3  // found in the LICENSE file. See the AUTHORS file for names of contributors.
     4  //
     5  // We recover the contents of the descriptor from the other files we find.
     6  // (1) Any log files are first converted to tables
     7  // (2) We scan every table to compute
     8  //     (a) smallest/largest for the table
     9  //     (b) largest sequence number in the table
    10  // (3) We generate descriptor contents:
    11  //      - log number is set to zero
    12  //      - next-file-number is set to 1 + largest file number we found
    13  //      - last-sequence-number is set to largest sequence# found across
    14  //        all tables (see 2c)
    15  //      - compaction pointers are cleared
    16  //      - every table file is added at level 0
    17  //
    18  // Possible optimization 1:
    19  //   (a) Compute total size and use to pick appropriate max-level M
    20  //   (b) Sort tables by largest sequence# in the table
    21  //   (c) For each table: if it overlaps earlier table, place in level-0,
    22  //       else place in level-M.
    23  // Possible optimization 2:
    24  //   Store per-table metadata (smallest, largest, largest-seq#, ...)
    25  //   in the table's meta section to speed up ScanTable.
    26  
    27  #include "db/builder.h"
    28  #include "db/db_impl.h"
    29  #include "db/dbformat.h"
    30  #include "db/filename.h"
    31  #include "db/log_reader.h"
    32  #include "db/log_writer.h"
    33  #include "db/memtable.h"
    34  #include "db/table_cache.h"
    35  #include "db/version_edit.h"
    36  #include "db/write_batch_internal.h"
    37  #include "leveldb/comparator.h"
    38  #include "leveldb/db.h"
    39  #include "leveldb/env.h"
    40  
    41  namespace leveldb {
    42  
    43  namespace {
    44  
    45  class Repairer {
    46   public:
    47    Repairer(const std::string& dbname, const Options& options)
    48        : dbname_(dbname),
    49          env_(options.env),
    50          icmp_(options.comparator),
    51          ipolicy_(options.filter_policy),
    52          options_(SanitizeOptions(dbname, &icmp_, &ipolicy_, options)),
    53          owns_info_log_(options_.info_log != options.info_log),
    54          owns_cache_(options_.block_cache != options.block_cache),
    55          next_file_number_(1) {
    56      // TableCache can be small since we expect each table to be opened once.
    57      table_cache_ = new TableCache(dbname_, &options_, 10);
    58    }
    59  
    60    ~Repairer() {
    61      delete table_cache_;
    62      if (owns_info_log_) {
    63        delete options_.info_log;
    64      }
    65      if (owns_cache_) {
    66        delete options_.block_cache;
    67      }
    68    }
    69  
    70    Status Run() {
    71      Status status = FindFiles();
    72      if (status.ok()) {
    73        ConvertLogFilesToTables();
    74        ExtractMetaData();
    75        status = WriteDescriptor();
    76      }
    77      if (status.ok()) {
    78        unsigned long long bytes = 0;
    79        for (size_t i = 0; i < tables_.size(); i++) {
    80          bytes += tables_[i].meta.file_size;
    81        }
    82        Log(options_.info_log,
    83            "**** Repaired leveldb %s; "
    84            "recovered %d files; %llu bytes. "
    85            "Some data may have been lost. "
    86            "****",
    87            dbname_.c_str(),
    88            static_cast<int>(tables_.size()),
    89            bytes);
    90      }
    91      return status;
    92    }
    93  
    94   private:
    95    struct TableInfo {
    96      FileMetaData meta;
    97      SequenceNumber max_sequence;
    98    };
    99  
   100    std::string const dbname_;
   101    Env* const env_;
   102    InternalKeyComparator const icmp_;
   103    InternalFilterPolicy const ipolicy_;
   104    Options const options_;
   105    bool owns_info_log_;
   106    bool owns_cache_;
   107    TableCache* table_cache_;
   108    VersionEdit edit_;
   109  
   110    std::vector<std::string> manifests_;
   111    std::vector<uint64_t> table_numbers_;
   112    std::vector<uint64_t> logs_;
   113    std::vector<TableInfo> tables_;
   114    uint64_t next_file_number_;
   115  
   116    Status FindFiles() {
   117      std::vector<std::string> filenames;
   118      Status status = env_->GetChildren(dbname_, &filenames);
   119      if (!status.ok()) {
   120        return status;
   121      }
   122      if (filenames.empty()) {
   123        return Status::IOError(dbname_, "repair found no files");
   124      }
   125  
   126      uint64_t number;
   127      FileType type;
   128      for (size_t i = 0; i < filenames.size(); i++) {
   129        if (ParseFileName(filenames[i], &number, &type)) {
   130          if (type == kDescriptorFile) {
   131            manifests_.push_back(filenames[i]);
   132          } else {
   133            if (number + 1 > next_file_number_) {
   134              next_file_number_ = number + 1;
   135            }
   136            if (type == kLogFile) {
   137              logs_.push_back(number);
   138            } else if (type == kTableFile) {
   139              table_numbers_.push_back(number);
   140            } else {
   141              // Ignore other files
   142            }
   143          }
   144        }
   145      }
   146      return status;
   147    }
   148  
   149    void ConvertLogFilesToTables() {
   150      for (size_t i = 0; i < logs_.size(); i++) {
   151        std::string logname = LogFileName(dbname_, logs_[i]);
   152        Status status = ConvertLogToTable(logs_[i]);
   153        if (!status.ok()) {
   154          Log(options_.info_log, "Log #%llu: ignoring conversion error: %s",
   155              (unsigned long long) logs_[i],
   156              status.ToString().c_str());
   157        }
   158        ArchiveFile(logname);
   159      }
   160    }
   161  
   162    Status ConvertLogToTable(uint64_t log) {
   163      struct LogReporter : public log::Reader::Reporter {
   164        Env* env;
   165        Logger* info_log;
   166        uint64_t lognum;
   167        virtual void Corruption(size_t bytes, const Status& s) {
   168          // We print error messages for corruption, but continue repairing.
   169          Log(info_log, "Log #%llu: dropping %d bytes; %s",
   170              (unsigned long long) lognum,
   171              static_cast<int>(bytes),
   172              s.ToString().c_str());
   173        }
   174      };
   175  
   176      // Open the log file
   177      std::string logname = LogFileName(dbname_, log);
   178      SequentialFile* lfile;
   179      Status status = env_->NewSequentialFile(logname, &lfile);
   180      if (!status.ok()) {
   181        return status;
   182      }
   183  
   184      // Create the log reader.
   185      LogReporter reporter;
   186      reporter.env = env_;
   187      reporter.info_log = options_.info_log;
   188      reporter.lognum = log;
   189      // We intentially make log::Reader do checksumming so that
   190      // corruptions cause entire commits to be skipped instead of
   191      // propagating bad information (like overly large sequence
   192      // numbers).
   193      log::Reader reader(lfile, &reporter, false/*do not checksum*/,
   194                         0/*initial_offset*/);
   195  
   196      // Read all the records and add to a memtable
   197      std::string scratch;
   198      Slice record;
   199      WriteBatch batch;
   200      MemTable* mem = new MemTable(icmp_);
   201      mem->Ref();
   202      int counter = 0;
   203      while (reader.ReadRecord(&record, &scratch)) {
   204        if (record.size() < 12) {
   205          reporter.Corruption(
   206              record.size(), Status::Corruption("log record too small"));
   207          continue;
   208        }
   209        WriteBatchInternal::SetContents(&batch, record);
   210        status = WriteBatchInternal::InsertInto(&batch, mem);
   211        if (status.ok()) {
   212          counter += WriteBatchInternal::Count(&batch);
   213        } else {
   214          Log(options_.info_log, "Log #%llu: ignoring %s",
   215              (unsigned long long) log,
   216              status.ToString().c_str());
   217          status = Status::OK();  // Keep going with rest of file
   218        }
   219      }
   220      delete lfile;
   221  
   222      // Do not record a version edit for this conversion to a Table
   223      // since ExtractMetaData() will also generate edits.
   224      FileMetaData meta;
   225      meta.number = next_file_number_++;
   226      Iterator* iter = mem->NewIterator();
   227      status = BuildTable(dbname_, env_, options_, table_cache_, iter, &meta);
   228      delete iter;
   229      mem->Unref();
   230      mem = NULL;
   231      if (status.ok()) {
   232        if (meta.file_size > 0) {
   233          table_numbers_.push_back(meta.number);
   234        }
   235      }
   236      Log(options_.info_log, "Log #%llu: %d ops saved to Table #%llu %s",
   237          (unsigned long long) log,
   238          counter,
   239          (unsigned long long) meta.number,
   240          status.ToString().c_str());
   241      return status;
   242    }
   243  
   244    void ExtractMetaData() {
   245      std::vector<TableInfo> kept;
   246      for (size_t i = 0; i < table_numbers_.size(); i++) {
   247        TableInfo t;
   248        t.meta.number = table_numbers_[i];
   249        Status status = ScanTable(&t);
   250        if (!status.ok()) {
   251          std::string fname = TableFileName(dbname_, table_numbers_[i]);
   252          Log(options_.info_log, "Table #%llu: ignoring %s",
   253              (unsigned long long) table_numbers_[i],
   254              status.ToString().c_str());
   255          ArchiveFile(fname);
   256        } else {
   257          tables_.push_back(t);
   258        }
   259      }
   260    }
   261  
   262    Status ScanTable(TableInfo* t) {
   263      std::string fname = TableFileName(dbname_, t->meta.number);
   264      int counter = 0;
   265      Status status = env_->GetFileSize(fname, &t->meta.file_size);
   266      if (!status.ok()) {
   267        fname = SSTTableFileName(dbname_, t->meta.number);
   268        Status s2 = env_->GetFileSize(fname, &t->meta.file_size);
   269        if (s2.ok())
   270          status = Status::OK();
   271      }
   272      if (status.ok()) {
   273        Iterator* iter = table_cache_->NewIterator(
   274            ReadOptions(), t->meta.number, t->meta.file_size);
   275        bool empty = true;
   276        ParsedInternalKey parsed;
   277        t->max_sequence = 0;
   278        for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
   279          Slice key = iter->key();
   280          if (!ParseInternalKey(key, &parsed)) {
   281            Log(options_.info_log, "Table #%llu: unparsable key %s",
   282                (unsigned long long) t->meta.number,
   283                EscapeString(key).c_str());
   284            continue;
   285          }
   286  
   287          counter++;
   288          if (empty) {
   289            empty = false;
   290            t->meta.smallest.DecodeFrom(key);
   291          }
   292          t->meta.largest.DecodeFrom(key);
   293          if (parsed.sequence > t->max_sequence) {
   294            t->max_sequence = parsed.sequence;
   295          }
   296        }
   297        if (!iter->status().ok()) {
   298          status = iter->status();
   299        }
   300        delete iter;
   301      }
   302      // If there was trouble opening an .sst file this will report that the .ldb
   303      // file was not found, which is kind of lame but shouldn't happen often.
   304      Log(options_.info_log, "Table #%llu: %d entries %s",
   305          (unsigned long long) t->meta.number,
   306          counter,
   307          status.ToString().c_str());
   308      return status;
   309    }
   310  
   311    Status WriteDescriptor() {
   312      std::string tmp = TempFileName(dbname_, 1);
   313      WritableFile* file;
   314      Status status = env_->NewWritableFile(tmp, &file);
   315      if (!status.ok()) {
   316        return status;
   317      }
   318  
   319      SequenceNumber max_sequence = 0;
   320      for (size_t i = 0; i < tables_.size(); i++) {
   321        if (max_sequence < tables_[i].max_sequence) {
   322          max_sequence = tables_[i].max_sequence;
   323        }
   324      }
   325  
   326      edit_.SetComparatorName(icmp_.user_comparator()->Name());
   327      edit_.SetLogNumber(0);
   328      edit_.SetNextFile(next_file_number_);
   329      edit_.SetLastSequence(max_sequence);
   330  
   331      for (size_t i = 0; i < tables_.size(); i++) {
   332        // TODO(opt): separate out into multiple levels
   333        const TableInfo& t = tables_[i];
   334        edit_.AddFile(0, t.meta.number, t.meta.file_size,
   335                      t.meta.smallest, t.meta.largest);
   336      }
   337  
   338      //fprintf(stderr, "NewDescriptor:\n%s\n", edit_.DebugString().c_str());
   339      {
   340        log::Writer log(file);
   341        std::string record;
   342        edit_.EncodeTo(&record);
   343        status = log.AddRecord(record);
   344      }
   345      if (status.ok()) {
   346        status = file->Close();
   347      }
   348      delete file;
   349      file = NULL;
   350  
   351      if (!status.ok()) {
   352        env_->DeleteFile(tmp);
   353      } else {
   354        // Discard older manifests
   355        for (size_t i = 0; i < manifests_.size(); i++) {
   356          ArchiveFile(dbname_ + "/" + manifests_[i]);
   357        }
   358  
   359        // Install new manifest
   360        status = env_->RenameFile(tmp, DescriptorFileName(dbname_, 1));
   361        if (status.ok()) {
   362          status = SetCurrentFile(env_, dbname_, 1);
   363        } else {
   364          env_->DeleteFile(tmp);
   365        }
   366      }
   367      return status;
   368    }
   369  
   370    void ArchiveFile(const std::string& fname) {
   371      // Move into another directory.  E.g., for
   372      //    dir/foo
   373      // rename to
   374      //    dir/lost/foo
   375      const char* slash = strrchr(fname.c_str(), '/');
   376      std::string new_dir;
   377      if (slash != NULL) {
   378        new_dir.assign(fname.data(), slash - fname.data());
   379      }
   380      new_dir.append("/lost");
   381      env_->CreateDir(new_dir);  // Ignore error
   382      std::string new_file = new_dir;
   383      new_file.append("/");
   384      new_file.append((slash == NULL) ? fname.c_str() : slash + 1);
   385      Status s = env_->RenameFile(fname, new_file);
   386      Log(options_.info_log, "Archiving %s: %s\n",
   387          fname.c_str(), s.ToString().c_str());
   388    }
   389  };
   390  }  // namespace
   391  
   392  Status RepairDB(const std::string& dbname, const Options& options) {
   393    Repairer repairer(dbname, options);
   394    return repairer.Run();
   395  }
   396  
   397  }  // namespace leveldb