github.com/cellofellow/gopkg@v0.0.0-20140722061823-eec0544a62ad/database/leveldb.chai2010/src/repair.cc (about) 1 // Copyright (c) 2011 The LevelDB Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. See the AUTHORS file for names of contributors. 4 // 5 // We recover the contents of the descriptor from the other files we find. 6 // (1) Any log files are first converted to tables 7 // (2) We scan every table to compute 8 // (a) smallest/largest for the table 9 // (b) largest sequence number in the table 10 // (3) We generate descriptor contents: 11 // - log number is set to zero 12 // - next-file-number is set to 1 + largest file number we found 13 // - last-sequence-number is set to largest sequence# found across 14 // all tables (see 2c) 15 // - compaction pointers are cleared 16 // - every table file is added at level 0 17 // 18 // Possible optimization 1: 19 // (a) Compute total size and use to pick appropriate max-level M 20 // (b) Sort tables by largest sequence# in the table 21 // (c) For each table: if it overlaps earlier table, place in level-0, 22 // else place in level-M. 23 // Possible optimization 2: 24 // Store per-table metadata (smallest, largest, largest-seq#, ...) 25 // in the table's meta section to speed up ScanTable. 26 27 #include "db/builder.h" 28 #include "db/db_impl.h" 29 #include "db/dbformat.h" 30 #include "db/filename.h" 31 #include "db/log_reader.h" 32 #include "db/log_writer.h" 33 #include "db/memtable.h" 34 #include "db/table_cache.h" 35 #include "db/version_edit.h" 36 #include "db/write_batch_internal.h" 37 #include "leveldb/comparator.h" 38 #include "leveldb/db.h" 39 #include "leveldb/env.h" 40 41 namespace leveldb { 42 43 namespace { 44 45 class Repairer { 46 public: 47 Repairer(const std::string& dbname, const Options& options) 48 : dbname_(dbname), 49 env_(options.env), 50 icmp_(options.comparator), 51 ipolicy_(options.filter_policy), 52 options_(SanitizeOptions(dbname, &icmp_, &ipolicy_, options)), 53 owns_info_log_(options_.info_log != options.info_log), 54 owns_cache_(options_.block_cache != options.block_cache), 55 next_file_number_(1) { 56 // TableCache can be small since we expect each table to be opened once. 57 table_cache_ = new TableCache(dbname_, &options_, 10); 58 } 59 60 ~Repairer() { 61 delete table_cache_; 62 if (owns_info_log_) { 63 delete options_.info_log; 64 } 65 if (owns_cache_) { 66 delete options_.block_cache; 67 } 68 } 69 70 Status Run() { 71 Status status = FindFiles(); 72 if (status.ok()) { 73 ConvertLogFilesToTables(); 74 ExtractMetaData(); 75 status = WriteDescriptor(); 76 } 77 if (status.ok()) { 78 unsigned long long bytes = 0; 79 for (size_t i = 0; i < tables_.size(); i++) { 80 bytes += tables_[i].meta.file_size; 81 } 82 Log(options_.info_log, 83 "**** Repaired leveldb %s; " 84 "recovered %d files; %llu bytes. " 85 "Some data may have been lost. " 86 "****", 87 dbname_.c_str(), 88 static_cast<int>(tables_.size()), 89 bytes); 90 } 91 return status; 92 } 93 94 private: 95 struct TableInfo { 96 FileMetaData meta; 97 SequenceNumber max_sequence; 98 }; 99 100 std::string const dbname_; 101 Env* const env_; 102 InternalKeyComparator const icmp_; 103 InternalFilterPolicy const ipolicy_; 104 Options const options_; 105 bool owns_info_log_; 106 bool owns_cache_; 107 TableCache* table_cache_; 108 VersionEdit edit_; 109 110 std::vector<std::string> manifests_; 111 std::vector<uint64_t> table_numbers_; 112 std::vector<uint64_t> logs_; 113 std::vector<TableInfo> tables_; 114 uint64_t next_file_number_; 115 116 Status FindFiles() { 117 std::vector<std::string> filenames; 118 Status status = env_->GetChildren(dbname_, &filenames); 119 if (!status.ok()) { 120 return status; 121 } 122 if (filenames.empty()) { 123 return Status::IOError(dbname_, "repair found no files"); 124 } 125 126 uint64_t number; 127 FileType type; 128 for (size_t i = 0; i < filenames.size(); i++) { 129 if (ParseFileName(filenames[i], &number, &type)) { 130 if (type == kDescriptorFile) { 131 manifests_.push_back(filenames[i]); 132 } else { 133 if (number + 1 > next_file_number_) { 134 next_file_number_ = number + 1; 135 } 136 if (type == kLogFile) { 137 logs_.push_back(number); 138 } else if (type == kTableFile) { 139 table_numbers_.push_back(number); 140 } else { 141 // Ignore other files 142 } 143 } 144 } 145 } 146 return status; 147 } 148 149 void ConvertLogFilesToTables() { 150 for (size_t i = 0; i < logs_.size(); i++) { 151 std::string logname = LogFileName(dbname_, logs_[i]); 152 Status status = ConvertLogToTable(logs_[i]); 153 if (!status.ok()) { 154 Log(options_.info_log, "Log #%llu: ignoring conversion error: %s", 155 (unsigned long long) logs_[i], 156 status.ToString().c_str()); 157 } 158 ArchiveFile(logname); 159 } 160 } 161 162 Status ConvertLogToTable(uint64_t log) { 163 struct LogReporter : public log::Reader::Reporter { 164 Env* env; 165 Logger* info_log; 166 uint64_t lognum; 167 virtual void Corruption(size_t bytes, const Status& s) { 168 // We print error messages for corruption, but continue repairing. 169 Log(info_log, "Log #%llu: dropping %d bytes; %s", 170 (unsigned long long) lognum, 171 static_cast<int>(bytes), 172 s.ToString().c_str()); 173 } 174 }; 175 176 // Open the log file 177 std::string logname = LogFileName(dbname_, log); 178 SequentialFile* lfile; 179 Status status = env_->NewSequentialFile(logname, &lfile); 180 if (!status.ok()) { 181 return status; 182 } 183 184 // Create the log reader. 185 LogReporter reporter; 186 reporter.env = env_; 187 reporter.info_log = options_.info_log; 188 reporter.lognum = log; 189 // We intentially make log::Reader do checksumming so that 190 // corruptions cause entire commits to be skipped instead of 191 // propagating bad information (like overly large sequence 192 // numbers). 193 log::Reader reader(lfile, &reporter, false/*do not checksum*/, 194 0/*initial_offset*/); 195 196 // Read all the records and add to a memtable 197 std::string scratch; 198 Slice record; 199 WriteBatch batch; 200 MemTable* mem = new MemTable(icmp_); 201 mem->Ref(); 202 int counter = 0; 203 while (reader.ReadRecord(&record, &scratch)) { 204 if (record.size() < 12) { 205 reporter.Corruption( 206 record.size(), Status::Corruption("log record too small")); 207 continue; 208 } 209 WriteBatchInternal::SetContents(&batch, record); 210 status = WriteBatchInternal::InsertInto(&batch, mem); 211 if (status.ok()) { 212 counter += WriteBatchInternal::Count(&batch); 213 } else { 214 Log(options_.info_log, "Log #%llu: ignoring %s", 215 (unsigned long long) log, 216 status.ToString().c_str()); 217 status = Status::OK(); // Keep going with rest of file 218 } 219 } 220 delete lfile; 221 222 // Do not record a version edit for this conversion to a Table 223 // since ExtractMetaData() will also generate edits. 224 FileMetaData meta; 225 meta.number = next_file_number_++; 226 Iterator* iter = mem->NewIterator(); 227 status = BuildTable(dbname_, env_, options_, table_cache_, iter, &meta); 228 delete iter; 229 mem->Unref(); 230 mem = NULL; 231 if (status.ok()) { 232 if (meta.file_size > 0) { 233 table_numbers_.push_back(meta.number); 234 } 235 } 236 Log(options_.info_log, "Log #%llu: %d ops saved to Table #%llu %s", 237 (unsigned long long) log, 238 counter, 239 (unsigned long long) meta.number, 240 status.ToString().c_str()); 241 return status; 242 } 243 244 void ExtractMetaData() { 245 std::vector<TableInfo> kept; 246 for (size_t i = 0; i < table_numbers_.size(); i++) { 247 TableInfo t; 248 t.meta.number = table_numbers_[i]; 249 Status status = ScanTable(&t); 250 if (!status.ok()) { 251 std::string fname = TableFileName(dbname_, table_numbers_[i]); 252 Log(options_.info_log, "Table #%llu: ignoring %s", 253 (unsigned long long) table_numbers_[i], 254 status.ToString().c_str()); 255 ArchiveFile(fname); 256 } else { 257 tables_.push_back(t); 258 } 259 } 260 } 261 262 Status ScanTable(TableInfo* t) { 263 std::string fname = TableFileName(dbname_, t->meta.number); 264 int counter = 0; 265 Status status = env_->GetFileSize(fname, &t->meta.file_size); 266 if (!status.ok()) { 267 fname = SSTTableFileName(dbname_, t->meta.number); 268 Status s2 = env_->GetFileSize(fname, &t->meta.file_size); 269 if (s2.ok()) 270 status = Status::OK(); 271 } 272 if (status.ok()) { 273 Iterator* iter = table_cache_->NewIterator( 274 ReadOptions(), t->meta.number, t->meta.file_size); 275 bool empty = true; 276 ParsedInternalKey parsed; 277 t->max_sequence = 0; 278 for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { 279 Slice key = iter->key(); 280 if (!ParseInternalKey(key, &parsed)) { 281 Log(options_.info_log, "Table #%llu: unparsable key %s", 282 (unsigned long long) t->meta.number, 283 EscapeString(key).c_str()); 284 continue; 285 } 286 287 counter++; 288 if (empty) { 289 empty = false; 290 t->meta.smallest.DecodeFrom(key); 291 } 292 t->meta.largest.DecodeFrom(key); 293 if (parsed.sequence > t->max_sequence) { 294 t->max_sequence = parsed.sequence; 295 } 296 } 297 if (!iter->status().ok()) { 298 status = iter->status(); 299 } 300 delete iter; 301 } 302 // If there was trouble opening an .sst file this will report that the .ldb 303 // file was not found, which is kind of lame but shouldn't happen often. 304 Log(options_.info_log, "Table #%llu: %d entries %s", 305 (unsigned long long) t->meta.number, 306 counter, 307 status.ToString().c_str()); 308 return status; 309 } 310 311 Status WriteDescriptor() { 312 std::string tmp = TempFileName(dbname_, 1); 313 WritableFile* file; 314 Status status = env_->NewWritableFile(tmp, &file); 315 if (!status.ok()) { 316 return status; 317 } 318 319 SequenceNumber max_sequence = 0; 320 for (size_t i = 0; i < tables_.size(); i++) { 321 if (max_sequence < tables_[i].max_sequence) { 322 max_sequence = tables_[i].max_sequence; 323 } 324 } 325 326 edit_.SetComparatorName(icmp_.user_comparator()->Name()); 327 edit_.SetLogNumber(0); 328 edit_.SetNextFile(next_file_number_); 329 edit_.SetLastSequence(max_sequence); 330 331 for (size_t i = 0; i < tables_.size(); i++) { 332 // TODO(opt): separate out into multiple levels 333 const TableInfo& t = tables_[i]; 334 edit_.AddFile(0, t.meta.number, t.meta.file_size, 335 t.meta.smallest, t.meta.largest); 336 } 337 338 //fprintf(stderr, "NewDescriptor:\n%s\n", edit_.DebugString().c_str()); 339 { 340 log::Writer log(file); 341 std::string record; 342 edit_.EncodeTo(&record); 343 status = log.AddRecord(record); 344 } 345 if (status.ok()) { 346 status = file->Close(); 347 } 348 delete file; 349 file = NULL; 350 351 if (!status.ok()) { 352 env_->DeleteFile(tmp); 353 } else { 354 // Discard older manifests 355 for (size_t i = 0; i < manifests_.size(); i++) { 356 ArchiveFile(dbname_ + "/" + manifests_[i]); 357 } 358 359 // Install new manifest 360 status = env_->RenameFile(tmp, DescriptorFileName(dbname_, 1)); 361 if (status.ok()) { 362 status = SetCurrentFile(env_, dbname_, 1); 363 } else { 364 env_->DeleteFile(tmp); 365 } 366 } 367 return status; 368 } 369 370 void ArchiveFile(const std::string& fname) { 371 // Move into another directory. E.g., for 372 // dir/foo 373 // rename to 374 // dir/lost/foo 375 const char* slash = strrchr(fname.c_str(), '/'); 376 std::string new_dir; 377 if (slash != NULL) { 378 new_dir.assign(fname.data(), slash - fname.data()); 379 } 380 new_dir.append("/lost"); 381 env_->CreateDir(new_dir); // Ignore error 382 std::string new_file = new_dir; 383 new_file.append("/"); 384 new_file.append((slash == NULL) ? fname.c_str() : slash + 1); 385 Status s = env_->RenameFile(fname, new_file); 386 Log(options_.info_log, "Archiving %s: %s\n", 387 fname.c_str(), s.ToString().c_str()); 388 } 389 }; 390 } // namespace 391 392 Status RepairDB(const std::string& dbname, const Options& options) { 393 Repairer repairer(dbname, options); 394 return repairer.Run(); 395 } 396 397 } // namespace leveldb