github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/c-deps/libroach/db.cc (about) 1 // Copyright 2014 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 #include "db.h" 12 #include <algorithm> 13 #include <iostream> 14 #include <rocksdb/convenience.h> 15 #include <rocksdb/perf_context.h> 16 #include <rocksdb/sst_file_writer.h> 17 #include <rocksdb/table.h> 18 #include <rocksdb/utilities/checkpoint.h> 19 #include <stdarg.h> 20 #include "batch.h" 21 #include "cache.h" 22 #include "comparator.h" 23 #include "defines.h" 24 #include "encoding.h" 25 #include "engine.h" 26 #include "env_manager.h" 27 #include "eventlistener.h" 28 #include "fmt.h" 29 #include "getter.h" 30 #include "godefs.h" 31 #include "incremental_iterator.h" 32 #include "iterator.h" 33 #include "merge.h" 34 #include "options.h" 35 #include "protos/roachpb/errors.pb.h" 36 #include "row_counter.h" 37 #include "snapshot.h" 38 #include "stack_trace.h" 39 #include "status.h" 40 #include "table_props.h" 41 #include "timestamp.h" 42 43 using namespace cockroach; 44 45 namespace cockroach { 46 47 DBKey ToDBKey(const rocksdb::Slice& s) { 48 DBKey key; 49 memset(&key, 0, sizeof(key)); 50 rocksdb::Slice tmp; 51 if (DecodeKey(s, &tmp, &key.wall_time, &key.logical)) { 52 key.key = ToDBSlice(tmp); 53 } 54 return key; 55 } 56 57 ScopedStats::ScopedStats(DBIterator* iter) 58 : iter_(iter), 59 internal_delete_skipped_count_base_( 60 rocksdb::get_perf_context()->internal_delete_skipped_count) { 61 if (iter_->stats != nullptr) { 62 rocksdb::SetPerfLevel(rocksdb::PerfLevel::kEnableTimeExceptForMutex); 63 } 64 } 65 ScopedStats::~ScopedStats() { 66 if (iter_->stats != nullptr) { 67 iter_->stats->internal_delete_skipped_count += 68 (rocksdb::get_perf_context()->internal_delete_skipped_count - 69 internal_delete_skipped_count_base_); 70 rocksdb::SetPerfLevel(rocksdb::PerfLevel::kDisable); 71 } 72 } 73 74 void BatchSSTablesForCompaction(const std::vector<rocksdb::SstFileMetaData>& sst, 75 rocksdb::Slice start_key, rocksdb::Slice end_key, 76 uint64_t target_size, std::vector<rocksdb::Range>* ranges) { 77 int prev = -1; // index of the last compacted sst 78 uint64_t size = 0; 79 for (int i = 0; i < sst.size(); ++i) { 80 size += sst[i].size; 81 if (size < target_size && (i + 1) < sst.size()) { 82 // We haven't reached the target size or the end of the sstables 83 // to compact. 84 continue; 85 } 86 87 rocksdb::Slice start; 88 if (prev == -1) { 89 // This is the first compaction. 90 start = start_key; 91 } else { 92 // This is a compaction in the middle or end of the requested 93 // key range. The start key for the compaction is the largest 94 // key from the previous compacted. 95 start = rocksdb::Slice(sst[prev].largestkey); 96 } 97 98 rocksdb::Slice end; 99 if ((i + 1) == sst.size()) { 100 // This is the last compaction. 101 end = end_key; 102 } else { 103 // This is a compaction at the start or in the middle of the 104 // requested key range. The end key is the largest key in the 105 // current sstable. 106 end = rocksdb::Slice(sst[i].largestkey); 107 } 108 109 ranges->emplace_back(rocksdb::Range(start, end)); 110 111 prev = i; 112 size = 0; 113 } 114 } 115 116 } // namespace cockroach 117 118 namespace { 119 120 DBIterState DBIterGetState(DBIterator* iter) { 121 DBIterState state = {}; 122 state.valid = iter->rep->Valid(); 123 state.status = ToDBStatus(iter->rep->status()); 124 125 if (state.valid) { 126 rocksdb::Slice key; 127 state.valid = DecodeKey(iter->rep->key(), &key, &state.key.wall_time, &state.key.logical); 128 if (state.valid) { 129 state.key.key = ToDBSlice(key); 130 state.value = ToDBSlice(iter->rep->value()); 131 } 132 } 133 134 return state; 135 } 136 } // namespace 137 138 namespace cockroach { 139 140 // DBOpenHookOSS mode only verifies that no extra options are specified. 141 rocksdb::Status DBOpenHookOSS(std::shared_ptr<rocksdb::Logger> info_log, const std::string& db_dir, 142 const DBOptions db_opts, EnvManager* env_mgr) { 143 if (db_opts.extra_options.len != 0) { 144 return rocksdb::Status::InvalidArgument("encryption options are not supported in OSS builds"); 145 } 146 return rocksdb::Status::OK(); 147 } 148 149 } // namespace cockroach 150 151 static DBOpenHook* db_open_hook = DBOpenHookOSS; 152 153 void DBSetOpenHook(void* hook) { db_open_hook = (DBOpenHook*)hook; } 154 155 DBStatus DBOpen(DBEngine** db, DBSlice dir, DBOptions db_opts) { 156 rocksdb::Options options = DBMakeOptions(db_opts); 157 158 const std::string additional_options = ToString(db_opts.rocksdb_options); 159 if (!additional_options.empty()) { 160 // TODO(peter): Investigate using rocksdb::LoadOptionsFromFile if 161 // "additional_options" starts with "@". The challenge is that 162 // LoadOptionsFromFile gives us a DBOptions and 163 // ColumnFamilyOptions with no ability to supply "base" options 164 // and no ability to determine what options were specified in the 165 // file which could cause "defaults" to override the options 166 // returned by DBMakeOptions. We might need to fix this upstream. 167 rocksdb::Status status = rocksdb::GetOptionsFromString(options, additional_options, &options); 168 if (!status.ok()) { 169 return ToDBStatus(status); 170 } 171 } 172 173 const std::string db_dir = ToString(dir); 174 175 // Make the default options.env the default. It points to Env::Default which does not 176 // need to be deleted. 177 std::unique_ptr<cockroach::EnvManager> env_mgr(new cockroach::EnvManager(options.env)); 178 179 if (dir.len == 0) { 180 // In-memory database: use a MemEnv as the base Env. 181 auto memenv = rocksdb::NewMemEnv(rocksdb::Env::Default()); 182 // Register it for deletion. 183 env_mgr->TakeEnvOwnership(memenv); 184 // Create a root directory to suppress error messages that RocksDB would 185 // print if it had to create the DB directory itself. 186 memenv->CreateDir("/"); 187 // Make it the env that all other Envs must wrap. 188 env_mgr->base_env = memenv; 189 // Make it the env for rocksdb. 190 env_mgr->db_env = memenv; 191 } 192 193 // Create the file registry. It uses the base_env to access the registry file. 194 auto file_registry = 195 std::unique_ptr<FileRegistry>(new FileRegistry(env_mgr->base_env, db_dir, db_opts.read_only)); 196 197 if (db_opts.use_file_registry) { 198 // We're using the file registry. 199 auto status = file_registry->Load(); 200 if (!status.ok()) { 201 return ToDBStatus(status); 202 } 203 204 status = file_registry->CheckNoRegistryFile(); 205 if (!status.ok()) { 206 // We have a file registry, this means we've used encryption flags before 207 // and are tracking all files on disk. Running without encryption (extra_options empty) 208 // will bypass the file registry and lose changes. 209 // In this case, we have multiple possibilities: 210 // - no extra_options: this fails here 211 // - extra_options: 212 // - OSS: this fails in the OSS hook (OSS does not understand extra_options) 213 // - CCL: fails if the options do not parse properly 214 if (db_opts.extra_options.len == 0) { 215 return ToDBStatus(rocksdb::Status::InvalidArgument( 216 "encryption was used on this store before, but no encryption flags specified. You need " 217 "a CCL build and must fully specify the --enterprise-encryption flag")); 218 } 219 } 220 221 // EnvManager takes ownership of the file registry. 222 env_mgr->file_registry.swap(file_registry); 223 } else { 224 // File registry format not enabled: check whether we have a registry file (we shouldn't). 225 // The file_registry is not passed to anyone, it is deleted when it goes out of scope. 226 auto status = file_registry->CheckNoRegistryFile(); 227 if (!status.ok()) { 228 return ToDBStatus(status); 229 } 230 } 231 232 // Call hooks to handle db_opts.extra_options. 233 auto hook_status = db_open_hook(options.info_log, db_dir, db_opts, env_mgr.get()); 234 if (!hook_status.ok()) { 235 return ToDBStatus(hook_status); 236 } 237 238 // Register listener for tracking RocksDB stats. 239 std::shared_ptr<DBEventListener> event_listener(new DBEventListener); 240 options.listeners.emplace_back(event_listener); 241 242 // Point rocksdb to the env to use. 243 options.env = env_mgr->db_env; 244 245 rocksdb::DB* db_ptr; 246 rocksdb::Status status; 247 if (db_opts.read_only) { 248 status = rocksdb::DB::OpenForReadOnly(options, db_dir, &db_ptr); 249 } else { 250 status = rocksdb::DB::Open(options, db_dir, &db_ptr); 251 } 252 253 if (!status.ok()) { 254 return ToDBStatus(status); 255 } 256 *db = new DBImpl(db_ptr, std::move(env_mgr), 257 db_opts.cache != nullptr ? db_opts.cache->rep : nullptr, event_listener); 258 return kSuccess; 259 } 260 261 DBStatus DBCreateCheckpoint(DBEngine* db, DBSlice dir) { 262 const std::string cp_dir = ToString(dir); 263 264 rocksdb::Checkpoint* cp_ptr; 265 auto status = rocksdb::Checkpoint::Create(db->rep, &cp_ptr); 266 if (!status.ok()) { 267 return ToDBStatus(status); 268 } 269 // NB: passing 0 for log_size_for_flush forces a WAL sync, i.e. makes sure 270 // that the checkpoint is up to date. 271 status = cp_ptr->CreateCheckpoint(cp_dir, 0 /* log_size_for_flush */); 272 delete (cp_ptr); 273 return ToDBStatus(status); 274 } 275 276 DBStatus DBDestroy(DBSlice dir) { 277 rocksdb::Options options; 278 return ToDBStatus(rocksdb::DestroyDB(ToString(dir), options)); 279 } 280 281 DBStatus DBClose(DBEngine* db) { 282 DBStatus status = db->AssertPreClose(); 283 if (status.data == nullptr) { 284 delete db; 285 } 286 return status; 287 } 288 289 DBStatus DBFlush(DBEngine* db) { 290 rocksdb::FlushOptions options; 291 options.wait = true; 292 return ToDBStatus(db->rep->Flush(options)); 293 } 294 295 DBStatus DBSyncWAL(DBEngine* db) { 296 #ifdef _WIN32 297 // On Windows, DB::SyncWAL() is not implemented due to fact that 298 // `WinWritableFile` is not thread safe. To get around that, the only other 299 // methods that can be used to ensure that a sync is triggered is to either 300 // flush the memtables or perform a write with `WriteOptions.sync=true`. See 301 // https://github.com/facebook/rocksdb/wiki/RocksDB-FAQ for more details. 302 // Please also see #17442 for more discussion on the topic. 303 304 // In order to force a sync we issue a write-batch containing 305 // LogData with 'sync=true'. The LogData forces a write to the WAL 306 // but otherwise doesn't add anything to the memtable or sstables. 307 rocksdb::WriteBatch batch; 308 batch.PutLogData(""); 309 rocksdb::WriteOptions options; 310 options.sync = true; 311 return ToDBStatus(db->rep->Write(options, &batch)); 312 #else 313 return ToDBStatus(db->rep->FlushWAL(true /* sync */)); 314 #endif 315 } 316 317 DBStatus DBCompact(DBEngine* db) { 318 return DBCompactRange(db, DBSlice(), DBSlice(), true /* force_bottommost */); 319 } 320 321 DBStatus DBCompactRange(DBEngine* db, DBSlice start, DBSlice end, bool force_bottommost) { 322 rocksdb::CompactRangeOptions options; 323 // By default, RocksDB doesn't recompact the bottom level (unless 324 // there is a compaction filter, which we don't use). However, 325 // recompacting the bottom layer is necessary to pick up changes to 326 // settings like bloom filter configurations, and to fully reclaim 327 // space after dropping, truncating, or migrating tables. 328 if (force_bottommost) { 329 options.bottommost_level_compaction = rocksdb::BottommostLevelCompaction::kForce; 330 } 331 // By default, RocksDB treats manual compaction requests as 332 // operating exclusively, preventing normal automatic compactions 333 // from running. This can block writes to the database, as L0 334 // SSTables will become full without being allowed to compact to L1. 335 options.exclusive_manual_compaction = false; 336 337 // Compacting the entire database in a single-shot can use a 338 // significant amount of additional (temporary) disk space. Instead, 339 // we loop over the sstables in the lowest level and initiate 340 // compactions on smaller ranges of keys. The resulting compacted 341 // database is the same size, but the temporary disk space needed 342 // for the compaction is dramatically reduced. 343 std::vector<rocksdb::LiveFileMetaData> all_metadata; 344 std::vector<rocksdb::LiveFileMetaData> metadata; 345 db->rep->GetLiveFilesMetaData(&all_metadata); 346 347 const std::string start_key(ToString(start)); 348 const std::string end_key(ToString(end)); 349 350 int max_level = 0; 351 for (int i = 0; i < all_metadata.size(); i++) { 352 // Skip any SSTables which fall outside the specified range, if a 353 // range was specified. 354 if ((!start_key.empty() && all_metadata[i].largestkey < start_key) || 355 (!end_key.empty() && all_metadata[i].smallestkey >= end_key)) { 356 continue; 357 } 358 if (max_level < all_metadata[i].level) { 359 max_level = all_metadata[i].level; 360 } 361 // Gather the set of SSTables to compact. 362 metadata.push_back(all_metadata[i]); 363 } 364 all_metadata.clear(); 365 366 if (max_level != db->rep->NumberLevels() - 1) { 367 // There are no sstables at the lowest level, so just compact the 368 // specified key span, wholesale. Due to the 369 // level_compaction_dynamic_level_bytes setting, this will only 370 // happen on spans containing very little data. 371 const rocksdb::Slice start_slice(start_key); 372 const rocksdb::Slice end_slice(end_key); 373 return ToDBStatus(db->rep->CompactRange(options, !start_key.empty() ? &start_slice : nullptr, 374 !end_key.empty() ? &end_slice : nullptr)); 375 } 376 377 // A naive approach to selecting ranges to compact would be to 378 // compact the ranges specified by the smallest and largest key in 379 // each sstable of the bottom-most level. Unfortunately, the 380 // sstables in the bottom-most level have vastly different 381 // sizes. For example, starting with the following set of bottom-most 382 // sstables: 383 // 384 // 100M[16] 89M 70M 66M 56M 54M 38M[2] 36M 23M 20M 17M 8M 6M 5M 2M 2K[4] 385 // 386 // If we compact the entire database in one call we can end up with: 387 // 388 // 100M[22] 77M 76M 50M 389 // 390 // If we use the naive approach (compact the range specified by 391 // the smallest and largest keys): 392 // 393 // 100M[18] 92M 68M 62M 61M 50M 45M 39M 31M 29M[2] 24M 23M 18M 9M 8M[2] 7M 394 // 2K[4] 395 // 396 // With the approach below: 397 // 398 // 100M[19] 80M 68M[2] 62M 61M 53M 45M 36M 31M 399 // 400 // The approach below is to loop over the bottom-most sstables in 401 // sorted order and initiate a compact range every 128MB of data. 402 403 // Gather up the bottom-most sstable metadata. 404 std::vector<rocksdb::SstFileMetaData> sst; 405 for (int i = 0; i < metadata.size(); i++) { 406 if (metadata[i].level != max_level) { 407 continue; 408 } 409 sst.push_back(metadata[i]); 410 } 411 // Sort the metadata by smallest key. 412 std::sort(sst.begin(), sst.end(), 413 [](const rocksdb::SstFileMetaData& a, const rocksdb::SstFileMetaData& b) -> bool { 414 return a.smallestkey < b.smallestkey; 415 }); 416 417 // Batch the bottom-most sstables into compactions of ~128MB. 418 const uint64_t target_size = 128 << 20; 419 std::vector<rocksdb::Range> ranges; 420 BatchSSTablesForCompaction(sst, start_key, end_key, target_size, &ranges); 421 422 for (auto r : ranges) { 423 rocksdb::Status status = db->rep->CompactRange(options, r.start.empty() ? nullptr : &r.start, 424 r.limit.empty() ? nullptr : &r.limit); 425 if (!status.ok()) { 426 return ToDBStatus(status); 427 } 428 } 429 430 return kSuccess; 431 } 432 433 DBStatus DBDisableAutoCompaction(DBEngine* db) { 434 auto status = db->rep->SetOptions({{"disable_auto_compactions", "true"}}); 435 return ToDBStatus(status); 436 } 437 438 DBStatus DBEnableAutoCompaction(DBEngine* db) { 439 auto status = db->rep->EnableAutoCompaction({db->rep->DefaultColumnFamily()}); 440 return ToDBStatus(status); 441 } 442 443 DBStatus DBApproximateDiskBytes(DBEngine* db, DBKey start, DBKey end, uint64_t* size) { 444 const std::string start_key(EncodeKey(start)); 445 const std::string end_key(EncodeKey(end)); 446 const rocksdb::Range r(start_key, end_key); 447 const uint8_t flags = rocksdb::DB::SizeApproximationFlags::INCLUDE_FILES; 448 449 db->rep->GetApproximateSizes(&r, 1, size, flags); 450 return kSuccess; 451 } 452 453 DBStatus DBPut(DBEngine* db, DBKey key, DBSlice value) { return db->Put(key, value); } 454 455 DBStatus DBMerge(DBEngine* db, DBKey key, DBSlice value) { return db->Merge(key, value); } 456 457 DBStatus DBGet(DBEngine* db, DBKey key, DBString* value) { return db->Get(key, value); } 458 459 DBStatus DBDelete(DBEngine* db, DBKey key) { return db->Delete(key); } 460 461 DBStatus DBSingleDelete(DBEngine* db, DBKey key) { return db->SingleDelete(key); } 462 463 DBStatus DBDeleteRange(DBEngine* db, DBKey start, DBKey end) { return db->DeleteRange(start, end); } 464 465 DBStatus DBDeleteIterRange(DBEngine* db, DBIterator* iter, DBKey start, DBKey end) { 466 rocksdb::Iterator* const iter_rep = iter->rep.get(); 467 iter_rep->Seek(EncodeKey(start)); 468 const std::string end_key = EncodeKey(end); 469 for (; iter_rep->Valid() && kComparator.Compare(iter_rep->key(), end_key) < 0; iter_rep->Next()) { 470 DBStatus status = db->Delete(ToDBKey(iter_rep->key())); 471 if (status.data != NULL) { 472 return status; 473 } 474 } 475 return kSuccess; 476 } 477 478 DBStatus DBCommitAndCloseBatch(DBEngine* db, bool sync) { 479 DBStatus status = db->CommitBatch(sync); 480 if (status.data == NULL) { 481 DBClose(db); 482 } 483 return status; 484 } 485 486 DBStatus DBApplyBatchRepr(DBEngine* db, DBSlice repr, bool sync) { 487 return db->ApplyBatchRepr(repr, sync); 488 } 489 490 DBSlice DBBatchRepr(DBEngine* db) { return db->BatchRepr(); } 491 492 DBEngine* DBNewSnapshot(DBEngine* db) { return new DBSnapshot(db); } 493 494 DBEngine* DBNewBatch(DBEngine* db, bool writeOnly) { 495 if (writeOnly) { 496 return new DBWriteOnlyBatch(db); 497 } 498 return new DBBatch(db); 499 } 500 501 DBStatus DBEnvWriteFile(DBEngine* db, DBSlice path, DBSlice contents) { 502 return db->EnvWriteFile(path, contents); 503 } 504 505 DBStatus DBEnvOpenFile(DBEngine* db, DBSlice path, uint64_t bytes_per_sync, 506 DBWritableFile* file) { 507 return db->EnvOpenFile(path, bytes_per_sync, (rocksdb::WritableFile**)file); 508 } 509 510 DBStatus DBEnvReadFile(DBEngine* db, DBSlice path, DBSlice* contents) { 511 return db->EnvReadFile(path, contents); 512 } 513 514 DBStatus DBEnvCloseFile(DBEngine* db, DBWritableFile file) { 515 return db->EnvCloseFile((rocksdb::WritableFile*)file); 516 } 517 518 DBStatus DBEnvSyncFile(DBEngine* db, DBWritableFile file) { 519 return db->EnvSyncFile((rocksdb::WritableFile*)file); 520 } 521 522 DBStatus DBEnvAppendFile(DBEngine* db, DBWritableFile file, DBSlice contents) { 523 return db->EnvAppendFile((rocksdb::WritableFile*)file, contents); 524 } 525 526 DBStatus DBEnvDeleteFile(DBEngine* db, DBSlice path) { return db->EnvDeleteFile(path); } 527 528 DBStatus DBEnvDeleteDirAndFiles(DBEngine* db, DBSlice dir) { return db->EnvDeleteDirAndFiles(dir); } 529 530 DBStatus DBEnvLinkFile(DBEngine* db, DBSlice oldname, DBSlice newname) { 531 return db->EnvLinkFile(oldname, newname); 532 } 533 534 DBIterState DBCheckForKeyCollisions(DBIterator* existingIter, DBIterator* sstIter, 535 MVCCStatsResult* skippedKVStats, DBString* write_intent) { 536 DBIterState state = {}; 537 memset(skippedKVStats, 0, sizeof(*skippedKVStats)); 538 539 while (existingIter->rep->Valid() && sstIter->rep->Valid()) { 540 rocksdb::Slice sstKey; 541 rocksdb::Slice existingKey; 542 DBTimestamp existing_ts = kZeroTimestamp; 543 DBTimestamp sst_ts = kZeroTimestamp; 544 if (!DecodeKey(sstIter->rep->key(), &sstKey, &sst_ts) || 545 !DecodeKey(existingIter->rep->key(), &existingKey, &existing_ts)) { 546 state.valid = false; 547 state.status = FmtStatus("unable to decode key"); 548 return state; 549 } 550 551 // Encountered an inline value or a write intent. 552 if (existing_ts == kZeroTimestamp) { 553 cockroach::storage::enginepb::MVCCMetadata meta; 554 if (!meta.ParseFromArray(existingIter->rep->value().data(), 555 existingIter->rep->value().size())) { 556 state.status = FmtStatus("failed to parse meta"); 557 state.valid = false; 558 return state; 559 } 560 561 // Check for an inline value, as these are only used in non-user data. 562 // This method is currently used by AddSSTable when performing an IMPORT 563 // INTO. We do not expect to encounter any inline values, and thus we 564 // report an error. 565 if (meta.has_raw_bytes()) { 566 state.status = FmtStatus("InlineError"); 567 } else if (meta.has_txn()) { 568 // Check for a write intent. 569 // 570 // TODO(adityamaru): Currently, we raise a WriteIntentError on 571 // encountering all intents. This is because, we do not expect to 572 // encounter many intents during IMPORT INTO as we lock the key space we 573 // are importing into. Older write intents could however be found in the 574 // target key space, which will require appropriate resolution logic. 575 cockroach::roachpb::WriteIntentError err; 576 cockroach::roachpb::Intent* intent = err.add_intents(); 577 intent->mutable_single_key_span()->set_key(existingIter->rep->key().data(), 578 existingIter->rep->key().size()); 579 intent->mutable_txn()->CopyFrom(meta.txn()); 580 581 *write_intent = ToDBString(err.SerializeAsString()); 582 state.status = FmtStatus("WriteIntentError"); 583 } else { 584 state.status = FmtStatus("intent without transaction"); 585 } 586 587 state.valid = false; 588 return state; 589 } 590 591 DBKey targetKey; 592 memset(&targetKey, 0, sizeof(targetKey)); 593 int compare = kComparator.Compare(existingKey, sstKey); 594 if (compare == 0) { 595 // If the colliding key is a tombstone in the existing data, and the 596 // timestamp of the sst key is greater than or equal to the timestamp of 597 // the tombstone, then this is not considered a collision. We move the 598 // iterator over the existing data to the next potentially colliding key 599 // (skipping all versions of the deleted key), and resume iteration. 600 // 601 // If the ts of the sst key is less than that of the tombstone it is 602 // changing existing data, and we treat this as a collision. 603 if (existingIter->rep->value().empty() && sst_ts >= existing_ts) { 604 DBIterNext(existingIter, true /* skip_current_key_versions */); 605 continue; 606 } 607 608 // If the ingested KV has an identical timestamp and value as the existing 609 // data then we do not consider it to be a collision. We move the iterator 610 // over the existing data to the next potentially colliding key (skipping 611 // all versions of the current key), and resume iteration. 612 bool has_equal_timestamp = existing_ts == sst_ts; 613 bool has_equal_value = 614 kComparator.Compare(existingIter->rep->value(), sstIter->rep->value()) == 0; 615 if (has_equal_timestamp && has_equal_value) { 616 // Even though we skip over the KVs described above, their stats have 617 // already been accounted for resulting in a problem of double-counting. 618 // To solve this we send back the stats of these skipped KVs so that we 619 // can subtract them later. This enables us to construct accurate 620 // MVCCStats and prevents expensive recomputation in the future. 621 const int64_t meta_key_size = sstKey.size() + 1; 622 const int64_t meta_val_size = 0; 623 int64_t total_bytes = meta_key_size + meta_val_size; 624 625 // Update the skipped stats to account fot the skipped meta key. 626 skippedKVStats->live_bytes += total_bytes; 627 skippedKVStats->live_count++; 628 skippedKVStats->key_bytes += meta_key_size; 629 skippedKVStats->val_bytes += meta_val_size; 630 skippedKVStats->key_count++; 631 632 // Update the stats to account for the skipped versioned key/value. 633 total_bytes = sstIter->rep->value().size() + kMVCCVersionTimestampSize; 634 skippedKVStats->live_bytes += total_bytes; 635 skippedKVStats->key_bytes += kMVCCVersionTimestampSize; 636 skippedKVStats->val_bytes += sstIter->rep->value().size(); 637 skippedKVStats->val_count++; 638 639 DBIterNext(existingIter, true /* skip_current_key_versions */); 640 continue; 641 } 642 643 state.valid = false; 644 state.key.key = ToDBSlice(sstKey); 645 state.status = FmtStatus("key collision"); 646 return state; 647 } else if (compare < 0) { 648 targetKey.key = ToDBSlice(sstKey); 649 DBIterSeek(existingIter, targetKey); 650 } else if (compare > 0) { 651 targetKey.key = ToDBSlice(existingKey); 652 DBIterSeek(sstIter, targetKey); 653 } 654 } 655 656 state.valid = true; 657 return state; 658 } 659 660 DBIterator* DBNewIter(DBEngine* db, DBIterOptions iter_options) { 661 return db->NewIter(iter_options); 662 } 663 664 void DBIterDestroy(DBIterator* iter) { delete iter; } 665 666 IteratorStats DBIterStats(DBIterator* iter) { 667 IteratorStats stats = {}; 668 if (iter->stats != nullptr) { 669 stats = *iter->stats; 670 } 671 return stats; 672 } 673 674 DBIterState DBIterSeek(DBIterator* iter, DBKey key) { 675 ScopedStats stats(iter); 676 iter->rep->Seek(EncodeKey(key)); 677 return DBIterGetState(iter); 678 } 679 680 DBIterState DBIterSeekForPrev(DBIterator* iter, DBKey key) { 681 ScopedStats stats(iter); 682 iter->rep->SeekForPrev(EncodeKey(key)); 683 return DBIterGetState(iter); 684 } 685 686 DBIterState DBIterSeekToFirst(DBIterator* iter) { 687 ScopedStats stats(iter); 688 iter->rep->SeekToFirst(); 689 return DBIterGetState(iter); 690 } 691 692 DBIterState DBIterSeekToLast(DBIterator* iter) { 693 ScopedStats stats(iter); 694 iter->rep->SeekToLast(); 695 return DBIterGetState(iter); 696 } 697 698 DBIterState DBIterNext(DBIterator* iter, bool skip_current_key_versions) { 699 ScopedStats stats(iter); 700 // If we're skipping the current key versions, remember the key the 701 // iterator was pointing out. 702 std::string old_key; 703 if (skip_current_key_versions && iter->rep->Valid()) { 704 rocksdb::Slice key; 705 rocksdb::Slice ts; 706 if (!SplitKey(iter->rep->key(), &key, &ts)) { 707 DBIterState state = {0}; 708 state.valid = false; 709 state.status = FmtStatus("failed to split mvcc key"); 710 return state; 711 } 712 old_key = key.ToString(); 713 } 714 715 iter->rep->Next(); 716 717 if (skip_current_key_versions && iter->rep->Valid()) { 718 rocksdb::Slice key; 719 rocksdb::Slice ts; 720 if (!SplitKey(iter->rep->key(), &key, &ts)) { 721 DBIterState state = {0}; 722 state.valid = false; 723 state.status = FmtStatus("failed to split mvcc key"); 724 return state; 725 } 726 if (old_key == key) { 727 // We're pointed at a different version of the same key. Fall 728 // back to seeking to the next key. 729 old_key.append("\0", 1); 730 DBKey db_key; 731 db_key.key = ToDBSlice(old_key); 732 db_key.wall_time = 0; 733 db_key.logical = 0; 734 iter->rep->Seek(EncodeKey(db_key)); 735 } 736 } 737 738 return DBIterGetState(iter); 739 } 740 741 DBIterState DBIterPrev(DBIterator* iter, bool skip_current_key_versions) { 742 ScopedStats stats(iter); 743 // If we're skipping the current key versions, remember the key the 744 // iterator was pointed out. 745 std::string old_key; 746 if (skip_current_key_versions && iter->rep->Valid()) { 747 rocksdb::Slice key; 748 rocksdb::Slice ts; 749 if (SplitKey(iter->rep->key(), &key, &ts)) { 750 old_key = key.ToString(); 751 } 752 } 753 754 iter->rep->Prev(); 755 756 if (skip_current_key_versions && iter->rep->Valid()) { 757 rocksdb::Slice key; 758 rocksdb::Slice ts; 759 if (SplitKey(iter->rep->key(), &key, &ts)) { 760 if (old_key == key) { 761 // We're pointed at a different version of the same key. Fall 762 // back to seeking to the prev key. In this case, we seek to 763 // the "metadata" key and that back up the iterator. 764 DBKey db_key; 765 db_key.key = ToDBSlice(old_key); 766 db_key.wall_time = 0; 767 db_key.logical = 0; 768 iter->rep->Seek(EncodeKey(db_key)); 769 if (iter->rep->Valid()) { 770 iter->rep->Prev(); 771 } 772 } 773 } 774 } 775 776 return DBIterGetState(iter); 777 } 778 779 void DBIterSetLowerBound(DBIterator* iter, DBKey key) { iter->SetLowerBound(key); } 780 void DBIterSetUpperBound(DBIterator* iter, DBKey key) { iter->SetUpperBound(key); } 781 782 DBStatus DBMerge(DBSlice existing, DBSlice update, DBString* new_value, bool full_merge) { 783 new_value->len = 0; 784 785 cockroach::storage::enginepb::MVCCMetadata meta; 786 if (!meta.ParseFromArray(existing.data, existing.len)) { 787 return ToDBString("corrupted existing value"); 788 } 789 790 cockroach::storage::enginepb::MVCCMetadata update_meta; 791 if (!update_meta.ParseFromArray(update.data, update.len)) { 792 return ToDBString("corrupted update value"); 793 } 794 795 if (!MergeValues(&meta, update_meta, full_merge, NULL)) { 796 return ToDBString("incompatible merge values"); 797 } 798 return MergeResult(&meta, new_value); 799 } 800 801 DBStatus DBMergeOne(DBSlice existing, DBSlice update, DBString* new_value) { 802 return DBMerge(existing, update, new_value, true); 803 } 804 805 DBStatus DBPartialMergeOne(DBSlice existing, DBSlice update, DBString* new_value) { 806 return DBMerge(existing, update, new_value, false); 807 } 808 809 // DBGetStats queries the given DBEngine for various operational stats and 810 // write them to the provided DBStatsResult instance. 811 DBStatus DBGetStats(DBEngine* db, DBStatsResult* stats) { 812 return db->GetStats(stats); 813 } 814 815 // `DBGetTickersAndHistograms` retrieves maps of all RocksDB tickers and histograms. 816 // It differs from `DBGetStats` by getting _every_ ticker and histogram, and by not 817 // getting anything else (DB properties, for example). 818 // 819 // In addition to freeing the `DBString`s in the result, the caller is also 820 // responsible for freeing `DBTickersAndHistogramsResult::tickers` and 821 // `DBTickersAndHistogramsResult::histograms`. 822 DBStatus DBGetTickersAndHistograms(DBEngine* db, DBTickersAndHistogramsResult* stats) { 823 return db->GetTickersAndHistograms(stats); 824 } 825 826 DBString DBGetCompactionStats(DBEngine* db) { return db->GetCompactionStats(); } 827 828 DBStatus DBGetEnvStats(DBEngine* db, DBEnvStatsResult* stats) { return db->GetEnvStats(stats); } 829 830 DBStatus DBGetEncryptionRegistries(DBEngine* db, DBEncryptionRegistries* result) { 831 return db->GetEncryptionRegistries(result); 832 } 833 834 DBSSTable* DBGetSSTables(DBEngine* db, int* n) { return db->GetSSTables(n); } 835 836 DBStatus DBGetSortedWALFiles(DBEngine* db, DBWALFile** files, int* n) { 837 return db->GetSortedWALFiles(files, n); 838 } 839 840 DBString DBGetUserProperties(DBEngine* db) { return db->GetUserProperties(); } 841 842 DBStatus DBIngestExternalFiles(DBEngine* db, char** paths, size_t len, bool move_files) { 843 std::vector<std::string> paths_vec; 844 for (size_t i = 0; i < len; i++) { 845 paths_vec.push_back(paths[i]); 846 } 847 848 rocksdb::IngestExternalFileOptions ingest_options; 849 // If move_files is true and the env supports it, RocksDB will hard link. 850 // Otherwise, it will copy. 851 ingest_options.move_files = move_files; 852 // If snapshot_consistency is true and there is an outstanding RocksDB 853 // snapshot, a global sequence number is forced (see the allow_global_seqno 854 // option). 855 ingest_options.snapshot_consistency = true; 856 // If a file is ingested over existing data (including the range tombstones 857 // used by range snapshots) or if a RocksDB snapshot is outstanding when this 858 // ingest runs, then after moving/copying the file, historically RocksDB would 859 // edit it (overwrite some of the bytes) to have a global sequence number. 860 // After https://github.com/facebook/rocksdb/pull/4172 this can be disabled 861 // (with the mutable manifest/metadata tracking that instead). However it is 862 // only safe to disable the seqno write if older versions of RocksDB (<5.16) 863 // will not be used to read these SSTs; luckily we no longer need to 864 // interoperate with such older versions. 865 ingest_options.write_global_seqno = false; 866 // RocksDB checks the option allow_global_seqno and, if it is false, returns 867 // an error instead of ingesting a file that would require one. However it 868 // does this check *even if it is not planning on writing seqno* at all (and 869 // we're not planning on writing any as per write_global_seqno above), so we 870 // need to set allow_global_seqno to true. 871 ingest_options.allow_global_seqno = true; 872 // If there are mutations in the memtable for the keyrange covered by the file 873 // being ingested, this option is checked. If true, the memtable is flushed 874 // using a blocking, write-stalling flush and the ingest run. If false, an 875 // error is returned. 876 // 877 // We want to ingest, but we do not want a write-stall, so we initially set it 878 // to false -- if our ingest fails, we'll do a manual, no-stall flush and wait 879 // for it to finish before trying the ingest again. 880 ingest_options.allow_blocking_flush = false; 881 882 rocksdb::Status status = db->rep->IngestExternalFile(paths_vec, ingest_options); 883 if (status.IsInvalidArgument()) { 884 // TODO(dt): inspect status to see if it has the message 885 // `External file requires flush` 886 // since the move_file and other errors also use kInvalidArgument. 887 888 // It is possible we failed because the memtable required a flush but in the 889 // options above, we set "allow_blocking_flush = false" preventing ingest 890 // from running flush with allow_write_stall = true and halting foreground 891 // traffic. Now that we know we need to flush, let's do one ourselves, with 892 // allow_write_stall = false and wait for it. After it finishes we can retry 893 // the ingest. 894 rocksdb::FlushOptions flush_options; 895 flush_options.allow_write_stall = false; 896 flush_options.wait = true; 897 898 rocksdb::Status flush_status = db->rep->Flush(flush_options); 899 if (!flush_status.ok()) { 900 return ToDBStatus(flush_status); 901 } 902 903 // Hopefully on this second attempt we will not need to flush at all, but 904 // just in case we do, we'll allow the write stall this time -- that way we 905 // can ensure we actually get the ingestion done and move on. A stalling 906 // flush is be less than ideal, but since we just flushed, a) this shouldn't 907 // happen often and b) if it does, it should be small and quick. 908 ingest_options.allow_blocking_flush = true; 909 status = db->rep->IngestExternalFile(paths_vec, ingest_options); 910 } 911 912 if (!status.ok()) { 913 return ToDBStatus(status); 914 } 915 916 return kSuccess; 917 } 918 919 struct DBSstFileWriter { 920 std::unique_ptr<rocksdb::Options> options; 921 std::unique_ptr<rocksdb::Env> memenv; 922 rocksdb::SstFileWriter rep; 923 924 DBSstFileWriter(rocksdb::Options* o, rocksdb::Env* m) 925 : options(o), memenv(m), rep(rocksdb::EnvOptions(), *o, o->comparator) {} 926 virtual ~DBSstFileWriter() {} 927 }; 928 929 DBSstFileWriter* DBSstFileWriterNew() { 930 // TODO(dan): Right now, backup is the only user of this code, so that's what 931 // the options are tuned for. If something else starts using it, we'll likely 932 // have to add some configurability. 933 934 rocksdb::BlockBasedTableOptions table_options; 935 // Larger block size (4kb default) means smaller file at the expense of more 936 // scanning during lookups. 937 table_options.block_size = 32 * 1024; 938 // The original LevelDB compatible format. We explicitly set the checksum too 939 // to guard against the silent version upconversion. See 940 // https://github.com/facebook/rocksdb/blob/972f96b3fbae1a4675043bdf4279c9072ad69645/include/rocksdb/table.h#L198 941 table_options.format_version = 0; 942 table_options.checksum = rocksdb::kCRC32c; 943 table_options.whole_key_filtering = false; 944 // This makes the sstables produced by Pebble and RocksDB byte-by-byte identical, which is 945 // useful for testing. 946 table_options.index_shortening = 947 rocksdb::BlockBasedTableOptions::IndexShorteningMode::kShortenSeparatorsAndSuccessor; 948 949 rocksdb::Options* options = new rocksdb::Options(); 950 options->comparator = &kComparator; 951 options->table_factory.reset(rocksdb::NewBlockBasedTableFactory(table_options)); 952 953 // Use the TablePropertiesCollector hook to store the min and max MVCC 954 // timestamps present in each sstable in the metadata for that sstable. Used 955 // by the time bounded iterator optimization. 956 options->table_properties_collector_factories.emplace_back(DBMakeTimeBoundCollector()); 957 // Automatically request compactions whenever an SST contains too many range 958 // deletions. 959 options->table_properties_collector_factories.emplace_back(DBMakeDeleteRangeCollector()); 960 961 std::unique_ptr<rocksdb::Env> memenv; 962 memenv.reset(rocksdb::NewMemEnv(rocksdb::Env::Default())); 963 options->env = memenv.get(); 964 965 return new DBSstFileWriter(options, memenv.release()); 966 } 967 968 DBStatus DBSstFileWriterOpen(DBSstFileWriter* fw) { 969 rocksdb::Status status = fw->rep.Open("sst"); 970 if (!status.ok()) { 971 return ToDBStatus(status); 972 } 973 return kSuccess; 974 } 975 976 namespace { 977 DBStatus DBSstFileWriterAddRaw(DBSstFileWriter* fw, const rocksdb::Slice key, 978 const rocksdb::Slice val) { 979 rocksdb::Status status = fw->rep.Put(key, val); 980 if (!status.ok()) { 981 return ToDBStatus(status); 982 } 983 984 return kSuccess; 985 } 986 } // namespace 987 988 DBStatus DBSstFileWriterAdd(DBSstFileWriter* fw, DBKey key, DBSlice val) { 989 return DBSstFileWriterAddRaw(fw, EncodeKey(key), ToSlice(val)); 990 } 991 992 DBStatus DBSstFileWriterDelete(DBSstFileWriter* fw, DBKey key) { 993 rocksdb::Status status = fw->rep.Delete(EncodeKey(key)); 994 if (!status.ok()) { 995 return ToDBStatus(status); 996 } 997 return kSuccess; 998 } 999 1000 DBStatus DBSstFileWriterDeleteRange(DBSstFileWriter* fw, DBKey start, DBKey end) { 1001 rocksdb::Status status = fw->rep.DeleteRange(EncodeKey(start), EncodeKey(end)); 1002 if (!status.ok()) { 1003 return ToDBStatus(status); 1004 } 1005 return kSuccess; 1006 } 1007 1008 DBStatus DBSstFileWriterCopyData(DBSstFileWriter* fw, DBString* data) { 1009 uint64_t file_size; 1010 rocksdb::Status status = fw->memenv->GetFileSize("sst", &file_size); 1011 if (!status.ok()) { 1012 return ToDBStatus(status); 1013 } 1014 if (file_size == 0) { 1015 return kSuccess; 1016 } 1017 1018 const rocksdb::EnvOptions soptions; 1019 std::unique_ptr<rocksdb::SequentialFile> sst; 1020 status = fw->memenv->NewSequentialFile("sst", &sst, soptions); 1021 if (!status.ok()) { 1022 return ToDBStatus(status); 1023 } 1024 1025 // scratch is eventually returned as the array part of data and freed by the 1026 // caller. 1027 char* scratch = static_cast<char*>(malloc(file_size)); 1028 1029 rocksdb::Slice sst_contents; 1030 status = sst->Read(file_size, &sst_contents, scratch); 1031 if (!status.ok()) { 1032 return ToDBStatus(status); 1033 } 1034 if (sst_contents.size() != file_size) { 1035 return FmtStatus("expected to read %" PRIu64 " bytes but got %zu", file_size, 1036 sst_contents.size()); 1037 } 1038 1039 // The contract of the SequentialFile.Read call above is that it _might_ use 1040 // scratch as the backing data for sst_contents, but it also _might not_. If 1041 // it didn't, copy sst_contents into scratch, so we can unconditionally return 1042 // a DBString backed by scratch (which can then always be freed by the 1043 // caller). Note that this means the data is always copied exactly once, 1044 // either by Read or here. 1045 if (sst_contents.data() != scratch) { 1046 memcpy(scratch, sst_contents.data(), sst_contents.size()); 1047 } 1048 data->data = scratch; 1049 data->len = sst_contents.size(); 1050 1051 return kSuccess; 1052 } 1053 1054 DBStatus DBSstFileWriterTruncate(DBSstFileWriter* fw, DBString* data) { 1055 DBStatus status = DBSstFileWriterCopyData(fw, data); 1056 if (status.data != NULL) { 1057 return status; 1058 } 1059 return ToDBStatus(fw->memenv->Truncate("sst", 0)); 1060 } 1061 1062 DBStatus DBSstFileWriterFinish(DBSstFileWriter* fw, DBString* data) { 1063 rocksdb::Status status = fw->rep.Finish(); 1064 if (!status.ok()) { 1065 return ToDBStatus(status); 1066 } 1067 1068 return DBSstFileWriterCopyData(fw, data); 1069 } 1070 1071 void DBSstFileWriterClose(DBSstFileWriter* fw) { delete fw; } 1072 1073 DBStatus DBLockFile(DBSlice filename, DBFileLock* lock) { 1074 return ToDBStatus( 1075 rocksdb::Env::Default()->LockFile(ToString(filename), (rocksdb::FileLock**)lock)); 1076 } 1077 1078 DBStatus DBUnlockFile(DBFileLock lock) { 1079 return ToDBStatus(rocksdb::Env::Default()->UnlockFile((rocksdb::FileLock*)lock)); 1080 } 1081 1082 DBStatus DBExportToSst(DBKey start, DBKey end, bool export_all_revisions, 1083 uint64_t target_size, uint64_t max_size, 1084 DBIterOptions iter_opts, DBEngine* engine, DBString* data, 1085 DBString* write_intent, DBString* summary, DBString* resume) { 1086 DBSstFileWriter* writer = DBSstFileWriterNew(); 1087 DBStatus status = DBSstFileWriterOpen(writer); 1088 if (status.data != NULL) { 1089 return status; 1090 } 1091 1092 DBIncrementalIterator iter(engine, iter_opts, start, end, write_intent); 1093 1094 roachpb::BulkOpSummary bulkop_summary; 1095 RowCounter row_counter(&bulkop_summary); 1096 1097 bool skip_current_key_versions = !export_all_revisions; 1098 DBIterState state; 1099 const std::string end_key = EncodeKey(end); 1100 // cur_key is used when paginated is true and export_all_revisions is 1101 // true. If we're exporting all revisions and we're returning a paginated 1102 // SST then we need to keep track of when we've finished adding all of the 1103 // versions of a key to the writer. 1104 const bool paginated = target_size > 0; 1105 std::string cur_key; 1106 std::string resume_key; 1107 // Seek to the MVCC metadata key for the provided start key and let the 1108 // incremental iterator find the appropriate version. 1109 const DBKey seek_key = {.key = start.key}; 1110 for (state = iter.seek(seek_key);; state = iter.next(skip_current_key_versions)) { 1111 if (state.status.data != NULL) { 1112 DBSstFileWriterClose(writer); 1113 return state.status; 1114 } else if (!state.valid || kComparator.Compare(iter.key(), end_key) >= 0) { 1115 break; 1116 } 1117 rocksdb::Slice decoded_key; 1118 int64_t wall_time = 0; 1119 int32_t logical_time = 0; 1120 1121 if (!DecodeKey(iter.key(), &decoded_key, &wall_time, &logical_time)) { 1122 DBSstFileWriterClose(writer); 1123 return ToDBString("Unable to decode key"); 1124 } 1125 1126 const bool is_new_key = !export_all_revisions || decoded_key.compare(cur_key) != 0; 1127 if (paginated && export_all_revisions && is_new_key) { 1128 // Reuse the underlying buffer in cur_key. 1129 cur_key.clear(); 1130 cur_key.reserve(decoded_key.size()); 1131 cur_key.assign(decoded_key.data(), decoded_key.size()); 1132 } 1133 1134 // Skip tombstone (len=0) records when start time is zero (non-incremental) 1135 // and we are not exporting all versions. 1136 const bool is_skipping_deletes = 1137 start.wall_time == 0 && start.logical == 0 && !export_all_revisions; 1138 if (is_skipping_deletes && iter.value().size() == 0) { 1139 continue; 1140 } 1141 1142 // Check to see if this is the first version of key and adding it would 1143 // put us over the limit (we might already be over the limit). 1144 const int64_t cur_size = bulkop_summary.data_size(); 1145 const bool reached_target_size = cur_size > 0 && cur_size >= target_size; 1146 if (paginated && is_new_key && reached_target_size) { 1147 resume_key.reserve(decoded_key.size()); 1148 resume_key.assign(decoded_key.data(), decoded_key.size()); 1149 break; 1150 } 1151 1152 // Insert key into sst and update statistics. 1153 status = DBSstFileWriterAddRaw(writer, iter.key(), iter.value()); 1154 if (status.data != NULL) { 1155 DBSstFileWriterClose(writer); 1156 return status; 1157 } 1158 1159 if (!row_counter.Count(iter.key())) { 1160 return ToDBString("Error in row counter"); 1161 } 1162 const int64_t new_size = cur_size + decoded_key.size() + iter.value().size(); 1163 if (max_size > 0 && new_size > max_size) { 1164 return FmtStatus("export size (%" PRIi64 " bytes) exceeds max size (%" PRIi64 " bytes)", 1165 new_size, max_size); 1166 } 1167 bulkop_summary.set_data_size(new_size); 1168 } 1169 *summary = ToDBString(bulkop_summary.SerializeAsString()); 1170 1171 if (bulkop_summary.data_size() == 0) { 1172 DBSstFileWriterClose(writer); 1173 return kSuccess; 1174 } 1175 1176 auto res = DBSstFileWriterFinish(writer, data); 1177 DBSstFileWriterClose(writer); 1178 1179 // If we're not returning an error, check to see if we need to return the resume key. 1180 if (res.data == NULL && resume_key.length() > 0) { 1181 *resume = ToDBString(resume_key); 1182 } 1183 1184 return res; 1185 } 1186 1187 DBStatus DBEnvOpenReadableFile(DBEngine* db, DBSlice path, DBReadableFile* file) { 1188 return db->EnvOpenReadableFile(path, (rocksdb::RandomAccessFile**)file); 1189 } 1190 1191 DBStatus DBEnvReadAtFile(DBEngine* db, DBReadableFile file, DBSlice buffer, int64_t offset, 1192 int* n) { 1193 return db->EnvReadAtFile((rocksdb::RandomAccessFile*)file, buffer, offset, n); 1194 } 1195 1196 DBStatus DBEnvCloseReadableFile(DBEngine* db, DBReadableFile file) { 1197 return db->EnvCloseReadableFile((rocksdb::RandomAccessFile*)file); 1198 } 1199 1200 DBStatus DBEnvOpenDirectory(DBEngine* db, DBSlice path, DBDirectory* file) { 1201 return db->EnvOpenDirectory(path, (rocksdb::Directory**)file); 1202 } 1203 1204 DBStatus DBEnvSyncDirectory(DBEngine* db, DBDirectory file) { 1205 return db->EnvSyncDirectory((rocksdb::Directory*)file); 1206 } 1207 1208 DBStatus DBEnvCloseDirectory(DBEngine* db, DBDirectory file) { 1209 return db->EnvCloseDirectory((rocksdb::Directory*)file); 1210 } 1211 1212 DBStatus DBEnvRenameFile(DBEngine* db, DBSlice oldname, DBSlice newname) { 1213 return db->EnvRenameFile(oldname, newname); 1214 } 1215 1216 DBStatus DBEnvCreateDir(DBEngine* db, DBSlice name) { 1217 return db->EnvCreateDir(name); 1218 } 1219 1220 DBStatus DBEnvDeleteDir(DBEngine* db, DBSlice name) { 1221 return db->EnvDeleteDir(name); 1222 } 1223 1224 DBListDirResults DBEnvListDir(DBEngine* db, DBSlice name) { 1225 DBListDirResults result; 1226 std::vector<std::string> contents; 1227 result.status = db->EnvListDir(name, &contents); 1228 result.n = contents.size(); 1229 // We malloc the names so it can be deallocated by the caller using free(). 1230 const int size = contents.size() * sizeof(DBString); 1231 result.names = reinterpret_cast<DBString*>(malloc(size)); 1232 memset(result.names, 0, size); 1233 for (int i = 0; i < contents.size(); i++) { 1234 result.names[i] = ToDBString(rocksdb::Slice(contents[i].data(), contents[i].size())); 1235 } 1236 return result; 1237 } 1238 1239 DBString DBDumpThreadStacks() { 1240 return ToDBString(DumpThreadStacks()); 1241 }