github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/c-deps/libroach/db.cc

github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/c-deps/libroach/db.cc (about)

     1  // Copyright 2014 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  #include "db.h"
    12  #include <algorithm>
    13  #include <iostream>
    14  #include <rocksdb/convenience.h>
    15  #include <rocksdb/perf_context.h>
    16  #include <rocksdb/sst_file_writer.h>
    17  #include <rocksdb/table.h>
    18  #include <rocksdb/utilities/checkpoint.h>
    19  #include <stdarg.h>
    20  #include "batch.h"
    21  #include "cache.h"
    22  #include "comparator.h"
    23  #include "defines.h"
    24  #include "encoding.h"
    25  #include "engine.h"
    26  #include "env_manager.h"
    27  #include "eventlistener.h"
    28  #include "fmt.h"
    29  #include "getter.h"
    30  #include "godefs.h"
    31  #include "incremental_iterator.h"
    32  #include "iterator.h"
    33  #include "merge.h"
    34  #include "options.h"
    35  #include "protos/roachpb/errors.pb.h"
    36  #include "row_counter.h"
    37  #include "snapshot.h"
    38  #include "stack_trace.h"
    39  #include "status.h"
    40  #include "table_props.h"
    41  #include "timestamp.h"
    42  
    43  using namespace cockroach;
    44  
    45  namespace cockroach {
    46  
    47  DBKey ToDBKey(const rocksdb::Slice& s) {
    48    DBKey key;
    49    memset(&key, 0, sizeof(key));
    50    rocksdb::Slice tmp;
    51    if (DecodeKey(s, &tmp, &key.wall_time, &key.logical)) {
    52      key.key = ToDBSlice(tmp);
    53    }
    54    return key;
    55  }
    56  
    57  ScopedStats::ScopedStats(DBIterator* iter)
    58      : iter_(iter),
    59        internal_delete_skipped_count_base_(
    60            rocksdb::get_perf_context()->internal_delete_skipped_count) {
    61    if (iter_->stats != nullptr) {
    62      rocksdb::SetPerfLevel(rocksdb::PerfLevel::kEnableTimeExceptForMutex);
    63    }
    64  }
    65  ScopedStats::~ScopedStats() {
    66    if (iter_->stats != nullptr) {
    67      iter_->stats->internal_delete_skipped_count +=
    68          (rocksdb::get_perf_context()->internal_delete_skipped_count -
    69           internal_delete_skipped_count_base_);
    70      rocksdb::SetPerfLevel(rocksdb::PerfLevel::kDisable);
    71    }
    72  }
    73  
    74  void BatchSSTablesForCompaction(const std::vector<rocksdb::SstFileMetaData>& sst,
    75                                  rocksdb::Slice start_key, rocksdb::Slice end_key,
    76                                  uint64_t target_size, std::vector<rocksdb::Range>* ranges) {
    77    int prev = -1;  // index of the last compacted sst
    78    uint64_t size = 0;
    79    for (int i = 0; i < sst.size(); ++i) {
    80      size += sst[i].size;
    81      if (size < target_size && (i + 1) < sst.size()) {
    82        // We haven't reached the target size or the end of the sstables
    83        // to compact.
    84        continue;
    85      }
    86  
    87      rocksdb::Slice start;
    88      if (prev == -1) {
    89        // This is the first compaction.
    90        start = start_key;
    91      } else {
    92        // This is a compaction in the middle or end of the requested
    93        // key range. The start key for the compaction is the largest
    94        // key from the previous compacted.
    95        start = rocksdb::Slice(sst[prev].largestkey);
    96      }
    97  
    98      rocksdb::Slice end;
    99      if ((i + 1) == sst.size()) {
   100        // This is the last compaction.
   101        end = end_key;
   102      } else {
   103        // This is a compaction at the start or in the middle of the
   104        // requested key range. The end key is the largest key in the
   105        // current sstable.
   106        end = rocksdb::Slice(sst[i].largestkey);
   107      }
   108  
   109      ranges->emplace_back(rocksdb::Range(start, end));
   110  
   111      prev = i;
   112      size = 0;
   113    }
   114  }
   115  
   116  }  // namespace cockroach
   117  
   118  namespace {
   119  
   120  DBIterState DBIterGetState(DBIterator* iter) {
   121    DBIterState state = {};
   122    state.valid = iter->rep->Valid();
   123    state.status = ToDBStatus(iter->rep->status());
   124  
   125    if (state.valid) {
   126      rocksdb::Slice key;
   127      state.valid = DecodeKey(iter->rep->key(), &key, &state.key.wall_time, &state.key.logical);
   128      if (state.valid) {
   129        state.key.key = ToDBSlice(key);
   130        state.value = ToDBSlice(iter->rep->value());
   131      }
   132    }
   133  
   134    return state;
   135  }
   136  }  // namespace
   137  
   138  namespace cockroach {
   139  
   140  // DBOpenHookOSS mode only verifies that no extra options are specified.
   141  rocksdb::Status DBOpenHookOSS(std::shared_ptr<rocksdb::Logger> info_log, const std::string& db_dir,
   142                                const DBOptions db_opts, EnvManager* env_mgr) {
   143    if (db_opts.extra_options.len != 0) {
   144      return rocksdb::Status::InvalidArgument("encryption options are not supported in OSS builds");
   145    }
   146    return rocksdb::Status::OK();
   147  }
   148  
   149  }  // namespace cockroach
   150  
   151  static DBOpenHook* db_open_hook = DBOpenHookOSS;
   152  
   153  void DBSetOpenHook(void* hook) { db_open_hook = (DBOpenHook*)hook; }
   154  
   155  DBStatus DBOpen(DBEngine** db, DBSlice dir, DBOptions db_opts) {
   156    rocksdb::Options options = DBMakeOptions(db_opts);
   157  
   158    const std::string additional_options = ToString(db_opts.rocksdb_options);
   159    if (!additional_options.empty()) {
   160      // TODO(peter): Investigate using rocksdb::LoadOptionsFromFile if
   161      // "additional_options" starts with "@". The challenge is that
   162      // LoadOptionsFromFile gives us a DBOptions and
   163      // ColumnFamilyOptions with no ability to supply "base" options
   164      // and no ability to determine what options were specified in the
   165      // file which could cause "defaults" to override the options
   166      // returned by DBMakeOptions. We might need to fix this upstream.
   167      rocksdb::Status status = rocksdb::GetOptionsFromString(options, additional_options, &options);
   168      if (!status.ok()) {
   169        return ToDBStatus(status);
   170      }
   171    }
   172  
   173    const std::string db_dir = ToString(dir);
   174  
   175    // Make the default options.env the default. It points to Env::Default which does not
   176    // need to be deleted.
   177    std::unique_ptr<cockroach::EnvManager> env_mgr(new cockroach::EnvManager(options.env));
   178  
   179    if (dir.len == 0) {
   180      // In-memory database: use a MemEnv as the base Env.
   181      auto memenv = rocksdb::NewMemEnv(rocksdb::Env::Default());
   182      // Register it for deletion.
   183      env_mgr->TakeEnvOwnership(memenv);
   184      // Create a root directory to suppress error messages that RocksDB would
   185      // print if it had to create the DB directory itself.
   186      memenv->CreateDir("/");
   187      // Make it the env that all other Envs must wrap.
   188      env_mgr->base_env = memenv;
   189      // Make it the env for rocksdb.
   190      env_mgr->db_env = memenv;
   191    }
   192  
   193    // Create the file registry. It uses the base_env to access the registry file.
   194    auto file_registry =
   195        std::unique_ptr<FileRegistry>(new FileRegistry(env_mgr->base_env, db_dir, db_opts.read_only));
   196  
   197    if (db_opts.use_file_registry) {
   198      // We're using the file registry.
   199      auto status = file_registry->Load();
   200      if (!status.ok()) {
   201        return ToDBStatus(status);
   202      }
   203  
   204      status = file_registry->CheckNoRegistryFile();
   205      if (!status.ok()) {
   206        // We have a file registry, this means we've used encryption flags before
   207        // and are tracking all files on disk. Running without encryption (extra_options empty)
   208        // will bypass the file registry and lose changes.
   209        // In this case, we have multiple possibilities:
   210        // - no extra_options: this fails here
   211        // - extra_options:
   212        //   - OSS: this fails in the OSS hook (OSS does not understand extra_options)
   213        //   - CCL: fails if the options do not parse properly
   214        if (db_opts.extra_options.len == 0) {
   215          return ToDBStatus(rocksdb::Status::InvalidArgument(
   216              "encryption was used on this store before, but no encryption flags specified. You need "
   217              "a CCL build and must fully specify the --enterprise-encryption flag"));
   218        }
   219      }
   220  
   221      // EnvManager takes ownership of the file registry.
   222      env_mgr->file_registry.swap(file_registry);
   223    } else {
   224      // File registry format not enabled: check whether we have a registry file (we shouldn't).
   225      // The file_registry is not passed to anyone, it is deleted when it goes out of scope.
   226      auto status = file_registry->CheckNoRegistryFile();
   227      if (!status.ok()) {
   228        return ToDBStatus(status);
   229      }
   230    }
   231  
   232    // Call hooks to handle db_opts.extra_options.
   233    auto hook_status = db_open_hook(options.info_log, db_dir, db_opts, env_mgr.get());
   234    if (!hook_status.ok()) {
   235      return ToDBStatus(hook_status);
   236    }
   237  
   238    // Register listener for tracking RocksDB stats.
   239    std::shared_ptr<DBEventListener> event_listener(new DBEventListener);
   240    options.listeners.emplace_back(event_listener);
   241  
   242    // Point rocksdb to the env to use.
   243    options.env = env_mgr->db_env;
   244  
   245    rocksdb::DB* db_ptr;
   246    rocksdb::Status status;
   247    if (db_opts.read_only) {
   248      status = rocksdb::DB::OpenForReadOnly(options, db_dir, &db_ptr);
   249    } else {
   250      status = rocksdb::DB::Open(options, db_dir, &db_ptr);
   251    }
   252  
   253    if (!status.ok()) {
   254      return ToDBStatus(status);
   255    }
   256    *db = new DBImpl(db_ptr, std::move(env_mgr),
   257                     db_opts.cache != nullptr ? db_opts.cache->rep : nullptr, event_listener);
   258    return kSuccess;
   259  }
   260  
   261  DBStatus DBCreateCheckpoint(DBEngine* db, DBSlice dir) {
   262    const std::string cp_dir = ToString(dir);
   263  
   264    rocksdb::Checkpoint* cp_ptr;
   265    auto status = rocksdb::Checkpoint::Create(db->rep, &cp_ptr);
   266    if (!status.ok()) {
   267      return ToDBStatus(status);
   268    }
   269    // NB: passing 0 for log_size_for_flush forces a WAL sync, i.e. makes sure
   270    // that the checkpoint is up to date.
   271    status = cp_ptr->CreateCheckpoint(cp_dir, 0 /* log_size_for_flush */);
   272    delete (cp_ptr);
   273    return ToDBStatus(status);
   274  }
   275  
   276  DBStatus DBDestroy(DBSlice dir) {
   277    rocksdb::Options options;
   278    return ToDBStatus(rocksdb::DestroyDB(ToString(dir), options));
   279  }
   280  
   281  DBStatus DBClose(DBEngine* db) {
   282    DBStatus status = db->AssertPreClose();
   283    if (status.data == nullptr) {
   284      delete db;
   285    }
   286    return status;
   287  }
   288  
   289  DBStatus DBFlush(DBEngine* db) {
   290    rocksdb::FlushOptions options;
   291    options.wait = true;
   292    return ToDBStatus(db->rep->Flush(options));
   293  }
   294  
   295  DBStatus DBSyncWAL(DBEngine* db) {
   296  #ifdef _WIN32
   297    // On Windows, DB::SyncWAL() is not implemented due to fact that
   298    // `WinWritableFile` is not thread safe. To get around that, the only other
   299    // methods that can be used to ensure that a sync is triggered is to either
   300    // flush the memtables or perform a write with `WriteOptions.sync=true`. See
   301    // https://github.com/facebook/rocksdb/wiki/RocksDB-FAQ for more details.
   302    // Please also see #17442 for more discussion on the topic.
   303  
   304    // In order to force a sync we issue a write-batch containing
   305    // LogData with 'sync=true'. The LogData forces a write to the WAL
   306    // but otherwise doesn't add anything to the memtable or sstables.
   307    rocksdb::WriteBatch batch;
   308    batch.PutLogData("");
   309    rocksdb::WriteOptions options;
   310    options.sync = true;
   311    return ToDBStatus(db->rep->Write(options, &batch));
   312  #else
   313    return ToDBStatus(db->rep->FlushWAL(true /* sync */));
   314  #endif
   315  }
   316  
   317  DBStatus DBCompact(DBEngine* db) {
   318    return DBCompactRange(db, DBSlice(), DBSlice(), true /* force_bottommost */);
   319  }
   320  
   321  DBStatus DBCompactRange(DBEngine* db, DBSlice start, DBSlice end, bool force_bottommost) {
   322    rocksdb::CompactRangeOptions options;
   323    // By default, RocksDB doesn't recompact the bottom level (unless
   324    // there is a compaction filter, which we don't use). However,
   325    // recompacting the bottom layer is necessary to pick up changes to
   326    // settings like bloom filter configurations, and to fully reclaim
   327    // space after dropping, truncating, or migrating tables.
   328    if (force_bottommost) {
   329      options.bottommost_level_compaction = rocksdb::BottommostLevelCompaction::kForce;
   330    }
   331    // By default, RocksDB treats manual compaction requests as
   332    // operating exclusively, preventing normal automatic compactions
   333    // from running. This can block writes to the database, as L0
   334    // SSTables will become full without being allowed to compact to L1.
   335    options.exclusive_manual_compaction = false;
   336  
   337    // Compacting the entire database in a single-shot can use a
   338    // significant amount of additional (temporary) disk space. Instead,
   339    // we loop over the sstables in the lowest level and initiate
   340    // compactions on smaller ranges of keys. The resulting compacted
   341    // database is the same size, but the temporary disk space needed
   342    // for the compaction is dramatically reduced.
   343    std::vector<rocksdb::LiveFileMetaData> all_metadata;
   344    std::vector<rocksdb::LiveFileMetaData> metadata;
   345    db->rep->GetLiveFilesMetaData(&all_metadata);
   346  
   347    const std::string start_key(ToString(start));
   348    const std::string end_key(ToString(end));
   349  
   350    int max_level = 0;
   351    for (int i = 0; i < all_metadata.size(); i++) {
   352      // Skip any SSTables which fall outside the specified range, if a
   353      // range was specified.
   354      if ((!start_key.empty() && all_metadata[i].largestkey < start_key) ||
   355          (!end_key.empty() && all_metadata[i].smallestkey >= end_key)) {
   356        continue;
   357      }
   358      if (max_level < all_metadata[i].level) {
   359        max_level = all_metadata[i].level;
   360      }
   361      // Gather the set of SSTables to compact.
   362      metadata.push_back(all_metadata[i]);
   363    }
   364    all_metadata.clear();
   365  
   366    if (max_level != db->rep->NumberLevels() - 1) {
   367      // There are no sstables at the lowest level, so just compact the
   368      // specified key span, wholesale. Due to the
   369      // level_compaction_dynamic_level_bytes setting, this will only
   370      // happen on spans containing very little data.
   371      const rocksdb::Slice start_slice(start_key);
   372      const rocksdb::Slice end_slice(end_key);
   373      return ToDBStatus(db->rep->CompactRange(options, !start_key.empty() ? &start_slice : nullptr,
   374                                              !end_key.empty() ? &end_slice : nullptr));
   375    }
   376  
   377    // A naive approach to selecting ranges to compact would be to
   378    // compact the ranges specified by the smallest and largest key in
   379    // each sstable of the bottom-most level. Unfortunately, the
   380    // sstables in the bottom-most level have vastly different
   381    // sizes. For example, starting with the following set of bottom-most
   382    // sstables:
   383    //
   384    //   100M[16] 89M 70M 66M 56M 54M 38M[2] 36M 23M 20M 17M 8M 6M 5M 2M 2K[4]
   385    //
   386    // If we compact the entire database in one call we can end up with:
   387    //
   388    //   100M[22] 77M 76M 50M
   389    //
   390    // If we use the naive approach (compact the range specified by
   391    // the smallest and largest keys):
   392    //
   393    //   100M[18] 92M 68M 62M 61M 50M 45M 39M 31M 29M[2] 24M 23M 18M 9M 8M[2] 7M
   394    //   2K[4]
   395    //
   396    // With the approach below:
   397    //
   398    //   100M[19] 80M 68M[2] 62M 61M 53M 45M 36M 31M
   399    //
   400    // The approach below is to loop over the bottom-most sstables in
   401    // sorted order and initiate a compact range every 128MB of data.
   402  
   403    // Gather up the bottom-most sstable metadata.
   404    std::vector<rocksdb::SstFileMetaData> sst;
   405    for (int i = 0; i < metadata.size(); i++) {
   406      if (metadata[i].level != max_level) {
   407        continue;
   408      }
   409      sst.push_back(metadata[i]);
   410    }
   411    // Sort the metadata by smallest key.
   412    std::sort(sst.begin(), sst.end(),
   413              [](const rocksdb::SstFileMetaData& a, const rocksdb::SstFileMetaData& b) -> bool {
   414                return a.smallestkey < b.smallestkey;
   415              });
   416  
   417    // Batch the bottom-most sstables into compactions of ~128MB.
   418    const uint64_t target_size = 128 << 20;
   419    std::vector<rocksdb::Range> ranges;
   420    BatchSSTablesForCompaction(sst, start_key, end_key, target_size, &ranges);
   421  
   422    for (auto r : ranges) {
   423      rocksdb::Status status = db->rep->CompactRange(options, r.start.empty() ? nullptr : &r.start,
   424                                                     r.limit.empty() ? nullptr : &r.limit);
   425      if (!status.ok()) {
   426        return ToDBStatus(status);
   427      }
   428    }
   429  
   430    return kSuccess;
   431  }
   432  
   433  DBStatus DBDisableAutoCompaction(DBEngine* db) {
   434    auto status = db->rep->SetOptions({{"disable_auto_compactions", "true"}});
   435    return ToDBStatus(status);
   436  }
   437  
   438  DBStatus DBEnableAutoCompaction(DBEngine* db) {
   439    auto status = db->rep->EnableAutoCompaction({db->rep->DefaultColumnFamily()});
   440    return ToDBStatus(status);
   441  }
   442  
   443  DBStatus DBApproximateDiskBytes(DBEngine* db, DBKey start, DBKey end, uint64_t* size) {
   444    const std::string start_key(EncodeKey(start));
   445    const std::string end_key(EncodeKey(end));
   446    const rocksdb::Range r(start_key, end_key);
   447    const uint8_t flags = rocksdb::DB::SizeApproximationFlags::INCLUDE_FILES;
   448  
   449    db->rep->GetApproximateSizes(&r, 1, size, flags);
   450    return kSuccess;
   451  }
   452  
   453  DBStatus DBPut(DBEngine* db, DBKey key, DBSlice value) { return db->Put(key, value); }
   454  
   455  DBStatus DBMerge(DBEngine* db, DBKey key, DBSlice value) { return db->Merge(key, value); }
   456  
   457  DBStatus DBGet(DBEngine* db, DBKey key, DBString* value) { return db->Get(key, value); }
   458  
   459  DBStatus DBDelete(DBEngine* db, DBKey key) { return db->Delete(key); }
   460  
   461  DBStatus DBSingleDelete(DBEngine* db, DBKey key) { return db->SingleDelete(key); }
   462  
   463  DBStatus DBDeleteRange(DBEngine* db, DBKey start, DBKey end) { return db->DeleteRange(start, end); }
   464  
   465  DBStatus DBDeleteIterRange(DBEngine* db, DBIterator* iter, DBKey start, DBKey end) {
   466    rocksdb::Iterator* const iter_rep = iter->rep.get();
   467    iter_rep->Seek(EncodeKey(start));
   468    const std::string end_key = EncodeKey(end);
   469    for (; iter_rep->Valid() && kComparator.Compare(iter_rep->key(), end_key) < 0; iter_rep->Next()) {
   470      DBStatus status = db->Delete(ToDBKey(iter_rep->key()));
   471      if (status.data != NULL) {
   472        return status;
   473      }
   474    }
   475    return kSuccess;
   476  }
   477  
   478  DBStatus DBCommitAndCloseBatch(DBEngine* db, bool sync) {
   479    DBStatus status = db->CommitBatch(sync);
   480    if (status.data == NULL) {
   481      DBClose(db);
   482    }
   483    return status;
   484  }
   485  
   486  DBStatus DBApplyBatchRepr(DBEngine* db, DBSlice repr, bool sync) {
   487    return db->ApplyBatchRepr(repr, sync);
   488  }
   489  
   490  DBSlice DBBatchRepr(DBEngine* db) { return db->BatchRepr(); }
   491  
   492  DBEngine* DBNewSnapshot(DBEngine* db) { return new DBSnapshot(db); }
   493  
   494  DBEngine* DBNewBatch(DBEngine* db, bool writeOnly) {
   495    if (writeOnly) {
   496      return new DBWriteOnlyBatch(db);
   497    }
   498    return new DBBatch(db);
   499  }
   500  
   501  DBStatus DBEnvWriteFile(DBEngine* db, DBSlice path, DBSlice contents) {
   502    return db->EnvWriteFile(path, contents);
   503  }
   504  
   505  DBStatus DBEnvOpenFile(DBEngine* db, DBSlice path,  uint64_t bytes_per_sync,
   506                         DBWritableFile* file) {
   507    return db->EnvOpenFile(path, bytes_per_sync, (rocksdb::WritableFile**)file);
   508  }
   509  
   510  DBStatus DBEnvReadFile(DBEngine* db, DBSlice path, DBSlice* contents) {
   511    return db->EnvReadFile(path, contents);
   512  }
   513  
   514  DBStatus DBEnvCloseFile(DBEngine* db, DBWritableFile file) {
   515    return db->EnvCloseFile((rocksdb::WritableFile*)file);
   516  }
   517  
   518  DBStatus DBEnvSyncFile(DBEngine* db, DBWritableFile file) {
   519    return db->EnvSyncFile((rocksdb::WritableFile*)file);
   520  }
   521  
   522  DBStatus DBEnvAppendFile(DBEngine* db, DBWritableFile file, DBSlice contents) {
   523    return db->EnvAppendFile((rocksdb::WritableFile*)file, contents);
   524  }
   525  
   526  DBStatus DBEnvDeleteFile(DBEngine* db, DBSlice path) { return db->EnvDeleteFile(path); }
   527  
   528  DBStatus DBEnvDeleteDirAndFiles(DBEngine* db, DBSlice dir) { return db->EnvDeleteDirAndFiles(dir); }
   529  
   530  DBStatus DBEnvLinkFile(DBEngine* db, DBSlice oldname, DBSlice newname) {
   531    return db->EnvLinkFile(oldname, newname);
   532  }
   533  
   534  DBIterState DBCheckForKeyCollisions(DBIterator* existingIter, DBIterator* sstIter,
   535                                      MVCCStatsResult* skippedKVStats, DBString* write_intent) {
   536    DBIterState state = {};
   537    memset(skippedKVStats, 0, sizeof(*skippedKVStats));
   538  
   539    while (existingIter->rep->Valid() && sstIter->rep->Valid()) {
   540      rocksdb::Slice sstKey;
   541      rocksdb::Slice existingKey;
   542      DBTimestamp existing_ts = kZeroTimestamp;
   543      DBTimestamp sst_ts = kZeroTimestamp;
   544      if (!DecodeKey(sstIter->rep->key(), &sstKey, &sst_ts) ||
   545          !DecodeKey(existingIter->rep->key(), &existingKey, &existing_ts)) {
   546        state.valid = false;
   547        state.status = FmtStatus("unable to decode key");
   548        return state;
   549      }
   550  
   551      // Encountered an inline value or a write intent.
   552      if (existing_ts == kZeroTimestamp) {
   553        cockroach::storage::enginepb::MVCCMetadata meta;
   554        if (!meta.ParseFromArray(existingIter->rep->value().data(),
   555                                 existingIter->rep->value().size())) {
   556          state.status = FmtStatus("failed to parse meta");
   557          state.valid = false;
   558          return state;
   559        }
   560  
   561        // Check for an inline value, as these are only used in non-user data.
   562        // This method is currently used by AddSSTable when performing an IMPORT
   563        // INTO. We do not expect to encounter any inline values, and thus we
   564        // report an error.
   565        if (meta.has_raw_bytes()) {
   566          state.status = FmtStatus("InlineError");
   567        } else if (meta.has_txn()) {
   568          // Check for a write intent.
   569          //
   570          // TODO(adityamaru): Currently, we raise a WriteIntentError on
   571          // encountering all intents. This is because, we do not expect to
   572          // encounter many intents during IMPORT INTO as we lock the key space we
   573          // are importing into. Older write intents could however be found in the
   574          // target key space, which will require appropriate resolution logic.
   575          cockroach::roachpb::WriteIntentError err;
   576          cockroach::roachpb::Intent* intent = err.add_intents();
   577          intent->mutable_single_key_span()->set_key(existingIter->rep->key().data(),
   578                                                     existingIter->rep->key().size());
   579          intent->mutable_txn()->CopyFrom(meta.txn());
   580  
   581          *write_intent = ToDBString(err.SerializeAsString());
   582          state.status = FmtStatus("WriteIntentError");
   583        } else {
   584          state.status = FmtStatus("intent without transaction");
   585        }
   586  
   587        state.valid = false;
   588        return state;
   589      }
   590  
   591      DBKey targetKey;
   592      memset(&targetKey, 0, sizeof(targetKey));
   593      int compare = kComparator.Compare(existingKey, sstKey);
   594      if (compare == 0) {
   595        // If the colliding key is a tombstone in the existing data, and the
   596        // timestamp of the sst key is greater than or equal to the timestamp of
   597        // the tombstone, then this is not considered a collision. We move the
   598        // iterator over the existing data to the next potentially colliding key
   599        // (skipping all versions of the deleted key), and resume iteration.
   600        //
   601        // If the ts of the sst key is less than that of the tombstone it is
   602        // changing existing data, and we treat this as a collision.
   603        if (existingIter->rep->value().empty() && sst_ts >= existing_ts) {
   604          DBIterNext(existingIter, true /* skip_current_key_versions */);
   605          continue;
   606        }
   607  
   608        // If the ingested KV has an identical timestamp and value as the existing
   609        // data then we do not consider it to be a collision. We move the iterator
   610        // over the existing data to the next potentially colliding key (skipping
   611        // all versions of the current key), and resume iteration.
   612        bool has_equal_timestamp = existing_ts == sst_ts;
   613        bool has_equal_value =
   614            kComparator.Compare(existingIter->rep->value(), sstIter->rep->value()) == 0;
   615        if (has_equal_timestamp && has_equal_value) {
   616          // Even though we skip over the KVs described above, their stats have
   617          // already been accounted for resulting in a problem of double-counting.
   618          // To solve this we send back the stats of these skipped KVs so that we
   619          // can subtract them later. This enables us to construct accurate
   620          // MVCCStats and prevents expensive recomputation in the future.
   621          const int64_t meta_key_size = sstKey.size() + 1;
   622          const int64_t meta_val_size = 0;
   623          int64_t total_bytes = meta_key_size + meta_val_size;
   624  
   625          // Update the skipped stats to account fot the skipped meta key.
   626          skippedKVStats->live_bytes += total_bytes;
   627          skippedKVStats->live_count++;
   628          skippedKVStats->key_bytes += meta_key_size;
   629          skippedKVStats->val_bytes += meta_val_size;
   630          skippedKVStats->key_count++;
   631  
   632          // Update the stats to account for the skipped versioned key/value.
   633          total_bytes = sstIter->rep->value().size() + kMVCCVersionTimestampSize;
   634          skippedKVStats->live_bytes += total_bytes;
   635          skippedKVStats->key_bytes += kMVCCVersionTimestampSize;
   636          skippedKVStats->val_bytes += sstIter->rep->value().size();
   637          skippedKVStats->val_count++;
   638  
   639          DBIterNext(existingIter, true /* skip_current_key_versions */);
   640          continue;
   641        }
   642  
   643        state.valid = false;
   644        state.key.key = ToDBSlice(sstKey);
   645        state.status = FmtStatus("key collision");
   646        return state;
   647      } else if (compare < 0) {
   648        targetKey.key = ToDBSlice(sstKey);
   649        DBIterSeek(existingIter, targetKey);
   650      } else if (compare > 0) {
   651        targetKey.key = ToDBSlice(existingKey);
   652        DBIterSeek(sstIter, targetKey);
   653      }
   654    }
   655  
   656    state.valid = true;
   657    return state;
   658  }
   659  
   660  DBIterator* DBNewIter(DBEngine* db, DBIterOptions iter_options) {
   661    return db->NewIter(iter_options);
   662  }
   663  
   664  void DBIterDestroy(DBIterator* iter) { delete iter; }
   665  
   666  IteratorStats DBIterStats(DBIterator* iter) {
   667    IteratorStats stats = {};
   668    if (iter->stats != nullptr) {
   669      stats = *iter->stats;
   670    }
   671    return stats;
   672  }
   673  
   674  DBIterState DBIterSeek(DBIterator* iter, DBKey key) {
   675    ScopedStats stats(iter);
   676    iter->rep->Seek(EncodeKey(key));
   677    return DBIterGetState(iter);
   678  }
   679  
   680  DBIterState DBIterSeekForPrev(DBIterator* iter, DBKey key) {
   681    ScopedStats stats(iter);
   682    iter->rep->SeekForPrev(EncodeKey(key));
   683    return DBIterGetState(iter);
   684  }
   685  
   686  DBIterState DBIterSeekToFirst(DBIterator* iter) {
   687    ScopedStats stats(iter);
   688    iter->rep->SeekToFirst();
   689    return DBIterGetState(iter);
   690  }
   691  
   692  DBIterState DBIterSeekToLast(DBIterator* iter) {
   693    ScopedStats stats(iter);
   694    iter->rep->SeekToLast();
   695    return DBIterGetState(iter);
   696  }
   697  
   698  DBIterState DBIterNext(DBIterator* iter, bool skip_current_key_versions) {
   699    ScopedStats stats(iter);
   700    // If we're skipping the current key versions, remember the key the
   701    // iterator was pointing out.
   702    std::string old_key;
   703    if (skip_current_key_versions && iter->rep->Valid()) {
   704      rocksdb::Slice key;
   705      rocksdb::Slice ts;
   706      if (!SplitKey(iter->rep->key(), &key, &ts)) {
   707        DBIterState state = {0};
   708        state.valid = false;
   709        state.status = FmtStatus("failed to split mvcc key");
   710        return state;
   711      }
   712      old_key = key.ToString();
   713    }
   714  
   715    iter->rep->Next();
   716  
   717    if (skip_current_key_versions && iter->rep->Valid()) {
   718      rocksdb::Slice key;
   719      rocksdb::Slice ts;
   720      if (!SplitKey(iter->rep->key(), &key, &ts)) {
   721        DBIterState state = {0};
   722        state.valid = false;
   723        state.status = FmtStatus("failed to split mvcc key");
   724        return state;
   725      }
   726      if (old_key == key) {
   727        // We're pointed at a different version of the same key. Fall
   728        // back to seeking to the next key.
   729        old_key.append("\0", 1);
   730        DBKey db_key;
   731        db_key.key = ToDBSlice(old_key);
   732        db_key.wall_time = 0;
   733        db_key.logical = 0;
   734        iter->rep->Seek(EncodeKey(db_key));
   735      }
   736    }
   737  
   738    return DBIterGetState(iter);
   739  }
   740  
   741  DBIterState DBIterPrev(DBIterator* iter, bool skip_current_key_versions) {
   742    ScopedStats stats(iter);
   743    // If we're skipping the current key versions, remember the key the
   744    // iterator was pointed out.
   745    std::string old_key;
   746    if (skip_current_key_versions && iter->rep->Valid()) {
   747      rocksdb::Slice key;
   748      rocksdb::Slice ts;
   749      if (SplitKey(iter->rep->key(), &key, &ts)) {
   750        old_key = key.ToString();
   751      }
   752    }
   753  
   754    iter->rep->Prev();
   755  
   756    if (skip_current_key_versions && iter->rep->Valid()) {
   757      rocksdb::Slice key;
   758      rocksdb::Slice ts;
   759      if (SplitKey(iter->rep->key(), &key, &ts)) {
   760        if (old_key == key) {
   761          // We're pointed at a different version of the same key. Fall
   762          // back to seeking to the prev key. In this case, we seek to
   763          // the "metadata" key and that back up the iterator.
   764          DBKey db_key;
   765          db_key.key = ToDBSlice(old_key);
   766          db_key.wall_time = 0;
   767          db_key.logical = 0;
   768          iter->rep->Seek(EncodeKey(db_key));
   769          if (iter->rep->Valid()) {
   770            iter->rep->Prev();
   771          }
   772        }
   773      }
   774    }
   775  
   776    return DBIterGetState(iter);
   777  }
   778  
   779  void DBIterSetLowerBound(DBIterator* iter, DBKey key) { iter->SetLowerBound(key); }
   780  void DBIterSetUpperBound(DBIterator* iter, DBKey key) { iter->SetUpperBound(key); }
   781  
   782  DBStatus DBMerge(DBSlice existing, DBSlice update, DBString* new_value, bool full_merge) {
   783    new_value->len = 0;
   784  
   785    cockroach::storage::enginepb::MVCCMetadata meta;
   786    if (!meta.ParseFromArray(existing.data, existing.len)) {
   787      return ToDBString("corrupted existing value");
   788    }
   789  
   790    cockroach::storage::enginepb::MVCCMetadata update_meta;
   791    if (!update_meta.ParseFromArray(update.data, update.len)) {
   792      return ToDBString("corrupted update value");
   793    }
   794  
   795    if (!MergeValues(&meta, update_meta, full_merge, NULL)) {
   796      return ToDBString("incompatible merge values");
   797    }
   798    return MergeResult(&meta, new_value);
   799  }
   800  
   801  DBStatus DBMergeOne(DBSlice existing, DBSlice update, DBString* new_value) {
   802    return DBMerge(existing, update, new_value, true);
   803  }
   804  
   805  DBStatus DBPartialMergeOne(DBSlice existing, DBSlice update, DBString* new_value) {
   806    return DBMerge(existing, update, new_value, false);
   807  }
   808  
   809  // DBGetStats queries the given DBEngine for various operational stats and
   810  // write them to the provided DBStatsResult instance.
   811  DBStatus DBGetStats(DBEngine* db, DBStatsResult* stats) {
   812    return db->GetStats(stats);
   813  }
   814  
   815  // `DBGetTickersAndHistograms` retrieves maps of all RocksDB tickers and histograms.
   816  // It differs from `DBGetStats` by getting _every_ ticker and histogram, and by not
   817  // getting anything else (DB properties, for example).
   818  //
   819  // In addition to freeing the `DBString`s in the result, the caller is also
   820  // responsible for freeing `DBTickersAndHistogramsResult::tickers` and
   821  // `DBTickersAndHistogramsResult::histograms`.
   822  DBStatus DBGetTickersAndHistograms(DBEngine* db, DBTickersAndHistogramsResult* stats) {
   823    return db->GetTickersAndHistograms(stats);
   824  }
   825  
   826  DBString DBGetCompactionStats(DBEngine* db) { return db->GetCompactionStats(); }
   827  
   828  DBStatus DBGetEnvStats(DBEngine* db, DBEnvStatsResult* stats) { return db->GetEnvStats(stats); }
   829  
   830  DBStatus DBGetEncryptionRegistries(DBEngine* db, DBEncryptionRegistries* result) {
   831    return db->GetEncryptionRegistries(result);
   832  }
   833  
   834  DBSSTable* DBGetSSTables(DBEngine* db, int* n) { return db->GetSSTables(n); }
   835  
   836  DBStatus DBGetSortedWALFiles(DBEngine* db, DBWALFile** files, int* n) {
   837    return db->GetSortedWALFiles(files, n);
   838  }
   839  
   840  DBString DBGetUserProperties(DBEngine* db) { return db->GetUserProperties(); }
   841  
   842  DBStatus DBIngestExternalFiles(DBEngine* db, char** paths, size_t len, bool move_files) {
   843    std::vector<std::string> paths_vec;
   844    for (size_t i = 0; i < len; i++) {
   845      paths_vec.push_back(paths[i]);
   846    }
   847  
   848    rocksdb::IngestExternalFileOptions ingest_options;
   849    // If move_files is true and the env supports it, RocksDB will hard link.
   850    // Otherwise, it will copy.
   851    ingest_options.move_files = move_files;
   852    // If snapshot_consistency is true and there is an outstanding RocksDB
   853    // snapshot, a global sequence number is forced (see the allow_global_seqno
   854    // option).
   855    ingest_options.snapshot_consistency = true;
   856    // If a file is ingested over existing data (including the range tombstones
   857    // used by range snapshots) or if a RocksDB snapshot is outstanding when this
   858    // ingest runs, then after moving/copying the file, historically RocksDB would
   859    // edit it (overwrite some of the bytes) to have a global sequence number.
   860    // After https://github.com/facebook/rocksdb/pull/4172 this can be disabled
   861    // (with the mutable manifest/metadata tracking that instead). However it is
   862    // only safe to disable the seqno write if older versions of RocksDB (<5.16)
   863    // will not be used to read these SSTs; luckily we no longer need to
   864    // interoperate with such older versions.
   865    ingest_options.write_global_seqno = false;
   866    // RocksDB checks the option allow_global_seqno and, if it is false, returns
   867    // an error instead of ingesting a file that would require one. However it
   868    // does this check *even if it is not planning on writing seqno* at all (and
   869    // we're not planning on writing any as per write_global_seqno above), so we
   870    // need to set allow_global_seqno to true.
   871    ingest_options.allow_global_seqno = true;
   872    // If there are mutations in the memtable for the keyrange covered by the file
   873    // being ingested, this option is checked. If true, the memtable is flushed
   874    // using a blocking, write-stalling flush and the ingest run. If false, an
   875    // error is returned.
   876    //
   877    // We want to ingest, but we do not want a write-stall, so we initially set it
   878    // to false -- if our ingest fails, we'll do a manual, no-stall flush and wait
   879    // for it to finish before trying the ingest again.
   880    ingest_options.allow_blocking_flush = false;
   881  
   882    rocksdb::Status status = db->rep->IngestExternalFile(paths_vec, ingest_options);
   883    if (status.IsInvalidArgument()) {
   884      // TODO(dt): inspect status to see if it has the message
   885      //          `External file requires flush`
   886      //           since the move_file and other errors also use kInvalidArgument.
   887  
   888      // It is possible we failed because the memtable required a flush but in the
   889      // options above, we set "allow_blocking_flush = false" preventing ingest
   890      // from running flush with allow_write_stall = true and halting foreground
   891      // traffic. Now that we know we need to flush, let's do one ourselves, with
   892      // allow_write_stall = false and wait for it. After it finishes we can retry
   893      // the ingest.
   894      rocksdb::FlushOptions flush_options;
   895      flush_options.allow_write_stall = false;
   896      flush_options.wait = true;
   897  
   898      rocksdb::Status flush_status = db->rep->Flush(flush_options);
   899      if (!flush_status.ok()) {
   900        return ToDBStatus(flush_status);
   901      }
   902  
   903      // Hopefully on this second attempt we will not need to flush at all, but
   904      // just in case we do, we'll allow the write stall this time -- that way we
   905      // can ensure we actually get the ingestion done and move on. A stalling
   906      // flush is be less than ideal, but since we just flushed, a) this shouldn't
   907      // happen often and b) if it does, it should be small and quick.
   908      ingest_options.allow_blocking_flush = true;
   909      status = db->rep->IngestExternalFile(paths_vec, ingest_options);
   910    }
   911  
   912    if (!status.ok()) {
   913      return ToDBStatus(status);
   914    }
   915  
   916    return kSuccess;
   917  }
   918  
   919  struct DBSstFileWriter {
   920    std::unique_ptr<rocksdb::Options> options;
   921    std::unique_ptr<rocksdb::Env> memenv;
   922    rocksdb::SstFileWriter rep;
   923  
   924    DBSstFileWriter(rocksdb::Options* o, rocksdb::Env* m)
   925        : options(o), memenv(m), rep(rocksdb::EnvOptions(), *o, o->comparator) {}
   926    virtual ~DBSstFileWriter() {}
   927  };
   928  
   929  DBSstFileWriter* DBSstFileWriterNew() {
   930    // TODO(dan): Right now, backup is the only user of this code, so that's what
   931    // the options are tuned for. If something else starts using it, we'll likely
   932    // have to add some configurability.
   933  
   934    rocksdb::BlockBasedTableOptions table_options;
   935    // Larger block size (4kb default) means smaller file at the expense of more
   936    // scanning during lookups.
   937    table_options.block_size = 32 * 1024;
   938    // The original LevelDB compatible format. We explicitly set the checksum too
   939    // to guard against the silent version upconversion. See
   940    // https://github.com/facebook/rocksdb/blob/972f96b3fbae1a4675043bdf4279c9072ad69645/include/rocksdb/table.h#L198
   941    table_options.format_version = 0;
   942    table_options.checksum = rocksdb::kCRC32c;
   943    table_options.whole_key_filtering = false;
   944    // This makes the sstables produced by Pebble and RocksDB byte-by-byte identical, which is
   945    // useful for testing.
   946    table_options.index_shortening =
   947        rocksdb::BlockBasedTableOptions::IndexShorteningMode::kShortenSeparatorsAndSuccessor;
   948  
   949    rocksdb::Options* options = new rocksdb::Options();
   950    options->comparator = &kComparator;
   951    options->table_factory.reset(rocksdb::NewBlockBasedTableFactory(table_options));
   952  
   953    // Use the TablePropertiesCollector hook to store the min and max MVCC
   954    // timestamps present in each sstable in the metadata for that sstable. Used
   955    // by the time bounded iterator optimization.
   956    options->table_properties_collector_factories.emplace_back(DBMakeTimeBoundCollector());
   957    // Automatically request compactions whenever an SST contains too many range
   958    // deletions.
   959    options->table_properties_collector_factories.emplace_back(DBMakeDeleteRangeCollector());
   960  
   961    std::unique_ptr<rocksdb::Env> memenv;
   962    memenv.reset(rocksdb::NewMemEnv(rocksdb::Env::Default()));
   963    options->env = memenv.get();
   964  
   965    return new DBSstFileWriter(options, memenv.release());
   966  }
   967  
   968  DBStatus DBSstFileWriterOpen(DBSstFileWriter* fw) {
   969    rocksdb::Status status = fw->rep.Open("sst");
   970    if (!status.ok()) {
   971      return ToDBStatus(status);
   972    }
   973    return kSuccess;
   974  }
   975  
   976  namespace {
   977  DBStatus DBSstFileWriterAddRaw(DBSstFileWriter* fw, const rocksdb::Slice key,
   978                                 const rocksdb::Slice val) {
   979    rocksdb::Status status = fw->rep.Put(key, val);
   980    if (!status.ok()) {
   981      return ToDBStatus(status);
   982    }
   983  
   984    return kSuccess;
   985  }
   986  }  // namespace
   987  
   988  DBStatus DBSstFileWriterAdd(DBSstFileWriter* fw, DBKey key, DBSlice val) {
   989    return DBSstFileWriterAddRaw(fw, EncodeKey(key), ToSlice(val));
   990  }
   991  
   992  DBStatus DBSstFileWriterDelete(DBSstFileWriter* fw, DBKey key) {
   993    rocksdb::Status status = fw->rep.Delete(EncodeKey(key));
   994    if (!status.ok()) {
   995      return ToDBStatus(status);
   996    }
   997    return kSuccess;
   998  }
   999  
  1000  DBStatus DBSstFileWriterDeleteRange(DBSstFileWriter* fw, DBKey start, DBKey end) {
  1001    rocksdb::Status status = fw->rep.DeleteRange(EncodeKey(start), EncodeKey(end));
  1002    if (!status.ok()) {
  1003      return ToDBStatus(status);
  1004    }
  1005    return kSuccess;
  1006  }
  1007  
  1008  DBStatus DBSstFileWriterCopyData(DBSstFileWriter* fw, DBString* data) {
  1009    uint64_t file_size;
  1010    rocksdb::Status status = fw->memenv->GetFileSize("sst", &file_size);
  1011    if (!status.ok()) {
  1012      return ToDBStatus(status);
  1013    }
  1014    if (file_size == 0) {
  1015      return kSuccess;
  1016    }
  1017  
  1018    const rocksdb::EnvOptions soptions;
  1019    std::unique_ptr<rocksdb::SequentialFile> sst;
  1020    status = fw->memenv->NewSequentialFile("sst", &sst, soptions);
  1021    if (!status.ok()) {
  1022      return ToDBStatus(status);
  1023    }
  1024  
  1025    // scratch is eventually returned as the array part of data and freed by the
  1026    // caller.
  1027    char* scratch = static_cast<char*>(malloc(file_size));
  1028  
  1029    rocksdb::Slice sst_contents;
  1030    status = sst->Read(file_size, &sst_contents, scratch);
  1031    if (!status.ok()) {
  1032      return ToDBStatus(status);
  1033    }
  1034    if (sst_contents.size() != file_size) {
  1035          return FmtStatus("expected to read %" PRIu64 " bytes but got %zu", file_size,
  1036                       sst_contents.size());
  1037    }
  1038  
  1039    // The contract of the SequentialFile.Read call above is that it _might_ use
  1040    // scratch as the backing data for sst_contents, but it also _might not_. If
  1041    // it didn't, copy sst_contents into scratch, so we can unconditionally return
  1042    // a DBString backed by scratch (which can then always be freed by the
  1043    // caller). Note that this means the data is always copied exactly once,
  1044    // either by Read or here.
  1045    if (sst_contents.data() != scratch) {
  1046      memcpy(scratch, sst_contents.data(), sst_contents.size());
  1047    }
  1048    data->data = scratch;
  1049    data->len = sst_contents.size();
  1050  
  1051    return kSuccess;
  1052  }
  1053  
  1054  DBStatus DBSstFileWriterTruncate(DBSstFileWriter* fw, DBString* data) {
  1055    DBStatus status = DBSstFileWriterCopyData(fw, data);
  1056    if (status.data != NULL) {
  1057      return status;
  1058    }
  1059    return ToDBStatus(fw->memenv->Truncate("sst", 0));
  1060  }
  1061  
  1062  DBStatus DBSstFileWriterFinish(DBSstFileWriter* fw, DBString* data) {
  1063    rocksdb::Status status = fw->rep.Finish();
  1064    if (!status.ok()) {
  1065      return ToDBStatus(status);
  1066    }
  1067  
  1068    return DBSstFileWriterCopyData(fw, data);
  1069  }
  1070  
  1071  void DBSstFileWriterClose(DBSstFileWriter* fw) { delete fw; }
  1072  
  1073  DBStatus DBLockFile(DBSlice filename, DBFileLock* lock) {
  1074    return ToDBStatus(
  1075        rocksdb::Env::Default()->LockFile(ToString(filename), (rocksdb::FileLock**)lock));
  1076  }
  1077  
  1078  DBStatus DBUnlockFile(DBFileLock lock) {
  1079    return ToDBStatus(rocksdb::Env::Default()->UnlockFile((rocksdb::FileLock*)lock));
  1080  }
  1081  
  1082  DBStatus DBExportToSst(DBKey start, DBKey end, bool export_all_revisions,
  1083                         uint64_t target_size, uint64_t max_size,
  1084                         DBIterOptions iter_opts, DBEngine* engine, DBString* data,
  1085                         DBString* write_intent, DBString* summary, DBString* resume) {
  1086    DBSstFileWriter* writer = DBSstFileWriterNew();
  1087    DBStatus status = DBSstFileWriterOpen(writer);
  1088    if (status.data != NULL) {
  1089      return status;
  1090    }
  1091  
  1092    DBIncrementalIterator iter(engine, iter_opts, start, end, write_intent);
  1093  
  1094    roachpb::BulkOpSummary bulkop_summary;
  1095    RowCounter row_counter(&bulkop_summary);
  1096  
  1097    bool skip_current_key_versions = !export_all_revisions;
  1098    DBIterState state;
  1099    const std::string end_key = EncodeKey(end);
  1100    // cur_key is used when paginated is true and export_all_revisions is
  1101    // true. If we're exporting all revisions and we're returning a paginated
  1102    // SST then we need to keep track of when we've finished adding all of the
  1103    // versions of a key to the writer.
  1104    const bool paginated = target_size > 0;
  1105    std::string cur_key;
  1106    std::string resume_key;
  1107    // Seek to the MVCC metadata key for the provided start key and let the
  1108    // incremental iterator find the appropriate version.
  1109    const DBKey seek_key = {.key = start.key};
  1110    for (state = iter.seek(seek_key);; state = iter.next(skip_current_key_versions)) {
  1111      if (state.status.data != NULL) {
  1112        DBSstFileWriterClose(writer);
  1113        return state.status;
  1114      } else if (!state.valid || kComparator.Compare(iter.key(), end_key) >= 0) {
  1115        break;
  1116      }
  1117      rocksdb::Slice decoded_key;
  1118      int64_t wall_time = 0;
  1119      int32_t logical_time = 0;
  1120  
  1121      if (!DecodeKey(iter.key(), &decoded_key, &wall_time, &logical_time)) {
  1122        DBSstFileWriterClose(writer);
  1123        return ToDBString("Unable to decode key");
  1124      }
  1125  
  1126      const bool is_new_key = !export_all_revisions || decoded_key.compare(cur_key) != 0;
  1127      if (paginated && export_all_revisions && is_new_key) {
  1128        // Reuse the underlying buffer in cur_key.
  1129        cur_key.clear();
  1130        cur_key.reserve(decoded_key.size());
  1131        cur_key.assign(decoded_key.data(), decoded_key.size());
  1132      }
  1133  
  1134      // Skip tombstone (len=0) records when start time is zero (non-incremental)
  1135      // and we are not exporting all versions.
  1136      const bool is_skipping_deletes =
  1137          start.wall_time == 0 && start.logical == 0 && !export_all_revisions;
  1138      if (is_skipping_deletes && iter.value().size() == 0) {
  1139        continue;
  1140      }
  1141  
  1142      // Check to see if this is the first version of key and adding it would
  1143      // put us over the limit (we might already be over the limit).
  1144      const int64_t cur_size = bulkop_summary.data_size();
  1145      const bool reached_target_size = cur_size > 0 && cur_size >= target_size;
  1146      if (paginated && is_new_key && reached_target_size) {
  1147        resume_key.reserve(decoded_key.size());
  1148        resume_key.assign(decoded_key.data(), decoded_key.size());
  1149        break;
  1150      }
  1151  
  1152      // Insert key into sst and update statistics.
  1153      status = DBSstFileWriterAddRaw(writer, iter.key(), iter.value());
  1154      if (status.data != NULL) {
  1155        DBSstFileWriterClose(writer);
  1156        return status;
  1157      }
  1158  
  1159      if (!row_counter.Count(iter.key())) {
  1160        return ToDBString("Error in row counter");
  1161      }
  1162      const int64_t new_size = cur_size + decoded_key.size() + iter.value().size();
  1163      if (max_size > 0 && new_size > max_size) {
  1164        return FmtStatus("export size (%" PRIi64 " bytes) exceeds max size (%" PRIi64 " bytes)",
  1165                         new_size, max_size);
  1166      }
  1167      bulkop_summary.set_data_size(new_size);
  1168    }
  1169    *summary = ToDBString(bulkop_summary.SerializeAsString());
  1170  
  1171    if (bulkop_summary.data_size() == 0) {
  1172      DBSstFileWriterClose(writer);
  1173      return kSuccess;
  1174    }
  1175  
  1176    auto res = DBSstFileWriterFinish(writer, data);
  1177    DBSstFileWriterClose(writer);
  1178  
  1179    // If we're not returning an error, check to see if we need to return the resume key.
  1180    if (res.data == NULL && resume_key.length() > 0) {
  1181      *resume = ToDBString(resume_key);
  1182    }
  1183  
  1184    return res;
  1185  }
  1186  
  1187  DBStatus DBEnvOpenReadableFile(DBEngine* db, DBSlice path, DBReadableFile* file) {
  1188    return db->EnvOpenReadableFile(path, (rocksdb::RandomAccessFile**)file);
  1189  }
  1190  
  1191  DBStatus DBEnvReadAtFile(DBEngine* db, DBReadableFile file, DBSlice buffer, int64_t offset,
  1192                           int* n) {
  1193    return db->EnvReadAtFile((rocksdb::RandomAccessFile*)file, buffer, offset, n);
  1194  }
  1195  
  1196  DBStatus DBEnvCloseReadableFile(DBEngine* db, DBReadableFile file) {
  1197    return db->EnvCloseReadableFile((rocksdb::RandomAccessFile*)file);
  1198  }
  1199  
  1200  DBStatus DBEnvOpenDirectory(DBEngine* db, DBSlice path, DBDirectory* file) {
  1201    return db->EnvOpenDirectory(path, (rocksdb::Directory**)file);
  1202  }
  1203  
  1204  DBStatus DBEnvSyncDirectory(DBEngine* db, DBDirectory file) {
  1205    return db->EnvSyncDirectory((rocksdb::Directory*)file);
  1206  }
  1207  
  1208  DBStatus DBEnvCloseDirectory(DBEngine* db, DBDirectory file) {
  1209    return db->EnvCloseDirectory((rocksdb::Directory*)file);
  1210  }
  1211  
  1212  DBStatus DBEnvRenameFile(DBEngine* db, DBSlice oldname, DBSlice newname) {
  1213    return db->EnvRenameFile(oldname, newname);
  1214  }
  1215  
  1216  DBStatus DBEnvCreateDir(DBEngine* db, DBSlice name) {
  1217    return db->EnvCreateDir(name);
  1218  }
  1219  
  1220  DBStatus DBEnvDeleteDir(DBEngine* db, DBSlice name) {
  1221    return db->EnvDeleteDir(name);
  1222  }
  1223  
  1224  DBListDirResults DBEnvListDir(DBEngine* db, DBSlice name) {
  1225    DBListDirResults result;
  1226    std::vector<std::string> contents;
  1227    result.status = db->EnvListDir(name, &contents);
  1228    result.n = contents.size();
  1229    // We malloc the names so it can be deallocated by the caller using free().
  1230    const int size = contents.size() * sizeof(DBString);
  1231    result.names = reinterpret_cast<DBString*>(malloc(size));
  1232    memset(result.names, 0, size);
  1233    for (int i = 0; i < contents.size(); i++) {
  1234      result.names[i] = ToDBString(rocksdb::Slice(contents[i].data(), contents[i].size()));
  1235    }
  1236    return result;
  1237  }
  1238  
  1239  DBString DBDumpThreadStacks() {
  1240    return ToDBString(DumpThreadStacks());
  1241  }