github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/c-deps/libroach/engine.cc (about)

     1  // Copyright 2018 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  #include "engine.h"
    12  #include "db.h"
    13  #include "encoding.h"
    14  #include "env_manager.h"
    15  #include "fmt.h"
    16  #include "getter.h"
    17  #include "iterator.h"
    18  #include "protos/storage/enginepb/rocksdb.pb.h"
    19  #include "status.h"
    20  
    21  using namespace cockroach;
    22  
    23  DBEngine::~DBEngine() {}
    24  
    25  DBStatus DBEngine::AssertPreClose() { return kSuccess; }
    26  
    27  DBSSTable* DBEngine::GetSSTables(int* n) {
    28    std::vector<rocksdb::LiveFileMetaData> metadata;
    29    rep->GetLiveFilesMetaData(&metadata);
    30    *n = metadata.size();
    31    // We malloc the result so it can be deallocated by the caller using free().
    32    const int size = metadata.size() * sizeof(DBSSTable);
    33    DBSSTable* tables = reinterpret_cast<DBSSTable*>(malloc(size));
    34    memset(tables, 0, size);
    35    for (int i = 0; i < metadata.size(); i++) {
    36      tables[i].level = metadata[i].level;
    37      tables[i].size = metadata[i].size;
    38  
    39      rocksdb::Slice tmp;
    40      if (DecodeKey(metadata[i].smallestkey, &tmp, &tables[i].start_key.wall_time,
    41                    &tables[i].start_key.logical)) {
    42        // This is a bit ugly because we want DBKey.key to be copied and
    43        // not refer to the memory in metadata[i].smallestkey.
    44        DBString str = ToDBString(tmp);
    45        tables[i].start_key.key = DBSlice{str.data, str.len};
    46      }
    47      if (DecodeKey(metadata[i].largestkey, &tmp, &tables[i].end_key.wall_time,
    48                    &tables[i].end_key.logical)) {
    49        DBString str = ToDBString(tmp);
    50        tables[i].end_key.key = DBSlice{str.data, str.len};
    51      }
    52    }
    53    return tables;
    54  }
    55  
    56  DBStatus DBEngine::GetSortedWALFiles(DBWALFile** out_files, int* n) {
    57    rocksdb::VectorLogPtr files;
    58    rocksdb::Status s = rep->GetSortedWalFiles(files);
    59    if (!s.ok()) {
    60      return ToDBStatus(s);
    61    }
    62    *n = files.size();
    63    // We calloc the result so it can be deallocated by the caller using free().
    64    *out_files = reinterpret_cast<DBWALFile*>(calloc(files.size(), sizeof(DBWALFile)));
    65    for (int i = 0; i < files.size(); i++) {
    66      (*out_files)[i].log_number = files[i]->LogNumber();
    67      (*out_files)[i].size = files[i]->SizeFileBytes();
    68    }
    69    return kSuccess;
    70  }
    71  
    72  DBString DBEngine::GetUserProperties() {
    73    rocksdb::TablePropertiesCollection props;
    74    rocksdb::Status status = rep->GetPropertiesOfAllTables(&props);
    75  
    76    cockroach::storage::enginepb::SSTUserPropertiesCollection all;
    77    if (!status.ok()) {
    78      all.set_error(status.ToString());
    79      return ToDBString(all.SerializeAsString());
    80    }
    81  
    82    for (auto i = props.begin(); i != props.end(); i++) {
    83      cockroach::storage::enginepb::SSTUserProperties* sst = all.add_sst();
    84      sst->set_path(i->first);
    85      auto userprops = i->second->user_collected_properties;
    86  
    87      auto ts_min = userprops.find("crdb.ts.min");
    88      if (ts_min != userprops.end() && !ts_min->second.empty()) {
    89        if (!DecodeTimestamp(rocksdb::Slice(ts_min->second), sst->mutable_ts_min())) {
    90          fmt::SStringPrintf(
    91              all.mutable_error(), "unable to decode crdb.ts.min value '%s' in table %s",
    92              rocksdb::Slice(ts_min->second).ToString(true).c_str(), sst->path().c_str());
    93          break;
    94        }
    95      }
    96  
    97      auto ts_max = userprops.find("crdb.ts.max");
    98      if (ts_max != userprops.end() && !ts_max->second.empty()) {
    99        if (!DecodeTimestamp(rocksdb::Slice(ts_max->second), sst->mutable_ts_max())) {
   100          fmt::SStringPrintf(
   101              all.mutable_error(), "unable to decode crdb.ts.max value '%s' in table %s",
   102              rocksdb::Slice(ts_max->second).ToString(true).c_str(), sst->path().c_str());
   103          break;
   104        }
   105      }
   106    }
   107    return ToDBString(all.SerializeAsString());
   108  }
   109  
   110  namespace cockroach {
   111  
   112  DBImpl::DBImpl(rocksdb::DB* r, std::unique_ptr<EnvManager> e, std::shared_ptr<rocksdb::Cache> bc,
   113                 std::shared_ptr<DBEventListener> event_listener)
   114      : DBEngine(r, &iters_count),
   115        env_mgr(std::move(e)),
   116        rep_deleter(r),
   117        block_cache(bc),
   118        event_listener(event_listener),
   119        iters_count(0) {}
   120  
   121  DBImpl::~DBImpl() {
   122    const rocksdb::Options& opts = rep->GetOptions();
   123    const std::shared_ptr<rocksdb::Statistics>& s = opts.statistics;
   124    rocksdb::Info(opts.info_log, "bloom filter utility:    %0.1f%%",
   125                  (100.0 * s->getTickerCount(rocksdb::BLOOM_FILTER_PREFIX_USEFUL)) /
   126                      s->getTickerCount(rocksdb::BLOOM_FILTER_PREFIX_CHECKED));
   127  }
   128  
   129  DBStatus DBImpl::AssertPreClose() {
   130    const int64_t n = iters_count.load();
   131    if (n == 0) {
   132      return kSuccess;
   133    }
   134    return FmtStatus("%" PRId64 " leaked iterators", n);
   135  }
   136  
   137  DBStatus DBImpl::Put(DBKey key, DBSlice value) {
   138    rocksdb::WriteOptions options;
   139    return ToDBStatus(rep->Put(options, EncodeKey(key), ToSlice(value)));
   140  }
   141  
   142  DBStatus DBImpl::Merge(DBKey key, DBSlice value) {
   143    rocksdb::WriteOptions options;
   144    return ToDBStatus(rep->Merge(options, EncodeKey(key), ToSlice(value)));
   145  }
   146  
   147  DBStatus DBImpl::Get(DBKey key, DBString* value) {
   148    rocksdb::ReadOptions read_opts;
   149    DBGetter base(rep, read_opts, EncodeKey(key));
   150    return base.Get(value);
   151  }
   152  
   153  DBStatus DBImpl::Delete(DBKey key) {
   154    rocksdb::WriteOptions options;
   155    return ToDBStatus(rep->Delete(options, EncodeKey(key)));
   156  }
   157  
   158  DBStatus DBImpl::SingleDelete(DBKey key) {
   159    rocksdb::WriteOptions options;
   160    return ToDBStatus(rep->SingleDelete(options, EncodeKey(key)));
   161  }
   162  
   163  DBStatus DBImpl::DeleteRange(DBKey start, DBKey end) {
   164    rocksdb::WriteOptions options;
   165    return ToDBStatus(
   166        rep->DeleteRange(options, rep->DefaultColumnFamily(), EncodeKey(start), EncodeKey(end)));
   167  }
   168  
   169  DBStatus DBImpl::CommitBatch(bool sync) { return FmtStatus("unsupported"); }
   170  
   171  DBStatus DBImpl::ApplyBatchRepr(DBSlice repr, bool sync) {
   172    rocksdb::WriteBatch batch(ToString(repr));
   173    rocksdb::WriteOptions options;
   174    options.sync = sync;
   175    return ToDBStatus(rep->Write(options, &batch));
   176  }
   177  
   178  DBSlice DBImpl::BatchRepr() { return ToDBSlice("unsupported"); }
   179  
   180  DBIterator* DBImpl::NewIter(DBIterOptions iter_opts) {
   181    DBIterator* iter = new DBIterator(iters, iter_opts);
   182    iter->rep.reset(rep->NewIterator(iter->read_opts));
   183    return iter;
   184  }
   185  
   186  // GetStats retrieves a subset of RocksDB stats that are relevant to
   187  // CockroachDB.
   188  DBStatus DBImpl::GetStats(DBStatsResult* stats) {
   189    const rocksdb::Options& opts = rep->GetOptions();
   190    const std::shared_ptr<rocksdb::Statistics>& s = opts.statistics;
   191  
   192    uint64_t memtable_total_size;
   193    rep->GetIntProperty("rocksdb.cur-size-all-mem-tables", &memtable_total_size);
   194  
   195    uint64_t table_readers_mem_estimate;
   196    rep->GetIntProperty("rocksdb.estimate-table-readers-mem", &table_readers_mem_estimate);
   197  
   198    uint64_t pending_compaction_bytes_estimate;
   199    rep->GetIntProperty("rocksdb.estimate-pending-compaction-bytes",
   200                        &pending_compaction_bytes_estimate);
   201  
   202    std::string l0_file_count_str;
   203    rep->GetProperty("rocksdb.num-files-at-level0", &l0_file_count_str);
   204  
   205    stats->block_cache_hits = (int64_t)s->getTickerCount(rocksdb::BLOCK_CACHE_HIT);
   206    stats->block_cache_misses = (int64_t)s->getTickerCount(rocksdb::BLOCK_CACHE_MISS);
   207    stats->block_cache_usage = (int64_t)block_cache->GetUsage();
   208    stats->block_cache_pinned_usage = (int64_t)block_cache->GetPinnedUsage();
   209    stats->bloom_filter_prefix_checked =
   210        (int64_t)s->getTickerCount(rocksdb::BLOOM_FILTER_PREFIX_CHECKED);
   211    stats->bloom_filter_prefix_useful =
   212        (int64_t)s->getTickerCount(rocksdb::BLOOM_FILTER_PREFIX_USEFUL);
   213    stats->memtable_total_size = memtable_total_size;
   214    stats->flushes = (int64_t)event_listener->GetFlushes();
   215    stats->flush_bytes = (int64_t)s->getTickerCount(rocksdb::FLUSH_WRITE_BYTES);
   216    stats->compactions = (int64_t)event_listener->GetCompactions();
   217    stats->compact_read_bytes =
   218        (int64_t)s->getTickerCount(rocksdb::COMPACT_READ_BYTES);
   219    stats->compact_write_bytes =
   220        (int64_t)s->getTickerCount(rocksdb::COMPACT_WRITE_BYTES);
   221    stats->table_readers_mem_estimate = table_readers_mem_estimate;
   222    stats->pending_compaction_bytes_estimate = pending_compaction_bytes_estimate;
   223    stats->l0_file_count = std::atoi(l0_file_count_str.c_str());
   224    return kSuccess;
   225  }
   226  
   227  // `GetTickersAndHistograms` retrieves maps of all RocksDB tickers and histograms.
   228  // It differs from `GetStats` by getting _every_ ticker and histogram, and by not
   229  // getting anything else (DB properties, for example).
   230  //
   231  // In addition to freeing the `DBString`s in the result, the caller is also
   232  // responsible for freeing `DBTickersAndHistogramsResult::tickers` and
   233  // `DBTickersAndHistogramsResult::histograms`.
   234  DBStatus DBImpl::GetTickersAndHistograms(DBTickersAndHistogramsResult* stats) {
   235    const rocksdb::Options& opts = rep->GetOptions();
   236    const std::shared_ptr<rocksdb::Statistics>& s = opts.statistics;
   237    stats->tickers_len = rocksdb::TickersNameMap.size();
   238    // We malloc the result so it can be deallocated by the caller using free().
   239    stats->tickers = static_cast<TickerInfo*>(malloc(stats->tickers_len * sizeof(TickerInfo)));
   240    if (stats->tickers == nullptr) {
   241      return FmtStatus("malloc failed");
   242    }
   243    for (size_t i = 0; i < stats->tickers_len; ++i) {
   244      stats->tickers[i].name = ToDBString(rocksdb::TickersNameMap[i].second);
   245      stats->tickers[i].value = s->getTickerCount(static_cast<uint32_t>(i));
   246    }
   247  
   248    stats->histograms_len = rocksdb::HistogramsNameMap.size();
   249    // We malloc the result so it can be deallocated by the caller using free().
   250    stats->histograms =
   251        static_cast<HistogramInfo*>(malloc(stats->histograms_len * sizeof(HistogramInfo)));
   252    if (stats->histograms == nullptr) {
   253      return FmtStatus("malloc failed");
   254    }
   255    for (size_t i = 0; i < stats->histograms_len; ++i) {
   256      stats->histograms[i].name = ToDBString(rocksdb::HistogramsNameMap[i].second);
   257      rocksdb::HistogramData data;
   258      s->histogramData(static_cast<uint32_t>(i), &data);
   259      stats->histograms[i].mean = data.average;
   260      stats->histograms[i].p50 = data.median;
   261      stats->histograms[i].p95 = data.percentile95;
   262      stats->histograms[i].p99 = data.percentile99;
   263      stats->histograms[i].max = data.max;
   264      stats->histograms[i].count = data.count;
   265      stats->histograms[i].sum = data.sum;
   266    }
   267    return kSuccess;
   268  }
   269  
   270  DBString DBImpl::GetCompactionStats() {
   271    std::string tmp;
   272    rep->GetProperty("rocksdb.cfstats-no-file-histogram", &tmp);
   273    return ToDBString(tmp);
   274  }
   275  
   276  DBStatus DBImpl::GetEnvStats(DBEnvStatsResult* stats) {
   277    // Always initialize the fields.
   278    stats->encryption_status = DBString();
   279    stats->total_files = stats->total_bytes = stats->active_key_files = stats->active_key_bytes = 0;
   280    stats->encryption_type = 0;
   281  
   282    if (env_mgr->env_stats_handler == nullptr || env_mgr->file_registry == nullptr) {
   283      // We can't compute these if we don't have a file registry or stats handler.
   284      // This happens in OSS mode or when encryption has not been turned on.
   285      return kSuccess;
   286    }
   287  
   288    // Get encryption algorithm.
   289    stats->encryption_type = env_mgr->env_stats_handler->GetActiveStoreKeyType();
   290  
   291    // Get encryption status.
   292    std::string encryption_status;
   293    auto status = env_mgr->env_stats_handler->GetEncryptionStats(&encryption_status);
   294    if (!status.ok()) {
   295      return ToDBStatus(status);
   296    }
   297  
   298    stats->encryption_status = ToDBString(encryption_status);
   299  
   300    // Get file statistics.
   301    FileStats file_stats(env_mgr.get());
   302    status = file_stats.GetFiles(rep);
   303    if (!status.ok()) {
   304      return ToDBStatus(status);
   305    }
   306  
   307    // Get current active key ID.
   308    auto active_key_id = env_mgr->env_stats_handler->GetActiveDataKeyID();
   309  
   310    // Request stats for the Data env only.
   311    status = file_stats.GetStatsForEnvAndKey(enginepb::Data, active_key_id, stats);
   312    if (!status.ok()) {
   313      return ToDBStatus(status);
   314    }
   315  
   316    return kSuccess;
   317  }
   318  
   319  DBStatus DBImpl::GetEncryptionRegistries(DBEncryptionRegistries* result) {
   320    // Always initialize the fields.
   321    result->file_registry = DBString();
   322    result->key_registry = DBString();
   323  
   324    if (env_mgr->env_stats_handler == nullptr || env_mgr->file_registry == nullptr) {
   325      // We can't compute these if we don't have a file registry or stats handler.
   326      // This happens in OSS mode or when encryption has not been turned on.
   327      return kSuccess;
   328    }
   329  
   330    auto file_registry = env_mgr->file_registry->GetFileRegistry();
   331    if (file_registry == nullptr) {
   332      return ToDBStatus(rocksdb::Status::InvalidArgument("file registry has not been loaded"));
   333    }
   334  
   335    std::string serialized_file_registry;
   336    if (!file_registry->SerializeToString(&serialized_file_registry)) {
   337      return ToDBStatus(rocksdb::Status::InvalidArgument("failed to serialize file registry proto"));
   338    }
   339  
   340    std::string serialized_key_registry;
   341    auto status = env_mgr->env_stats_handler->GetEncryptionRegistry(&serialized_key_registry);
   342    if (!status.ok()) {
   343      return ToDBStatus(status);
   344    }
   345  
   346    result->file_registry = ToDBString(serialized_file_registry);
   347    result->key_registry = ToDBString(serialized_key_registry);
   348  
   349    return kSuccess;
   350  }
   351  
   352  // EnvWriteFile writes the given data as a new "file" in the given engine.
   353  DBStatus DBImpl::EnvWriteFile(DBSlice path, DBSlice contents) {
   354    rocksdb::Status s;
   355  
   356    const rocksdb::EnvOptions soptions;
   357    std::unique_ptr<rocksdb::WritableFile> destfile;
   358    s = this->rep->GetEnv()->NewWritableFile(ToString(path), &destfile, soptions);
   359    if (!s.ok()) {
   360      return ToDBStatus(s);
   361    }
   362  
   363    s = destfile->Append(ToSlice(contents));
   364    if (!s.ok()) {
   365      return ToDBStatus(s);
   366    }
   367  
   368    return kSuccess;
   369  }
   370  
   371  // EnvOpenFile opens a new file in the given engine.
   372  DBStatus DBImpl::EnvOpenFile(DBSlice path, uint64_t bytes_per_sync, rocksdb::WritableFile** file) {
   373    rocksdb::Status status;
   374    rocksdb::EnvOptions soptions;
   375    soptions.bytes_per_sync = bytes_per_sync;
   376    std::unique_ptr<rocksdb::WritableFile> rocksdb_file;
   377  
   378    // Create the file.
   379    status = this->rep->GetEnv()->NewWritableFile(ToString(path), &rocksdb_file, soptions);
   380    if (!status.ok()) {
   381      return ToDBStatus(status);
   382    }
   383    *file = rocksdb_file.release();
   384    return kSuccess;
   385  }
   386  
   387  // EnvReadFile reads the content of the given filename.
   388  DBStatus DBImpl::EnvReadFile(DBSlice path, DBSlice* contents) {
   389    rocksdb::Status status;
   390    std::string data;
   391  
   392    status = ReadFileToString(this->rep->GetEnv(), ToString(path), &data);
   393    if (!status.ok()) {
   394      if (status.IsNotFound()) {
   395        return FmtStatus("No such file or directory");
   396      }
   397      return ToDBStatus(status);
   398    }
   399    contents->data = static_cast<char*>(malloc(data.size()));
   400    contents->len = data.size();
   401    memcpy(contents->data, data.c_str(), data.size());
   402    return kSuccess;
   403  }
   404  
   405  // CloseFile closes the given file in the given engine.
   406  DBStatus DBImpl::EnvCloseFile(rocksdb::WritableFile* file) {
   407    rocksdb::Status status = file->Close();
   408    delete file;
   409    return ToDBStatus(status);
   410  }
   411  
   412  // EnvAppendFile appends the given data to the file in the given engine.
   413  DBStatus DBImpl::EnvAppendFile(rocksdb::WritableFile* file, DBSlice contents) {
   414    rocksdb::Status status = file->Append(ToSlice(contents));
   415    return ToDBStatus(status);
   416  }
   417  
   418  // EnvSyncFile synchronously writes the data of the file to the disk.
   419  DBStatus DBImpl::EnvSyncFile(rocksdb::WritableFile* file) {
   420    rocksdb::Status status = file->Sync();
   421    return ToDBStatus(status);
   422  }
   423  
   424  // EnvDeleteFile deletes the file with the given filename.
   425  DBStatus DBImpl::EnvDeleteFile(DBSlice path) {
   426    rocksdb::Status status = this->rep->GetEnv()->DeleteFile(ToString(path));
   427    if (status.IsNotFound()) {
   428      return FmtStatus("No such file or directory");
   429    }
   430    return ToDBStatus(status);
   431  }
   432  
   433  // EnvDeleteDirAndFiles deletes the directory with the given dir name and any
   434  // files it contains but not subdirectories.
   435  DBStatus DBImpl::EnvDeleteDirAndFiles(DBSlice dir) {
   436    rocksdb::Status status;
   437  
   438    std::vector<std::string> files;
   439    this->rep->GetEnv()->GetChildren(ToString(dir), &files);
   440    for (auto& file : files) {
   441      if (file != "." && file != "..") {
   442        this->rep->GetEnv()->DeleteFile(ToString(dir) + "/" + file);
   443      }
   444    }
   445  
   446    status = this->rep->GetEnv()->DeleteDir(ToString(dir));
   447    if (status.IsNotFound()) {
   448      return FmtStatus("No such file or directory");
   449    }
   450    return ToDBStatus(status);
   451  }
   452  
   453  // EnvLinkFile creates 'newname' as a hard link to 'oldname'.
   454  DBStatus DBImpl::EnvLinkFile(DBSlice oldname, DBSlice newname) {
   455    return ToDBStatus(this->rep->GetEnv()->LinkFile(ToString(oldname), ToString(newname)));
   456  }
   457  
   458  DBStatus DBImpl::EnvOpenReadableFile(DBSlice path, rocksdb::RandomAccessFile** file) {
   459    rocksdb::Status status;
   460    const rocksdb::EnvOptions soptions;
   461    std::unique_ptr<rocksdb::RandomAccessFile> rocksdb_file;
   462  
   463    status = this->rep->GetEnv()->NewRandomAccessFile(ToString(path), &rocksdb_file, soptions);
   464    if (!status.ok()) {
   465      return ToDBStatus(status);
   466    }
   467    *file = rocksdb_file.release();
   468    return kSuccess;
   469  }
   470  
   471  DBStatus DBImpl::EnvCloseReadableFile(rocksdb::RandomAccessFile* file) {
   472    delete file;
   473    return kSuccess;
   474  }
   475  
   476  DBStatus DBImpl::EnvReadAtFile(rocksdb::RandomAccessFile* file, DBSlice buffer, int64_t offset,
   477                                 int* n) {
   478    size_t max_bytes_to_read = buffer.len;
   479    char* scratch = buffer.data;
   480    rocksdb::Slice result;
   481    auto status = file->Read(offset, max_bytes_to_read, &result, scratch);
   482    *n = result.size();
   483    return ToDBStatus(status);
   484  }
   485  
   486  DBStatus DBImpl::EnvOpenDirectory(DBSlice path, rocksdb::Directory** file) {
   487    rocksdb::Status status;
   488    std::unique_ptr<rocksdb::Directory> rocksdb_dir;
   489  
   490    status = this->rep->GetEnv()->NewDirectory(ToString(path), &rocksdb_dir);
   491    if (!status.ok()) {
   492      return ToDBStatus(status);
   493    }
   494    *file = rocksdb_dir.release();
   495    return kSuccess;
   496  }
   497  
   498  DBStatus DBImpl::EnvSyncDirectory(rocksdb::Directory* file) { return ToDBStatus(file->Fsync()); }
   499  
   500  DBStatus DBImpl::EnvCloseDirectory(rocksdb::Directory* file) {
   501    delete file;
   502    return kSuccess;
   503  }
   504  
   505  DBStatus DBImpl::EnvRenameFile(DBSlice oldname, DBSlice newname) {
   506    return ToDBStatus(this->rep->GetEnv()->RenameFile(ToString(oldname), ToString(newname)));
   507  }
   508  
   509  DBStatus DBImpl::EnvCreateDir(DBSlice name) {
   510    return ToDBStatus(this->rep->GetEnv()->CreateDirIfMissing(ToString(name)));
   511  }
   512  
   513  DBStatus DBImpl::EnvDeleteDir(DBSlice name) {
   514    return ToDBStatus(this->rep->GetEnv()->DeleteDir(ToString(name)));
   515  }
   516  
   517  DBStatus DBImpl::EnvListDir(DBSlice name, std::vector<std::string>* result) {
   518    return ToDBStatus(this->rep->GetEnv()->GetChildren(ToString(name), result));
   519  }
   520  
   521  }  // namespace cockroach