github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/c-deps/libroach/engine.cc (about) 1 // Copyright 2018 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 #include "engine.h" 12 #include "db.h" 13 #include "encoding.h" 14 #include "env_manager.h" 15 #include "fmt.h" 16 #include "getter.h" 17 #include "iterator.h" 18 #include "protos/storage/enginepb/rocksdb.pb.h" 19 #include "status.h" 20 21 using namespace cockroach; 22 23 DBEngine::~DBEngine() {} 24 25 DBStatus DBEngine::AssertPreClose() { return kSuccess; } 26 27 DBSSTable* DBEngine::GetSSTables(int* n) { 28 std::vector<rocksdb::LiveFileMetaData> metadata; 29 rep->GetLiveFilesMetaData(&metadata); 30 *n = metadata.size(); 31 // We malloc the result so it can be deallocated by the caller using free(). 32 const int size = metadata.size() * sizeof(DBSSTable); 33 DBSSTable* tables = reinterpret_cast<DBSSTable*>(malloc(size)); 34 memset(tables, 0, size); 35 for (int i = 0; i < metadata.size(); i++) { 36 tables[i].level = metadata[i].level; 37 tables[i].size = metadata[i].size; 38 39 rocksdb::Slice tmp; 40 if (DecodeKey(metadata[i].smallestkey, &tmp, &tables[i].start_key.wall_time, 41 &tables[i].start_key.logical)) { 42 // This is a bit ugly because we want DBKey.key to be copied and 43 // not refer to the memory in metadata[i].smallestkey. 44 DBString str = ToDBString(tmp); 45 tables[i].start_key.key = DBSlice{str.data, str.len}; 46 } 47 if (DecodeKey(metadata[i].largestkey, &tmp, &tables[i].end_key.wall_time, 48 &tables[i].end_key.logical)) { 49 DBString str = ToDBString(tmp); 50 tables[i].end_key.key = DBSlice{str.data, str.len}; 51 } 52 } 53 return tables; 54 } 55 56 DBStatus DBEngine::GetSortedWALFiles(DBWALFile** out_files, int* n) { 57 rocksdb::VectorLogPtr files; 58 rocksdb::Status s = rep->GetSortedWalFiles(files); 59 if (!s.ok()) { 60 return ToDBStatus(s); 61 } 62 *n = files.size(); 63 // We calloc the result so it can be deallocated by the caller using free(). 64 *out_files = reinterpret_cast<DBWALFile*>(calloc(files.size(), sizeof(DBWALFile))); 65 for (int i = 0; i < files.size(); i++) { 66 (*out_files)[i].log_number = files[i]->LogNumber(); 67 (*out_files)[i].size = files[i]->SizeFileBytes(); 68 } 69 return kSuccess; 70 } 71 72 DBString DBEngine::GetUserProperties() { 73 rocksdb::TablePropertiesCollection props; 74 rocksdb::Status status = rep->GetPropertiesOfAllTables(&props); 75 76 cockroach::storage::enginepb::SSTUserPropertiesCollection all; 77 if (!status.ok()) { 78 all.set_error(status.ToString()); 79 return ToDBString(all.SerializeAsString()); 80 } 81 82 for (auto i = props.begin(); i != props.end(); i++) { 83 cockroach::storage::enginepb::SSTUserProperties* sst = all.add_sst(); 84 sst->set_path(i->first); 85 auto userprops = i->second->user_collected_properties; 86 87 auto ts_min = userprops.find("crdb.ts.min"); 88 if (ts_min != userprops.end() && !ts_min->second.empty()) { 89 if (!DecodeTimestamp(rocksdb::Slice(ts_min->second), sst->mutable_ts_min())) { 90 fmt::SStringPrintf( 91 all.mutable_error(), "unable to decode crdb.ts.min value '%s' in table %s", 92 rocksdb::Slice(ts_min->second).ToString(true).c_str(), sst->path().c_str()); 93 break; 94 } 95 } 96 97 auto ts_max = userprops.find("crdb.ts.max"); 98 if (ts_max != userprops.end() && !ts_max->second.empty()) { 99 if (!DecodeTimestamp(rocksdb::Slice(ts_max->second), sst->mutable_ts_max())) { 100 fmt::SStringPrintf( 101 all.mutable_error(), "unable to decode crdb.ts.max value '%s' in table %s", 102 rocksdb::Slice(ts_max->second).ToString(true).c_str(), sst->path().c_str()); 103 break; 104 } 105 } 106 } 107 return ToDBString(all.SerializeAsString()); 108 } 109 110 namespace cockroach { 111 112 DBImpl::DBImpl(rocksdb::DB* r, std::unique_ptr<EnvManager> e, std::shared_ptr<rocksdb::Cache> bc, 113 std::shared_ptr<DBEventListener> event_listener) 114 : DBEngine(r, &iters_count), 115 env_mgr(std::move(e)), 116 rep_deleter(r), 117 block_cache(bc), 118 event_listener(event_listener), 119 iters_count(0) {} 120 121 DBImpl::~DBImpl() { 122 const rocksdb::Options& opts = rep->GetOptions(); 123 const std::shared_ptr<rocksdb::Statistics>& s = opts.statistics; 124 rocksdb::Info(opts.info_log, "bloom filter utility: %0.1f%%", 125 (100.0 * s->getTickerCount(rocksdb::BLOOM_FILTER_PREFIX_USEFUL)) / 126 s->getTickerCount(rocksdb::BLOOM_FILTER_PREFIX_CHECKED)); 127 } 128 129 DBStatus DBImpl::AssertPreClose() { 130 const int64_t n = iters_count.load(); 131 if (n == 0) { 132 return kSuccess; 133 } 134 return FmtStatus("%" PRId64 " leaked iterators", n); 135 } 136 137 DBStatus DBImpl::Put(DBKey key, DBSlice value) { 138 rocksdb::WriteOptions options; 139 return ToDBStatus(rep->Put(options, EncodeKey(key), ToSlice(value))); 140 } 141 142 DBStatus DBImpl::Merge(DBKey key, DBSlice value) { 143 rocksdb::WriteOptions options; 144 return ToDBStatus(rep->Merge(options, EncodeKey(key), ToSlice(value))); 145 } 146 147 DBStatus DBImpl::Get(DBKey key, DBString* value) { 148 rocksdb::ReadOptions read_opts; 149 DBGetter base(rep, read_opts, EncodeKey(key)); 150 return base.Get(value); 151 } 152 153 DBStatus DBImpl::Delete(DBKey key) { 154 rocksdb::WriteOptions options; 155 return ToDBStatus(rep->Delete(options, EncodeKey(key))); 156 } 157 158 DBStatus DBImpl::SingleDelete(DBKey key) { 159 rocksdb::WriteOptions options; 160 return ToDBStatus(rep->SingleDelete(options, EncodeKey(key))); 161 } 162 163 DBStatus DBImpl::DeleteRange(DBKey start, DBKey end) { 164 rocksdb::WriteOptions options; 165 return ToDBStatus( 166 rep->DeleteRange(options, rep->DefaultColumnFamily(), EncodeKey(start), EncodeKey(end))); 167 } 168 169 DBStatus DBImpl::CommitBatch(bool sync) { return FmtStatus("unsupported"); } 170 171 DBStatus DBImpl::ApplyBatchRepr(DBSlice repr, bool sync) { 172 rocksdb::WriteBatch batch(ToString(repr)); 173 rocksdb::WriteOptions options; 174 options.sync = sync; 175 return ToDBStatus(rep->Write(options, &batch)); 176 } 177 178 DBSlice DBImpl::BatchRepr() { return ToDBSlice("unsupported"); } 179 180 DBIterator* DBImpl::NewIter(DBIterOptions iter_opts) { 181 DBIterator* iter = new DBIterator(iters, iter_opts); 182 iter->rep.reset(rep->NewIterator(iter->read_opts)); 183 return iter; 184 } 185 186 // GetStats retrieves a subset of RocksDB stats that are relevant to 187 // CockroachDB. 188 DBStatus DBImpl::GetStats(DBStatsResult* stats) { 189 const rocksdb::Options& opts = rep->GetOptions(); 190 const std::shared_ptr<rocksdb::Statistics>& s = opts.statistics; 191 192 uint64_t memtable_total_size; 193 rep->GetIntProperty("rocksdb.cur-size-all-mem-tables", &memtable_total_size); 194 195 uint64_t table_readers_mem_estimate; 196 rep->GetIntProperty("rocksdb.estimate-table-readers-mem", &table_readers_mem_estimate); 197 198 uint64_t pending_compaction_bytes_estimate; 199 rep->GetIntProperty("rocksdb.estimate-pending-compaction-bytes", 200 &pending_compaction_bytes_estimate); 201 202 std::string l0_file_count_str; 203 rep->GetProperty("rocksdb.num-files-at-level0", &l0_file_count_str); 204 205 stats->block_cache_hits = (int64_t)s->getTickerCount(rocksdb::BLOCK_CACHE_HIT); 206 stats->block_cache_misses = (int64_t)s->getTickerCount(rocksdb::BLOCK_CACHE_MISS); 207 stats->block_cache_usage = (int64_t)block_cache->GetUsage(); 208 stats->block_cache_pinned_usage = (int64_t)block_cache->GetPinnedUsage(); 209 stats->bloom_filter_prefix_checked = 210 (int64_t)s->getTickerCount(rocksdb::BLOOM_FILTER_PREFIX_CHECKED); 211 stats->bloom_filter_prefix_useful = 212 (int64_t)s->getTickerCount(rocksdb::BLOOM_FILTER_PREFIX_USEFUL); 213 stats->memtable_total_size = memtable_total_size; 214 stats->flushes = (int64_t)event_listener->GetFlushes(); 215 stats->flush_bytes = (int64_t)s->getTickerCount(rocksdb::FLUSH_WRITE_BYTES); 216 stats->compactions = (int64_t)event_listener->GetCompactions(); 217 stats->compact_read_bytes = 218 (int64_t)s->getTickerCount(rocksdb::COMPACT_READ_BYTES); 219 stats->compact_write_bytes = 220 (int64_t)s->getTickerCount(rocksdb::COMPACT_WRITE_BYTES); 221 stats->table_readers_mem_estimate = table_readers_mem_estimate; 222 stats->pending_compaction_bytes_estimate = pending_compaction_bytes_estimate; 223 stats->l0_file_count = std::atoi(l0_file_count_str.c_str()); 224 return kSuccess; 225 } 226 227 // `GetTickersAndHistograms` retrieves maps of all RocksDB tickers and histograms. 228 // It differs from `GetStats` by getting _every_ ticker and histogram, and by not 229 // getting anything else (DB properties, for example). 230 // 231 // In addition to freeing the `DBString`s in the result, the caller is also 232 // responsible for freeing `DBTickersAndHistogramsResult::tickers` and 233 // `DBTickersAndHistogramsResult::histograms`. 234 DBStatus DBImpl::GetTickersAndHistograms(DBTickersAndHistogramsResult* stats) { 235 const rocksdb::Options& opts = rep->GetOptions(); 236 const std::shared_ptr<rocksdb::Statistics>& s = opts.statistics; 237 stats->tickers_len = rocksdb::TickersNameMap.size(); 238 // We malloc the result so it can be deallocated by the caller using free(). 239 stats->tickers = static_cast<TickerInfo*>(malloc(stats->tickers_len * sizeof(TickerInfo))); 240 if (stats->tickers == nullptr) { 241 return FmtStatus("malloc failed"); 242 } 243 for (size_t i = 0; i < stats->tickers_len; ++i) { 244 stats->tickers[i].name = ToDBString(rocksdb::TickersNameMap[i].second); 245 stats->tickers[i].value = s->getTickerCount(static_cast<uint32_t>(i)); 246 } 247 248 stats->histograms_len = rocksdb::HistogramsNameMap.size(); 249 // We malloc the result so it can be deallocated by the caller using free(). 250 stats->histograms = 251 static_cast<HistogramInfo*>(malloc(stats->histograms_len * sizeof(HistogramInfo))); 252 if (stats->histograms == nullptr) { 253 return FmtStatus("malloc failed"); 254 } 255 for (size_t i = 0; i < stats->histograms_len; ++i) { 256 stats->histograms[i].name = ToDBString(rocksdb::HistogramsNameMap[i].second); 257 rocksdb::HistogramData data; 258 s->histogramData(static_cast<uint32_t>(i), &data); 259 stats->histograms[i].mean = data.average; 260 stats->histograms[i].p50 = data.median; 261 stats->histograms[i].p95 = data.percentile95; 262 stats->histograms[i].p99 = data.percentile99; 263 stats->histograms[i].max = data.max; 264 stats->histograms[i].count = data.count; 265 stats->histograms[i].sum = data.sum; 266 } 267 return kSuccess; 268 } 269 270 DBString DBImpl::GetCompactionStats() { 271 std::string tmp; 272 rep->GetProperty("rocksdb.cfstats-no-file-histogram", &tmp); 273 return ToDBString(tmp); 274 } 275 276 DBStatus DBImpl::GetEnvStats(DBEnvStatsResult* stats) { 277 // Always initialize the fields. 278 stats->encryption_status = DBString(); 279 stats->total_files = stats->total_bytes = stats->active_key_files = stats->active_key_bytes = 0; 280 stats->encryption_type = 0; 281 282 if (env_mgr->env_stats_handler == nullptr || env_mgr->file_registry == nullptr) { 283 // We can't compute these if we don't have a file registry or stats handler. 284 // This happens in OSS mode or when encryption has not been turned on. 285 return kSuccess; 286 } 287 288 // Get encryption algorithm. 289 stats->encryption_type = env_mgr->env_stats_handler->GetActiveStoreKeyType(); 290 291 // Get encryption status. 292 std::string encryption_status; 293 auto status = env_mgr->env_stats_handler->GetEncryptionStats(&encryption_status); 294 if (!status.ok()) { 295 return ToDBStatus(status); 296 } 297 298 stats->encryption_status = ToDBString(encryption_status); 299 300 // Get file statistics. 301 FileStats file_stats(env_mgr.get()); 302 status = file_stats.GetFiles(rep); 303 if (!status.ok()) { 304 return ToDBStatus(status); 305 } 306 307 // Get current active key ID. 308 auto active_key_id = env_mgr->env_stats_handler->GetActiveDataKeyID(); 309 310 // Request stats for the Data env only. 311 status = file_stats.GetStatsForEnvAndKey(enginepb::Data, active_key_id, stats); 312 if (!status.ok()) { 313 return ToDBStatus(status); 314 } 315 316 return kSuccess; 317 } 318 319 DBStatus DBImpl::GetEncryptionRegistries(DBEncryptionRegistries* result) { 320 // Always initialize the fields. 321 result->file_registry = DBString(); 322 result->key_registry = DBString(); 323 324 if (env_mgr->env_stats_handler == nullptr || env_mgr->file_registry == nullptr) { 325 // We can't compute these if we don't have a file registry or stats handler. 326 // This happens in OSS mode or when encryption has not been turned on. 327 return kSuccess; 328 } 329 330 auto file_registry = env_mgr->file_registry->GetFileRegistry(); 331 if (file_registry == nullptr) { 332 return ToDBStatus(rocksdb::Status::InvalidArgument("file registry has not been loaded")); 333 } 334 335 std::string serialized_file_registry; 336 if (!file_registry->SerializeToString(&serialized_file_registry)) { 337 return ToDBStatus(rocksdb::Status::InvalidArgument("failed to serialize file registry proto")); 338 } 339 340 std::string serialized_key_registry; 341 auto status = env_mgr->env_stats_handler->GetEncryptionRegistry(&serialized_key_registry); 342 if (!status.ok()) { 343 return ToDBStatus(status); 344 } 345 346 result->file_registry = ToDBString(serialized_file_registry); 347 result->key_registry = ToDBString(serialized_key_registry); 348 349 return kSuccess; 350 } 351 352 // EnvWriteFile writes the given data as a new "file" in the given engine. 353 DBStatus DBImpl::EnvWriteFile(DBSlice path, DBSlice contents) { 354 rocksdb::Status s; 355 356 const rocksdb::EnvOptions soptions; 357 std::unique_ptr<rocksdb::WritableFile> destfile; 358 s = this->rep->GetEnv()->NewWritableFile(ToString(path), &destfile, soptions); 359 if (!s.ok()) { 360 return ToDBStatus(s); 361 } 362 363 s = destfile->Append(ToSlice(contents)); 364 if (!s.ok()) { 365 return ToDBStatus(s); 366 } 367 368 return kSuccess; 369 } 370 371 // EnvOpenFile opens a new file in the given engine. 372 DBStatus DBImpl::EnvOpenFile(DBSlice path, uint64_t bytes_per_sync, rocksdb::WritableFile** file) { 373 rocksdb::Status status; 374 rocksdb::EnvOptions soptions; 375 soptions.bytes_per_sync = bytes_per_sync; 376 std::unique_ptr<rocksdb::WritableFile> rocksdb_file; 377 378 // Create the file. 379 status = this->rep->GetEnv()->NewWritableFile(ToString(path), &rocksdb_file, soptions); 380 if (!status.ok()) { 381 return ToDBStatus(status); 382 } 383 *file = rocksdb_file.release(); 384 return kSuccess; 385 } 386 387 // EnvReadFile reads the content of the given filename. 388 DBStatus DBImpl::EnvReadFile(DBSlice path, DBSlice* contents) { 389 rocksdb::Status status; 390 std::string data; 391 392 status = ReadFileToString(this->rep->GetEnv(), ToString(path), &data); 393 if (!status.ok()) { 394 if (status.IsNotFound()) { 395 return FmtStatus("No such file or directory"); 396 } 397 return ToDBStatus(status); 398 } 399 contents->data = static_cast<char*>(malloc(data.size())); 400 contents->len = data.size(); 401 memcpy(contents->data, data.c_str(), data.size()); 402 return kSuccess; 403 } 404 405 // CloseFile closes the given file in the given engine. 406 DBStatus DBImpl::EnvCloseFile(rocksdb::WritableFile* file) { 407 rocksdb::Status status = file->Close(); 408 delete file; 409 return ToDBStatus(status); 410 } 411 412 // EnvAppendFile appends the given data to the file in the given engine. 413 DBStatus DBImpl::EnvAppendFile(rocksdb::WritableFile* file, DBSlice contents) { 414 rocksdb::Status status = file->Append(ToSlice(contents)); 415 return ToDBStatus(status); 416 } 417 418 // EnvSyncFile synchronously writes the data of the file to the disk. 419 DBStatus DBImpl::EnvSyncFile(rocksdb::WritableFile* file) { 420 rocksdb::Status status = file->Sync(); 421 return ToDBStatus(status); 422 } 423 424 // EnvDeleteFile deletes the file with the given filename. 425 DBStatus DBImpl::EnvDeleteFile(DBSlice path) { 426 rocksdb::Status status = this->rep->GetEnv()->DeleteFile(ToString(path)); 427 if (status.IsNotFound()) { 428 return FmtStatus("No such file or directory"); 429 } 430 return ToDBStatus(status); 431 } 432 433 // EnvDeleteDirAndFiles deletes the directory with the given dir name and any 434 // files it contains but not subdirectories. 435 DBStatus DBImpl::EnvDeleteDirAndFiles(DBSlice dir) { 436 rocksdb::Status status; 437 438 std::vector<std::string> files; 439 this->rep->GetEnv()->GetChildren(ToString(dir), &files); 440 for (auto& file : files) { 441 if (file != "." && file != "..") { 442 this->rep->GetEnv()->DeleteFile(ToString(dir) + "/" + file); 443 } 444 } 445 446 status = this->rep->GetEnv()->DeleteDir(ToString(dir)); 447 if (status.IsNotFound()) { 448 return FmtStatus("No such file or directory"); 449 } 450 return ToDBStatus(status); 451 } 452 453 // EnvLinkFile creates 'newname' as a hard link to 'oldname'. 454 DBStatus DBImpl::EnvLinkFile(DBSlice oldname, DBSlice newname) { 455 return ToDBStatus(this->rep->GetEnv()->LinkFile(ToString(oldname), ToString(newname))); 456 } 457 458 DBStatus DBImpl::EnvOpenReadableFile(DBSlice path, rocksdb::RandomAccessFile** file) { 459 rocksdb::Status status; 460 const rocksdb::EnvOptions soptions; 461 std::unique_ptr<rocksdb::RandomAccessFile> rocksdb_file; 462 463 status = this->rep->GetEnv()->NewRandomAccessFile(ToString(path), &rocksdb_file, soptions); 464 if (!status.ok()) { 465 return ToDBStatus(status); 466 } 467 *file = rocksdb_file.release(); 468 return kSuccess; 469 } 470 471 DBStatus DBImpl::EnvCloseReadableFile(rocksdb::RandomAccessFile* file) { 472 delete file; 473 return kSuccess; 474 } 475 476 DBStatus DBImpl::EnvReadAtFile(rocksdb::RandomAccessFile* file, DBSlice buffer, int64_t offset, 477 int* n) { 478 size_t max_bytes_to_read = buffer.len; 479 char* scratch = buffer.data; 480 rocksdb::Slice result; 481 auto status = file->Read(offset, max_bytes_to_read, &result, scratch); 482 *n = result.size(); 483 return ToDBStatus(status); 484 } 485 486 DBStatus DBImpl::EnvOpenDirectory(DBSlice path, rocksdb::Directory** file) { 487 rocksdb::Status status; 488 std::unique_ptr<rocksdb::Directory> rocksdb_dir; 489 490 status = this->rep->GetEnv()->NewDirectory(ToString(path), &rocksdb_dir); 491 if (!status.ok()) { 492 return ToDBStatus(status); 493 } 494 *file = rocksdb_dir.release(); 495 return kSuccess; 496 } 497 498 DBStatus DBImpl::EnvSyncDirectory(rocksdb::Directory* file) { return ToDBStatus(file->Fsync()); } 499 500 DBStatus DBImpl::EnvCloseDirectory(rocksdb::Directory* file) { 501 delete file; 502 return kSuccess; 503 } 504 505 DBStatus DBImpl::EnvRenameFile(DBSlice oldname, DBSlice newname) { 506 return ToDBStatus(this->rep->GetEnv()->RenameFile(ToString(oldname), ToString(newname))); 507 } 508 509 DBStatus DBImpl::EnvCreateDir(DBSlice name) { 510 return ToDBStatus(this->rep->GetEnv()->CreateDirIfMissing(ToString(name))); 511 } 512 513 DBStatus DBImpl::EnvDeleteDir(DBSlice name) { 514 return ToDBStatus(this->rep->GetEnv()->DeleteDir(ToString(name))); 515 } 516 517 DBStatus DBImpl::EnvListDir(DBSlice name, std::vector<std::string>* result) { 518 return ToDBStatus(this->rep->GetEnv()->GetChildren(ToString(name), result)); 519 } 520 521 } // namespace cockroach