github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/c-deps/libroach/options.cc (about) 1 // Copyright 2018 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 #include "options.h" 12 #include <limits> 13 #include <rocksdb/env.h> 14 #include <rocksdb/filter_policy.h> 15 #include <rocksdb/slice_transform.h> 16 #include <rocksdb/table.h> 17 #include "cache.h" 18 #include "comparator.h" 19 #include "db.h" 20 #include "encoding.h" 21 #include "godefs.h" 22 #include "merge.h" 23 #include "protos/util/log/log.pb.h" 24 #include "table_props.h" 25 26 namespace cockroach { 27 28 namespace { 29 30 class DBPrefixExtractor : public rocksdb::SliceTransform { 31 public: 32 DBPrefixExtractor() {} 33 34 virtual const char* Name() const { return "cockroach_prefix_extractor"; } 35 36 // MVCC keys are encoded as <user-key>/<timestamp>. Extract the <user-key> 37 // prefix which will allow for more efficient iteration over the keys 38 // matching a particular <user-key>. Specifically, the <user-key> will be 39 // added to the per table bloom filters and will be used to skip tables 40 // which do not contain the <user-key>. 41 virtual rocksdb::Slice Transform(const rocksdb::Slice& src) const { return KeyPrefix(src); } 42 43 virtual bool InDomain(const rocksdb::Slice& src) const { return true; } 44 }; 45 46 // The DBLogger is a rocksdb::Logger that calls back into Go code for formatted logging. 47 class DBLogger : public rocksdb::Logger { 48 public: 49 DBLogger(bool use_primary_log) : use_primary_log_(use_primary_log) {} 50 51 virtual void Logv(const rocksdb::InfoLogLevel log_level, const char* format, 52 va_list ap) override { 53 int go_log_level = util::log::Severity::UNKNOWN; // compiler tells us to initialize it 54 switch (log_level) { 55 case rocksdb::DEBUG_LEVEL: 56 // There is no DEBUG severity. Just give it INFO severity, then. 57 go_log_level = util::log::Severity::INFO; 58 break; 59 case rocksdb::INFO_LEVEL: 60 go_log_level = util::log::Severity::INFO; 61 break; 62 case rocksdb::WARN_LEVEL: 63 go_log_level = util::log::Severity::WARNING; 64 break; 65 case rocksdb::ERROR_LEVEL: 66 go_log_level = util::log::Severity::ERROR; 67 break; 68 case rocksdb::FATAL_LEVEL: 69 go_log_level = util::log::Severity::FATAL; 70 break; 71 case rocksdb::HEADER_LEVEL: 72 // There is no HEADER severity. Just give it INFO severity, then. 73 go_log_level = util::log::Severity::INFO; 74 break; 75 case rocksdb::NUM_INFO_LOG_LEVELS: 76 assert(false); 77 return; 78 } 79 80 // First try with a small fixed size buffer. 81 char space[1024]; 82 83 // It's possible for methods that use a va_list to invalidate the data in 84 // it upon use. The fix is to make a copy of the structure before using it 85 // and use that copy instead. 86 va_list backup_ap; 87 va_copy(backup_ap, ap); 88 int result = vsnprintf(space, sizeof(space), format, backup_ap); 89 va_end(backup_ap); 90 91 if ((result >= 0) && (result < sizeof(space))) { 92 rocksDBLog(use_primary_log_, go_log_level, space, result); 93 return; 94 } 95 96 // Repeatedly increase buffer size until it fits. 97 int length = sizeof(space); 98 while (true) { 99 if (result < 0) { 100 // Older behavior: just try doubling the buffer size. 101 length *= 2; 102 } else { 103 // We need exactly "result+1" characters. 104 length = result + 1; 105 } 106 char* buf = new char[length]; 107 108 // Restore the va_list before we use it again 109 va_copy(backup_ap, ap); 110 result = vsnprintf(buf, length, format, backup_ap); 111 va_end(backup_ap); 112 113 if ((result >= 0) && (result < length)) { 114 // It fit 115 rocksDBLog(use_primary_log_, go_log_level, buf, result); 116 delete[] buf; 117 return; 118 } 119 delete[] buf; 120 } 121 } 122 123 virtual void LogHeader(const char* format, va_list ap) override { 124 // RocksDB's `Logger::LogHeader()` implementation forgot to call the `Logv()` overload 125 // that takes severity info. Until it's fixed we can override their implementation. 126 Logv(rocksdb::InfoLogLevel::HEADER_LEVEL, format, ap); 127 } 128 129 virtual void Logv(const char* format, va_list ap) override { 130 // The RocksDB API tries to force us to separate the severity check (above function) 131 // from the actual logging (this function) by making this function pure virtual. 132 // However, when calling into Go, we need to provide severity level to both the severity 133 // level check function (`rocksDBV`) and the actual logging function (`rocksDBLog`). So, 134 // we do all the work in the function that has severity level and then expect this 135 // function to never be called. 136 assert(false); 137 } 138 139 private: 140 const bool use_primary_log_; 141 }; 142 143 } // namespace 144 145 rocksdb::Logger* NewDBLogger(bool use_primary_log) { return new DBLogger(use_primary_log); } 146 147 rocksdb::Options DBMakeOptions(DBOptions db_opts) { 148 // Use the rocksdb options builder to configure the base options 149 // using our memtable budget. 150 rocksdb::Options options; 151 // Increase parallelism for compactions and flushes based on the 152 // number of cpus. Always use at least 2 threads, otherwise 153 // compactions and flushes may fight with each other. 154 options.IncreaseParallelism(std::max(db_opts.num_cpu, 2)); 155 // Disable subcompactions since they're a less stable feature, and not 156 // necessary for our workload, where frequent fsyncs naturally prevent 157 // foreground writes from getting too far ahead of compactions. 158 options.max_subcompactions = 1; 159 options.comparator = &kComparator; 160 options.create_if_missing = !db_opts.must_exist; 161 options.info_log.reset(NewDBLogger(false /* use_primary_log */)); 162 options.merge_operator.reset(NewMergeOperator()); 163 options.prefix_extractor.reset(new DBPrefixExtractor); 164 options.statistics = rocksdb::CreateDBStatistics(); 165 options.max_open_files = db_opts.max_open_files; 166 options.compaction_pri = rocksdb::kMinOverlappingRatio; 167 // Periodically sync SST writes to smooth out disk usage. Not performing such 168 // syncs can be faster but can cause performance blips when the OS decides it 169 // needs to flush data. 170 options.bytes_per_sync = 512 << 10; // 512 KB 171 // Enabling `strict_bytes_per_sync` prevents the situation where an SST is 172 // generated fast enough that the async writeback submissions fall behind. 173 // It enforces we wait for any previous `bytes_per_sync` sync to finish before 174 // issuing any future sync. That way we prevent situations where a huge amount 175 // of data gets written out all at once upon finishing a file (the final sync 176 // covers all the data, not just a range of size `bytes_per_sync`). 177 options.strict_bytes_per_sync = true; 178 // Do not sync the WAL periodically. We sync it every write already by calling 179 // `FlushWAL(true)` on non-temp stores. On the temp store we do not intend to 180 // sync WAL ever, so setting it to zero is fine there too. 181 options.wal_bytes_per_sync = 0; 182 183 // On ext4 and xfs, at least, `fallocate()`ing a large empty WAL is not enough 184 // to avoid inode writeback on every `fdatasync()`. Although `fallocate()` can 185 // preallocate space and preset the file size, it marks the preallocated 186 // "extents" as unwritten in the inode to guarantee readers cannot be exposed 187 // to data belonging to others. Every time `fdatasync()` happens, an inode 188 // writeback happens for the update to split an unwritten extent and mark part 189 // of it as written. 190 // 191 // Setting `recycle_log_file_num > 0` circumvents this as it'll eventually 192 // reuse WALs where extents are already all marked as written. When the DB 193 // opens, the first WAL will have its space preallocated as unwritten extents, 194 // so will still incur frequent inode writebacks. The second WAL will as well 195 // since the first WAL cannot be recycled until the first flush completes. 196 // From the third WAL onwards, however, we will have a previously written WAL 197 // readily available to recycle. 198 // 199 // We could pick a higher value if we see memtable flush backing up, or if we 200 // start using column families (WAL changes every time any column family 201 // initiates a flush, and WAL cannot be reused until that flush completes). 202 options.recycle_log_file_num = 1; 203 204 // The size reads should be performed in for compaction. The 205 // internets claim this can speed up compactions, though RocksDB 206 // docs say it is only useful on spinning disks. Experimentally it 207 // has had no effect. 208 // options.compaction_readahead_size = 2 << 20; 209 210 // Do not create bloom filters for the last level (i.e. the largest 211 // level which contains data in the LSM store). Setting this option 212 // reduces the size of the bloom filters by 10x. This is significant 213 // given that bloom filters require 1.25 bytes (10 bits) per key 214 // which can translate into gigabytes of memory given typical key 215 // and value sizes. The downside is that bloom filters will only be 216 // usable on the higher levels, but that seems acceptable. We 217 // typically see read amplification of 5-6x on clusters (i.e. there 218 // are 5-6 levels of sstables) which means we'll achieve 80-90% of 219 // the benefit of having bloom filters on every level for only 10% 220 // of the memory cost. 221 options.optimize_filters_for_hits = true; 222 223 // We periodically report stats ourselves and by default the info 224 // logger swallows log messages. 225 options.stats_dump_period_sec = 0; 226 227 // Use the TablePropertiesCollector hook to store the min and max MVCC 228 // timestamps present in each sstable in the metadata for that sstable. 229 options.table_properties_collector_factories.emplace_back(DBMakeTimeBoundCollector()); 230 231 // Automatically request compactions whenever an SST contains too many range 232 // deletions. 233 options.table_properties_collector_factories.emplace_back(DBMakeDeleteRangeCollector()); 234 235 // The write buffer size is the size of the in memory structure that 236 // will be flushed to create L0 files. 237 options.write_buffer_size = 64 << 20; // 64 MB 238 // How much memory should be allotted to memtables? Note that this 239 // is a peak setting, steady state should be lower. We set this 240 // relatively high to account for bursts of writes (e.g. due to a 241 // deletion of a large range of keys). In particular, we want this 242 // to be somewhat larger than than typical range size so that 243 // deletion of a range worth of keys does not cause write stalls. 244 options.max_write_buffer_number = 4; 245 // Number of files to trigger L0 compaction. We set this low so that 246 // we quickly move files out of L0 as each L0 file increases read 247 // amplification. 248 options.level0_file_num_compaction_trigger = 2; 249 // Soft limit on number of L0 files. Writes are slowed down when 250 // this number is reached. Bulk-ingestion can add lots of files 251 // suddenly, so setting this much higher should avoid spurious 252 // slowdowns to writes. 253 // TODO(dt): if/when we dynamically tune for bulk-ingestion, we 254 // could leave this at 20 and only raise it during ingest jobs. 255 options.level0_slowdown_writes_trigger = 950; 256 // Maximum number of L0 files. Writes are stopped at this 257 // point. This is set significantly higher than 258 // level0_slowdown_writes_trigger to avoid completely blocking 259 // writes. 260 // TODO(dt): if/when we dynamically tune for bulk-ingestion, we 261 // could leave this at 30 and only raise it during ingest. 262 options.level0_stop_writes_trigger = 1000; 263 // Maximum estimated pending compaction bytes before slowing writes. 264 // Default is 64gb but that can be hit easily during bulk-ingestion since it 265 // is based on assumptions about relative level sizes that do not hold when 266 // adding data directly. Additionally some system-critical writes in 267 // cockroach (node-liveness), just can not be slow or they will fail and 268 // cause unavailability, so back-pressuring may *cause* unavailability, 269 // instead of gracefully slowing to some stable equilibrium to avoid it. As 270 // such, we want these set so they are impossible to hit. 271 options.soft_pending_compaction_bytes_limit = std::numeric_limits<uint64_t>::max(); 272 options.hard_pending_compaction_bytes_limit = std::numeric_limits<uint64_t>::max(); 273 // Flush write buffers to L0 as soon as they are full. A higher 274 // value could be beneficial if there are duplicate records in each 275 // of the individual write buffers, but perf testing hasn't shown 276 // any benefit so far. 277 options.min_write_buffer_number_to_merge = 1; 278 // Enable dynamic level sizing which reduces both size and write 279 // amplification. This causes RocksDB to pick the target size of 280 // each level dynamically. 281 options.level_compaction_dynamic_level_bytes = true; 282 // Follow the RocksDB recommendation to configure the size of L1 to 283 // be the same as the estimated size of L0. 284 options.max_bytes_for_level_base = 64 << 20; // 64 MB 285 options.max_bytes_for_level_multiplier = 10; 286 // Target the base file size (L1) as 4 MB. Each additional level 287 // grows the file size by 2. With max_bytes_for_level_base set to 64 288 // MB, this translates into the following target level and file 289 // sizes for each level: 290 // 291 // level-size file-size max-files 292 // L1: 64 MB 4 MB 16 293 // L2: 640 MB 8 MB 80 294 // L3: 6.25 GB 16 MB 400 295 // L4: 62.5 GB 32 MB 2000 296 // L5: 625 GB 64 MB 10000 297 // L6: 6.1 TB 128 MB 50000 298 // 299 // Due to the use of level_compaction_dynamic_level_bytes most data 300 // will be in L6. The number of files will be approximately 301 // total-data-size / 128 MB. 302 // 303 // We don't want the target file size to be too large, otherwise 304 // individual compactions become more expensive. We don't want the 305 // target file size to be too small or else we get an overabundance 306 // of sstables. 307 options.target_file_size_base = 4 << 20; // 4 MB 308 options.target_file_size_multiplier = 2; 309 options.manual_wal_flush = true; 310 311 // Because we open a long running rocksdb instance, we do not want the 312 // manifest file to grow unbounded. Assuming each manifest entry is about 1 313 // KB, this allows for 128 K entries. This could account for several hours to 314 // few months of runtime without rolling based on the workload. 315 options.max_manifest_file_size = 128 << 20; // 128 MB 316 317 rocksdb::BlockBasedTableOptions table_options; 318 if (db_opts.cache != nullptr) { 319 table_options.block_cache = db_opts.cache->rep; 320 321 // Reserve 1 memtable worth of memory from the cache. Under high 322 // load situations we'll be using somewhat more than 1 memtable, 323 // but usually not significantly more unless there is an I/O 324 // throughput problem. 325 // 326 // We ensure that at least 1MB is allocated for the block cache. 327 // Some unit tests expect to see a non-zero block cache hit rate, 328 // but they use a cache that is small enough that all of it would 329 // otherwise be reserved for the memtable. 330 std::lock_guard<std::mutex> guard(db_opts.cache->mu); 331 const int64_t capacity = db_opts.cache->rep->GetCapacity(); 332 const int64_t new_capacity = std::max<int64_t>(1 << 20, capacity - options.write_buffer_size); 333 db_opts.cache->rep->SetCapacity(new_capacity); 334 } 335 336 // Pass false for use_blocked_base_builder creates a per file 337 // (sstable) filter instead of a per-block filter. The per file 338 // filter can be consulted before going to the index which saves an 339 // index lookup. The cost is an 4-bytes per key in memory during 340 // compactions, which seems a small price to pay. 341 table_options.filter_policy.reset(rocksdb::NewBloomFilterPolicy(10, false /* !block_based */)); 342 table_options.format_version = 2; 343 344 // Increasing block_size decreases memory usage at the cost of 345 // increased read amplification. When reading a key-value pair from 346 // a table file, RocksDB loads an entire block into memory. The 347 // RocksDB default is 4KB. This sets it to 32KB. 348 table_options.block_size = 32 << 10; 349 // Disable whole_key_filtering which adds a bloom filter entry for 350 // the "whole key", doubling the size of our bloom filters. This is 351 // used to speed up Get operations which we don't use. 352 table_options.whole_key_filtering = false; 353 options.table_factory.reset(rocksdb::NewBlockBasedTableFactory(table_options)); 354 return options; 355 } 356 357 } // namespace cockroach