github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/c-deps/libroach/options.cc

github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/c-deps/libroach/options.cc (about)

     1  // Copyright 2018 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  #include "options.h"
    12  #include <limits>
    13  #include <rocksdb/env.h>
    14  #include <rocksdb/filter_policy.h>
    15  #include <rocksdb/slice_transform.h>
    16  #include <rocksdb/table.h>
    17  #include "cache.h"
    18  #include "comparator.h"
    19  #include "db.h"
    20  #include "encoding.h"
    21  #include "godefs.h"
    22  #include "merge.h"
    23  #include "protos/util/log/log.pb.h"
    24  #include "table_props.h"
    25  
    26  namespace cockroach {
    27  
    28  namespace {
    29  
    30  class DBPrefixExtractor : public rocksdb::SliceTransform {
    31   public:
    32    DBPrefixExtractor() {}
    33  
    34    virtual const char* Name() const { return "cockroach_prefix_extractor"; }
    35  
    36    // MVCC keys are encoded as <user-key>/<timestamp>. Extract the <user-key>
    37    // prefix which will allow for more efficient iteration over the keys
    38    // matching a particular <user-key>. Specifically, the <user-key> will be
    39    // added to the per table bloom filters and will be used to skip tables
    40    // which do not contain the <user-key>.
    41    virtual rocksdb::Slice Transform(const rocksdb::Slice& src) const { return KeyPrefix(src); }
    42  
    43    virtual bool InDomain(const rocksdb::Slice& src) const { return true; }
    44  };
    45  
    46  // The DBLogger is a rocksdb::Logger that calls back into Go code for formatted logging.
    47  class DBLogger : public rocksdb::Logger {
    48   public:
    49    DBLogger(bool use_primary_log) : use_primary_log_(use_primary_log) {}
    50  
    51    virtual void Logv(const rocksdb::InfoLogLevel log_level, const char* format,
    52                      va_list ap) override {
    53      int go_log_level = util::log::Severity::UNKNOWN;  // compiler tells us to initialize it
    54      switch (log_level) {
    55      case rocksdb::DEBUG_LEVEL:
    56        // There is no DEBUG severity. Just give it INFO severity, then.
    57        go_log_level = util::log::Severity::INFO;
    58        break;
    59      case rocksdb::INFO_LEVEL:
    60        go_log_level = util::log::Severity::INFO;
    61        break;
    62      case rocksdb::WARN_LEVEL:
    63        go_log_level = util::log::Severity::WARNING;
    64        break;
    65      case rocksdb::ERROR_LEVEL:
    66        go_log_level = util::log::Severity::ERROR;
    67        break;
    68      case rocksdb::FATAL_LEVEL:
    69        go_log_level = util::log::Severity::FATAL;
    70        break;
    71      case rocksdb::HEADER_LEVEL:
    72        // There is no HEADER severity. Just give it INFO severity, then.
    73        go_log_level = util::log::Severity::INFO;
    74        break;
    75      case rocksdb::NUM_INFO_LOG_LEVELS:
    76        assert(false);
    77        return;
    78      }
    79  
    80      // First try with a small fixed size buffer.
    81      char space[1024];
    82  
    83      // It's possible for methods that use a va_list to invalidate the data in
    84      // it upon use. The fix is to make a copy of the structure before using it
    85      // and use that copy instead.
    86      va_list backup_ap;
    87      va_copy(backup_ap, ap);
    88      int result = vsnprintf(space, sizeof(space), format, backup_ap);
    89      va_end(backup_ap);
    90  
    91      if ((result >= 0) && (result < sizeof(space))) {
    92        rocksDBLog(use_primary_log_, go_log_level, space, result);
    93        return;
    94      }
    95  
    96      // Repeatedly increase buffer size until it fits.
    97      int length = sizeof(space);
    98      while (true) {
    99        if (result < 0) {
   100          // Older behavior: just try doubling the buffer size.
   101          length *= 2;
   102        } else {
   103          // We need exactly "result+1" characters.
   104          length = result + 1;
   105        }
   106        char* buf = new char[length];
   107  
   108        // Restore the va_list before we use it again
   109        va_copy(backup_ap, ap);
   110        result = vsnprintf(buf, length, format, backup_ap);
   111        va_end(backup_ap);
   112  
   113        if ((result >= 0) && (result < length)) {
   114          // It fit
   115          rocksDBLog(use_primary_log_, go_log_level, buf, result);
   116          delete[] buf;
   117          return;
   118        }
   119        delete[] buf;
   120      }
   121    }
   122  
   123    virtual void LogHeader(const char* format, va_list ap) override {
   124      // RocksDB's `Logger::LogHeader()` implementation forgot to call the `Logv()` overload
   125      // that takes severity info. Until it's fixed we can override their implementation.
   126      Logv(rocksdb::InfoLogLevel::HEADER_LEVEL, format, ap);
   127    }
   128  
   129    virtual void Logv(const char* format, va_list ap) override {
   130      // The RocksDB API tries to force us to separate the severity check (above function)
   131      // from the actual logging (this function) by making this function pure virtual.
   132      // However, when calling into Go, we need to provide severity level to both the severity
   133      // level check function (`rocksDBV`) and the actual logging function (`rocksDBLog`). So,
   134      // we do all the work in the function that has severity level and then expect this
   135      // function to never be called.
   136      assert(false);
   137    }
   138  
   139   private:
   140    const bool use_primary_log_;
   141  };
   142  
   143  }  // namespace
   144  
   145  rocksdb::Logger* NewDBLogger(bool use_primary_log) { return new DBLogger(use_primary_log); }
   146  
   147  rocksdb::Options DBMakeOptions(DBOptions db_opts) {
   148    // Use the rocksdb options builder to configure the base options
   149    // using our memtable budget.
   150    rocksdb::Options options;
   151    // Increase parallelism for compactions and flushes based on the
   152    // number of cpus. Always use at least 2 threads, otherwise
   153    // compactions and flushes may fight with each other.
   154    options.IncreaseParallelism(std::max(db_opts.num_cpu, 2));
   155    // Disable subcompactions since they're a less stable feature, and not
   156    // necessary for our workload, where frequent fsyncs naturally prevent
   157    // foreground writes from getting too far ahead of compactions.
   158    options.max_subcompactions = 1;
   159    options.comparator = &kComparator;
   160    options.create_if_missing = !db_opts.must_exist;
   161    options.info_log.reset(NewDBLogger(false /* use_primary_log */));
   162    options.merge_operator.reset(NewMergeOperator());
   163    options.prefix_extractor.reset(new DBPrefixExtractor);
   164    options.statistics = rocksdb::CreateDBStatistics();
   165    options.max_open_files = db_opts.max_open_files;
   166    options.compaction_pri = rocksdb::kMinOverlappingRatio;
   167    // Periodically sync SST writes to smooth out disk usage. Not performing such
   168    // syncs can be faster but can cause performance blips when the OS decides it
   169    // needs to flush data.
   170    options.bytes_per_sync = 512 << 10;  // 512 KB
   171    // Enabling `strict_bytes_per_sync` prevents the situation where an SST is
   172    // generated fast enough that the async writeback submissions fall behind.
   173    // It enforces we wait for any previous `bytes_per_sync` sync to finish before
   174    // issuing any future sync. That way we prevent situations where a huge amount
   175    // of data gets written out all at once upon finishing a file (the final sync
   176    // covers all the data, not just a range of size `bytes_per_sync`).
   177    options.strict_bytes_per_sync = true;
   178    // Do not sync the WAL periodically. We sync it every write already by calling
   179    // `FlushWAL(true)` on non-temp stores. On the temp store we do not intend to
   180    // sync WAL ever, so setting it to zero is fine there too.
   181    options.wal_bytes_per_sync = 0;
   182  
   183    // On ext4 and xfs, at least, `fallocate()`ing a large empty WAL is not enough
   184    // to avoid inode writeback on every `fdatasync()`. Although `fallocate()` can
   185    // preallocate space and preset the file size, it marks the preallocated
   186    // "extents" as unwritten in the inode to guarantee readers cannot be exposed
   187    // to data belonging to others. Every time `fdatasync()` happens, an inode
   188    // writeback happens for the update to split an unwritten extent and mark part
   189    // of it as written.
   190    //
   191    // Setting `recycle_log_file_num > 0` circumvents this as it'll eventually
   192    // reuse WALs where extents are already all marked as written. When the DB
   193    // opens, the first WAL will have its space preallocated as unwritten extents,
   194    // so will still incur frequent inode writebacks. The second WAL will as well
   195    // since the first WAL cannot be recycled until the first flush completes.
   196    // From the third WAL onwards, however, we will have a previously written WAL
   197    // readily available to recycle.
   198    //
   199    // We could pick a higher value if we see memtable flush backing up, or if we
   200    // start using column families (WAL changes every time any column family
   201    // initiates a flush, and WAL cannot be reused until that flush completes).
   202    options.recycle_log_file_num = 1;
   203  
   204    // The size reads should be performed in for compaction. The
   205    // internets claim this can speed up compactions, though RocksDB
   206    // docs say it is only useful on spinning disks. Experimentally it
   207    // has had no effect.
   208    // options.compaction_readahead_size = 2 << 20;
   209  
   210    // Do not create bloom filters for the last level (i.e. the largest
   211    // level which contains data in the LSM store). Setting this option
   212    // reduces the size of the bloom filters by 10x. This is significant
   213    // given that bloom filters require 1.25 bytes (10 bits) per key
   214    // which can translate into gigabytes of memory given typical key
   215    // and value sizes. The downside is that bloom filters will only be
   216    // usable on the higher levels, but that seems acceptable. We
   217    // typically see read amplification of 5-6x on clusters (i.e. there
   218    // are 5-6 levels of sstables) which means we'll achieve 80-90% of
   219    // the benefit of having bloom filters on every level for only 10%
   220    // of the memory cost.
   221    options.optimize_filters_for_hits = true;
   222  
   223    // We periodically report stats ourselves and by default the info
   224    // logger swallows log messages.
   225    options.stats_dump_period_sec = 0;
   226  
   227    // Use the TablePropertiesCollector hook to store the min and max MVCC
   228    // timestamps present in each sstable in the metadata for that sstable.
   229    options.table_properties_collector_factories.emplace_back(DBMakeTimeBoundCollector());
   230  
   231    // Automatically request compactions whenever an SST contains too many range
   232    // deletions.
   233    options.table_properties_collector_factories.emplace_back(DBMakeDeleteRangeCollector());
   234  
   235    // The write buffer size is the size of the in memory structure that
   236    // will be flushed to create L0 files.
   237    options.write_buffer_size = 64 << 20;  // 64 MB
   238    // How much memory should be allotted to memtables? Note that this
   239    // is a peak setting, steady state should be lower. We set this
   240    // relatively high to account for bursts of writes (e.g. due to a
   241    // deletion of a large range of keys). In particular, we want this
   242    // to be somewhat larger than than typical range size so that
   243    // deletion of a range worth of keys does not cause write stalls.
   244    options.max_write_buffer_number = 4;
   245    // Number of files to trigger L0 compaction. We set this low so that
   246    // we quickly move files out of L0 as each L0 file increases read
   247    // amplification.
   248    options.level0_file_num_compaction_trigger = 2;
   249    // Soft limit on number of L0 files. Writes are slowed down when
   250    // this number is reached. Bulk-ingestion can add lots of files
   251    // suddenly, so setting this much higher should avoid spurious
   252    // slowdowns to writes.
   253    // TODO(dt): if/when we dynamically tune for bulk-ingestion, we
   254    // could leave this at 20 and only raise it during ingest jobs.
   255    options.level0_slowdown_writes_trigger = 950;
   256    // Maximum number of L0 files. Writes are stopped at this
   257    // point. This is set significantly higher than
   258    // level0_slowdown_writes_trigger to avoid completely blocking
   259    // writes.
   260    // TODO(dt): if/when we dynamically tune for bulk-ingestion, we
   261    // could leave this at 30 and only raise it during ingest.
   262    options.level0_stop_writes_trigger = 1000;
   263    // Maximum estimated pending compaction bytes before slowing writes.
   264    // Default is 64gb but that can be hit easily during bulk-ingestion since it
   265    // is based on assumptions about relative level sizes that do not hold when
   266    // adding data directly. Additionally some system-critical writes in
   267    // cockroach (node-liveness), just can not be slow or they will fail and
   268    // cause unavailability, so back-pressuring may *cause* unavailability,
   269    // instead of gracefully slowing to some stable equilibrium to avoid it. As
   270    // such, we want these set so they are impossible to hit.
   271    options.soft_pending_compaction_bytes_limit = std::numeric_limits<uint64_t>::max();
   272    options.hard_pending_compaction_bytes_limit = std::numeric_limits<uint64_t>::max();
   273    // Flush write buffers to L0 as soon as they are full. A higher
   274    // value could be beneficial if there are duplicate records in each
   275    // of the individual write buffers, but perf testing hasn't shown
   276    // any benefit so far.
   277    options.min_write_buffer_number_to_merge = 1;
   278    // Enable dynamic level sizing which reduces both size and write
   279    // amplification. This causes RocksDB to pick the target size of
   280    // each level dynamically.
   281    options.level_compaction_dynamic_level_bytes = true;
   282    // Follow the RocksDB recommendation to configure the size of L1 to
   283    // be the same as the estimated size of L0.
   284    options.max_bytes_for_level_base = 64 << 20;  // 64 MB
   285    options.max_bytes_for_level_multiplier = 10;
   286    // Target the base file size (L1) as 4 MB. Each additional level
   287    // grows the file size by 2. With max_bytes_for_level_base set to 64
   288    // MB, this translates into the following target level and file
   289    // sizes for each level:
   290    //
   291    //       level-size  file-size  max-files
   292    //   L1:      64 MB       4 MB         16
   293    //   L2:     640 MB       8 MB         80
   294    //   L3:    6.25 GB      16 MB        400
   295    //   L4:    62.5 GB      32 MB       2000
   296    //   L5:     625 GB      64 MB      10000
   297    //   L6:     6.1 TB     128 MB      50000
   298    //
   299    // Due to the use of level_compaction_dynamic_level_bytes most data
   300    // will be in L6. The number of files will be approximately
   301    // total-data-size / 128 MB.
   302    //
   303    // We don't want the target file size to be too large, otherwise
   304    // individual compactions become more expensive. We don't want the
   305    // target file size to be too small or else we get an overabundance
   306    // of sstables.
   307    options.target_file_size_base = 4 << 20;  // 4 MB
   308    options.target_file_size_multiplier = 2;
   309    options.manual_wal_flush = true;
   310  
   311    // Because we open a long running rocksdb instance, we do not want the
   312    // manifest file to grow unbounded. Assuming each manifest entry is about 1
   313    // KB, this allows for 128 K entries. This could account for several hours to
   314    // few months of runtime without rolling based on the workload.
   315    options.max_manifest_file_size = 128 << 20;  // 128 MB
   316  
   317    rocksdb::BlockBasedTableOptions table_options;
   318    if (db_opts.cache != nullptr) {
   319      table_options.block_cache = db_opts.cache->rep;
   320  
   321      // Reserve 1 memtable worth of memory from the cache. Under high
   322      // load situations we'll be using somewhat more than 1 memtable,
   323      // but usually not significantly more unless there is an I/O
   324      // throughput problem.
   325      //
   326      // We ensure that at least 1MB is allocated for the block cache.
   327      // Some unit tests expect to see a non-zero block cache hit rate,
   328      // but they use a cache that is small enough that all of it would
   329      // otherwise be reserved for the memtable.
   330      std::lock_guard<std::mutex> guard(db_opts.cache->mu);
   331      const int64_t capacity = db_opts.cache->rep->GetCapacity();
   332      const int64_t new_capacity = std::max<int64_t>(1 << 20, capacity - options.write_buffer_size);
   333      db_opts.cache->rep->SetCapacity(new_capacity);
   334    }
   335  
   336    // Pass false for use_blocked_base_builder creates a per file
   337    // (sstable) filter instead of a per-block filter. The per file
   338    // filter can be consulted before going to the index which saves an
   339    // index lookup. The cost is an 4-bytes per key in memory during
   340    // compactions, which seems a small price to pay.
   341    table_options.filter_policy.reset(rocksdb::NewBloomFilterPolicy(10, false /* !block_based */));
   342    table_options.format_version = 2;
   343  
   344    // Increasing block_size decreases memory usage at the cost of
   345    // increased read amplification. When reading a key-value pair from
   346    // a table file, RocksDB loads an entire block into memory. The
   347    // RocksDB default is 4KB. This sets it to 32KB.
   348    table_options.block_size = 32 << 10;
   349    // Disable whole_key_filtering which adds a bloom filter entry for
   350    // the "whole key", doubling the size of our bloom filters. This is
   351    // used to speed up Get operations which we don't use.
   352    table_options.whole_key_filtering = false;
   353    options.table_factory.reset(rocksdb::NewBlockBasedTableFactory(table_options));
   354    return options;
   355  }
   356  
   357  }  // namespace cockroach