github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/c-deps/libroach/merge.cc

github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/c-deps/libroach/merge.cc (about)

     1  // Copyright 2018 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  #include "merge.h"
    12  #include <numeric>
    13  #include <rocksdb/env.h>
    14  #include "db.h"
    15  #include "protos/roachpb/data.pb.h"
    16  #include "protos/roachpb/internal.pb.h"
    17  #include "status.h"
    18  
    19  namespace cockroach {
    20  
    21  namespace {
    22  
    23  const int kChecksumSize = 4;
    24  const int kTagPos = kChecksumSize;
    25  const int kHeaderSize = kTagPos + 1;
    26  
    27  rocksdb::Slice ValueDataBytes(const std::string& val) {
    28    if (val.size() < kHeaderSize) {
    29      return rocksdb::Slice();
    30    }
    31    return rocksdb::Slice(val.data() + kHeaderSize, val.size() - kHeaderSize);
    32  }
    33  
    34  cockroach::roachpb::ValueType GetTag(const std::string& val) {
    35    if (val.size() < kHeaderSize) {
    36      return cockroach::roachpb::UNKNOWN;
    37    }
    38    return cockroach::roachpb::ValueType(val[kTagPos]);
    39  }
    40  
    41  void SetTag(std::string* val, cockroach::roachpb::ValueType tag) { (*val)[kTagPos] = tag; }
    42  
    43  WARN_UNUSED_RESULT bool ParseProtoFromValue(const std::string& val,
    44                                              google::protobuf::MessageLite* msg) {
    45    if (val.size() < kHeaderSize) {
    46      return false;
    47    }
    48    const rocksdb::Slice d = ValueDataBytes(val);
    49    return msg->ParseFromArray(d.data(), d.size());
    50  }
    51  
    52  void SerializeProtoToValue(std::string* val, const google::protobuf::MessageLite& msg) {
    53    val->resize(kHeaderSize);
    54    std::fill(val->begin(), val->end(), 0);
    55    SetTag(val, cockroach::roachpb::BYTES);
    56    msg.AppendToString(val);
    57  }
    58  
    59  // Method used to sort InternalTimeSeriesSamples.
    60  bool TimeSeriesSampleOrdering(const cockroach::roachpb::InternalTimeSeriesSample* a,
    61                                const cockroach::roachpb::InternalTimeSeriesSample* b) {
    62    return a->offset() < b->offset();
    63  }
    64  
    65  // IsTimeSeriesData returns true if the given protobuffer Value contains a
    66  // TimeSeriesData message.
    67  bool IsTimeSeriesData(const std::string& val) {
    68    return GetTag(val) == cockroach::roachpb::TIMESERIES;
    69  }
    70  
    71  void SerializeTimeSeriesToValue(std::string* val,
    72                                  const cockroach::roachpb::InternalTimeSeriesData& ts) {
    73    SerializeProtoToValue(val, ts);
    74    SetTag(val, cockroach::roachpb::TIMESERIES);
    75  }
    76  
    77  // MergeTimeSeriesValues attempts to merge two Values which contain
    78  // InternalTimeSeriesData messages. The messages cannot be merged if they have
    79  // different start timestamps or sample durations. Returns true if the merge is
    80  // successful.
    81  WARN_UNUSED_RESULT bool MergeTimeSeriesValues(std::string* left, const std::string& right,
    82                                                bool full_merge, rocksdb::Logger* logger) {
    83    // Attempt to parse TimeSeriesData from both Values.
    84    cockroach::roachpb::InternalTimeSeriesData left_ts;
    85    cockroach::roachpb::InternalTimeSeriesData right_ts;
    86    if (!ParseProtoFromValue(*left, &left_ts)) {
    87      rocksdb::Warn(logger, "left InternalTimeSeriesData could not be parsed from bytes.");
    88      return false;
    89    }
    90    if (!ParseProtoFromValue(right, &right_ts)) {
    91      rocksdb::Warn(logger, "right InternalTimeSeriesData could not be parsed from bytes.");
    92      return false;
    93    }
    94  
    95    // Ensure that both InternalTimeSeriesData have the same timestamp and
    96    // sample_duration.
    97    if (left_ts.start_timestamp_nanos() != right_ts.start_timestamp_nanos()) {
    98      rocksdb::Warn(logger, "TimeSeries merge failed due to mismatched start timestamps");
    99      return false;
   100    }
   101    if (left_ts.sample_duration_nanos() != right_ts.sample_duration_nanos()) {
   102      rocksdb::Warn(logger, "TimeSeries merge failed due to mismatched sample durations.");
   103      return false;
   104    }
   105  
   106    // Determine if we are using row or columnar format, by checking if either
   107    // format has a "last" column.
   108    bool use_column_format = left_ts.last_size() > 0 || right_ts.last_size() > 0;
   109  
   110    // If only a partial merge, do not sort and combine - instead, just quickly
   111    // merge the two values together. Values will be processed later after a
   112    // full merge.
   113    if (!full_merge) {
   114      // If using columnar format, convert both operands even in a partial merge.
   115      // This is necessary to keep the order of merges stable.
   116      if (use_column_format) {
   117        convertToColumnar(&left_ts);
   118        convertToColumnar(&right_ts);
   119      }
   120      left_ts.MergeFrom(right_ts);
   121      SerializeTimeSeriesToValue(left, left_ts);
   122      return true;
   123    }
   124  
   125    if (use_column_format) {
   126      // Convert from row format to column format if necessary.
   127      convertToColumnar(&left_ts);
   128      convertToColumnar(&right_ts);
   129  
   130      // Find the minimum offset of the right collection, and find the highest
   131      // index in the left collection which is greater than or equal to that
   132      // minimum. This determines how many elements of the left collection will
   133      // need to be re-sorted and de-duplicated.
   134      auto min_offset = std::min_element(right_ts.offset().begin(), right_ts.offset().end());
   135      auto first_unsorted_index = std::distance(
   136          left_ts.offset().begin(),
   137          std::lower_bound(left_ts.offset().begin(), left_ts.offset().end(), *min_offset));
   138      left_ts.MergeFrom(right_ts);
   139      sortAndDeduplicateColumns(&left_ts, first_unsorted_index);
   140      SerializeTimeSeriesToValue(left, left_ts);
   141    } else {
   142      // Initialize new_ts and its primitive data fields. Values from the left and
   143      // right collections will be merged into the new collection.
   144      cockroach::roachpb::InternalTimeSeriesData new_ts;
   145      new_ts.set_start_timestamp_nanos(left_ts.start_timestamp_nanos());
   146      new_ts.set_sample_duration_nanos(left_ts.sample_duration_nanos());
   147  
   148      // Sort values in right_ts. Assume values in left_ts have been sorted.
   149      std::stable_sort(right_ts.mutable_samples()->pointer_begin(),
   150                       right_ts.mutable_samples()->pointer_end(), TimeSeriesSampleOrdering);
   151  
   152      // Merge sample values of left and right into new_ts.
   153      auto left_front = left_ts.samples().begin();
   154      auto left_end = left_ts.samples().end();
   155      auto right_front = right_ts.samples().begin();
   156      auto right_end = right_ts.samples().end();
   157  
   158      // Loop until samples from both sides have been exhausted.
   159      while (left_front != left_end || right_front != right_end) {
   160        // Select the lowest offset from either side.
   161        long next_offset;
   162        if (left_front == left_end) {
   163          next_offset = right_front->offset();
   164        } else if (right_front == right_end) {
   165          next_offset = left_front->offset();
   166        } else if (left_front->offset() <= right_front->offset()) {
   167          next_offset = left_front->offset();
   168        } else {
   169          next_offset = right_front->offset();
   170        }
   171  
   172        // Create an empty sample in the output collection.
   173        cockroach::roachpb::InternalTimeSeriesSample* ns = new_ts.add_samples();
   174  
   175        // Only the most recently merged value with a given sample offset is kept;
   176        // samples merged earlier at the same offset are discarded. We will now
   177        // parse through the left and right sample sets, finding the most recently
   178        // merged sample at the current offset.
   179        cockroach::roachpb::InternalTimeSeriesSample src;
   180        while (left_front != left_end && left_front->offset() == next_offset) {
   181          src = *left_front;
   182          left_front++;
   183        }
   184        while (right_front != right_end && right_front->offset() == next_offset) {
   185          src = *right_front;
   186          right_front++;
   187        }
   188  
   189        ns->CopyFrom(src);
   190      }
   191  
   192      // Serialize the new TimeSeriesData into the left value's byte field.
   193      SerializeTimeSeriesToValue(left, new_ts);
   194    }
   195    return true;
   196  }
   197  
   198  // ConsolidateTimeSeriesValue processes a single value which contains
   199  // InternalTimeSeriesData messages. This method will sort the sample collection
   200  // of the value, keeping only the last of samples with duplicate offsets.
   201  // This method is the single-value equivalent of MergeTimeSeriesValues, and is
   202  // used in the case where the first value is merged into the key. Returns true
   203  // if the merge is successful.
   204  WARN_UNUSED_RESULT bool ConsolidateTimeSeriesValue(std::string* val, rocksdb::Logger* logger) {
   205    // Attempt to parse TimeSeriesData from both Values.
   206    cockroach::roachpb::InternalTimeSeriesData val_ts;
   207    if (!ParseProtoFromValue(*val, &val_ts)) {
   208      rocksdb::Warn(logger, "InternalTimeSeriesData could not be parsed from bytes.");
   209      return false;
   210    }
   211  
   212    // Detect if the value is in columnar or row format. Columnar format is
   213    // detected by the presence of a non-zero-length offset field.
   214    if (val_ts.offset_size() > 0) {
   215      // It's possible that, due to partial merges, the right hand value contain
   216      // both row-format and column-format data. Convert it all to columnar.
   217      convertToColumnar(&val_ts);
   218      sortAndDeduplicateColumns(&val_ts, 0);
   219    } else {
   220      std::stable_sort(val_ts.mutable_samples()->pointer_begin(),
   221                       val_ts.mutable_samples()->pointer_end(), TimeSeriesSampleOrdering);
   222  
   223      // Deduplicate values, keeping only the *last* sample merged with a given index.
   224      using sample = cockroach::roachpb::InternalTimeSeriesSample;
   225      auto it =
   226          std::unique(val_ts.mutable_samples()->rbegin(), val_ts.mutable_samples()->rend(),
   227                      [](const sample& a, const sample& b) { return a.offset() == b.offset(); });
   228      val_ts.mutable_samples()->DeleteSubrange(
   229          0, std::distance(val_ts.mutable_samples()->begin(), it.base()));
   230    }
   231  
   232    // Serialize the new TimeSeriesData into the value's byte field.
   233    SerializeTimeSeriesToValue(val, val_ts);
   234    return true;
   235  }
   236  
   237  class DBMergeOperator : public rocksdb::MergeOperator {
   238    virtual const char* Name() const { return "cockroach_merge_operator"; }
   239  
   240    virtual bool FullMerge(const rocksdb::Slice& key, const rocksdb::Slice* existing_value,
   241                           const std::deque<std::string>& operand_list, std::string* new_value,
   242                           rocksdb::Logger* logger) const WARN_UNUSED_RESULT {
   243      // TODO(pmattis): Taken from the old merger code, below are some
   244      // details about how errors returned by the merge operator are
   245      // handled. Need to test various error scenarios and decide on
   246      // desired behavior. Clear the key and it's gone. Corrupt it
   247      // properly and RocksDB might refuse to work with it at all until
   248      // you clear it manually, which may also not be what we want. The
   249      // problem with merges is that RocksDB won't really carry them out
   250      // while we have a chance to talk back to clients.
   251      //
   252      // If we indicate failure (*success = false), then the call to the
   253      // merger via rocksdb_merge will not return an error, but simply
   254      // remove or truncate the offending key (at least when the settings
   255      // specify that missing keys should be created; otherwise a
   256      // corruption error will be returned, but likely only after the next
   257      // read of the key). In effect, there is no propagation of error
   258      // information to the client.
   259      cockroach::storage::enginepb::MVCCMetadata meta;
   260      if (existing_value != NULL) {
   261        if (!meta.ParseFromArray(existing_value->data(), existing_value->size())) {
   262          // Corrupted existing value.
   263          rocksdb::Warn(logger, "corrupted existing value");
   264          return false;
   265        }
   266      }
   267  
   268      for (int i = 0; i < operand_list.size(); i++) {
   269        if (!MergeOne(&meta, operand_list[i], true, logger)) {
   270          return false;
   271        }
   272      }
   273  
   274      if (!meta.SerializeToString(new_value)) {
   275        rocksdb::Warn(logger, "serialization error");
   276        return false;
   277      }
   278      return true;
   279    }
   280  
   281    virtual bool PartialMergeMulti(const rocksdb::Slice& key,
   282                                   const std::deque<rocksdb::Slice>& operand_list,
   283                                   std::string* new_value,
   284                                   rocksdb::Logger* logger) const WARN_UNUSED_RESULT {
   285      cockroach::storage::enginepb::MVCCMetadata meta;
   286  
   287      for (int i = 0; i < operand_list.size(); i++) {
   288        if (!MergeOne(&meta, operand_list[i], false, logger)) {
   289          return false;
   290        }
   291      }
   292  
   293      if (!meta.SerializeToString(new_value)) {
   294        rocksdb::Warn(logger, "serialization error");
   295        return false;
   296      }
   297      return true;
   298    }
   299  
   300   private:
   301    bool MergeOne(cockroach::storage::enginepb::MVCCMetadata* meta,
   302                  const rocksdb::Slice& operand, bool full_merge,
   303                  rocksdb::Logger* logger) const WARN_UNUSED_RESULT {
   304      cockroach::storage::enginepb::MVCCMetadata operand_meta;
   305      if (!operand_meta.ParseFromArray(operand.data(), operand.size())) {
   306        rocksdb::Warn(logger, "corrupted operand value");
   307        return false;
   308      }
   309      return MergeValues(meta, operand_meta, full_merge, logger);
   310    }
   311  };
   312  
   313  }  // namespace
   314  
   315  // convertToColumnar detects time series data which is in the old row format,
   316  // converting the data within into the new columnar format.
   317  void convertToColumnar(cockroach::roachpb::InternalTimeSeriesData* data) {
   318    if (data->samples_size() > 0) {
   319      for (auto sample : data->samples()) {
   320        // While the row format contains other values (such as min and max), these
   321        // were not stored in actual usage. Furthermore, the new columnar format
   322        // has been designed to be "sparse", with high resolutions containing
   323        // values only for the "offset" and "last" columns. Thus, the other row
   324        // fields are ignored.
   325        data->add_offset(sample.offset());
   326        data->add_last(sample.sum());
   327      }
   328      data->clear_samples();
   329    }
   330  }
   331  
   332  // sortAndDeduplicateColumns sorts all column fields of the time series data
   333  // structure according to the values "offset" column. At the same time,
   334  // duplicate offset values are removed - only the last instance of an offset in
   335  // the collection is retained.
   336  //
   337  // "first_unsorted" is an optimization which only sorts data rows with an index
   338  // greater than or equal to the supplied index. This is used because the
   339  // supplied data is often the result of merging an already-sorted collection
   340  // with a smaller unsorted collection, and thus only a portion of the end of the
   341  // data needs to be sorted.
   342  void sortAndDeduplicateColumns(cockroach::roachpb::InternalTimeSeriesData* data,
   343                                 int first_unsorted) {
   344    // Create an auxiliary array of array indexes, and sort that array according
   345    // to the corresponding offset value in the data.offset() collection. This
   346    // yields the permutation of the current array indexes that will place the
   347    // offsets into sorted order.
   348    auto order = std::vector<int>(data->offset_size() - first_unsorted);
   349    std::iota(order.begin(), order.end(), first_unsorted);
   350    std::stable_sort(order.begin(), order.end(),
   351                     [&](const int a, const int b) { return data->offset(a) < data->offset(b); });
   352  
   353    // Remove any duplicates from the permutation, keeping the *last* element
   354    // merged for any given offset. Note the number of duplicates removed so that
   355    // the columns can be resized later.
   356    auto it = std::unique(order.rbegin(), order.rend(), [&](const int a, const int b) {
   357      return data->offset(a) == data->offset(b);
   358    });
   359    int duplicates = std::distance(order.begin(), it.base());
   360    order.erase(order.begin(), it.base());
   361  
   362    // Apply the permutation in the auxiliary array to all of the relevant column
   363    // arrays in the data set.
   364    for (int i = 0; i < order.size(); i++) {
   365      // "dest_idx" is the current index which is being operated on; for each
   366      // column, we will be replacing the value at this index with the correct
   367      // sorted-order value for that index.
   368      //
   369      // "src_idx" is the current location of the value that is being moved to
   370      // dest_idx, found by consulting the "order" auxiliary array. Its value
   371      // will be *swapped* with the current value at "dest_idx".
   372      //
   373      // Because we are swapping values, and because we iterate through
   374      // destinations from front to back, it is possible that value that was
   375      // originally in "src_idx" has already been swapped to another location;
   376      // specifically, if "src_idx" is earlier than "dest_idx", then its value is
   377      // guaranteed to have been swapped. To find its current location, we
   378      // "follow" the indexes in the order array until we arrive at a src_idx
   379      // which is greater than the current dest_idx, which will be the correct
   380      // location of the source value.
   381      //
   382      // An example of this situation:
   383      //
   384      // initial:
   385      //    data = [3 1 4 2]
   386      //    order = [1 3 0 2]
   387      //
   388      // dest = 0
   389      //    src = order[0] // 1
   390      //    data.swap(dest, src) // 0 <-> 1
   391      //    data == [1 3 4 2]
   392      //
   393      // dest = 1
   394      //    src = order[1] // 3
   395      //    data.swap(dest, src) // 1 <-> 3
   396      //    data == [1 2 4 3]
   397      //
   398      // dest = 2
   399      //    src = order[2] // 0
   400      //    // src < dest, so follow the trail
   401      //    src = order[src] // 1
   402      //    // src < dest, so follow the trail
   403      //    src = order[src] // 3
   404      //    data.swap(dest, src) // 2 <-> 3
   405      //    data == [1 2 3 4]
   406      int dest_idx = i + first_unsorted;
   407      int src_idx = order[i];
   408      while (src_idx < dest_idx) {
   409        src_idx = order[src_idx - first_unsorted];
   410      }
   411      // If the source is equal to the destination, then this value is already
   412      // at its correct sorted location.
   413      if (src_idx == dest_idx) {
   414        continue;
   415      }
   416  
   417      data->mutable_offset()->SwapElements(src_idx, dest_idx);
   418      data->mutable_last()->SwapElements(src_idx, dest_idx);
   419  
   420      // These columns are only present at resolutions generated as rollups. We
   421      // detect this by checking if there are any count columns present (the
   422      // choice of "count" is arbitrary, all of these columns will be present or
   423      // not).
   424      if (data->count_size() > 0) {
   425        data->mutable_count()->SwapElements(src_idx, dest_idx);
   426        data->mutable_sum()->SwapElements(src_idx, dest_idx);
   427        data->mutable_min()->SwapElements(src_idx, dest_idx);
   428        data->mutable_max()->SwapElements(src_idx, dest_idx);
   429        data->mutable_first()->SwapElements(src_idx, dest_idx);
   430        data->mutable_variance()->SwapElements(src_idx, dest_idx);
   431      }
   432    }
   433  
   434    // Resize each column to account for any duplicate values which were removed -
   435    // the swapping algorithm will have moved these to the very end of the
   436    // collection.
   437    auto new_size = data->offset_size() - duplicates;
   438    data->mutable_offset()->Truncate(new_size);
   439    data->mutable_last()->Truncate(new_size);
   440    if (data->count_size() > 0) {
   441      data->mutable_count()->Truncate(new_size);
   442      data->mutable_sum()->Truncate(new_size);
   443      data->mutable_min()->Truncate(new_size);
   444      data->mutable_max()->Truncate(new_size);
   445      data->mutable_first()->Truncate(new_size);
   446      data->mutable_variance()->Truncate(new_size);
   447    }
   448  }
   449  
   450  WARN_UNUSED_RESULT bool MergeValues(cockroach::storage::enginepb::MVCCMetadata* left,
   451                                      const cockroach::storage::enginepb::MVCCMetadata& right,
   452                                      bool full_merge, rocksdb::Logger* logger) {
   453    if (left->has_raw_bytes()) {
   454      if (!right.has_raw_bytes()) {
   455        rocksdb::Warn(logger, "inconsistent value types for merge (left = bytes, right = ?)");
   456        return false;
   457      }
   458  
   459      // Replay Advisory: Because merge commands pass through raft, it is possible
   460      // for merging values to be "replayed". Currently, the only actual use of
   461      // the merge system is for time series data, which is safe against replay;
   462      // however, this property is not general for all potential mergeable types.
   463      // If a future need arises to merge another type of data, replay protection
   464      // will likely need to be a consideration.
   465  
   466      if (IsTimeSeriesData(left->raw_bytes()) || IsTimeSeriesData(right.raw_bytes())) {
   467        // The right operand must also be a time series.
   468        if (!IsTimeSeriesData(left->raw_bytes()) || !IsTimeSeriesData(right.raw_bytes())) {
   469          rocksdb::Warn(logger, "inconsistent value types for merging time "
   470                                "series data (type(left) != type(right))");
   471          return false;
   472        }
   473        return MergeTimeSeriesValues(left->mutable_raw_bytes(), right.raw_bytes(), full_merge,
   474                                     logger);
   475      } else {
   476        const rocksdb::Slice rdata = ValueDataBytes(right.raw_bytes());
   477        left->mutable_raw_bytes()->append(rdata.data(), rdata.size());
   478      }
   479      return true;
   480    } else {
   481      left->mutable_raw_bytes()->assign(right.raw_bytes());
   482      if (right.has_merge_timestamp()) {
   483        left->mutable_merge_timestamp()->CopyFrom(right.merge_timestamp());
   484      }
   485      if (full_merge && IsTimeSeriesData(left->raw_bytes())) {
   486        if (!ConsolidateTimeSeriesValue(left->mutable_raw_bytes(), logger)) {
   487          return false;
   488        }
   489      }
   490      return true;
   491    }
   492  }
   493  
   494  // MergeResult serializes the result MVCCMetadata value into a byte slice.
   495  DBStatus MergeResult(cockroach::storage::enginepb::MVCCMetadata* meta, DBString* result) {
   496    // TODO(pmattis): Should recompute checksum here. Need a crc32
   497    // implementation and need to verify the checksumming is identical
   498    // to what is being done in Go. Zlib's crc32 should be sufficient.
   499    result->len = meta->ByteSize();
   500    result->data = static_cast<char*>(malloc(result->len));
   501    if (!meta->SerializeToArray(result->data, result->len)) {
   502      return ToDBString("serialization error");
   503    }
   504    return kSuccess;
   505  }
   506  
   507  rocksdb::MergeOperator* NewMergeOperator() { return new DBMergeOperator; }
   508  
   509  }  // namespace cockroach