github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/c-deps/libroach/merge.cc (about) 1 // Copyright 2018 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 #include "merge.h" 12 #include <numeric> 13 #include <rocksdb/env.h> 14 #include "db.h" 15 #include "protos/roachpb/data.pb.h" 16 #include "protos/roachpb/internal.pb.h" 17 #include "status.h" 18 19 namespace cockroach { 20 21 namespace { 22 23 const int kChecksumSize = 4; 24 const int kTagPos = kChecksumSize; 25 const int kHeaderSize = kTagPos + 1; 26 27 rocksdb::Slice ValueDataBytes(const std::string& val) { 28 if (val.size() < kHeaderSize) { 29 return rocksdb::Slice(); 30 } 31 return rocksdb::Slice(val.data() + kHeaderSize, val.size() - kHeaderSize); 32 } 33 34 cockroach::roachpb::ValueType GetTag(const std::string& val) { 35 if (val.size() < kHeaderSize) { 36 return cockroach::roachpb::UNKNOWN; 37 } 38 return cockroach::roachpb::ValueType(val[kTagPos]); 39 } 40 41 void SetTag(std::string* val, cockroach::roachpb::ValueType tag) { (*val)[kTagPos] = tag; } 42 43 WARN_UNUSED_RESULT bool ParseProtoFromValue(const std::string& val, 44 google::protobuf::MessageLite* msg) { 45 if (val.size() < kHeaderSize) { 46 return false; 47 } 48 const rocksdb::Slice d = ValueDataBytes(val); 49 return msg->ParseFromArray(d.data(), d.size()); 50 } 51 52 void SerializeProtoToValue(std::string* val, const google::protobuf::MessageLite& msg) { 53 val->resize(kHeaderSize); 54 std::fill(val->begin(), val->end(), 0); 55 SetTag(val, cockroach::roachpb::BYTES); 56 msg.AppendToString(val); 57 } 58 59 // Method used to sort InternalTimeSeriesSamples. 60 bool TimeSeriesSampleOrdering(const cockroach::roachpb::InternalTimeSeriesSample* a, 61 const cockroach::roachpb::InternalTimeSeriesSample* b) { 62 return a->offset() < b->offset(); 63 } 64 65 // IsTimeSeriesData returns true if the given protobuffer Value contains a 66 // TimeSeriesData message. 67 bool IsTimeSeriesData(const std::string& val) { 68 return GetTag(val) == cockroach::roachpb::TIMESERIES; 69 } 70 71 void SerializeTimeSeriesToValue(std::string* val, 72 const cockroach::roachpb::InternalTimeSeriesData& ts) { 73 SerializeProtoToValue(val, ts); 74 SetTag(val, cockroach::roachpb::TIMESERIES); 75 } 76 77 // MergeTimeSeriesValues attempts to merge two Values which contain 78 // InternalTimeSeriesData messages. The messages cannot be merged if they have 79 // different start timestamps or sample durations. Returns true if the merge is 80 // successful. 81 WARN_UNUSED_RESULT bool MergeTimeSeriesValues(std::string* left, const std::string& right, 82 bool full_merge, rocksdb::Logger* logger) { 83 // Attempt to parse TimeSeriesData from both Values. 84 cockroach::roachpb::InternalTimeSeriesData left_ts; 85 cockroach::roachpb::InternalTimeSeriesData right_ts; 86 if (!ParseProtoFromValue(*left, &left_ts)) { 87 rocksdb::Warn(logger, "left InternalTimeSeriesData could not be parsed from bytes."); 88 return false; 89 } 90 if (!ParseProtoFromValue(right, &right_ts)) { 91 rocksdb::Warn(logger, "right InternalTimeSeriesData could not be parsed from bytes."); 92 return false; 93 } 94 95 // Ensure that both InternalTimeSeriesData have the same timestamp and 96 // sample_duration. 97 if (left_ts.start_timestamp_nanos() != right_ts.start_timestamp_nanos()) { 98 rocksdb::Warn(logger, "TimeSeries merge failed due to mismatched start timestamps"); 99 return false; 100 } 101 if (left_ts.sample_duration_nanos() != right_ts.sample_duration_nanos()) { 102 rocksdb::Warn(logger, "TimeSeries merge failed due to mismatched sample durations."); 103 return false; 104 } 105 106 // Determine if we are using row or columnar format, by checking if either 107 // format has a "last" column. 108 bool use_column_format = left_ts.last_size() > 0 || right_ts.last_size() > 0; 109 110 // If only a partial merge, do not sort and combine - instead, just quickly 111 // merge the two values together. Values will be processed later after a 112 // full merge. 113 if (!full_merge) { 114 // If using columnar format, convert both operands even in a partial merge. 115 // This is necessary to keep the order of merges stable. 116 if (use_column_format) { 117 convertToColumnar(&left_ts); 118 convertToColumnar(&right_ts); 119 } 120 left_ts.MergeFrom(right_ts); 121 SerializeTimeSeriesToValue(left, left_ts); 122 return true; 123 } 124 125 if (use_column_format) { 126 // Convert from row format to column format if necessary. 127 convertToColumnar(&left_ts); 128 convertToColumnar(&right_ts); 129 130 // Find the minimum offset of the right collection, and find the highest 131 // index in the left collection which is greater than or equal to that 132 // minimum. This determines how many elements of the left collection will 133 // need to be re-sorted and de-duplicated. 134 auto min_offset = std::min_element(right_ts.offset().begin(), right_ts.offset().end()); 135 auto first_unsorted_index = std::distance( 136 left_ts.offset().begin(), 137 std::lower_bound(left_ts.offset().begin(), left_ts.offset().end(), *min_offset)); 138 left_ts.MergeFrom(right_ts); 139 sortAndDeduplicateColumns(&left_ts, first_unsorted_index); 140 SerializeTimeSeriesToValue(left, left_ts); 141 } else { 142 // Initialize new_ts and its primitive data fields. Values from the left and 143 // right collections will be merged into the new collection. 144 cockroach::roachpb::InternalTimeSeriesData new_ts; 145 new_ts.set_start_timestamp_nanos(left_ts.start_timestamp_nanos()); 146 new_ts.set_sample_duration_nanos(left_ts.sample_duration_nanos()); 147 148 // Sort values in right_ts. Assume values in left_ts have been sorted. 149 std::stable_sort(right_ts.mutable_samples()->pointer_begin(), 150 right_ts.mutable_samples()->pointer_end(), TimeSeriesSampleOrdering); 151 152 // Merge sample values of left and right into new_ts. 153 auto left_front = left_ts.samples().begin(); 154 auto left_end = left_ts.samples().end(); 155 auto right_front = right_ts.samples().begin(); 156 auto right_end = right_ts.samples().end(); 157 158 // Loop until samples from both sides have been exhausted. 159 while (left_front != left_end || right_front != right_end) { 160 // Select the lowest offset from either side. 161 long next_offset; 162 if (left_front == left_end) { 163 next_offset = right_front->offset(); 164 } else if (right_front == right_end) { 165 next_offset = left_front->offset(); 166 } else if (left_front->offset() <= right_front->offset()) { 167 next_offset = left_front->offset(); 168 } else { 169 next_offset = right_front->offset(); 170 } 171 172 // Create an empty sample in the output collection. 173 cockroach::roachpb::InternalTimeSeriesSample* ns = new_ts.add_samples(); 174 175 // Only the most recently merged value with a given sample offset is kept; 176 // samples merged earlier at the same offset are discarded. We will now 177 // parse through the left and right sample sets, finding the most recently 178 // merged sample at the current offset. 179 cockroach::roachpb::InternalTimeSeriesSample src; 180 while (left_front != left_end && left_front->offset() == next_offset) { 181 src = *left_front; 182 left_front++; 183 } 184 while (right_front != right_end && right_front->offset() == next_offset) { 185 src = *right_front; 186 right_front++; 187 } 188 189 ns->CopyFrom(src); 190 } 191 192 // Serialize the new TimeSeriesData into the left value's byte field. 193 SerializeTimeSeriesToValue(left, new_ts); 194 } 195 return true; 196 } 197 198 // ConsolidateTimeSeriesValue processes a single value which contains 199 // InternalTimeSeriesData messages. This method will sort the sample collection 200 // of the value, keeping only the last of samples with duplicate offsets. 201 // This method is the single-value equivalent of MergeTimeSeriesValues, and is 202 // used in the case where the first value is merged into the key. Returns true 203 // if the merge is successful. 204 WARN_UNUSED_RESULT bool ConsolidateTimeSeriesValue(std::string* val, rocksdb::Logger* logger) { 205 // Attempt to parse TimeSeriesData from both Values. 206 cockroach::roachpb::InternalTimeSeriesData val_ts; 207 if (!ParseProtoFromValue(*val, &val_ts)) { 208 rocksdb::Warn(logger, "InternalTimeSeriesData could not be parsed from bytes."); 209 return false; 210 } 211 212 // Detect if the value is in columnar or row format. Columnar format is 213 // detected by the presence of a non-zero-length offset field. 214 if (val_ts.offset_size() > 0) { 215 // It's possible that, due to partial merges, the right hand value contain 216 // both row-format and column-format data. Convert it all to columnar. 217 convertToColumnar(&val_ts); 218 sortAndDeduplicateColumns(&val_ts, 0); 219 } else { 220 std::stable_sort(val_ts.mutable_samples()->pointer_begin(), 221 val_ts.mutable_samples()->pointer_end(), TimeSeriesSampleOrdering); 222 223 // Deduplicate values, keeping only the *last* sample merged with a given index. 224 using sample = cockroach::roachpb::InternalTimeSeriesSample; 225 auto it = 226 std::unique(val_ts.mutable_samples()->rbegin(), val_ts.mutable_samples()->rend(), 227 [](const sample& a, const sample& b) { return a.offset() == b.offset(); }); 228 val_ts.mutable_samples()->DeleteSubrange( 229 0, std::distance(val_ts.mutable_samples()->begin(), it.base())); 230 } 231 232 // Serialize the new TimeSeriesData into the value's byte field. 233 SerializeTimeSeriesToValue(val, val_ts); 234 return true; 235 } 236 237 class DBMergeOperator : public rocksdb::MergeOperator { 238 virtual const char* Name() const { return "cockroach_merge_operator"; } 239 240 virtual bool FullMerge(const rocksdb::Slice& key, const rocksdb::Slice* existing_value, 241 const std::deque<std::string>& operand_list, std::string* new_value, 242 rocksdb::Logger* logger) const WARN_UNUSED_RESULT { 243 // TODO(pmattis): Taken from the old merger code, below are some 244 // details about how errors returned by the merge operator are 245 // handled. Need to test various error scenarios and decide on 246 // desired behavior. Clear the key and it's gone. Corrupt it 247 // properly and RocksDB might refuse to work with it at all until 248 // you clear it manually, which may also not be what we want. The 249 // problem with merges is that RocksDB won't really carry them out 250 // while we have a chance to talk back to clients. 251 // 252 // If we indicate failure (*success = false), then the call to the 253 // merger via rocksdb_merge will not return an error, but simply 254 // remove or truncate the offending key (at least when the settings 255 // specify that missing keys should be created; otherwise a 256 // corruption error will be returned, but likely only after the next 257 // read of the key). In effect, there is no propagation of error 258 // information to the client. 259 cockroach::storage::enginepb::MVCCMetadata meta; 260 if (existing_value != NULL) { 261 if (!meta.ParseFromArray(existing_value->data(), existing_value->size())) { 262 // Corrupted existing value. 263 rocksdb::Warn(logger, "corrupted existing value"); 264 return false; 265 } 266 } 267 268 for (int i = 0; i < operand_list.size(); i++) { 269 if (!MergeOne(&meta, operand_list[i], true, logger)) { 270 return false; 271 } 272 } 273 274 if (!meta.SerializeToString(new_value)) { 275 rocksdb::Warn(logger, "serialization error"); 276 return false; 277 } 278 return true; 279 } 280 281 virtual bool PartialMergeMulti(const rocksdb::Slice& key, 282 const std::deque<rocksdb::Slice>& operand_list, 283 std::string* new_value, 284 rocksdb::Logger* logger) const WARN_UNUSED_RESULT { 285 cockroach::storage::enginepb::MVCCMetadata meta; 286 287 for (int i = 0; i < operand_list.size(); i++) { 288 if (!MergeOne(&meta, operand_list[i], false, logger)) { 289 return false; 290 } 291 } 292 293 if (!meta.SerializeToString(new_value)) { 294 rocksdb::Warn(logger, "serialization error"); 295 return false; 296 } 297 return true; 298 } 299 300 private: 301 bool MergeOne(cockroach::storage::enginepb::MVCCMetadata* meta, 302 const rocksdb::Slice& operand, bool full_merge, 303 rocksdb::Logger* logger) const WARN_UNUSED_RESULT { 304 cockroach::storage::enginepb::MVCCMetadata operand_meta; 305 if (!operand_meta.ParseFromArray(operand.data(), operand.size())) { 306 rocksdb::Warn(logger, "corrupted operand value"); 307 return false; 308 } 309 return MergeValues(meta, operand_meta, full_merge, logger); 310 } 311 }; 312 313 } // namespace 314 315 // convertToColumnar detects time series data which is in the old row format, 316 // converting the data within into the new columnar format. 317 void convertToColumnar(cockroach::roachpb::InternalTimeSeriesData* data) { 318 if (data->samples_size() > 0) { 319 for (auto sample : data->samples()) { 320 // While the row format contains other values (such as min and max), these 321 // were not stored in actual usage. Furthermore, the new columnar format 322 // has been designed to be "sparse", with high resolutions containing 323 // values only for the "offset" and "last" columns. Thus, the other row 324 // fields are ignored. 325 data->add_offset(sample.offset()); 326 data->add_last(sample.sum()); 327 } 328 data->clear_samples(); 329 } 330 } 331 332 // sortAndDeduplicateColumns sorts all column fields of the time series data 333 // structure according to the values "offset" column. At the same time, 334 // duplicate offset values are removed - only the last instance of an offset in 335 // the collection is retained. 336 // 337 // "first_unsorted" is an optimization which only sorts data rows with an index 338 // greater than or equal to the supplied index. This is used because the 339 // supplied data is often the result of merging an already-sorted collection 340 // with a smaller unsorted collection, and thus only a portion of the end of the 341 // data needs to be sorted. 342 void sortAndDeduplicateColumns(cockroach::roachpb::InternalTimeSeriesData* data, 343 int first_unsorted) { 344 // Create an auxiliary array of array indexes, and sort that array according 345 // to the corresponding offset value in the data.offset() collection. This 346 // yields the permutation of the current array indexes that will place the 347 // offsets into sorted order. 348 auto order = std::vector<int>(data->offset_size() - first_unsorted); 349 std::iota(order.begin(), order.end(), first_unsorted); 350 std::stable_sort(order.begin(), order.end(), 351 [&](const int a, const int b) { return data->offset(a) < data->offset(b); }); 352 353 // Remove any duplicates from the permutation, keeping the *last* element 354 // merged for any given offset. Note the number of duplicates removed so that 355 // the columns can be resized later. 356 auto it = std::unique(order.rbegin(), order.rend(), [&](const int a, const int b) { 357 return data->offset(a) == data->offset(b); 358 }); 359 int duplicates = std::distance(order.begin(), it.base()); 360 order.erase(order.begin(), it.base()); 361 362 // Apply the permutation in the auxiliary array to all of the relevant column 363 // arrays in the data set. 364 for (int i = 0; i < order.size(); i++) { 365 // "dest_idx" is the current index which is being operated on; for each 366 // column, we will be replacing the value at this index with the correct 367 // sorted-order value for that index. 368 // 369 // "src_idx" is the current location of the value that is being moved to 370 // dest_idx, found by consulting the "order" auxiliary array. Its value 371 // will be *swapped* with the current value at "dest_idx". 372 // 373 // Because we are swapping values, and because we iterate through 374 // destinations from front to back, it is possible that value that was 375 // originally in "src_idx" has already been swapped to another location; 376 // specifically, if "src_idx" is earlier than "dest_idx", then its value is 377 // guaranteed to have been swapped. To find its current location, we 378 // "follow" the indexes in the order array until we arrive at a src_idx 379 // which is greater than the current dest_idx, which will be the correct 380 // location of the source value. 381 // 382 // An example of this situation: 383 // 384 // initial: 385 // data = [3 1 4 2] 386 // order = [1 3 0 2] 387 // 388 // dest = 0 389 // src = order[0] // 1 390 // data.swap(dest, src) // 0 <-> 1 391 // data == [1 3 4 2] 392 // 393 // dest = 1 394 // src = order[1] // 3 395 // data.swap(dest, src) // 1 <-> 3 396 // data == [1 2 4 3] 397 // 398 // dest = 2 399 // src = order[2] // 0 400 // // src < dest, so follow the trail 401 // src = order[src] // 1 402 // // src < dest, so follow the trail 403 // src = order[src] // 3 404 // data.swap(dest, src) // 2 <-> 3 405 // data == [1 2 3 4] 406 int dest_idx = i + first_unsorted; 407 int src_idx = order[i]; 408 while (src_idx < dest_idx) { 409 src_idx = order[src_idx - first_unsorted]; 410 } 411 // If the source is equal to the destination, then this value is already 412 // at its correct sorted location. 413 if (src_idx == dest_idx) { 414 continue; 415 } 416 417 data->mutable_offset()->SwapElements(src_idx, dest_idx); 418 data->mutable_last()->SwapElements(src_idx, dest_idx); 419 420 // These columns are only present at resolutions generated as rollups. We 421 // detect this by checking if there are any count columns present (the 422 // choice of "count" is arbitrary, all of these columns will be present or 423 // not). 424 if (data->count_size() > 0) { 425 data->mutable_count()->SwapElements(src_idx, dest_idx); 426 data->mutable_sum()->SwapElements(src_idx, dest_idx); 427 data->mutable_min()->SwapElements(src_idx, dest_idx); 428 data->mutable_max()->SwapElements(src_idx, dest_idx); 429 data->mutable_first()->SwapElements(src_idx, dest_idx); 430 data->mutable_variance()->SwapElements(src_idx, dest_idx); 431 } 432 } 433 434 // Resize each column to account for any duplicate values which were removed - 435 // the swapping algorithm will have moved these to the very end of the 436 // collection. 437 auto new_size = data->offset_size() - duplicates; 438 data->mutable_offset()->Truncate(new_size); 439 data->mutable_last()->Truncate(new_size); 440 if (data->count_size() > 0) { 441 data->mutable_count()->Truncate(new_size); 442 data->mutable_sum()->Truncate(new_size); 443 data->mutable_min()->Truncate(new_size); 444 data->mutable_max()->Truncate(new_size); 445 data->mutable_first()->Truncate(new_size); 446 data->mutable_variance()->Truncate(new_size); 447 } 448 } 449 450 WARN_UNUSED_RESULT bool MergeValues(cockroach::storage::enginepb::MVCCMetadata* left, 451 const cockroach::storage::enginepb::MVCCMetadata& right, 452 bool full_merge, rocksdb::Logger* logger) { 453 if (left->has_raw_bytes()) { 454 if (!right.has_raw_bytes()) { 455 rocksdb::Warn(logger, "inconsistent value types for merge (left = bytes, right = ?)"); 456 return false; 457 } 458 459 // Replay Advisory: Because merge commands pass through raft, it is possible 460 // for merging values to be "replayed". Currently, the only actual use of 461 // the merge system is for time series data, which is safe against replay; 462 // however, this property is not general for all potential mergeable types. 463 // If a future need arises to merge another type of data, replay protection 464 // will likely need to be a consideration. 465 466 if (IsTimeSeriesData(left->raw_bytes()) || IsTimeSeriesData(right.raw_bytes())) { 467 // The right operand must also be a time series. 468 if (!IsTimeSeriesData(left->raw_bytes()) || !IsTimeSeriesData(right.raw_bytes())) { 469 rocksdb::Warn(logger, "inconsistent value types for merging time " 470 "series data (type(left) != type(right))"); 471 return false; 472 } 473 return MergeTimeSeriesValues(left->mutable_raw_bytes(), right.raw_bytes(), full_merge, 474 logger); 475 } else { 476 const rocksdb::Slice rdata = ValueDataBytes(right.raw_bytes()); 477 left->mutable_raw_bytes()->append(rdata.data(), rdata.size()); 478 } 479 return true; 480 } else { 481 left->mutable_raw_bytes()->assign(right.raw_bytes()); 482 if (right.has_merge_timestamp()) { 483 left->mutable_merge_timestamp()->CopyFrom(right.merge_timestamp()); 484 } 485 if (full_merge && IsTimeSeriesData(left->raw_bytes())) { 486 if (!ConsolidateTimeSeriesValue(left->mutable_raw_bytes(), logger)) { 487 return false; 488 } 489 } 490 return true; 491 } 492 } 493 494 // MergeResult serializes the result MVCCMetadata value into a byte slice. 495 DBStatus MergeResult(cockroach::storage::enginepb::MVCCMetadata* meta, DBString* result) { 496 // TODO(pmattis): Should recompute checksum here. Need a crc32 497 // implementation and need to verify the checksumming is identical 498 // to what is being done in Go. Zlib's crc32 should be sufficient. 499 result->len = meta->ByteSize(); 500 result->data = static_cast<char*>(malloc(result->len)); 501 if (!meta->SerializeToArray(result->data, result->len)) { 502 return ToDBString("serialization error"); 503 } 504 return kSuccess; 505 } 506 507 rocksdb::MergeOperator* NewMergeOperator() { return new DBMergeOperator; } 508 509 } // namespace cockroach