github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/storage/pebble_merge.go (about) 1 // Copyright 2019 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package storage 12 13 import ( 14 "io" 15 "sort" 16 17 "github.com/cockroachdb/cockroach/pkg/roachpb" 18 "github.com/cockroachdb/cockroach/pkg/storage/enginepb" 19 "github.com/cockroachdb/cockroach/pkg/util/hlc" 20 "github.com/cockroachdb/cockroach/pkg/util/protoutil" 21 "github.com/cockroachdb/errors" 22 "github.com/gogo/protobuf/proto" 23 ) 24 25 // sortAndDeduplicateRows sorts all the samples field of the time series data 26 // structure according to the samples' `Offset`s. At the same time, samples with 27 // duplicate offset values are removed - only the last sample with a given offset 28 // in the collection is retained. 29 func sortAndDeduplicateRows(ts *roachpb.InternalTimeSeriesData) { 30 // In the common case, appending the newer entries to the older entries 31 // will result in an already ordered result, and there will be one sample 32 // per offset. Optimize for that case. 33 isSortedUniq := true 34 for i := 1; i < len(ts.Samples); i++ { 35 if ts.Samples[i-1].Offset >= ts.Samples[i].Offset { 36 isSortedUniq = false 37 break 38 } 39 } 40 if isSortedUniq { 41 return 42 } 43 44 // Create an auxiliary array of array indexes, and sort that array according 45 // to the corresponding offset value in the ts.Samples collection. This 46 // yields the permutation of the current array indexes that will place the 47 // samples into sorted order. In order to guarantee only the last sample with 48 // a duplicated offset is retained, we must do a stable sort. 49 sortedSrcIdxs := make([]int, len(ts.Samples)) 50 for i := range sortedSrcIdxs { 51 sortedSrcIdxs[i] = i 52 } 53 sort.SliceStable(sortedSrcIdxs, func(i, j int) bool { 54 return ts.Samples[sortedSrcIdxs[i]].Offset < ts.Samples[sortedSrcIdxs[j]].Offset 55 }) 56 57 // Remove any duplicates from the permutation, keeping the *last* element 58 // merged for any given offset. 59 uniqSortedSrcIdxs := make([]int, 0, len(ts.Samples)) 60 for destIdx := range sortedSrcIdxs { 61 if destIdx == len(sortedSrcIdxs)-1 || ts.Samples[sortedSrcIdxs[destIdx]].Offset != ts.Samples[sortedSrcIdxs[destIdx+1]].Offset { 62 uniqSortedSrcIdxs = append(uniqSortedSrcIdxs, sortedSrcIdxs[destIdx]) 63 } 64 } 65 66 origSamples := ts.Samples 67 ts.Samples = make([]roachpb.InternalTimeSeriesSample, len(uniqSortedSrcIdxs)) 68 69 // Apply the permutation in the auxiliary array to all of the relevant column 70 // arrays in the data set. 71 for destIdx, srcIdx := range uniqSortedSrcIdxs { 72 ts.Samples[destIdx] = origSamples[srcIdx] 73 } 74 } 75 76 // sortAndDeduplicateColumns sorts all column fields of the time series data 77 // structure according to the timeseries's `Offset` column. At the same time, 78 // duplicate offset values are removed - only the last instance of an offset in 79 // the collection is retained. 80 func sortAndDeduplicateColumns(ts *roachpb.InternalTimeSeriesData) { 81 // In the common case, appending the newer entries to the older entries 82 // will result in an already ordered result with no duplicated offsets. 83 // Optimize for that case. 84 isSortedUniq := true 85 for i := 1; i < len(ts.Offset); i++ { 86 if ts.Offset[i-1] >= ts.Offset[i] { 87 isSortedUniq = false 88 break 89 } 90 } 91 if isSortedUniq { 92 return 93 } 94 95 // Create an auxiliary array of array indexes, and sort that array according 96 // to the corresponding offset value in the `ts.Offset` collection. This yields 97 // the permutation of the current array indexes that will place the offsets into 98 // sorted order. In order to guarantee only the last column values corresponding 99 // to a duplicated offset are retained, we must do a stable sort. 100 sortedSrcIdxs := make([]int, len(ts.Offset)) 101 for i := range sortedSrcIdxs { 102 sortedSrcIdxs[i] = i 103 } 104 sort.SliceStable(sortedSrcIdxs, func(i, j int) bool { 105 return ts.Offset[sortedSrcIdxs[i]] < ts.Offset[sortedSrcIdxs[j]] 106 }) 107 108 // Remove any duplicates from the permutation, keeping the *last* element 109 // merged for any given offset. 110 uniqSortedSrcIdxs := make([]int, 0, len(ts.Offset)) 111 for destIdx := range sortedSrcIdxs { 112 if destIdx == len(sortedSrcIdxs)-1 || ts.Offset[sortedSrcIdxs[destIdx]] != ts.Offset[sortedSrcIdxs[destIdx+1]] { 113 uniqSortedSrcIdxs = append(uniqSortedSrcIdxs, sortedSrcIdxs[destIdx]) 114 } 115 } 116 117 origOffset, origLast, origCount, origSum, origMin, origMax, origFirst, origVariance := 118 ts.Offset, ts.Last, ts.Count, ts.Sum, ts.Min, ts.Max, ts.First, ts.Variance 119 ts.Offset = make([]int32, len(uniqSortedSrcIdxs)) 120 ts.Last = make([]float64, len(uniqSortedSrcIdxs)) 121 // These columns are only present at resolutions generated as rollups. We 122 // detect this by checking if there are any count columns present (the 123 // choice of "count" is arbitrary, all of these columns will be present or 124 // not). 125 if len(origCount) > 0 { 126 ts.Count = make([]uint32, len(uniqSortedSrcIdxs)) 127 ts.Sum = make([]float64, len(uniqSortedSrcIdxs)) 128 ts.Min = make([]float64, len(uniqSortedSrcIdxs)) 129 ts.Max = make([]float64, len(uniqSortedSrcIdxs)) 130 ts.First = make([]float64, len(uniqSortedSrcIdxs)) 131 ts.Variance = make([]float64, len(uniqSortedSrcIdxs)) 132 } 133 134 // Apply the permutation in the auxiliary array to all of the relevant column 135 // arrays in the data set. 136 for destIdx, srcIdx := range uniqSortedSrcIdxs { 137 ts.Offset[destIdx] = origOffset[srcIdx] 138 ts.Last[destIdx] = origLast[srcIdx] 139 140 if len(origCount) > 0 { 141 ts.Count[destIdx] = origCount[srcIdx] 142 ts.Sum[destIdx] = origSum[srcIdx] 143 ts.Min[destIdx] = origMin[srcIdx] 144 ts.Max[destIdx] = origMax[srcIdx] 145 ts.First[destIdx] = origFirst[srcIdx] 146 ts.Variance[destIdx] = origVariance[srcIdx] 147 } 148 } 149 } 150 151 // ensureColumnar detects time series data which is in the old row format, 152 // converting the row data into the new columnar format. 153 func ensureColumnar(ts *roachpb.InternalTimeSeriesData) { 154 for _, sample := range ts.Samples { 155 ts.Offset = append(ts.Offset, sample.Offset) 156 ts.Last = append(ts.Last, sample.Sum) 157 } 158 ts.Samples = ts.Samples[:0] 159 } 160 161 // MVCCValueMerger implements the `ValueMerger` interface. It buffers 162 // deserialized values in a slice in order specified by `oldToNew`. 163 // It determines the order of incoming operands by whether they were added 164 // with `MergeNewer()` or `MergeOlder()`, reversing the slice as necessary 165 // to ensure operands are always appended. It merges these deserialized 166 // operands when `Finish()` is called. 167 // 168 // It supports merging either all `roachpb.InternalTimeSeriesData` values 169 // or all non-timeseries values. Attempting to merge a mixture of timeseries 170 // and non-timeseries values will result in an error. 171 type MVCCValueMerger struct { 172 timeSeriesOps []roachpb.InternalTimeSeriesData 173 rawByteOps [][]byte 174 oldestMergeTS hlc.LegacyTimestamp 175 oldToNew bool 176 177 // Used to avoid heap allocations when passing pointer to `Unmarshal()`. 178 meta enginepb.MVCCMetadata 179 } 180 181 const ( 182 mvccChecksumSize = 4 183 mvccTagPos = mvccChecksumSize 184 mvccHeaderSize = mvccChecksumSize + 1 185 ) 186 187 func (t *MVCCValueMerger) ensureOrder(oldToNew bool) { 188 if oldToNew == t.oldToNew { 189 return 190 } 191 // Only one of the two loop bodies should actually execute under error-free 192 // conditions, i.e., all operands are either timeseries or all are non- 193 // timeseries. 194 for i := 0; i < len(t.timeSeriesOps)/2; i++ { 195 t.timeSeriesOps[i], t.timeSeriesOps[len(t.timeSeriesOps)-1-i] = t.timeSeriesOps[len(t.timeSeriesOps)-1-i], t.timeSeriesOps[i] 196 } 197 for i := 0; i < len(t.rawByteOps)/2; i++ { 198 t.rawByteOps[i], t.rawByteOps[len(t.rawByteOps)-1-i] = t.rawByteOps[len(t.rawByteOps)-1-i], t.rawByteOps[i] 199 } 200 t.oldToNew = oldToNew 201 } 202 203 func (t *MVCCValueMerger) deserializeMVCCValueAndAppend(value []byte) error { 204 if err := protoutil.Unmarshal(value, &t.meta); err != nil { 205 return errors.Errorf("corrupted operand value: %v", err) 206 } 207 if len(t.meta.RawBytes) < mvccHeaderSize { 208 return errors.Errorf("operand value too short") 209 } 210 if t.meta.RawBytes[mvccTagPos] == byte(roachpb.ValueType_TIMESERIES) { 211 if t.rawByteOps != nil { 212 return errors.Errorf("inconsistent value types for timeseries merge") 213 } 214 t.timeSeriesOps = append(t.timeSeriesOps, roachpb.InternalTimeSeriesData{}) 215 ts := &t.timeSeriesOps[len(t.timeSeriesOps)-1] 216 if err := protoutil.Unmarshal(t.meta.RawBytes[mvccHeaderSize:], ts); err != nil { 217 return errors.Errorf("corrupted timeseries: %v", err) 218 } 219 } else { 220 if t.timeSeriesOps != nil { 221 return errors.Errorf("inconsistent value types for non-timeseries merge") 222 } 223 t.rawByteOps = append(t.rawByteOps, t.meta.RawBytes[mvccHeaderSize:]) 224 } 225 // Save the timestamp of the oldest value since that is consistent with the 226 // behavior of the C++ DBMergeOperator. 227 if t.meta.MergeTimestamp != nil && (t.oldestMergeTS == hlc.LegacyTimestamp{} || !t.oldToNew) { 228 t.oldestMergeTS = *t.meta.MergeTimestamp 229 } 230 return nil 231 } 232 233 // MergeNewer deserializes the value and appends it to the slice corresponding to its type 234 // (timeseries or non-timeseries). The slice will be reversed if needed such that it is in 235 // old-to-new order. 236 func (t *MVCCValueMerger) MergeNewer(value []byte) error { 237 t.ensureOrder(true /* oldToNew */) 238 if err := t.deserializeMVCCValueAndAppend(value); err != nil { 239 return err 240 } 241 return nil 242 } 243 244 // MergeOlder deserializes the value and appends it to the slice corresponding to its type 245 // (timeseries or non-timeseries). The slice will be reversed if needed such that it is in 246 // new-to-old order. 247 func (t *MVCCValueMerger) MergeOlder(value []byte) error { 248 t.ensureOrder(false /* oldToNew */) 249 if err := t.deserializeMVCCValueAndAppend(value); err != nil { 250 return err 251 } 252 return nil 253 } 254 255 // Finish combines the buffered values from all `Merge*()` calls and marshals the result. 256 // In case of non-timeseries the values are simply concatenated from old to new. In case 257 // of timeseries the values are sorted, deduplicated, and potentially migrated to columnar 258 // format. When deduplicating, only the latest sample for a given offset is retained. 259 func (t *MVCCValueMerger) Finish() ([]byte, io.Closer, error) { 260 isColumnar := false 261 if t.timeSeriesOps == nil && t.rawByteOps == nil { 262 return nil, nil, errors.Errorf("empty merge unsupported") 263 } 264 t.ensureOrder(true /* oldToNew */) 265 if t.timeSeriesOps == nil { 266 // Concatenate non-timeseries operands from old to new 267 totalLen := 0 268 for _, rawByteOp := range t.rawByteOps { 269 totalLen += len(rawByteOp) 270 } 271 // See the motivating comment in mvcc.proto. 272 var meta enginepb.MVCCMetadataSubsetForMergeSerialization 273 meta.RawBytes = make([]byte, mvccHeaderSize, mvccHeaderSize+totalLen) 274 meta.RawBytes[mvccTagPos] = byte(roachpb.ValueType_BYTES) 275 for _, rawByteOp := range t.rawByteOps { 276 meta.RawBytes = append(meta.RawBytes, rawByteOp...) 277 } 278 res, err := protoutil.Marshal(&meta) 279 if err != nil { 280 return nil, nil, err 281 } 282 return res, nil, nil 283 } 284 285 // TODO(ajkr): confirm it is the case that (1) today's CRDB always merges timeseries 286 // values in columnar format, and (2) today's CRDB does not need to be downgrade- 287 // compatible with any version that supports row format only. Then we can drop support 288 // for row format entirely. It requires significant cleanup effort as many tests target 289 // the row format. 290 var merged roachpb.InternalTimeSeriesData 291 merged.StartTimestampNanos = t.timeSeriesOps[0].StartTimestampNanos 292 merged.SampleDurationNanos = t.timeSeriesOps[0].SampleDurationNanos 293 for _, timeSeriesOp := range t.timeSeriesOps { 294 if timeSeriesOp.StartTimestampNanos != merged.StartTimestampNanos { 295 return nil, nil, errors.Errorf("start timestamp mismatch") 296 } 297 if timeSeriesOp.SampleDurationNanos != merged.SampleDurationNanos { 298 return nil, nil, errors.Errorf("sample duration mismatch") 299 } 300 if !isColumnar && len(timeSeriesOp.Offset) > 0 { 301 ensureColumnar(&merged) 302 ensureColumnar(&timeSeriesOp) 303 isColumnar = true 304 } else if isColumnar { 305 ensureColumnar(&timeSeriesOp) 306 } 307 proto.Merge(&merged, &timeSeriesOp) 308 } 309 if isColumnar { 310 sortAndDeduplicateColumns(&merged) 311 } else { 312 sortAndDeduplicateRows(&merged) 313 } 314 tsBytes, err := protoutil.Marshal(&merged) 315 if err != nil { 316 return nil, nil, err 317 } 318 // See the motivating comment in mvcc.proto. 319 var meta enginepb.MVCCMetadataSubsetForMergeSerialization 320 if !(t.oldestMergeTS == hlc.LegacyTimestamp{}) { 321 meta.MergeTimestamp = &t.oldestMergeTS 322 } 323 tsTag := byte(roachpb.ValueType_TIMESERIES) 324 header := make([]byte, mvccHeaderSize) 325 header[mvccTagPos] = tsTag 326 meta.RawBytes = append(header, tsBytes...) 327 res, err := protoutil.Marshal(&meta) 328 if err != nil { 329 return nil, nil, err 330 } 331 return res, nil, nil 332 }