github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/storage/pebble_merge.go (about)

     1  // Copyright 2019 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package storage
    12  
    13  import (
    14  	"io"
    15  	"sort"
    16  
    17  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    18  	"github.com/cockroachdb/cockroach/pkg/storage/enginepb"
    19  	"github.com/cockroachdb/cockroach/pkg/util/hlc"
    20  	"github.com/cockroachdb/cockroach/pkg/util/protoutil"
    21  	"github.com/cockroachdb/errors"
    22  	"github.com/gogo/protobuf/proto"
    23  )
    24  
    25  // sortAndDeduplicateRows sorts all the samples field of the time series data
    26  // structure according to the samples' `Offset`s. At the same time, samples with
    27  // duplicate offset values are removed - only the last sample with a given offset
    28  // in the collection is retained.
    29  func sortAndDeduplicateRows(ts *roachpb.InternalTimeSeriesData) {
    30  	// In the common case, appending the newer entries to the older entries
    31  	// will result in an already ordered result, and there will be one sample
    32  	// per offset. Optimize for that case.
    33  	isSortedUniq := true
    34  	for i := 1; i < len(ts.Samples); i++ {
    35  		if ts.Samples[i-1].Offset >= ts.Samples[i].Offset {
    36  			isSortedUniq = false
    37  			break
    38  		}
    39  	}
    40  	if isSortedUniq {
    41  		return
    42  	}
    43  
    44  	// Create an auxiliary array of array indexes, and sort that array according
    45  	// to the corresponding offset value in the ts.Samples collection. This
    46  	// yields the permutation of the current array indexes that will place the
    47  	// samples into sorted order. In order to guarantee only the last sample with
    48  	// a duplicated offset is retained, we must do a stable sort.
    49  	sortedSrcIdxs := make([]int, len(ts.Samples))
    50  	for i := range sortedSrcIdxs {
    51  		sortedSrcIdxs[i] = i
    52  	}
    53  	sort.SliceStable(sortedSrcIdxs, func(i, j int) bool {
    54  		return ts.Samples[sortedSrcIdxs[i]].Offset < ts.Samples[sortedSrcIdxs[j]].Offset
    55  	})
    56  
    57  	// Remove any duplicates from the permutation, keeping the *last* element
    58  	// merged for any given offset.
    59  	uniqSortedSrcIdxs := make([]int, 0, len(ts.Samples))
    60  	for destIdx := range sortedSrcIdxs {
    61  		if destIdx == len(sortedSrcIdxs)-1 || ts.Samples[sortedSrcIdxs[destIdx]].Offset != ts.Samples[sortedSrcIdxs[destIdx+1]].Offset {
    62  			uniqSortedSrcIdxs = append(uniqSortedSrcIdxs, sortedSrcIdxs[destIdx])
    63  		}
    64  	}
    65  
    66  	origSamples := ts.Samples
    67  	ts.Samples = make([]roachpb.InternalTimeSeriesSample, len(uniqSortedSrcIdxs))
    68  
    69  	// Apply the permutation in the auxiliary array to all of the relevant column
    70  	// arrays in the data set.
    71  	for destIdx, srcIdx := range uniqSortedSrcIdxs {
    72  		ts.Samples[destIdx] = origSamples[srcIdx]
    73  	}
    74  }
    75  
    76  // sortAndDeduplicateColumns sorts all column fields of the time series data
    77  // structure according to the timeseries's `Offset` column. At the same time,
    78  // duplicate offset values are removed - only the last instance of an offset in
    79  // the collection is retained.
    80  func sortAndDeduplicateColumns(ts *roachpb.InternalTimeSeriesData) {
    81  	// In the common case, appending the newer entries to the older entries
    82  	// will result in an already ordered result with no duplicated offsets.
    83  	// Optimize for that case.
    84  	isSortedUniq := true
    85  	for i := 1; i < len(ts.Offset); i++ {
    86  		if ts.Offset[i-1] >= ts.Offset[i] {
    87  			isSortedUniq = false
    88  			break
    89  		}
    90  	}
    91  	if isSortedUniq {
    92  		return
    93  	}
    94  
    95  	// Create an auxiliary array of array indexes, and sort that array according
    96  	// to the corresponding offset value in the `ts.Offset` collection. This yields
    97  	// the permutation of the current array indexes that will place the offsets into
    98  	// sorted order. In order to guarantee only the last column values corresponding
    99  	// to a duplicated offset are retained, we must do a stable sort.
   100  	sortedSrcIdxs := make([]int, len(ts.Offset))
   101  	for i := range sortedSrcIdxs {
   102  		sortedSrcIdxs[i] = i
   103  	}
   104  	sort.SliceStable(sortedSrcIdxs, func(i, j int) bool {
   105  		return ts.Offset[sortedSrcIdxs[i]] < ts.Offset[sortedSrcIdxs[j]]
   106  	})
   107  
   108  	// Remove any duplicates from the permutation, keeping the *last* element
   109  	// merged for any given offset.
   110  	uniqSortedSrcIdxs := make([]int, 0, len(ts.Offset))
   111  	for destIdx := range sortedSrcIdxs {
   112  		if destIdx == len(sortedSrcIdxs)-1 || ts.Offset[sortedSrcIdxs[destIdx]] != ts.Offset[sortedSrcIdxs[destIdx+1]] {
   113  			uniqSortedSrcIdxs = append(uniqSortedSrcIdxs, sortedSrcIdxs[destIdx])
   114  		}
   115  	}
   116  
   117  	origOffset, origLast, origCount, origSum, origMin, origMax, origFirst, origVariance :=
   118  		ts.Offset, ts.Last, ts.Count, ts.Sum, ts.Min, ts.Max, ts.First, ts.Variance
   119  	ts.Offset = make([]int32, len(uniqSortedSrcIdxs))
   120  	ts.Last = make([]float64, len(uniqSortedSrcIdxs))
   121  	// These columns are only present at resolutions generated as rollups. We
   122  	// detect this by checking if there are any count columns present (the
   123  	// choice of "count" is arbitrary, all of these columns will be present or
   124  	// not).
   125  	if len(origCount) > 0 {
   126  		ts.Count = make([]uint32, len(uniqSortedSrcIdxs))
   127  		ts.Sum = make([]float64, len(uniqSortedSrcIdxs))
   128  		ts.Min = make([]float64, len(uniqSortedSrcIdxs))
   129  		ts.Max = make([]float64, len(uniqSortedSrcIdxs))
   130  		ts.First = make([]float64, len(uniqSortedSrcIdxs))
   131  		ts.Variance = make([]float64, len(uniqSortedSrcIdxs))
   132  	}
   133  
   134  	// Apply the permutation in the auxiliary array to all of the relevant column
   135  	// arrays in the data set.
   136  	for destIdx, srcIdx := range uniqSortedSrcIdxs {
   137  		ts.Offset[destIdx] = origOffset[srcIdx]
   138  		ts.Last[destIdx] = origLast[srcIdx]
   139  
   140  		if len(origCount) > 0 {
   141  			ts.Count[destIdx] = origCount[srcIdx]
   142  			ts.Sum[destIdx] = origSum[srcIdx]
   143  			ts.Min[destIdx] = origMin[srcIdx]
   144  			ts.Max[destIdx] = origMax[srcIdx]
   145  			ts.First[destIdx] = origFirst[srcIdx]
   146  			ts.Variance[destIdx] = origVariance[srcIdx]
   147  		}
   148  	}
   149  }
   150  
   151  // ensureColumnar detects time series data which is in the old row format,
   152  // converting the row data into the new columnar format.
   153  func ensureColumnar(ts *roachpb.InternalTimeSeriesData) {
   154  	for _, sample := range ts.Samples {
   155  		ts.Offset = append(ts.Offset, sample.Offset)
   156  		ts.Last = append(ts.Last, sample.Sum)
   157  	}
   158  	ts.Samples = ts.Samples[:0]
   159  }
   160  
   161  // MVCCValueMerger implements the `ValueMerger` interface. It buffers
   162  // deserialized values in a slice in order specified by `oldToNew`.
   163  // It determines the order of incoming operands by whether they were added
   164  // with `MergeNewer()` or `MergeOlder()`, reversing the slice as necessary
   165  // to ensure operands are always appended. It merges these deserialized
   166  // operands when `Finish()` is called.
   167  //
   168  // It supports merging either all `roachpb.InternalTimeSeriesData` values
   169  // or all non-timeseries values. Attempting to merge a mixture of timeseries
   170  // and non-timeseries values will result in an error.
   171  type MVCCValueMerger struct {
   172  	timeSeriesOps []roachpb.InternalTimeSeriesData
   173  	rawByteOps    [][]byte
   174  	oldestMergeTS hlc.LegacyTimestamp
   175  	oldToNew      bool
   176  
   177  	// Used to avoid heap allocations when passing pointer to `Unmarshal()`.
   178  	meta enginepb.MVCCMetadata
   179  }
   180  
   181  const (
   182  	mvccChecksumSize = 4
   183  	mvccTagPos       = mvccChecksumSize
   184  	mvccHeaderSize   = mvccChecksumSize + 1
   185  )
   186  
   187  func (t *MVCCValueMerger) ensureOrder(oldToNew bool) {
   188  	if oldToNew == t.oldToNew {
   189  		return
   190  	}
   191  	// Only one of the two loop bodies should actually execute under error-free
   192  	// conditions, i.e., all operands are either timeseries or all are non-
   193  	// timeseries.
   194  	for i := 0; i < len(t.timeSeriesOps)/2; i++ {
   195  		t.timeSeriesOps[i], t.timeSeriesOps[len(t.timeSeriesOps)-1-i] = t.timeSeriesOps[len(t.timeSeriesOps)-1-i], t.timeSeriesOps[i]
   196  	}
   197  	for i := 0; i < len(t.rawByteOps)/2; i++ {
   198  		t.rawByteOps[i], t.rawByteOps[len(t.rawByteOps)-1-i] = t.rawByteOps[len(t.rawByteOps)-1-i], t.rawByteOps[i]
   199  	}
   200  	t.oldToNew = oldToNew
   201  }
   202  
   203  func (t *MVCCValueMerger) deserializeMVCCValueAndAppend(value []byte) error {
   204  	if err := protoutil.Unmarshal(value, &t.meta); err != nil {
   205  		return errors.Errorf("corrupted operand value: %v", err)
   206  	}
   207  	if len(t.meta.RawBytes) < mvccHeaderSize {
   208  		return errors.Errorf("operand value too short")
   209  	}
   210  	if t.meta.RawBytes[mvccTagPos] == byte(roachpb.ValueType_TIMESERIES) {
   211  		if t.rawByteOps != nil {
   212  			return errors.Errorf("inconsistent value types for timeseries merge")
   213  		}
   214  		t.timeSeriesOps = append(t.timeSeriesOps, roachpb.InternalTimeSeriesData{})
   215  		ts := &t.timeSeriesOps[len(t.timeSeriesOps)-1]
   216  		if err := protoutil.Unmarshal(t.meta.RawBytes[mvccHeaderSize:], ts); err != nil {
   217  			return errors.Errorf("corrupted timeseries: %v", err)
   218  		}
   219  	} else {
   220  		if t.timeSeriesOps != nil {
   221  			return errors.Errorf("inconsistent value types for non-timeseries merge")
   222  		}
   223  		t.rawByteOps = append(t.rawByteOps, t.meta.RawBytes[mvccHeaderSize:])
   224  	}
   225  	// Save the timestamp of the oldest value since that is consistent with the
   226  	// behavior of the C++ DBMergeOperator.
   227  	if t.meta.MergeTimestamp != nil && (t.oldestMergeTS == hlc.LegacyTimestamp{} || !t.oldToNew) {
   228  		t.oldestMergeTS = *t.meta.MergeTimestamp
   229  	}
   230  	return nil
   231  }
   232  
   233  // MergeNewer deserializes the value and appends it to the slice corresponding to its type
   234  // (timeseries or non-timeseries). The slice will be reversed if needed such that it is in
   235  // old-to-new order.
   236  func (t *MVCCValueMerger) MergeNewer(value []byte) error {
   237  	t.ensureOrder(true /* oldToNew */)
   238  	if err := t.deserializeMVCCValueAndAppend(value); err != nil {
   239  		return err
   240  	}
   241  	return nil
   242  }
   243  
   244  // MergeOlder deserializes the value and appends it to the slice corresponding to its type
   245  // (timeseries or non-timeseries). The slice will be reversed if needed such that it is in
   246  // new-to-old order.
   247  func (t *MVCCValueMerger) MergeOlder(value []byte) error {
   248  	t.ensureOrder(false /* oldToNew */)
   249  	if err := t.deserializeMVCCValueAndAppend(value); err != nil {
   250  		return err
   251  	}
   252  	return nil
   253  }
   254  
   255  // Finish combines the buffered values from all `Merge*()` calls and marshals the result.
   256  // In case of non-timeseries the values are simply concatenated from old to new. In case
   257  // of timeseries the values are sorted, deduplicated, and potentially migrated to columnar
   258  // format. When deduplicating, only the latest sample for a given offset is retained.
   259  func (t *MVCCValueMerger) Finish() ([]byte, io.Closer, error) {
   260  	isColumnar := false
   261  	if t.timeSeriesOps == nil && t.rawByteOps == nil {
   262  		return nil, nil, errors.Errorf("empty merge unsupported")
   263  	}
   264  	t.ensureOrder(true /* oldToNew */)
   265  	if t.timeSeriesOps == nil {
   266  		// Concatenate non-timeseries operands from old to new
   267  		totalLen := 0
   268  		for _, rawByteOp := range t.rawByteOps {
   269  			totalLen += len(rawByteOp)
   270  		}
   271  		// See the motivating comment in mvcc.proto.
   272  		var meta enginepb.MVCCMetadataSubsetForMergeSerialization
   273  		meta.RawBytes = make([]byte, mvccHeaderSize, mvccHeaderSize+totalLen)
   274  		meta.RawBytes[mvccTagPos] = byte(roachpb.ValueType_BYTES)
   275  		for _, rawByteOp := range t.rawByteOps {
   276  			meta.RawBytes = append(meta.RawBytes, rawByteOp...)
   277  		}
   278  		res, err := protoutil.Marshal(&meta)
   279  		if err != nil {
   280  			return nil, nil, err
   281  		}
   282  		return res, nil, nil
   283  	}
   284  
   285  	// TODO(ajkr): confirm it is the case that (1) today's CRDB always merges timeseries
   286  	// values in columnar format, and (2) today's CRDB does not need to be downgrade-
   287  	// compatible with any version that supports row format only. Then we can drop support
   288  	// for row format entirely. It requires significant cleanup effort as many tests target
   289  	// the row format.
   290  	var merged roachpb.InternalTimeSeriesData
   291  	merged.StartTimestampNanos = t.timeSeriesOps[0].StartTimestampNanos
   292  	merged.SampleDurationNanos = t.timeSeriesOps[0].SampleDurationNanos
   293  	for _, timeSeriesOp := range t.timeSeriesOps {
   294  		if timeSeriesOp.StartTimestampNanos != merged.StartTimestampNanos {
   295  			return nil, nil, errors.Errorf("start timestamp mismatch")
   296  		}
   297  		if timeSeriesOp.SampleDurationNanos != merged.SampleDurationNanos {
   298  			return nil, nil, errors.Errorf("sample duration mismatch")
   299  		}
   300  		if !isColumnar && len(timeSeriesOp.Offset) > 0 {
   301  			ensureColumnar(&merged)
   302  			ensureColumnar(&timeSeriesOp)
   303  			isColumnar = true
   304  		} else if isColumnar {
   305  			ensureColumnar(&timeSeriesOp)
   306  		}
   307  		proto.Merge(&merged, &timeSeriesOp)
   308  	}
   309  	if isColumnar {
   310  		sortAndDeduplicateColumns(&merged)
   311  	} else {
   312  		sortAndDeduplicateRows(&merged)
   313  	}
   314  	tsBytes, err := protoutil.Marshal(&merged)
   315  	if err != nil {
   316  		return nil, nil, err
   317  	}
   318  	// See the motivating comment in mvcc.proto.
   319  	var meta enginepb.MVCCMetadataSubsetForMergeSerialization
   320  	if !(t.oldestMergeTS == hlc.LegacyTimestamp{}) {
   321  		meta.MergeTimestamp = &t.oldestMergeTS
   322  	}
   323  	tsTag := byte(roachpb.ValueType_TIMESERIES)
   324  	header := make([]byte, mvccHeaderSize)
   325  	header[mvccTagPos] = tsTag
   326  	meta.RawBytes = append(header, tsBytes...)
   327  	res, err := protoutil.Marshal(&meta)
   328  	if err != nil {
   329  		return nil, nil, err
   330  	}
   331  	return res, nil, nil
   332  }