github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/ts/rollup.go (about)

     1  // Copyright 2018 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package ts
    12  
    13  import (
    14  	"context"
    15  	"math"
    16  	"sort"
    17  	"unsafe"
    18  
    19  	"github.com/cockroachdb/cockroach/pkg/kv"
    20  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    21  	"github.com/cockroachdb/cockroach/pkg/ts/tspb"
    22  	"github.com/cockroachdb/cockroach/pkg/util/hlc"
    23  )
    24  
    25  type rollupDatapoint struct {
    26  	timestampNanos int64
    27  	first          float64
    28  	last           float64
    29  	min            float64
    30  	max            float64
    31  	sum            float64
    32  	count          uint32
    33  	variance       float64
    34  }
    35  
    36  type rollupData struct {
    37  	name       string
    38  	source     string
    39  	datapoints []rollupDatapoint
    40  }
    41  
    42  func (rd *rollupData) toInternal(
    43  	keyDuration, sampleDuration int64,
    44  ) ([]roachpb.InternalTimeSeriesData, error) {
    45  	if err := tspb.VerifySlabAndSampleDuration(keyDuration, sampleDuration); err != nil {
    46  		return nil, err
    47  	}
    48  
    49  	// This slice must be preallocated to avoid reallocation on `append` because
    50  	// we maintain pointers to its elements in the map below.
    51  	result := make([]roachpb.InternalTimeSeriesData, 0, len(rd.datapoints))
    52  	// Pointers because they need to mutate the stuff in the slice above.
    53  	resultByKeyTime := make(map[int64]*roachpb.InternalTimeSeriesData)
    54  
    55  	for _, dp := range rd.datapoints {
    56  		// Determine which InternalTimeSeriesData this datapoint belongs to,
    57  		// creating if it has not already been created for a previous sample.
    58  		keyTime := normalizeToPeriod(dp.timestampNanos, keyDuration)
    59  		itsd, ok := resultByKeyTime[keyTime]
    60  		if !ok {
    61  			result = append(result, roachpb.InternalTimeSeriesData{
    62  				StartTimestampNanos: keyTime,
    63  				SampleDurationNanos: sampleDuration,
    64  			})
    65  			itsd = &result[len(result)-1]
    66  			resultByKeyTime[keyTime] = itsd
    67  		}
    68  
    69  		itsd.Offset = append(itsd.Offset, itsd.OffsetForTimestamp(dp.timestampNanos))
    70  		itsd.Last = append(itsd.Last, dp.last)
    71  		itsd.First = append(itsd.First, dp.first)
    72  		itsd.Min = append(itsd.Min, dp.min)
    73  		itsd.Max = append(itsd.Max, dp.max)
    74  		itsd.Count = append(itsd.Count, dp.count)
    75  		itsd.Sum = append(itsd.Sum, dp.sum)
    76  		itsd.Variance = append(itsd.Variance, dp.variance)
    77  	}
    78  
    79  	return result, nil
    80  }
    81  
    82  func computeRollupsFromData(data tspb.TimeSeriesData, rollupPeriodNanos int64) rollupData {
    83  	rollup := rollupData{
    84  		name:   data.Name,
    85  		source: data.Source,
    86  	}
    87  
    88  	createRollupPoint := func(timestamp int64, dataSlice []tspb.TimeSeriesDatapoint) {
    89  		result := rollupDatapoint{
    90  			timestampNanos: timestamp,
    91  			max:            -math.MaxFloat64,
    92  			min:            math.MaxFloat64,
    93  		}
    94  		for i, dp := range dataSlice {
    95  			if i == 0 {
    96  				result.first = dp.Value
    97  			}
    98  			result.last = dp.Value
    99  			result.max = math.Max(result.max, dp.Value)
   100  			result.min = math.Min(result.min, dp.Value)
   101  
   102  			if result.count > 0 {
   103  				result.variance = computeParallelVariance(
   104  					parallelVarianceArgs{
   105  						count:    result.count,
   106  						average:  result.sum / float64(result.count),
   107  						variance: result.variance,
   108  					},
   109  					parallelVarianceArgs{
   110  						count:    1,
   111  						average:  dp.Value,
   112  						variance: 0,
   113  					},
   114  				)
   115  			}
   116  
   117  			result.count++
   118  			result.sum += dp.Value
   119  		}
   120  
   121  		rollup.datapoints = append(rollup.datapoints, result)
   122  	}
   123  
   124  	dps := data.Datapoints
   125  	for len(dps) > 0 {
   126  		rollupTimestamp := normalizeToPeriod(dps[0].TimestampNanos, rollupPeriodNanos)
   127  		endIdx := sort.Search(len(dps), func(i int) bool {
   128  			return normalizeToPeriod(dps[i].TimestampNanos, rollupPeriodNanos) > rollupTimestamp
   129  		})
   130  		createRollupPoint(rollupTimestamp, dps[:endIdx])
   131  		dps = dps[endIdx:]
   132  	}
   133  
   134  	return rollup
   135  }
   136  
   137  func (db *DB) rollupTimeSeries(
   138  	ctx context.Context,
   139  	timeSeriesList []timeSeriesResolutionInfo,
   140  	now hlc.Timestamp,
   141  	qmc QueryMemoryContext,
   142  ) error {
   143  	thresholds := db.computeThresholds(now.WallTime)
   144  	for _, timeSeries := range timeSeriesList {
   145  		// Only process rollup if this resolution has a target rollup resolution.
   146  		targetResolution, hasRollup := timeSeries.Resolution.TargetRollupResolution()
   147  		if !hasRollup {
   148  			continue
   149  		}
   150  
   151  		// Query from beginning of time up to the threshold for this resolution.
   152  		threshold := thresholds[timeSeries.Resolution]
   153  
   154  		// Create an initial targetSpan to find data for this series, starting at
   155  		// the beginning of time and ending with the threshold time. Queries use
   156  		// MaxSpanRequestKeys to limit the number of rows in memory at one time,
   157  		// and will use ResumeSpan to issue additional queries if necessary.
   158  		targetSpan := roachpb.Span{
   159  			Key: MakeDataKey(timeSeries.Name, "" /* source */, timeSeries.Resolution, 0),
   160  			EndKey: MakeDataKey(
   161  				timeSeries.Name, "" /* source */, timeSeries.Resolution, threshold,
   162  			),
   163  		}
   164  
   165  		// For each row, generate a rollup datapoint and add it to the correct
   166  		// rollupData object.
   167  		rollupDataMap := make(map[string]rollupData)
   168  
   169  		account := qmc.workerMonitor.MakeBoundAccount()
   170  		defer account.Close(ctx)
   171  
   172  		childQmc := QueryMemoryContext{
   173  			workerMonitor:      qmc.workerMonitor,
   174  			resultAccount:      &account,
   175  			QueryMemoryOptions: qmc.QueryMemoryOptions,
   176  		}
   177  		for querySpan := targetSpan; querySpan.Valid(); {
   178  			var err error
   179  			querySpan, err = db.queryAndComputeRollupsForSpan(
   180  				ctx, timeSeries, querySpan, targetResolution, rollupDataMap, childQmc,
   181  			)
   182  			if err != nil {
   183  				return err
   184  			}
   185  		}
   186  
   187  		// Write computed rollupDataMap to disk
   188  		var rollupDataSlice []rollupData
   189  		for _, data := range rollupDataMap {
   190  			rollupDataSlice = append(rollupDataSlice, data)
   191  		}
   192  		if err := db.storeRollup(ctx, targetResolution, rollupDataSlice); err != nil {
   193  			return err
   194  		}
   195  	}
   196  	return nil
   197  }
   198  
   199  // queryAndComputeRollupsForSpan queries time series data from the provided
   200  // span, up to a maximum limit of rows based on memory limits.
   201  func (db *DB) queryAndComputeRollupsForSpan(
   202  	ctx context.Context,
   203  	series timeSeriesResolutionInfo,
   204  	span roachpb.Span,
   205  	targetResolution Resolution,
   206  	rollupDataMap map[string]rollupData,
   207  	qmc QueryMemoryContext,
   208  ) (roachpb.Span, error) {
   209  	b := &kv.Batch{}
   210  	b.Header.MaxSpanRequestKeys = qmc.GetMaxRollupSlabs(series.Resolution)
   211  	b.Scan(span.Key, span.EndKey)
   212  	if err := db.db.Run(ctx, b); err != nil {
   213  		return roachpb.Span{}, err
   214  	}
   215  
   216  	// Convert result data into a map of source strings to ordered spans of
   217  	// time series data.
   218  	diskAccount := qmc.workerMonitor.MakeBoundAccount()
   219  	defer diskAccount.Close(ctx)
   220  	sourceSpans, err := convertKeysToSpans(ctx, b.Results[0].Rows, &diskAccount)
   221  	if err != nil {
   222  		return roachpb.Span{}, err
   223  	}
   224  
   225  	// For each source, iterate over the data span and compute
   226  	// rollupDatapoints.
   227  	for source, span := range sourceSpans {
   228  		rollup, ok := rollupDataMap[source]
   229  		if !ok {
   230  			rollup = rollupData{
   231  				name:   series.Name,
   232  				source: source,
   233  			}
   234  			if err := qmc.resultAccount.Grow(ctx, int64(unsafe.Sizeof(rollup))); err != nil {
   235  				return roachpb.Span{}, err
   236  			}
   237  		}
   238  
   239  		var end timeSeriesSpanIterator
   240  		for start := makeTimeSeriesSpanIterator(span); start.isValid(); start = end {
   241  			rollupPeriod := targetResolution.SampleDuration()
   242  			sampleTimestamp := normalizeToPeriod(start.timestamp, rollupPeriod)
   243  			datapoint := rollupDatapoint{
   244  				timestampNanos: sampleTimestamp,
   245  				max:            -math.MaxFloat64,
   246  				min:            math.MaxFloat64,
   247  				first:          start.first(),
   248  			}
   249  			if err := qmc.resultAccount.Grow(ctx, int64(unsafe.Sizeof(datapoint))); err != nil {
   250  				return roachpb.Span{}, err
   251  			}
   252  			for end = start; end.isValid() && normalizeToPeriod(end.timestamp, rollupPeriod) == sampleTimestamp; end.forward() {
   253  				datapoint.last = end.last()
   254  				datapoint.max = math.Max(datapoint.max, end.max())
   255  				datapoint.min = math.Min(datapoint.min, end.min())
   256  
   257  				// Chan et al. algorithm for computing parallel variance. This allows
   258  				// the combination of two previously computed sample variances into a
   259  				// variance for the combined sample; this is needed when further
   260  				// downsampling previously downsampled variance values.
   261  				if datapoint.count > 0 {
   262  					datapoint.variance = computeParallelVariance(
   263  						parallelVarianceArgs{
   264  							count:    end.count(),
   265  							average:  end.average(),
   266  							variance: end.variance(),
   267  						},
   268  						parallelVarianceArgs{
   269  							count:    datapoint.count,
   270  							average:  datapoint.sum / float64(datapoint.count),
   271  							variance: datapoint.variance,
   272  						},
   273  					)
   274  				}
   275  
   276  				datapoint.count += end.count()
   277  				datapoint.sum += end.sum()
   278  			}
   279  			rollup.datapoints = append(rollup.datapoints, datapoint)
   280  		}
   281  		rollupDataMap[source] = rollup
   282  	}
   283  	return b.Results[0].ResumeSpanAsValue(), nil
   284  }
   285  
   286  type parallelVarianceArgs struct {
   287  	count    uint32
   288  	average  float64
   289  	variance float64
   290  }
   291  
   292  // computeParallelVariance computes the combined variance of two previously
   293  // computed sample variances. This is an implementation of the Chan et al.
   294  // algorithm for computing parallel variance. This allows the combination of two
   295  // previously computed sample variances into a variance for the combined sample;
   296  // this is needed when further downsampling previously downsampled variance
   297  // values. Note that it is exactly equivalent to the more widely used Welford's
   298  // algorithm when either variance set has a count of one.
   299  func computeParallelVariance(left, right parallelVarianceArgs) float64 {
   300  	leftCount := float64(left.count)
   301  	rightCount := float64(right.count)
   302  	totalCount := leftCount + rightCount
   303  	averageDelta := left.average - right.average
   304  	leftSumOfSquareDeviations := left.variance * leftCount
   305  	rightSumOfSquareDeviations := right.variance * rightCount
   306  	totalSumOfSquareDeviations := leftSumOfSquareDeviations + rightSumOfSquareDeviations + (averageDelta*averageDelta)*rightCount*leftCount/totalCount
   307  	return totalSumOfSquareDeviations / totalCount
   308  }