go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/analysis/internal/testresults/stability/query_stability.go

go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/analysis/internal/testresults/stability/query_stability.go (about)

     1  // Copyright 2024 The LUCI Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package stability implements the test stability analysis used by the
    16  // QueryStability RPC.
    17  package stability
    18  
    19  import (
    20  	"context"
    21  	"sort"
    22  	"time"
    23  
    24  	"cloud.google.com/go/spanner"
    25  
    26  	"go.chromium.org/luci/common/sync/parallel"
    27  	"go.chromium.org/luci/server/span"
    28  
    29  	spanutil "go.chromium.org/luci/analysis/internal/span"
    30  	"go.chromium.org/luci/analysis/internal/testresults"
    31  	"go.chromium.org/luci/analysis/pbutil"
    32  	pb "go.chromium.org/luci/analysis/proto/v1"
    33  )
    34  
    35  const (
    36  	// The maximum number of workers to run in parallel.
    37  	// Given 100 is the maximum number of test variants queried at once,
    38  	// it is desirable that maxWorkers * batchSize >= 100.
    39  	maxWorkers = 10
    40  
    41  	// The size of each batch (in test variants).
    42  	batchSize = 10
    43  )
    44  
    45  // QueryStabilityOptions specifies options for QueryStability().
    46  type QueryStabilityOptions struct {
    47  	// Project is the LUCI Project to query.
    48  	Project string
    49  	// SubRealms are the project-scoped realms (of the form "ci",
    50  	// NOT "chromium:ci") within the project to query.
    51  	SubRealms []string
    52  	// TestVariantPositions are the test variant positions to query.
    53  	TestVariantPositions []*pb.QueryTestVariantStabilityRequest_TestVariantPosition
    54  	// The test stability criteria to apply.
    55  	Criteria *pb.TestStabilityCriteria
    56  	// AsAtTime is latest parititon time to include in the results;
    57  	// outside of testing contexts, this should be the current time.
    58  	// QueryTestVariants returns data for the 14 day period leading
    59  	// up to this time.
    60  	AsAtTime time.Time
    61  }
    62  
    63  // run represents all executions of a test variant in a particular
    64  // (lowest-level) ResultDB invocation.
    65  type run struct {
    66  	// Whether at least one non-skipped test result in the run was
    67  	// expected.
    68  	expected bool
    69  }
    70  
    71  // QueryStability queries the stability of nominated test variants.
    72  // Used to inform exoneration decisions.
    73  //
    74  // Must be called in a Spanner transactional context. Context must
    75  // support multiple reads (i.e. NOT spanner.Single()) as request may
    76  // batched over multiple reads.
    77  func QueryStability(ctx context.Context, opts QueryStabilityOptions) ([]*pb.TestVariantStabilityAnalysis, error) {
    78  	batches := partitionQueryIntoBatches(opts.TestVariantPositions, batchSize)
    79  
    80  	err := parallel.WorkPool(maxWorkers, func(c chan<- func() error) {
    81  		for _, b := range batches {
    82  			// Assign batch to a local variable to ensure its current
    83  			// value is captured by function closures.
    84  			batch := b
    85  			c <- func() error {
    86  				var err error
    87  				batchOpts := opts
    88  				batchOpts.TestVariantPositions = batch.input
    89  				// queryStabilityShard ensures test variants appear
    90  				// in the output in the same order as they appear in the
    91  				// input.
    92  				batch.output, err = queryStabilityShard(ctx, batchOpts)
    93  				return err
    94  			}
    95  		}
    96  	})
    97  	if err != nil {
    98  		return nil, err
    99  	}
   100  
   101  	// The order of test variants in the output should be the
   102  	// same as the input. Perform the inverse to what we did
   103  	// in batching.
   104  	analysis := make([]*pb.TestVariantStabilityAnalysis, 0, len(opts.TestVariantPositions))
   105  	for _, b := range batches {
   106  		analysis = append(analysis, b.output...)
   107  	}
   108  
   109  	return analysis, nil
   110  }
   111  
   112  type queryStabilityBatch struct {
   113  	input  []*pb.QueryTestVariantStabilityRequest_TestVariantPosition
   114  	output []*pb.TestVariantStabilityAnalysis
   115  }
   116  
   117  // partitionQueryIntoBatches partitions a list of test variant positions
   118  // into batches.
   119  func partitionQueryIntoBatches(tvps []*pb.QueryTestVariantStabilityRequest_TestVariantPosition, batchSize int) []*queryStabilityBatch {
   120  	var batches []*queryStabilityBatch
   121  	batchInput := make([]*pb.QueryTestVariantStabilityRequest_TestVariantPosition, 0, batchSize)
   122  	for _, tvp := range tvps {
   123  		if len(batchInput) >= batchSize {
   124  			batches = append(batches, &queryStabilityBatch{
   125  				input: batchInput,
   126  			})
   127  			batchInput = make([]*pb.QueryTestVariantStabilityRequest_TestVariantPosition, 0, batchSize)
   128  		}
   129  		batchInput = append(batchInput, tvp)
   130  	}
   131  	if len(batchInput) > 0 {
   132  		batches = append(batches, &queryStabilityBatch{
   133  			input: batchInput,
   134  		})
   135  	}
   136  	return batches
   137  }
   138  
   139  // queryStabilityShard reads test stability statistics for test variants.
   140  // Must be called in a spanner transactional context.
   141  func queryStabilityShard(ctx context.Context, opts QueryStabilityOptions) ([]*pb.TestVariantStabilityAnalysis, error) {
   142  	type changelist struct {
   143  		Host   string
   144  		Change int64
   145  	}
   146  	type testVariantPosition struct {
   147  		TestID              string
   148  		VariantHash         string
   149  		SourceRefHash       []byte
   150  		QuerySourcePosition int64
   151  		ExcludedChangelists []changelist
   152  	}
   153  
   154  	tvps := make([]testVariantPosition, 0, len(opts.TestVariantPositions))
   155  	for _, ptv := range opts.TestVariantPositions {
   156  		variantHash := ptv.VariantHash
   157  		if variantHash == "" {
   158  			variantHash = pbutil.VariantHash(ptv.Variant)
   159  		}
   160  
   161  		excludedCLs := make([]changelist, 0, len(ptv.Sources.Changelists))
   162  		for _, cl := range ptv.Sources.Changelists {
   163  			excludedCLs = append(excludedCLs, changelist{
   164  				Host:   testresults.CompressHost(cl.Host),
   165  				Change: cl.Change,
   166  			})
   167  		}
   168  
   169  		tvps = append(tvps, testVariantPosition{
   170  			TestID:              ptv.TestId,
   171  			VariantHash:         variantHash,
   172  			SourceRefHash:       pbutil.SourceRefHash(pbutil.SourceRefFromSources(ptv.Sources)),
   173  			QuerySourcePosition: pbutil.SourcePosition(ptv.Sources),
   174  			ExcludedChangelists: excludedCLs,
   175  		})
   176  	}
   177  
   178  	stmt := spanner.NewStatement(testStabilityQuery)
   179  	stmt.Params = map[string]any{
   180  		"project":              opts.Project,
   181  		"testVariantPositions": tvps,
   182  		"subRealms":            opts.SubRealms,
   183  		"asAtTime":             opts.AsAtTime,
   184  		"skip":                 int64(pb.TestResultStatus_SKIP),
   185  	}
   186  
   187  	results := make([]*pb.TestVariantStabilityAnalysis, 0, len(tvps))
   188  
   189  	index := 0
   190  	var b spanutil.Buffer
   191  	err := span.Query(ctx, stmt).Do(func(row *spanner.Row) error {
   192  		var (
   193  			testID, variantHash                               string
   194  			verdictsBefore, verdictsOnOrAfter                 []*sourceVerdict
   195  			sourcePositionBuckets                             []*sourcePositionBucket
   196  			runFlakyVerdictsBefore, runFlakyVerdictsOnOrAfter []*sourceVerdict
   197  		)
   198  
   199  		err := b.FromSpanner(
   200  			row,
   201  			&testID,
   202  			&variantHash,
   203  			&verdictsBefore, &verdictsOnOrAfter,
   204  			&sourcePositionBuckets,
   205  			&runFlakyVerdictsBefore, &runFlakyVerdictsOnOrAfter,
   206  		)
   207  		if err != nil {
   208  			return err
   209  		}
   210  
   211  		analysis := &pb.TestVariantStabilityAnalysis{}
   212  		if testID != tvps[index].TestID || variantHash != tvps[index].VariantHash {
   213  			// This should never happen, as the SQL statement is designed
   214  			// to return results in the same order as test variants requested.
   215  			panic("results in incorrect order")
   216  		}
   217  		sourcePosition := tvps[index].QuerySourcePosition
   218  
   219  		analysis.TestId = testID
   220  		analysis.Variant = opts.TestVariantPositions[index].Variant
   221  		analysis.VariantHash = opts.TestVariantPositions[index].VariantHash
   222  		analysis.FailureRate = applyFailureRateCriteria(verdictsBefore, verdictsOnOrAfter, sourcePosition, opts.Criteria.FailureRate)
   223  		analysis.FlakeRate = applyFlakeRateCriteria(sourcePositionBuckets, sourcePosition, runFlakyVerdictsBefore, runFlakyVerdictsOnOrAfter, opts.Criteria.FlakeRate)
   224  		results = append(results, analysis)
   225  		index++
   226  		return nil
   227  	})
   228  	if err != nil {
   229  		return nil, err
   230  	}
   231  	return results, nil
   232  }
   233  
   234  // truncateSourceVerdicts truncates verdicts such that the
   235  // total runs in the truncated slice is no greater than maxRuns.
   236  //
   237  // If the total number of runs in verdicts is already less
   238  // than or equal to maxRuns, no truncation occurs.
   239  //
   240  // As individual source verdicts can represent more than one
   241  // run, truncation may occur inside a source verdict (dropping
   242  // some of its runs) to achieve the a total of maxRuns in
   243  // the returned slice.
   244  func truncateSourceVerdicts(verdicts []*sourceVerdict, maxRuns int) []*sourceVerdict {
   245  	var runs int
   246  	var result []*sourceVerdict
   247  	for _, verdict := range verdicts {
   248  		if runs >= maxRuns {
   249  			break
   250  		}
   251  
   252  		remainingRuns := maxRuns - runs
   253  		truncatedVerdict := truncateSourceVerdict(verdict, remainingRuns)
   254  		runs += int(truncatedVerdict.ExpectedRuns + truncatedVerdict.UnexpectedRuns)
   255  		result = append(result, truncatedVerdict)
   256  	}
   257  	return result
   258  }
   259  
   260  // truncateSourceVerdict truncates a verdict such that its
   261  // total runs is no greater than maxRuns.
   262  //
   263  // If multiple runs are to be removed from a verdict, the dropped
   264  // runs are balanced between the unexpected and expected runs
   265  // proportionately.
   266  func truncateSourceVerdict(verdict *sourceVerdict, maxRuns int) *sourceVerdict {
   267  	// Copy the verdict as we may drop some of its runs.
   268  	vc := *verdict
   269  
   270  	excessRuns := (vc.ExpectedRuns + vc.UnexpectedRuns) - int64(maxRuns)
   271  	if excessRuns > 0 {
   272  		// Fairly share the runs to be removed between expected
   273  		// and unexpected runs. Round towards removing more
   274  		// expected runs than unexpected runs.
   275  		unexpectedRunsToRemove := excessRuns * vc.UnexpectedRuns / (vc.ExpectedRuns + vc.UnexpectedRuns)
   276  		expectedRunsToRemove := excessRuns - unexpectedRunsToRemove
   277  
   278  		vc.UnexpectedRuns -= unexpectedRunsToRemove
   279  		vc.ExpectedRuns -= expectedRunsToRemove
   280  	}
   281  	return &vc
   282  }
   283  
   284  // flattenSourceVerdictsToRuns transforms a list of source verdicts
   285  // to a sequence of runs.
   286  func flattenSourceVerdictsToRuns(verdicts []*sourceVerdict) []run {
   287  	var result []run
   288  	for _, verdict := range verdicts {
   289  		result = append(result, flattenSourceVerdictToRuns(verdict)...)
   290  	}
   291  	return result
   292  }
   293  
   294  // flattenSourceVerdictToRuns transforms a source verdict
   295  // into a sequence of runs. As the order of runs within a source verdict
   296  // is not known, they are put in an arbitrary (fair) order.
   297  func flattenSourceVerdictToRuns(verdict *sourceVerdict) []run {
   298  	var result []run
   299  	unexpectedRuns := verdict.UnexpectedRuns
   300  	totalRuns := verdict.UnexpectedRuns + verdict.ExpectedRuns
   301  
   302  	var unexpectedOutput int64
   303  	var totalOutput int64
   304  	for totalOutput < totalRuns {
   305  		var expected bool
   306  
   307  		remainingUnexpectedRuns := (unexpectedRuns - unexpectedOutput)
   308  		remainingRuns := (totalRuns - totalOutput)
   309  
   310  		// What percentage of remaining runs are unexpected?
   311  		//
   312  		// In case of only unexpected runs, this will be 100.
   313  		// In case of only expected runs, this will be 0.
   314  		// Otherwise, it should be somewhere in the middle.
   315  		//
   316  		// Invariant: 0 <= remainingUnexpectedPercent <= 100.
   317  		remainingUnexpectedPercent := (remainingUnexpectedRuns * 100) / remainingRuns
   318  
   319  		// If we output an expected run now, what percentage
   320  		// of the runs output so far will be unexpected?
   321  		//
   322  		// Invariant: 0 <= unexpectedPercentIfOutputExpected < 100.
   323  		unexpectedPercentIfOutputExpected := (unexpectedOutput * 100) / (totalOutput + 1)
   324  
   325  		// Maintain fairness by alternating between expected
   326  		// and unexpected runs to keep the proportion of
   327  		// unexpected runs output so far and the proportion
   328  		// of unexpected runs remaining about equal.
   329  		//
   330  		// In case of only expected runs remaining, remainingUnexpectedPercent
   331  		// will equal zero so we will only output expected runs.
   332  		//
   333  		// In case of only unexpected runs remaining, remainingUnexpectedPercent
   334  		// will equal 100. As unexpectedPercentIfOutputExpected is
   335  		// always less than 100, we will only output unexpected runs.
   336  		if unexpectedPercentIfOutputExpected >= remainingUnexpectedPercent {
   337  			expected = true // output expected run.
   338  		} else {
   339  			expected = false // output unexpected run.
   340  		}
   341  
   342  		result = append(result, run{expected: expected})
   343  		if !expected {
   344  			unexpectedOutput++
   345  		}
   346  		totalOutput++
   347  	}
   348  	return result
   349  }
   350  
   351  func reverseVerdicts(verdicts []*sourceVerdict) []*sourceVerdict {
   352  	reversed := make([]*sourceVerdict, 0, len(verdicts))
   353  	for i := len(verdicts) - 1; i >= 0; i-- {
   354  		reversed = append(reversed, verdicts[i])
   355  	}
   356  	return reversed
   357  }
   358  
   359  func reverseRuns(runs []run) []run {
   360  	reversed := make([]run, 0, len(runs))
   361  	for i := len(runs) - 1; i >= 0; i-- {
   362  		reversed = append(reversed, runs[i])
   363  	}
   364  	return reversed
   365  }
   366  
   367  func splitOn(verdicts []*sourceVerdict, sourcePosition int64) (on, other []*sourceVerdict) {
   368  	on = make([]*sourceVerdict, 0, len(verdicts))
   369  	other = make([]*sourceVerdict, 0, len(verdicts))
   370  
   371  	for _, v := range verdicts {
   372  		if v.SourcePosition == sourcePosition {
   373  			on = append(on, v)
   374  		} else {
   375  			other = append(other, v)
   376  		}
   377  	}
   378  	return on, other
   379  }
   380  
   381  // filterSourceVerdicts filters source verdicts so that at most one
   382  // test run is present for each verdict obtained in presubmit.
   383  func filterSourceVerdictsForFailureRateCriteria(svs []*sourceVerdict) []*sourceVerdict {
   384  	result := make([]*sourceVerdict, 0, len(svs))
   385  	for _, sv := range svs {
   386  		item := &sourceVerdict{}
   387  		*item = *sv
   388  
   389  		if item.ChangelistChange.Valid && (item.UnexpectedRuns+item.ExpectedRuns) > 1 {
   390  			// For presubmit data, keep only one run, preferentially the unexpected run.
   391  			if item.UnexpectedRuns >= 1 {
   392  				item.UnexpectedRuns = 1
   393  				item.ExpectedRuns = 0
   394  			} else {
   395  				item.UnexpectedRuns = 0
   396  				item.ExpectedRuns = 1
   397  			}
   398  		}
   399  
   400  		result = append(result, item)
   401  	}
   402  	return result
   403  }
   404  
   405  // applyFailureRateCriteria applies the failure rate criteria to a test variant
   406  // at a given source position.
   407  //
   408  // beforeExamples is a list of (up to) 10 source verdicts with source position
   409  // just prior to the queried source position. The list shall be ordered with
   410  // the source verdict nearest the queried source position appearing first.
   411  //
   412  // onOrAfterExamples is a list of (up to) 10 source verdicts with source position
   413  // equal to, or just after, the queried source position. The list shall be
   414  // ordered with the source verdict nearest the queried source position
   415  // appearing first.
   416  //
   417  // Both sets of examples should have had the following filtering applied:
   418  //   - At most one source verdict per distinct CL (for source verdicts
   419  //     testing CLs; no such restrictions apply to postsubmit data).
   420  //   - Source verdicts must not be for the same CL as is being considered for
   421  //     exoneration (if any).
   422  //   - Source verdicts must not be for CLs authored by automation.
   423  //
   424  // criteria defines the failure rate thresholds to apply.
   425  func applyFailureRateCriteria(beforeExamples, onOrAfterExamples []*sourceVerdict, sourcePosition int64, criteria *pb.TestStabilityCriteria_FailureRateCriteria) *pb.TestVariantStabilityAnalysis_FailureRate {
   426  	// Limit source verdicts from presubmit to contributing at most 1 run each.
   427  	// This is to avoid a single repeatedly retried bad CL from having an oversized
   428  	// influence on the exoneration decision.
   429  	beforeExamples = filterSourceVerdictsForFailureRateCriteria(beforeExamples)
   430  	onOrAfterExamples = filterSourceVerdictsForFailureRateCriteria(onOrAfterExamples)
   431  
   432  	onExamples, afterExamples := splitOn(onOrAfterExamples, sourcePosition)
   433  
   434  	onExamples = truncateSourceVerdicts(onExamples, 10)
   435  	onRuns := flattenSourceVerdictsToRuns(onExamples)
   436  
   437  	// The window size is 10, and the window will always contain any runs on the queried
   438  	// source position. Additional runs may come from source positions before or after.
   439  	// Note: For the passed examples, the first example is the one nearest to the query
   440  	// position, so truncating keeps only the examples closest to the queried source position.
   441  	beforeExamples = truncateSourceVerdicts(beforeExamples, 10-len(onRuns))
   442  	afterExamples = truncateSourceVerdicts(afterExamples, 10-len(onRuns))
   443  
   444  	beforeRuns := flattenSourceVerdictsToRuns(beforeExamples)
   445  	afterRuns := flattenSourceVerdictsToRuns(afterExamples)
   446  
   447  	consecutive := consecutiveUnexpectedCount(reverseRuns(afterRuns), onRuns, beforeRuns)
   448  
   449  	// Put runs in chronological order:
   450  	// 0                 ....          len(runs)-1
   451  	// <--- more recent  <query position>  less recent --->
   452  	//
   453  	// We need to reverse afterRuns as it is sorted with the first element
   454  	// closest to the query position.
   455  	runs := append(append(reverseRuns(afterRuns), onRuns...), beforeRuns...)
   456  	maxFailuresInWindow := unexpectedRunsInWindow(runs, 10)
   457  
   458  	// Also put source verdicts in chronological order.
   459  	// 0                 ....          len(examples)-1
   460  	// <--- more recent  <query position>  less recent --->
   461  	examples := append(append(reverseVerdicts(afterExamples), onExamples...), beforeExamples...)
   462  
   463  	return &pb.TestVariantStabilityAnalysis_FailureRate{
   464  		IsMet: (consecutive >= int(criteria.ConsecutiveFailureThreshold) ||
   465  			maxFailuresInWindow >= int(criteria.FailureThreshold)),
   466  		UnexpectedTestRuns:            int32(maxFailuresInWindow),
   467  		ConsecutiveUnexpectedTestRuns: int32(consecutive),
   468  		RecentVerdicts:                toPBFailureRateRecentVerdict(examples),
   469  	}
   470  }
   471  
   472  // unexpectedRunsInWindow considers all sliding windows of size
   473  // windowSize over the slice runs, and returns the maximum number
   474  // of unexpected test runs in any such window.
   475  func unexpectedRunsInWindow(runs []run, windowSize int) int {
   476  	// If the number of runs is less than the window size, consider
   477  	// the runs that remain as a single window.
   478  	if len(runs) < windowSize {
   479  		windowSize = len(runs)
   480  	}
   481  
   482  	unexpectedCount := 0
   483  	for i := 0; i < windowSize; i++ {
   484  		if !runs[i].expected {
   485  			unexpectedCount++
   486  		}
   487  	}
   488  	// Now failureCount = COUNT_UNEXPECTED(runs[0:windowSize])
   489  
   490  	maxFailuresInWindow := unexpectedCount
   491  	for i := 1; i+windowSize-1 < len(runs); i++ {
   492  		// Slide the window one position.
   493  		if !runs[i-1].expected {
   494  			unexpectedCount--
   495  		}
   496  		if !runs[i+windowSize-1].expected {
   497  			unexpectedCount++
   498  		}
   499  		// Now failureCount = COUNT_UNEXPECTED(runs[i:windowSize+i])
   500  
   501  		if unexpectedCount > maxFailuresInWindow {
   502  			maxFailuresInWindow = unexpectedCount
   503  		}
   504  	}
   505  	return maxFailuresInWindow
   506  }
   507  
   508  // consecutiveUnexpectedCount returns the number of consecutive unexpected runs
   509  // present from the start or end of a series of runs, where those consecutive
   510  // unexpected runs also include the query position.
   511  //
   512  // If the consecutive failures do not pass the query position, this method
   513  // returns 0.
   514  // If there are consecutive failures but none touch the start or end
   515  // of the runs slice, this method also returns 0.
   516  //
   517  // Example:
   518  //
   519  //	[U U U] [U U] [U E U E E U U E] = afterRuns, onRuns, beforeRuns
   520  //
   521  // The method returns 6, because of there is a chain of 6 consecutive
   522  // failures starting at the front of the runs slice, and that chain
   523  // passes by the query position. It is also continues to one run
   524  // in the 'beforeRuns' slice.
   525  //
   526  // The following conventions apply to arguments:
   527  //   - the most recent runs (later source position) appear first
   528  //     in all runs slices.
   529  //   - onRuns represents runs exactly on the queried source position,
   530  //   - afterRuns represents runs with a source position greater than
   531  //     the queried source position
   532  //   - beforeRuns represents runs with a source position less than
   533  //     the queried source position
   534  func consecutiveUnexpectedCount(afterRuns, onRuns, beforeRuns []run) int {
   535  	for _, r := range onRuns {
   536  		if r.expected {
   537  			// There is an expected run on the queried source position.
   538  			// The failures cannot be consecutive up to and including
   539  			// the source position from either side.
   540  			return 0
   541  		}
   542  	}
   543  
   544  	// The number of consecutive runs in afterRuns, starting from
   545  	// the side of the queried source position.
   546  	afterRunsConsecutive := len(afterRuns)
   547  	for i := len(afterRuns) - 1; i >= 0; i-- {
   548  		if afterRuns[i].expected {
   549  			// We encountered an expected run.
   550  			afterRunsConsecutive = (len(afterRuns) - 1) - i
   551  			break
   552  		}
   553  	}
   554  
   555  	// The number of consecutive runs in beforeRuns, starting from
   556  	// the side of the queried source position.
   557  	beforeRunsConsecutive := len(beforeRuns)
   558  	for i := 0; i < len(beforeRuns); i++ {
   559  		if beforeRuns[i].expected {
   560  			// We encountered an expected run.
   561  			beforeRunsConsecutive = i
   562  			break
   563  		}
   564  	}
   565  
   566  	if len(afterRuns) == afterRunsConsecutive {
   567  		// All runs after the source position are unexpected.
   568  		// Additionally, we know all runs on the source position are unexpected.
   569  		return len(afterRuns) + len(onRuns) + beforeRunsConsecutive
   570  	}
   571  	if len(beforeRuns) == beforeRunsConsecutive {
   572  		// All runs before the source position are unexpected.
   573  		// Additionally, we know all runs on the source position are unexpected.
   574  		return len(beforeRuns) + len(onRuns) + afterRunsConsecutive
   575  	}
   576  	return 0
   577  }
   578  
   579  // applyFlakeRateCriteria applies the flake rate criteria to a test variant
   580  // at a given source position.
   581  //
   582  // buckets should be in ascending order by source position.
   583  func applyFlakeRateCriteria(buckets []*sourcePositionBucket, querySourcePosition int64, beforeExamples, onOrAfterExamples []*sourceVerdict, criteria *pb.TestStabilityCriteria_FlakeRateCriteria) *pb.TestVariantStabilityAnalysis_FlakeRate {
   584  	// Query the soure position +/- 1 week.
   585  	window := queryBuckets(buckets, querySourcePosition, 7*24*time.Hour)
   586  
   587  	runFlaky, total := countVerdicts(window)
   588  	if total < int64(criteria.MinWindow) {
   589  		// If the sample size is not large enough, revert to querying the full
   590  		// 14 days of data. This exists to improve performance on infrequently
   591  		// run tests at the cost of some recency.
   592  		window = buckets
   593  		runFlaky, total = countVerdicts(window)
   594  	}
   595  
   596  	// Examples arrive sorted such that those closest to the queried source
   597  	// position are first.
   598  	// Flip and combine them so that the most recent (latest source position)
   599  	// are first.
   600  	allExamples := append(reverseVerdicts(onOrAfterExamples), beforeExamples...)
   601  
   602  	// Find examples from the window considered.
   603  	var examples []*sourceVerdict
   604  	var startPosition int64
   605  	var endPosition int64
   606  	if len(window) > 0 {
   607  		startPosition = window[0].StartSourcePosition
   608  		endPosition = window[len(window)-1].EndSourcePosition
   609  
   610  		for _, e := range allExamples {
   611  			if startPosition <= e.SourcePosition && e.SourcePosition <= endPosition {
   612  				examples = append(examples, e)
   613  			}
   614  		}
   615  
   616  		if len(examples) > 10 {
   617  			examples = examples[:10]
   618  		}
   619  	}
   620  
   621  	flakeRate := 0.0
   622  	if total > 0 {
   623  		flakeRate = float64(runFlaky) / float64(total)
   624  	}
   625  
   626  	return &pb.TestVariantStabilityAnalysis_FlakeRate{
   627  		IsMet:            runFlaky >= int64(criteria.FlakeThreshold) && flakeRate >= criteria.FlakeRateThreshold,
   628  		RunFlakyVerdicts: int32(runFlaky),
   629  		TotalVerdicts:    int32(total),
   630  		FlakeExamples:    toPBFlakeRateVerdictExample(examples),
   631  		StartPosition:    startPosition,
   632  		EndPosition:      endPosition,
   633  	}
   634  }
   635  
   636  func countVerdicts(buckets []*sourcePositionBucket) (runFlaky, total int64) {
   637  	for _, b := range buckets {
   638  		runFlaky += b.RunFlakyVerdicts
   639  		total += b.TotalVerdicts
   640  	}
   641  	return runFlaky, total
   642  }
   643  
   644  // queryBuckets returns the slice of buckets that corresponds to
   645  // querying a time interval `interval` before and after a specified
   646  // source position, `querySourcePosition`.
   647  //
   648  // For example, query position 123456 +/- 1 week.
   649  //
   650  // To convert a time interval to a range of source positions,
   651  // this method computes an approximate time corresponding to each
   652  // source position. The time assigned to a source position
   653  // is the earliest partition time that source position (or a
   654  // later position) has been observed.
   655  //
   656  // buckets should be in ascending order by source position.
   657  func queryBuckets(buckets []*sourcePositionBucket, querySourcePosition int64, interval time.Duration) []*sourcePositionBucket {
   658  	if len(buckets) == 0 {
   659  		return buckets
   660  	}
   661  
   662  	// earliestSourcePositionAvailability[i] represents the earliest partition time
   663  	// observed for a test result with a source position at or after buckets[i].StartSourcePosition.
   664  	//
   665  	// Intuitively, it represents a best guess estimate about the time a source position
   666  	// was first available in the repository. It is also consistent in the sense that
   667  	// an earlier source position will never have a later time associated with it than
   668  	// a later source position.
   669  	earliestSourcePositionAvailability := make([]time.Time, len(buckets))
   670  	earliestTime := time.Date(9999, 12, 31, 23, 59, 59, 0, time.UTC)
   671  
   672  	// Start at the more recent (larger) source position and work backwards
   673  	// to the past.
   674  	for i := len(buckets) - 1; i >= 0; i-- {
   675  		b := buckets[i]
   676  		if b.EndSourcePosition < b.StartSourcePosition {
   677  			panic("end source position should be equal to or after start source position")
   678  		}
   679  		if i < len(buckets)-2 && !(buckets[i].EndSourcePosition < buckets[i+1].StartSourcePosition) {
   680  			panic("end source position of bucket should be before start source position of next bucket")
   681  		}
   682  		// Regardless of the earliest time a source position in this bucket was observed,
   683  		// if a bucket with a later source position had an earlier time, we should use that.
   684  		// This is because the later source positions build upon earlier sources positions,
   685  		// so the earlier source positions must have been available at that time too.
   686  		if b.EarliestPartitionTime.Before(earliestTime) {
   687  			earliestTime = b.EarliestPartitionTime
   688  		}
   689  		earliestSourcePositionAvailability[i] = earliestTime
   690  	}
   691  
   692  	// Find the nearest bucket that includes, or is prior to, the queried source position.
   693  	queryIndex := 0
   694  	for i, b := range buckets {
   695  		if b.StartSourcePosition > querySourcePosition {
   696  			break
   697  		}
   698  		queryIndex = i
   699  	}
   700  
   701  	// The time approximately corresponding to the queried source position.
   702  	queryTime := earliestSourcePositionAvailability[queryIndex]
   703  	queryStartTime := queryTime.Add(-interval)
   704  	queryEndTime := queryTime.Add(interval)
   705  
   706  	startIndex := len(buckets)
   707  	for i, time := range earliestSourcePositionAvailability {
   708  		if !time.Before(queryStartTime) { // time >= queryStartTime
   709  			startIndex = i
   710  			break
   711  		}
   712  	}
   713  
   714  	endIndex := 0
   715  	for i := len(earliestSourcePositionAvailability) - 1; i >= 0; i-- {
   716  		time := earliestSourcePositionAvailability[i]
   717  		if !time.After(queryEndTime) { // time <= queryEndTime
   718  			endIndex = i
   719  			break
   720  		}
   721  	}
   722  
   723  	return buckets[startIndex : endIndex+1]
   724  }
   725  
   726  // sourceVerdict is used to store an example source verdict returned by
   727  // a Spanner query.
   728  type sourceVerdict struct {
   729  	SourcePosition int64
   730  	// Verdicts considered by the analysis have at most one CL tested,
   731  	// which is set below (if present).
   732  	ChangelistHost        spanner.NullString
   733  	ChangelistChange      spanner.NullInt64
   734  	ChangelistPatchset    spanner.NullInt64
   735  	ChangelistOwnerKind   spanner.NullString
   736  	IngestedInvocationIds []string
   737  	UnexpectedRuns        int64
   738  	ExpectedRuns          int64
   739  }
   740  
   741  func toPBFailureRateRecentVerdict(verdicts []*sourceVerdict) []*pb.TestVariantStabilityAnalysis_FailureRate_RecentVerdict {
   742  	results := make([]*pb.TestVariantStabilityAnalysis_FailureRate_RecentVerdict, 0, len(verdicts))
   743  	for _, v := range verdicts {
   744  		var changelists []*pb.Changelist
   745  		if v.ChangelistHost.Valid {
   746  			changelists = append(changelists, &pb.Changelist{
   747  				Host:      testresults.DecompressHost(v.ChangelistHost.StringVal),
   748  				Change:    v.ChangelistChange.Int64,
   749  				Patchset:  int32(v.ChangelistPatchset.Int64),
   750  				OwnerKind: testresults.OwnerKindFromDB(v.ChangelistOwnerKind.StringVal),
   751  			})
   752  		}
   753  
   754  		results = append(results, &pb.TestVariantStabilityAnalysis_FailureRate_RecentVerdict{
   755  			Position:       v.SourcePosition,
   756  			Changelists:    changelists,
   757  			Invocations:    sortStrings(v.IngestedInvocationIds),
   758  			UnexpectedRuns: int32(v.UnexpectedRuns),
   759  			TotalRuns:      int32(v.ExpectedRuns + v.UnexpectedRuns),
   760  		})
   761  	}
   762  	return results
   763  }
   764  
   765  func sortStrings(ids []string) []string {
   766  	idsCopy := make([]string, len(ids))
   767  	copy(idsCopy, ids)
   768  	sort.Strings(idsCopy)
   769  	return idsCopy
   770  }
   771  
   772  func toPBFlakeRateVerdictExample(verdicts []*sourceVerdict) []*pb.TestVariantStabilityAnalysis_FlakeRate_VerdictExample {
   773  	results := make([]*pb.TestVariantStabilityAnalysis_FlakeRate_VerdictExample, 0, len(verdicts))
   774  	for _, v := range verdicts {
   775  		var changelists []*pb.Changelist
   776  		if v.ChangelistHost.Valid {
   777  			changelists = append(changelists, &pb.Changelist{
   778  				Host:      testresults.DecompressHost(v.ChangelistHost.StringVal),
   779  				Change:    v.ChangelistChange.Int64,
   780  				Patchset:  int32(v.ChangelistPatchset.Int64),
   781  				OwnerKind: testresults.OwnerKindFromDB(v.ChangelistOwnerKind.StringVal),
   782  			})
   783  		}
   784  
   785  		results = append(results, &pb.TestVariantStabilityAnalysis_FlakeRate_VerdictExample{
   786  			Position:    v.SourcePosition,
   787  			Changelists: changelists,
   788  			Invocations: sortStrings(v.IngestedInvocationIds),
   789  		})
   790  	}
   791  	return results
   792  }
   793  
   794  // sourcePositionBucket represents a range of source positions for
   795  // a given test variant.
   796  type sourcePositionBucket struct {
   797  	BucketKey int64
   798  	// Starting source position. Inclusive.
   799  	StartSourcePosition int64
   800  	// Ending source position. Inclusive.
   801  	EndSourcePosition int64
   802  	// The earliest partition time of a test result in the bucket.
   803  	EarliestPartitionTime time.Time
   804  	// The total number of source verdicts in the bucket.
   805  	TotalVerdicts int64
   806  	// The total number of run-flaky source verdicts in the bucket.
   807  	RunFlakyVerdicts int64
   808  }
   809  
   810  var testStabilityQuery = `
   811  WITH test_variant_verdicts AS (
   812  	SELECT
   813  		Index,
   814  		tv.TestId,
   815  		tv.VariantHash,
   816  		tv.QuerySourcePosition,
   817  		ARRAY(
   818  			-- Filter verdicts to at most one per changelist under test.
   819  			-- Don't filter verdicts without an unsubmitted changelist
   820  			-- under test (i.e. postsubmit data).
   821  			SELECT
   822  				ANY_VALUE(
   823  					STRUCT(
   824  						SourcePosition,
   825  						ChangelistHost,
   826  						ChangelistChange,
   827  						ChangelistPatchset,
   828  						ChangelistOwnerKind,
   829  						IngestedInvocationIds,
   830  						MaxPartitionTime,
   831  						MinPartitionTime,
   832  						UnexpectedRuns,
   833  						ExpectedRuns
   834  					)
   835  					-- For any CL, prefer the verdict that is flaky.
   836  					-- Then prefer the verdict that is closest to the queried
   837  					-- source position.
   838  					HAVING MIN ABS(SourcePosition - tv.QuerySourcePosition) + IF(UnexpectedRuns > 0 AND ExpectedRuns > 0, -1000 * 1000 * 1000, 0)
   839  				) AS Verdict,
   840  			FROM (
   841  				-- Flatten test runs to source verdicts.
   842  				SELECT
   843  					SourcePosition,
   844  					ChangelistHost,
   845  					ChangelistChange,
   846  					ChangelistPatchset,
   847            ANY_VALUE(ChangelistOwnerKind) AS ChangelistOwnerKind,
   848  					ANY_VALUE(HasDirtySources) AS HasDirtySources,
   849  					ANY_VALUE(IF(HasDirtySources, IngestedInvocationId, NULL)) AS DirtySourcesUniqifier,
   850  					ARRAY_AGG(DISTINCT IngestedInvocationId) as IngestedInvocationIds,
   851  					MAX(PartitionTime) as MaxPartitionTime,
   852  					MIN(PartitionTime) as MinPartitionTime,
   853  					COUNTIF(UnexpectedRun) as UnexpectedRuns,
   854  					COUNTIF(NOT UnexpectedRun) as ExpectedRuns,
   855  				FROM (
   856  					-- Flatten test results to test runs.
   857  					SELECT
   858  						PartitionTime,
   859  						IngestedInvocationId,
   860  						RunIndex,
   861  						LOGICAL_AND(COALESCE(IsUnexpected, FALSE)) AS UnexpectedRun,
   862  						ANY_VALUE(SourcePosition) AS SourcePosition,
   863  						ANY_VALUE(ChangelistHosts)[SAFE_OFFSET(0)] AS ChangelistHost,
   864  						ANY_VALUE(ChangelistChanges)[SAFE_OFFSET(0)] AS ChangelistChange,
   865  						ANY_VALUE(ChangelistPatchsets)[SAFE_OFFSET(0)] AS ChangelistPatchset,
   866  						ANY_VALUE(ChangelistOwnerKinds)[SAFE_OFFSET(0)] AS ChangelistOwnerKind,
   867  						ANY_VALUE(HasDirtySources) AS HasDirtySources
   868  					FROM TestResults
   869  					WHERE Project = @project
   870  						AND PartitionTime >= TIMESTAMP_SUB(@asAtTime, INTERVAL 14 DAY)
   871  						AND PartitionTime < @asAtTime
   872  						AND TestId = tv.TestId
   873  						AND VariantHash = tv.VariantHash
   874  						AND SourceRefHash = tv.SourceRefHash
   875  						AND SubRealm IN UNNEST(@subRealms)
   876  						-- Exclude skipped results.
   877  						AND Status <> @skip
   878  						AND (
   879  							(
   880  								-- Either there must be no CL tested by this result.
   881  								ChangelistHosts IS NULL OR ARRAY_LENGTH(ChangelistHosts) = 0
   882  							)
   883  							OR (
   884  								-- Or there must be exactly one CL tested.
   885  								ARRAY_LENGTH(ChangelistHosts) = 1
   886  
   887  								-- And that CL may not be authored by automation.
   888  								-- Automatic uprev automation will happily upload out CL after CL
   889  								-- with essentially the same change, that breaks the same test.
   890  								-- This adds more noise than signal.
   891  								AND ChangelistOwnerKinds[SAFE_OFFSET(0)] <> 'A'
   892  
   893  								-- And that CL must not be one of changelists which we
   894  								-- are considering exonerating as a result of this RPC.
   895  								AND STRUCT(ChangelistHosts[SAFE_OFFSET(0)] as Host, ChangelistChanges[SAFE_OFFSET(0)] AS Change)
   896  									NOT IN UNNEST(tv.ExcludedChangelists)
   897  							)
   898  						)
   899  					GROUP BY PartitionTime, IngestedInvocationId, RunIndex
   900  				)
   901  				GROUP BY
   902  					-- Base source position tested
   903  					SourcePosition,
   904  					-- Patchset applied ontop of base sources (if any)
   905  					ChangelistHost, ChangelistChange, ChangelistPatchset,
   906  					-- If sources are marked dirty, then sources must be treated as unique
   907  					-- per invocation. (I.E. then source verdict == test verdict).
   908  					IF(HasDirtySources, IngestedInvocationId, NULL)
   909  			)
   910  			-- Deduplicate to at most one per CL. For source verdicts not related
   911  			-- to a CL, no deduplication shall occur.
   912  			GROUP BY
   913  				-- Changelist applied ontop of base sources (if any), excluding patchset number.
   914  				ChangelistHost, ChangelistChange,
   915  				-- If there is no CL under test, then the original source verdicts
   916  				-- may be kept.
   917  				IF(ChangelistHost IS NULL, SourcePosition, NULL),
   918  				IF(ChangelistHost IS NULL, DirtySourcesUniqifier, NULL)
   919  			ORDER BY Verdict.SourcePosition DESC, Verdict.MaxPartitionTime DESC
   920  		) AS Verdicts,
   921  	FROM UNNEST(@testVariantPositions) tv WITH OFFSET Index
   922  )
   923  
   924  SELECT
   925  	TestId,
   926  	VariantHash,
   927  	ARRAY(
   928  		SELECT AS STRUCT
   929  			SourcePosition,
   930  			ChangelistHost,
   931  			ChangelistChange,
   932  			ChangelistPatchset,
   933  			ChangelistOwnerKind,
   934  			IngestedInvocationIds,
   935  			UnexpectedRuns,
   936  			ExpectedRuns
   937  		FROM UNNEST(Verdicts) v
   938  		WHERE v.SourcePosition < QuerySourcePosition
   939  		ORDER BY SourcePosition DESC, MaxPartitionTime DESC
   940  		-- The actual criteria is for 10 runs, not 10 source verdicts,
   941  		-- but this is hard to implement in SQL so we'll do post-filtering
   942  		-- in the app.
   943  		LIMIT 10
   944  	) as FailureRateVerdictsBefore,
   945  	ARRAY(
   946  		SELECT AS STRUCT
   947  			SourcePosition,
   948  			ChangelistHost,
   949  			ChangelistChange,
   950  			ChangelistPatchset,
   951  			ChangelistOwnerKind,
   952  			IngestedInvocationIds,
   953  			UnexpectedRuns,
   954  			ExpectedRuns
   955  		FROM UNNEST(Verdicts) v
   956  		WHERE v.SourcePosition >= QuerySourcePosition
   957  		ORDER BY SourcePosition ASC, MaxPartitionTime DESC
   958  		-- The actual criteria is for 10 runs, not 10 source verdicts,
   959  		-- but this is hard to implement in SQL so we'll do post-filtering
   960  		-- in the app.
   961  		LIMIT 10
   962  	) as FailureRateVerdictsOnOrAfter,
   963  	ARRAY(
   964  		-- The design calls for us to:
   965  		-- 1. convert the query position to a partition time,
   966  		-- 2. calculate a window +/- 7 days from that time,
   967  		-- 3. convert that time window back to source position range
   968  		-- 4. query that source position range and count the number of
   969  		---   (and proportion of) flaky verdicts in the range.
   970  		--
   971  		-- As Spanner does not have analytic functions, steps 1 and 3 are
   972  		-- hard to do in SQL.
   973  		--
   974  		-- Returning all verdicts to the backend to run the analysis there
   975  		-- is also not viable: each test variant may have up to ~10,000
   976  		-- source verdicts per two week period. At 100 bytes per verdict,
   977  		-- this would imply a transfer of around 1 MB per test variant
   978  		-- (or 100 MB in total for 100 test variants) to the backend.
   979  		-- This is too much.
   980  		--
   981  		-- Therefore, we use an approximate implementation.
   982  		-- We partition the source verdicts into 100 source position ranges,
   983  		-- maintaining the earliest partition time for each. This allows
   984  		-- steps 1-3 to be computed by AppEngine after the query returns.
   985  		--
   986  		-- Each bucket also maintains counts of flaky verdicts and total
   987  		-- source verdicts. Because of this, there is no need to perform
   988  		-- a follow-up query; once we determine the source position window
   989  		-- to query, we simply count the verdicts in the buckets we
   990  		-- determined to be part of that window.
   991  		SELECT AS STRUCT
   992  			CAST(FLOOR(Index * 100 / ARRAY_LENGTH(SourcePositions)) AS INT64) as BucketKey,
   993  			MIN(SourcePosition) as StartSourcePosition,
   994  			MAX(SourcePosition) as EndSourcePosition,
   995  			MIN(EarliestPartitionTime) as EarliestPartitionTime,
   996  			SUM(TotalVerdicts) as TotalVerdicts,
   997  			SUM(RunFlakyVerdicts) as RunFlakyVerdicts,
   998  		FROM (
   999  			SELECT
  1000  				ARRAY(
  1001  					-- Group source verdicts by source position first,
  1002  					-- so that buckets contain all of a source position
  1003  					-- or none of it.
  1004  					SELECT AS STRUCT
  1005  						SourcePosition,
  1006  						MIN(MinPartitionTime) as EarliestPartitionTime,
  1007  						COUNT(1) as TotalVerdicts,
  1008  						COUNTIF(UnexpectedRuns > 0 AND ExpectedRuns > 0) as RunFlakyVerdicts,
  1009  					FROM UNNEST(Verdicts) v
  1010  					GROUP BY SourcePosition
  1011  					ORDER BY SourcePosition
  1012  				) AS SourcePositions
  1013  		), UNNEST(SourcePositions) sp WITH OFFSET Index
  1014  		GROUP BY 1
  1015  		ORDER BY BucketKey
  1016  	) AS FlakeRateBuckets,
  1017  	-- We do not yet know exactly the range of source positions
  1018  	-- that we will end up using for the flake rate criteria.
  1019  	-- Get (up to) 10 examples of flake each side of the
  1020  	-- query position, so that regardless of where the
  1021  	-- window falls, we will be able to get 10 examples.
  1022  	ARRAY(
  1023  		SELECT AS STRUCT
  1024  			SourcePosition,
  1025  			ChangelistHost,
  1026  			ChangelistChange,
  1027  			ChangelistPatchset,
  1028  			ChangelistOwnerKind,
  1029  			IngestedInvocationIds,
  1030  			UnexpectedRuns,
  1031  			ExpectedRuns
  1032  		FROM UNNEST(Verdicts) v WITH OFFSET Index
  1033  		WHERE UnexpectedRuns > 0 AND ExpectedRuns > 0
  1034  		  AND v.SourcePosition < QuerySourcePosition
  1035  		ORDER BY SourcePosition DESC, MaxPartitionTime DESC
  1036  		LIMIT 10
  1037  	) as FlakeExamplesBefore,
  1038  	ARRAY(
  1039  		SELECT AS STRUCT
  1040  			SourcePosition,
  1041  			ChangelistHost,
  1042  			ChangelistChange,
  1043  			ChangelistPatchset,
  1044  			ChangelistOwnerKind,
  1045  			IngestedInvocationIds,
  1046  			UnexpectedRuns,
  1047  			ExpectedRuns
  1048  		FROM UNNEST(Verdicts) v WITH OFFSET Index
  1049  		WHERE UnexpectedRuns > 0 AND ExpectedRuns > 0
  1050  		  AND v.SourcePosition >= QuerySourcePosition
  1051  		ORDER BY SourcePosition ASC, MaxPartitionTime DESC
  1052  		LIMIT 10
  1053  	) as FlakeExamplesOnOrAfter,
  1054  FROM test_variant_verdicts
  1055  ORDER BY Index
  1056  `