go.fuchsia.dev/infra@v0.0.0-20240507153436-9b593402251b/cmd/autogardener/blamelist.go

go.fuchsia.dev/infra@v0.0.0-20240507153436-9b593402251b/cmd/autogardener/blamelist.go (about)

     1  // Copyright 2022 The Fuchsia Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style license that can be
     3  // found in the LICENSE file.
     4  
     5  package main
     6  
     7  import (
     8  	"context"
     9  	_ "embed"
    10  	"time"
    11  
    12  	"cloud.google.com/go/bigquery"
    13  	"go.fuchsia.dev/infra/functools"
    14  )
    15  
    16  //go:embed queries/nearby_test_results.sql
    17  var nearbyTestResultsQuery string
    18  
    19  type nearbyTestResult struct {
    20  	// Absolute builder name, in the form "project/bucket/builder".
    21  	Builder        string
    22  	Failed         bool
    23  	CommitPosition int
    24  }
    25  
    26  func getNearbyTestResults(
    27  	ctx context.Context,
    28  	bqClient *bigquery.Client,
    29  	sig failureSignature,
    30  	windowEnd time.Time,
    31  ) ([]nearbyTestResult, error) {
    32  	return runQuery[nearbyTestResult](ctx, bqClient, nearbyTestResultsQuery,
    33  		map[string]any{
    34  			"test_id": sig.FailedTest,
    35  			// Use a smaller window to decrease the likelihood of treating
    36  			// failures from separate old breakages as the first failure of the
    37  			// current breakage.
    38  			"earliest_time": windowEnd.Add(-20 * time.Hour),
    39  			"latest_time":   windowEnd,
    40  			// TODO(olivernewman): don't hardcode this.
    41  			"repo": "turquoise-internal.googlesource.com/integration",
    42  		},
    43  	)
    44  }
    45  
    46  // calculateBlamelistDistances computes, for each builder with a certain failure
    47  // mode, the number of builds between each suspect commit and the first build
    48  // (within the time window used by the query) that had that failure mode.
    49  //
    50  // This is analogous to the manual process of "lining up" CI builder blamelists
    51  // to find a culprit.
    52  func calculateBlamelistDistances(results []nearbyTestResult, suspects []suspectCommit) error {
    53  	byBuilder := make(map[string][]nearbyTestResult)
    54  	for _, tr := range results {
    55  		byBuilder[tr.Builder] = append(byBuilder[tr.Builder], tr)
    56  	}
    57  	// Sort results in chronological order (earliest first).
    58  	for _, results := range byBuilder {
    59  		functools.SortBy(results, func(tr nearbyTestResult) int {
    60  			return tr.CommitPosition
    61  		})
    62  	}
    63  
    64  	// TODO(olivernewman): handle the case where a test has broken on separate
    65  	// occasions within the time window. It's not easy to distinguish this from
    66  	// flakiness, but we can make a best effort at least for high-frequency
    67  	// failure modes.
    68  	for builder, results := range byBuilder {
    69  		firstFailureIdx := -1
    70  		for i, result := range results {
    71  			if result.Failed {
    72  				firstFailureIdx = i
    73  				break
    74  			}
    75  		}
    76  		// Skip builders where the failure mode didn't occur at all.
    77  		if firstFailureIdx == -1 {
    78  			continue
    79  		}
    80  		for i, suspect := range suspects {
    81  			for buildIdx, result := range results {
    82  				if result.CommitPosition >= suspect.CommitPosition {
    83  					// TODO(olivernewman): Also take blamelist size into
    84  					// account. If we are X% confident that a culprit falls
    85  					// within a given blamelist of length N, then we're only
    86  					// X/N% confident in each member of the blamelist. So that
    87  					// confidence will increase as the blamelist size decreases.
    88  					dist := firstFailureIdx - buildIdx
    89  					suspects[i].BlamelistDistances[builder] = dist
    90  					break
    91  				}
    92  			}
    93  		}
    94  	}
    95  	return nil
    96  }
    97  
    98  // scoreBlamelistDistances computes a 0-100 likelihood score for a potential
    99  // culprit commit based on a list of CI builder first-failure blamelist
   100  // distances. It takes the amount of data points into account by incorporating a
   101  // uncertainty level.
   102  func scoreBlamelistDistances(distances []int) int {
   103  	if len(distances) == 0 {
   104  		return 0
   105  	}
   106  	var weightedDistances []int
   107  	for _, dist := range distances {
   108  		if dist < 0 {
   109  			// If the commit landed *after* the first failure that's an
   110  			// especially good indicator that it's unlikely to be the culprit,
   111  			// so apply a large multiplier to that data point (and negate it, so
   112  			// the weighted distances are all positive). We simply downweight
   113  			// the suspect instead of completely discarding it (by returning
   114  			// zero) because it's possible the first failure is a different
   115  			// failure mode (e.g. latent flakiness) than the failure mode we're
   116  			// trying to diagnose.
   117  			dist *= -10
   118  		}
   119  		weightedDistances = append(weightedDistances, dist)
   120  	}
   121  	avg := average(weightedDistances)
   122  
   123  	// Calculate an uncertainty level based on the number of builders that are
   124  	// providing data points. If a commit was in the first failure blamelist of
   125  	// only builder it may be a coincidence, whereas we get a stronger signal if
   126  	// it's in the first failure blamelist of multiple builders.
   127  	uncertainty := 12 - 4*(len(distances)-1)
   128  	if uncertainty < 0 {
   129  		uncertainty = 0
   130  	}
   131  
   132  	score := 100/(avg/2+1) - float64(uncertainty)
   133  	if score < 0 {
   134  		return 0
   135  	}
   136  	return int(score)
   137  }