go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/analysis/internal/failureattributes/test_filtering.go

go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/analysis/internal/failureattributes/test_filtering.go (about)

     1  // Copyright 2023 The LUCI Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package failureattributes
    16  
    17  import (
    18  	"context"
    19  	"time"
    20  
    21  	"cloud.google.com/go/bigquery"
    22  
    23  	"go.chromium.org/luci/common/errors"
    24  
    25  	"go.chromium.org/luci/analysis/internal/bqutil"
    26  	"go.chromium.org/luci/analysis/internal/config"
    27  )
    28  
    29  // NewFilteredRunsAttributionHandler initialises a new TestFilteringHandler instance.
    30  func NewFilteredRunsAttributionHandler(cloudProject string) *FilteredRunsAttributionHandler {
    31  	return &FilteredRunsAttributionHandler{cloudProject: cloudProject}
    32  }
    33  
    34  // FilteredRunsAttributionHandler handles the attribute-filtered-test-runs cron job.
    35  type FilteredRunsAttributionHandler struct {
    36  	cloudProject string
    37  }
    38  
    39  // CronHandler handles the attribute-filtered-test-runs cron job.
    40  func (h *FilteredRunsAttributionHandler) CronHandler(ctx context.Context) error {
    41  	attributesClient, err := NewClient(ctx, h.cloudProject)
    42  	if err != nil {
    43  		return err
    44  	}
    45  	projectConfigs, err := config.Projects(ctx)
    46  	if err != nil {
    47  		return errors.Annotate(err, "obtain project configs").Err()
    48  	}
    49  
    50  	for _, project := range projectConfigs.Keys() {
    51  		err := attributesClient.attributeFilteredRuns(ctx, project)
    52  		if err != nil {
    53  			return errors.Annotate(err, "attribute filtered test runs for %s", project).Err()
    54  		}
    55  	}
    56  
    57  	return nil
    58  }
    59  
    60  // ingestionDelayThresholdDays is the maximum number of days a test filtering
    61  // event (represented as a skipped test result) need to be ingested after the
    62  // test filtering event occurs (determined by its partition date). If the test
    63  // result is ingested more than `ingestionDelayThresholdDays` days later, it
    64  // will not be attributed to older test failures.
    65  //
    66  // A reasonable threshold should achieve a balance between
    67  //  1. tolerating ingestion delay (particularly due to ingestion failures and
    68  //     retries), and
    69  //  2. reducing the amount of data the attribution query need to process.
    70  //
    71  // TODO: record the partition time of the earliest, unprocessed test filtering
    72  // event so we can increase the partition time window in the query as needed
    73  // instead of relying on a hardcoded threshold.
    74  const ingestionDelayThresholdDays = 3
    75  
    76  // attributionRule specifies how the filtered test runs are attributed to the
    77  // test failures in a project.
    78  type attributionRule struct {
    79  	// attributionWindowDays is the maximum number of days a filtered test run
    80  	// can be attributed to a test failure since the failure occurs. i.e. A
    81  	// filtered test run can only be attributed to a test failure if
    82  	// test_run_partition_time - test_failure_partition_time <=
    83  	// attributionWindowDays days.
    84  	attributionWindowDays int64
    85  	// failureFilterSQL is a predicate on the results that defines which results
    86  	// may have filtered test runs being attributed to them.
    87  	failureFilterSQL string
    88  	// partitionSQL is the SQL snippet used to partition the test results.
    89  	// Only results in the same partition as a test failure can be attributed to
    90  	// the test failure.
    91  	partitionSQL string
    92  	// runFilterSQL is a predicate on the results that defines which results
    93  	// represents a run that filtered out the test.
    94  	runFilterSQL string
    95  	// runCountSQL is an expression that defines the distinct runs to count.
    96  	//
    97  	// CAVEAT: the test_verdicts table may contain duplicated rows. To avoid
    98  	// counting duplicates, do not count the number of rows directly.
    99  	runCountSQL string
   100  }
   101  
   102  // projectAttributionRule specifies the attribution rule of each project.
   103  // Attribution is only enabled when the attribution rule is defined for a
   104  // project.
   105  //
   106  // TODO: move this to project configs with a more restrictive syntax and less
   107  // dependency on the exact shape of the query.
   108  var projectAttributionRule = map[string]attributionRule{
   109  	// As defined in go/cros-test-filtering.
   110  	"chromeos": {
   111  		attributionWindowDays: 14,
   112  		failureFilterSQL:      `NOT r.expected AND tv.change_verifier_run IS NULL`,
   113  		// Ideally, the skipped test results should be attributed to failures that
   114  		// caused them. i.e. failures in the same milestone if there are enough
   115  		// samples, or failures in the previous milestone otherwise. But given that
   116  		// luci-analysis does not know whether the testing filtering was activated
   117  		// from failures from which milestone, it's hard to write a query that
   118  		// can attribute correctly.
   119  		//
   120  		// Instead, we attribute skipped test results to failures in the any
   121  		// milestone. This is only an issue when
   122  		// 1. the test is failing in a previous milestone for a different root
   123  		// cause, and
   124  		// 2. the failures in the other milestone did not trigger test filtering
   125  		// themselves (therefore already marked as "triggered test filtering").
   126  		// Hopefully, this should be very rare. And we can adjust the attribution
   127  		// logic further if this become an issue.
   128  		partitionSQL: `tv.test_id, STRING(tv.variant.board)`,
   129  		runFilterSQL: `tv.change_verifier_run IS NOT NULL AND r.expected AND r.status='SKIP' ` +
   130  			`AND r.skip_reason='AUTOMATICALLY_DISABLED_FOR_FLAKINESS'`,
   131  		runCountSQL: `r.name`,
   132  	},
   133  }
   134  
   135  // attributeFilteredRuns runs a query to attribute the filtered test runs to
   136  // failures that cause the test to be filtered out. Then saves the result to
   137  // failure_attributes table.
   138  func (s *Client) attributeFilteredRuns(ctx context.Context, project string) error {
   139  	if err := s.ensureSchema(ctx); err != nil {
   140  		return errors.Annotate(err, "ensure schema").Err()
   141  	}
   142  
   143  	rule, ok := projectAttributionRule[project]
   144  	if !ok {
   145  		return nil
   146  	}
   147  
   148  	q := s.bqClient.Query(`
   149  		MERGE INTO internal.failure_attributes T
   150  		USING (
   151  			WITH post_finalization_attributes AS (
   152  				SELECT
   153  					tv.invocation.id AS verdict_invocation_id,
   154  					r.name AS result_name,
   155  					tv.partition_time AS partition_time,
   156  					` + rule.failureFilterSQL + ` AS is_attribution_target,
   157  					-- COUNT(DISTINCT IF(...)) cannot be used when window declaration has
   158  					-- ORDER BY. Gather the items in an array so we can count the distinct
   159  					-- items in the array later.
   160  					ARRAY_AGG(IF(
   161  						` + rule.runFilterSQL + `,
   162  						` + rule.runCountSQL + `,
   163  						NULL
   164  					)) OVER attribution_source AS attributed_filtered_runs,
   165  				FROM internal.test_verdicts AS tv, UNNEST(results) AS r
   166  				-- Postsubmit failures from more than @attributionWindowMs milliseconds
   167  				-- ago will no longer have new skip results being attributed to them.
   168  				-- Add @delayThresholdMs milliseconds to account for test results
   169  				-- ingestion delay (particularly due to ingestion failures and retries).
   170  				WHERE
   171  					tv.partition_time >= TIMESTAMP_SUB(
   172  						CURRENT_TIMESTAMP(),
   173  						INTERVAL @attributionWindowMs + @delayThresholdMs MILLISECOND
   174  					)
   175  					AND tv.project = @project
   176  				WINDOW attribution_source AS (
   177  					PARTITION BY ` + rule.partitionSQL + `
   178  					ORDER BY UNIX_MILLIS(tv.partition_time)
   179  					RANGE BETWEEN 1 FOLLOWING AND @attributionWindowMs FOLLOWING
   180  				)
   181  			)
   182  			SELECT
   183  				@project AS project,
   184  				"resultdb" AS test_result_system,
   185  				verdict_invocation_id AS ingested_invocation_id,
   186  				result_name AS test_result_id,
   187  				ANY_VALUE(partition_time) AS partition_time,
   188  				ANY_VALUE((SELECT COUNT(DISTINCT run) FROM UNNEST(attributed_filtered_runs) AS run)) AS attributed_filtered_run_count,
   189  			FROM post_finalization_attributes
   190  			WHERE is_attribution_target
   191  			-- test_verdicts table may contain duplicated rows. Use a group statement
   192  			-- to prevent the duplicated rows from causing the DML merge to fail.
   193  			GROUP BY verdict_invocation_id, result_name
   194  		) S
   195  		ON S.partition_time = T.partition_time
   196  			AND S.project = T.project
   197  			AND S.test_result_system = T.test_result_system
   198    		AND S.ingested_invocation_id = T.ingested_invocation_id
   199  			AND S.test_result_id = T.test_result_id
   200  		WHEN MATCHED
   201  			AND S.attributed_filtered_run_count > T.attributed_filtered_run_count
   202  		THEN
   203  			UPDATE SET attributed_filtered_run_count = S.attributed_filtered_run_count
   204  		WHEN NOT MATCHED THEN
   205  			INSERT (project, test_result_system, ingested_invocation_id, test_result_id, partition_time, attributed_filtered_run_count)
   206  			VALUES (
   207  				S.project,
   208  				S.test_result_system,
   209  				S.ingested_invocation_id,
   210  				S.test_result_id,
   211  				S.partition_time,
   212  				S.attributed_filtered_run_count
   213  			);
   214  	`)
   215  	q.Parameters = append(q.Parameters,
   216  		bigquery.QueryParameter{Name: "project", Value: project},
   217  		bigquery.QueryParameter{Name: "attributionWindowMs", Value: rule.attributionWindowDays * 24 * 60 * 60 * 1000},
   218  		bigquery.QueryParameter{Name: "delayThresholdMs", Value: ingestionDelayThresholdDays * 24 * 60 * 60 * 1000},
   219  	)
   220  
   221  	job, err := q.Run(ctx)
   222  	if err != nil {
   223  		return err
   224  	}
   225  
   226  	waitCtx, cancel := context.WithTimeout(ctx, time.Minute*9)
   227  	defer cancel()
   228  	js, err := bqutil.WaitForJob(waitCtx, job)
   229  	if err != nil {
   230  		return errors.Annotate(err, "waiting for filtered test run attribution query to complete").Err()
   231  	}
   232  	if js.Err() != nil {
   233  		return errors.Annotate(js.Err(), "filtered test run attribution query failed").Err()
   234  	}
   235  	return nil
   236  }