go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/analysis/internal/failureattributes/test_filtering.go (about) 1 // Copyright 2023 The LUCI Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package failureattributes 16 17 import ( 18 "context" 19 "time" 20 21 "cloud.google.com/go/bigquery" 22 23 "go.chromium.org/luci/common/errors" 24 25 "go.chromium.org/luci/analysis/internal/bqutil" 26 "go.chromium.org/luci/analysis/internal/config" 27 ) 28 29 // NewFilteredRunsAttributionHandler initialises a new TestFilteringHandler instance. 30 func NewFilteredRunsAttributionHandler(cloudProject string) *FilteredRunsAttributionHandler { 31 return &FilteredRunsAttributionHandler{cloudProject: cloudProject} 32 } 33 34 // FilteredRunsAttributionHandler handles the attribute-filtered-test-runs cron job. 35 type FilteredRunsAttributionHandler struct { 36 cloudProject string 37 } 38 39 // CronHandler handles the attribute-filtered-test-runs cron job. 40 func (h *FilteredRunsAttributionHandler) CronHandler(ctx context.Context) error { 41 attributesClient, err := NewClient(ctx, h.cloudProject) 42 if err != nil { 43 return err 44 } 45 projectConfigs, err := config.Projects(ctx) 46 if err != nil { 47 return errors.Annotate(err, "obtain project configs").Err() 48 } 49 50 for _, project := range projectConfigs.Keys() { 51 err := attributesClient.attributeFilteredRuns(ctx, project) 52 if err != nil { 53 return errors.Annotate(err, "attribute filtered test runs for %s", project).Err() 54 } 55 } 56 57 return nil 58 } 59 60 // ingestionDelayThresholdDays is the maximum number of days a test filtering 61 // event (represented as a skipped test result) need to be ingested after the 62 // test filtering event occurs (determined by its partition date). If the test 63 // result is ingested more than `ingestionDelayThresholdDays` days later, it 64 // will not be attributed to older test failures. 65 // 66 // A reasonable threshold should achieve a balance between 67 // 1. tolerating ingestion delay (particularly due to ingestion failures and 68 // retries), and 69 // 2. reducing the amount of data the attribution query need to process. 70 // 71 // TODO: record the partition time of the earliest, unprocessed test filtering 72 // event so we can increase the partition time window in the query as needed 73 // instead of relying on a hardcoded threshold. 74 const ingestionDelayThresholdDays = 3 75 76 // attributionRule specifies how the filtered test runs are attributed to the 77 // test failures in a project. 78 type attributionRule struct { 79 // attributionWindowDays is the maximum number of days a filtered test run 80 // can be attributed to a test failure since the failure occurs. i.e. A 81 // filtered test run can only be attributed to a test failure if 82 // test_run_partition_time - test_failure_partition_time <= 83 // attributionWindowDays days. 84 attributionWindowDays int64 85 // failureFilterSQL is a predicate on the results that defines which results 86 // may have filtered test runs being attributed to them. 87 failureFilterSQL string 88 // partitionSQL is the SQL snippet used to partition the test results. 89 // Only results in the same partition as a test failure can be attributed to 90 // the test failure. 91 partitionSQL string 92 // runFilterSQL is a predicate on the results that defines which results 93 // represents a run that filtered out the test. 94 runFilterSQL string 95 // runCountSQL is an expression that defines the distinct runs to count. 96 // 97 // CAVEAT: the test_verdicts table may contain duplicated rows. To avoid 98 // counting duplicates, do not count the number of rows directly. 99 runCountSQL string 100 } 101 102 // projectAttributionRule specifies the attribution rule of each project. 103 // Attribution is only enabled when the attribution rule is defined for a 104 // project. 105 // 106 // TODO: move this to project configs with a more restrictive syntax and less 107 // dependency on the exact shape of the query. 108 var projectAttributionRule = map[string]attributionRule{ 109 // As defined in go/cros-test-filtering. 110 "chromeos": { 111 attributionWindowDays: 14, 112 failureFilterSQL: `NOT r.expected AND tv.change_verifier_run IS NULL`, 113 // Ideally, the skipped test results should be attributed to failures that 114 // caused them. i.e. failures in the same milestone if there are enough 115 // samples, or failures in the previous milestone otherwise. But given that 116 // luci-analysis does not know whether the testing filtering was activated 117 // from failures from which milestone, it's hard to write a query that 118 // can attribute correctly. 119 // 120 // Instead, we attribute skipped test results to failures in the any 121 // milestone. This is only an issue when 122 // 1. the test is failing in a previous milestone for a different root 123 // cause, and 124 // 2. the failures in the other milestone did not trigger test filtering 125 // themselves (therefore already marked as "triggered test filtering"). 126 // Hopefully, this should be very rare. And we can adjust the attribution 127 // logic further if this become an issue. 128 partitionSQL: `tv.test_id, STRING(tv.variant.board)`, 129 runFilterSQL: `tv.change_verifier_run IS NOT NULL AND r.expected AND r.status='SKIP' ` + 130 `AND r.skip_reason='AUTOMATICALLY_DISABLED_FOR_FLAKINESS'`, 131 runCountSQL: `r.name`, 132 }, 133 } 134 135 // attributeFilteredRuns runs a query to attribute the filtered test runs to 136 // failures that cause the test to be filtered out. Then saves the result to 137 // failure_attributes table. 138 func (s *Client) attributeFilteredRuns(ctx context.Context, project string) error { 139 if err := s.ensureSchema(ctx); err != nil { 140 return errors.Annotate(err, "ensure schema").Err() 141 } 142 143 rule, ok := projectAttributionRule[project] 144 if !ok { 145 return nil 146 } 147 148 q := s.bqClient.Query(` 149 MERGE INTO internal.failure_attributes T 150 USING ( 151 WITH post_finalization_attributes AS ( 152 SELECT 153 tv.invocation.id AS verdict_invocation_id, 154 r.name AS result_name, 155 tv.partition_time AS partition_time, 156 ` + rule.failureFilterSQL + ` AS is_attribution_target, 157 -- COUNT(DISTINCT IF(...)) cannot be used when window declaration has 158 -- ORDER BY. Gather the items in an array so we can count the distinct 159 -- items in the array later. 160 ARRAY_AGG(IF( 161 ` + rule.runFilterSQL + `, 162 ` + rule.runCountSQL + `, 163 NULL 164 )) OVER attribution_source AS attributed_filtered_runs, 165 FROM internal.test_verdicts AS tv, UNNEST(results) AS r 166 -- Postsubmit failures from more than @attributionWindowMs milliseconds 167 -- ago will no longer have new skip results being attributed to them. 168 -- Add @delayThresholdMs milliseconds to account for test results 169 -- ingestion delay (particularly due to ingestion failures and retries). 170 WHERE 171 tv.partition_time >= TIMESTAMP_SUB( 172 CURRENT_TIMESTAMP(), 173 INTERVAL @attributionWindowMs + @delayThresholdMs MILLISECOND 174 ) 175 AND tv.project = @project 176 WINDOW attribution_source AS ( 177 PARTITION BY ` + rule.partitionSQL + ` 178 ORDER BY UNIX_MILLIS(tv.partition_time) 179 RANGE BETWEEN 1 FOLLOWING AND @attributionWindowMs FOLLOWING 180 ) 181 ) 182 SELECT 183 @project AS project, 184 "resultdb" AS test_result_system, 185 verdict_invocation_id AS ingested_invocation_id, 186 result_name AS test_result_id, 187 ANY_VALUE(partition_time) AS partition_time, 188 ANY_VALUE((SELECT COUNT(DISTINCT run) FROM UNNEST(attributed_filtered_runs) AS run)) AS attributed_filtered_run_count, 189 FROM post_finalization_attributes 190 WHERE is_attribution_target 191 -- test_verdicts table may contain duplicated rows. Use a group statement 192 -- to prevent the duplicated rows from causing the DML merge to fail. 193 GROUP BY verdict_invocation_id, result_name 194 ) S 195 ON S.partition_time = T.partition_time 196 AND S.project = T.project 197 AND S.test_result_system = T.test_result_system 198 AND S.ingested_invocation_id = T.ingested_invocation_id 199 AND S.test_result_id = T.test_result_id 200 WHEN MATCHED 201 AND S.attributed_filtered_run_count > T.attributed_filtered_run_count 202 THEN 203 UPDATE SET attributed_filtered_run_count = S.attributed_filtered_run_count 204 WHEN NOT MATCHED THEN 205 INSERT (project, test_result_system, ingested_invocation_id, test_result_id, partition_time, attributed_filtered_run_count) 206 VALUES ( 207 S.project, 208 S.test_result_system, 209 S.ingested_invocation_id, 210 S.test_result_id, 211 S.partition_time, 212 S.attributed_filtered_run_count 213 ); 214 `) 215 q.Parameters = append(q.Parameters, 216 bigquery.QueryParameter{Name: "project", Value: project}, 217 bigquery.QueryParameter{Name: "attributionWindowMs", Value: rule.attributionWindowDays * 24 * 60 * 60 * 1000}, 218 bigquery.QueryParameter{Name: "delayThresholdMs", Value: ingestionDelayThresholdDays * 24 * 60 * 60 * 1000}, 219 ) 220 221 job, err := q.Run(ctx) 222 if err != nil { 223 return err 224 } 225 226 waitCtx, cancel := context.WithTimeout(ctx, time.Minute*9) 227 defer cancel() 228 js, err := bqutil.WaitForJob(waitCtx, job) 229 if err != nil { 230 return errors.Annotate(err, "waiting for filtered test run attribution query to complete").Err() 231 } 232 if js.Err() != nil { 233 return errors.Annotate(js.Err(), "filtered test run attribution query failed").Err() 234 } 235 return nil 236 }