go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/bisection/testfailuredetection/test_failure_detection.go (about) 1 // Copyright 2023 The LUCI Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package testfailuredetection analyses recent test failures with 16 // the changepoint analysis from LUCI analysis, and select test failures to bisect. 17 package testfailuredetection 18 19 import ( 20 "context" 21 "fmt" 22 "math" 23 "strings" 24 25 "go.chromium.org/luci/bisection/internal/config" 26 "go.chromium.org/luci/bisection/internal/lucianalysis" 27 "go.chromium.org/luci/bisection/model" 28 configpb "go.chromium.org/luci/bisection/proto/config" 29 pb "go.chromium.org/luci/bisection/proto/v1" 30 "go.chromium.org/luci/bisection/rerun" 31 tpb "go.chromium.org/luci/bisection/task/proto" 32 "go.chromium.org/luci/bisection/testfailureanalysis/bisection" 33 "go.chromium.org/luci/bisection/util" 34 "go.chromium.org/luci/bisection/util/datastoreutil" 35 "go.chromium.org/luci/bisection/util/loggingutil" 36 "go.chromium.org/luci/common/clock" 37 "go.chromium.org/luci/common/errors" 38 "go.chromium.org/luci/common/logging" 39 "go.chromium.org/luci/common/retry/transient" 40 "go.chromium.org/luci/gae/service/datastore" 41 "go.chromium.org/luci/server" 42 "go.chromium.org/luci/server/tq" 43 "google.golang.org/protobuf/proto" 44 ) 45 46 const ( 47 taskClass = "test-failure-detection" 48 queue = "test-failure-detection" 49 ) 50 51 var taskClassRef = tq.RegisterTaskClass(tq.TaskClass{ 52 ID: taskClass, 53 Prototype: (*tpb.TestFailureDetectionTask)(nil), 54 Queue: queue, 55 Kind: tq.NonTransactional, 56 }) 57 58 // RegisterTaskClass registers the task class for tq dispatcher. 59 func RegisterTaskClass(srv *server.Server, luciAnalysisProjectFunc func(luciProject string) string) error { 60 ctx := srv.Context 61 ac, err := lucianalysis.NewClient(ctx, srv.Options.CloudProject, luciAnalysisProjectFunc) 62 if err != nil { 63 return err 64 } 65 srv.RegisterCleanup(func(context.Context) { 66 ac.Close() 67 }) 68 handler := func(c context.Context, payload proto.Message) error { 69 task := payload.(*tpb.TestFailureDetectionTask) 70 logging.Infof(c, "Processing test failure detection task %v", task) 71 err := Run(ctx, ac, task) 72 if err != nil { 73 err = errors.Annotate(err, "run detection").Err() 74 logging.Errorf(ctx, err.Error()) 75 // If the error is transient, return err to retry. 76 if transient.Tag.In(err) { 77 return err 78 } 79 return nil 80 } 81 return nil 82 } 83 taskClassRef.AttachHandler(handler) 84 return nil 85 } 86 87 // Schedule enqueues a task to find test failures to bisect. 88 func Schedule(ctx context.Context, task *tpb.TestFailureDetectionTask) error { 89 return tq.AddTask(ctx, &tq.Task{Payload: task}) 90 } 91 92 type analysisClient interface { 93 ReadTestFailures(ctx context.Context, task *tpb.TestFailureDetectionTask, filter *configpb.FailureIngestionFilter) ([]*lucianalysis.BuilderRegressionGroup, error) 94 ReadBuildInfo(ctx context.Context, tf *model.TestFailure) (lucianalysis.BuildInfo, error) 95 } 96 97 // Run finds and group test failures to send to bisector. 98 func Run(ctx context.Context, client analysisClient, task *tpb.TestFailureDetectionTask) error { 99 ctx = loggingutil.SetProject(ctx, task.Project) 100 logging.Infof(ctx, "Run test failure detection") 101 // Checks if test failure detection is enabled. 102 enabled, err := isEnabled(ctx, task.Project) 103 if err != nil { 104 return errors.Annotate(err, "is enabled").Err() 105 } 106 if !enabled { 107 logging.Infof(ctx, "Dectection is not enabled") 108 return nil 109 } 110 filter, err := getFailureIngestionFilter(ctx, task.Project) 111 if err != nil { 112 return errors.Annotate(err, "get excluded buckets").Err() 113 } 114 groups, err := client.ReadTestFailures(ctx, task, filter) 115 if err != nil { 116 return errors.Annotate(err, "read test failures").Err() 117 } 118 logging.Infof(ctx, "There are %d groups from LUCI Analysis query", len(groups)) 119 bundles := []*model.TestFailureBundle{} 120 skippedBundleLogLines := []string{} 121 for _, g := range groups { 122 bundle, err := newTestFailureBundle(task.Project, g) 123 if err != nil { 124 return errors.Annotate(err, "new test failure bundle").Err() 125 } 126 // Use the redundancy score of the primary test failure as 127 // the redundancy score of this test failure bundle. 128 rs, err := redundancyScore(ctx, bundle.Primary()) 129 if err != nil { 130 return errors.Annotate(err, "calculate redundancy score").Err() 131 } 132 if rs == 1 { 133 // Test failures in this bundle are completely redundant. 134 // This bundle should be skipped. 135 line := fmt.Sprintf("primary test %s(%s)", bundle.Primary().TestID, bundle.Primary().VariantHash) 136 skippedBundleLogLines = append(skippedBundleLogLines, line) 137 continue 138 } 139 bundle.Primary().RedundancyScore = rs 140 bundles = append(bundles, bundle) 141 } 142 logging.Infof(ctx, fmt.Sprintf("skip completely redundant bundles\n%s", strings.Join(skippedBundleLogLines, "\n"))) 143 logging.Infof(ctx, "There are %d bundles after redundancy filter", len(bundles)) 144 if len(bundles) == 0 { 145 logging.Infof(ctx, "Cannot find new test failures to bisect for project %s", task.Project) 146 return nil 147 } 148 bestBundle := First(ctx, bundles) 149 logging.Infof(ctx, "Selected test failure bundle with primary failure ID %s, variantHash %s, refHash %s", 150 bestBundle.Primary().TestID, bestBundle.Primary().VariantHash, bestBundle.Primary().RefHash) 151 testFailureAnalysis, err := prepareFailureAnalysis(ctx, client, bestBundle) 152 if err != nil { 153 // If there is a failure in preparing, in particular, in reading build info, 154 // we should store the analysis, so subsequent runs will not consider this 155 // test failure again. 156 testFailureAnalysis = &model.TestFailureAnalysis{ 157 Project: bestBundle.Primary().Project, 158 CreateTime: clock.Now(ctx), 159 Status: pb.AnalysisStatus_INSUFFICENTDATA, 160 RunStatus: pb.AnalysisRunStatus_ENDED, 161 EndTime: clock.Now(ctx), 162 SheriffRotations: bestBundle.Metadata.SheriffRotations, 163 } 164 e := saveTestFailuresAndAnalysis(ctx, bestBundle, testFailureAnalysis, false) 165 if e != nil { 166 // Just log. 167 logging.Errorf(ctx, "save test failure and analysis when insufficient data %v", e.Error()) 168 } 169 return errors.Annotate(err, "prepare failure analysis").Err() 170 } 171 if err := saveTestFailuresAndAnalysis(ctx, bestBundle, testFailureAnalysis, true); err != nil { 172 return errors.Annotate(err, "save test failure and analysis").Err() 173 } 174 return nil 175 } 176 177 func newTestFailureBundle(project string, group *lucianalysis.BuilderRegressionGroup) (*model.TestFailureBundle, error) { 178 testFailures := make([]*model.TestFailure, len(group.TestVariants)) 179 for i, tv := range group.TestVariants { 180 variant, err := util.VariantPB(tv.Variant.String()) 181 if err != nil { 182 return nil, err 183 } 184 testFailures[i] = &model.TestFailure{ 185 ID: 0, 186 Project: project, 187 TestID: tv.TestID.String(), 188 VariantHash: tv.VariantHash.String(), 189 Variant: variant, 190 RefHash: group.RefHash.String(), 191 Bucket: group.Bucket.String(), 192 Builder: group.Builder.String(), 193 Ref: &pb.SourceRef{System: &pb.SourceRef_Gitiles{ 194 Gitiles: &pb.GitilesRef{ 195 Host: group.Ref.Gitiles.Host.String(), 196 Project: group.Ref.Gitiles.Project.String(), 197 Ref: group.Ref.Gitiles.Ref.String(), 198 }, 199 }}, 200 RegressionStartPosition: group.RegressionStartPosition.Int64, 201 RegressionEndPosition: group.RegressionEndPosition.Int64, 202 StartPositionFailureRate: group.StartPositionFailureRate, 203 EndPositionFailureRate: group.EndPositionFailureRate, 204 IsPrimary: i == 0, 205 IsDiverged: false, 206 AnalysisKey: nil, 207 RedundancyScore: 0, 208 StartHour: group.StartHour.Timestamp.UTC(), 209 EndHour: group.EndHour.Timestamp.UTC(), 210 } 211 } 212 bundle := &model.TestFailureBundle{} 213 err := bundle.Add(testFailures) 214 if err != nil { 215 return nil, err 216 } 217 sheriffRotations := []string{} 218 for _, r := range group.SheriffRotations { 219 if r.String() != "" { 220 sheriffRotations = append(sheriffRotations, r.String()) 221 } 222 } 223 bundle.Metadata = &model.BundleMetaData{ 224 SheriffRotations: sheriffRotations, 225 } 226 return bundle, nil 227 } 228 229 // RedundancyScore returns a floating point number between 0 and 1 inclusive. 230 func redundancyScore(c context.Context, tf *model.TestFailure) (float64, error) { 231 sameTestVariant, err := datastoreutil.GetTestFailures(c, tf.Project, tf.TestID, tf.RefHash, tf.VariantHash) 232 if err != nil { 233 return 0, errors.Annotate(err, "get test failures of same test variant").Err() 234 } 235 for _, a := range sameTestVariant { 236 if numberOfOverlapCommit(tf.RegressionStartPosition, tf.RegressionEndPosition, 237 a.RegressionStartPosition, a.RegressionEndPosition) > 0 { 238 return 1, nil 239 } 240 } 241 maxOverlap := float64(0) 242 sameTest, err := datastoreutil.GetTestFailures(c, tf.Project, tf.TestID, tf.RefHash, "") 243 if err != nil { 244 return 0, errors.Annotate(err, "get test failures of same test").Err() 245 } 246 for _, t := range sameTest { 247 overlap := regressionRangeOverlap(tf.RegressionStartPosition, tf.RegressionEndPosition, 248 t.RegressionStartPosition, t.RegressionEndPosition) 249 maxOverlap = math.Max(maxOverlap, overlap) 250 } 251 if maxOverlap < 0 || maxOverlap > 1 { 252 return 0, errors.New("maxOverlap must between 0 to 1 inclusive. this suggests something wrong with the implementation") 253 } 254 return maxOverlap, nil 255 } 256 257 func numberOfOverlapCommit(rl1, ru1, rl2, ru2 int64) float64 { 258 return math.Min(float64(ru1), float64(ru2)) - math.Max(float64(rl1), float64(rl2)) + 1 259 } 260 261 func regressionRangeOverlap(rl1, ru1, rl2, ru2 int64) float64 { 262 return math.Max(0, numberOfOverlapCommit(rl1, ru1, rl2, ru2)) / float64(ru1-rl1+ru2-rl2+2) 263 } 264 265 func prepareFailureAnalysis(ctx context.Context, client analysisClient, bundle *model.TestFailureBundle) (*model.TestFailureAnalysis, error) { 266 tf := bundle.Primary() 267 buildInfo, err := client.ReadBuildInfo(ctx, tf) 268 if err != nil { 269 return nil, errors.Annotate(err, "read build info").Err() 270 } 271 testFailureAnalysis := &model.TestFailureAnalysis{ 272 Project: tf.Project, 273 Bucket: tf.Bucket, 274 Builder: tf.Builder, 275 CreateTime: clock.Now(ctx), 276 Status: pb.AnalysisStatus_CREATED, 277 Priority: rerun.PriorityTestFailure, 278 StartCommitHash: buildInfo.StartCommitHash, 279 EndCommitHash: buildInfo.EndCommitHash, 280 FailedBuildID: buildInfo.BuildID, 281 SheriffRotations: bundle.Metadata.SheriffRotations, 282 } 283 return testFailureAnalysis, nil 284 } 285 286 // saveTestFailuresAndAnalysis saves the test failures and a test failures analysis into datastore. 287 // It also transactionally enqueue a task to bisector, if shouldTriggerBisection is set to true. 288 func saveTestFailuresAndAnalysis(ctx context.Context, bundle *model.TestFailureBundle, testFailureAnalysis *model.TestFailureAnalysis, shouldTriggerBisection bool) error { 289 return datastore.RunInTransaction(ctx, func(ctx context.Context) error { 290 if err := datastore.AllocateIDs(ctx, testFailureAnalysis); err != nil { 291 return errors.Annotate(err, "allocate datastore ID for test failure analysis").Err() 292 } 293 for _, testFailure := range bundle.All() { 294 testFailure.AnalysisKey = datastore.KeyForObj(ctx, testFailureAnalysis) 295 } 296 // TODO(beining@): This will fail if the size of the bundle is greater than 499. 297 // If this becomes a problem, we need to save TestFailures in batches. 298 // https://cloud.google.com/datastore/docs/concepts/transactions#what_can_be_done_in_a_transaction 299 if err := datastore.Put(ctx, bundle.All()); err != nil { 300 return errors.Annotate(err, "save test failures").Err() 301 } 302 testFailureAnalysis.TestFailure = datastore.KeyForObj(ctx, bundle.Primary()) 303 if err := datastore.Put(ctx, testFailureAnalysis); err != nil { 304 return errors.Annotate(err, "save test failure analysis").Err() 305 } 306 // Send task to bisector transactionally. 307 if shouldTriggerBisection { 308 if err := bisection.Schedule(ctx, testFailureAnalysis.ID); err != nil { 309 return errors.Annotate(err, "send task to bisector").Err() 310 } 311 } 312 return nil 313 }, nil) 314 } 315 316 func isEnabled(ctx context.Context, project string) (bool, error) { 317 cfg, err := config.Project(ctx, project) 318 if err != nil { 319 return false, err 320 } 321 return cfg.TestAnalysisConfig.GetDetectorEnabled(), nil 322 } 323 324 func getFailureIngestionFilter(ctx context.Context, project string) (*configpb.FailureIngestionFilter, error) { 325 cfg, err := config.Project(ctx, project) 326 if err != nil { 327 return nil, err 328 } 329 return cfg.TestAnalysisConfig.GetFailureIngestionFilter(), nil 330 }