go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/bisection/server/updatetestrerun/update.go (about) 1 // Copyright 2023 The LUCI Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package updatetestrerun updates test failure analysis when we 16 // got test results from recipes. 17 package updatetestrerun 18 19 import ( 20 "context" 21 22 "go.chromium.org/luci/bisection/culpritaction/revertculprit" 23 "go.chromium.org/luci/bisection/model" 24 "go.chromium.org/luci/bisection/nthsectionsnapshot" 25 pb "go.chromium.org/luci/bisection/proto/v1" 26 "go.chromium.org/luci/bisection/testfailureanalysis" 27 "go.chromium.org/luci/bisection/testfailureanalysis/bisection" 28 "go.chromium.org/luci/bisection/testfailureanalysis/bisection/projectbisector" 29 "go.chromium.org/luci/bisection/util/datastoreutil" 30 "go.chromium.org/luci/bisection/util/loggingutil" 31 "go.chromium.org/luci/common/clock" 32 "go.chromium.org/luci/common/errors" 33 "go.chromium.org/luci/common/logging" 34 "go.chromium.org/luci/gae/service/datastore" 35 "google.golang.org/grpc/codes" 36 "google.golang.org/grpc/status" 37 ) 38 39 // Update is for updating test failure analysis given the request from recipe. 40 func Update(ctx context.Context, req *pb.UpdateTestAnalysisProgressRequest) (reterr error) { 41 defer func() { 42 if reterr != nil { 43 // We log here instead of UpdateTestAnalysisProgress to make sure the analysis ID 44 // and the rerun BBID correctly displayed in log. 45 logging.Errorf(ctx, "Update test analysis progress got error: %v", reterr.Error()) 46 } 47 }() 48 err := validateRequest(req) 49 if err != nil { 50 return status.Errorf(codes.InvalidArgument, errors.Annotate(err, "validate request").Err().Error()) 51 } 52 ctx = loggingutil.SetRerunBBID(ctx, req.Bbid) 53 54 // Fetch rerun. 55 rerun, err := datastoreutil.GetTestSingleRerun(ctx, req.Bbid) 56 if err != nil { 57 // We don't compare err == datastore.ErrNoSuchEntity because err may be annotated. 58 if errors.Is(err, datastore.ErrNoSuchEntity) { 59 return status.Errorf(codes.NotFound, errors.Annotate(err, "get test single rerun").Err().Error()) 60 } else { 61 return status.Errorf(codes.Internal, errors.Annotate(err, "get test single rerun").Err().Error()) 62 } 63 } 64 65 // Something is wrong here. We should not receive an update for ended rerun. 66 if rerun.HasEnded() { 67 return status.Errorf(codes.Internal, "rerun has ended") 68 } 69 70 // Safeguard, we don't really expect any other type. 71 if rerun.Type != model.RerunBuildType_CulpritVerification && rerun.Type != model.RerunBuildType_NthSection { 72 return status.Errorf(codes.Internal, "invalid rerun type %v", rerun.Type) 73 } 74 75 // Fetch analysis 76 tfa, err := datastoreutil.GetTestFailureAnalysis(ctx, rerun.AnalysisKey.IntID()) 77 if err != nil { 78 // Do not return a NOTFOUND here since the rerun was found. 79 // If the analysis is not found, there is likely something wrong. 80 return status.Errorf(codes.Internal, errors.Annotate(err, "get test failure analysis").Err().Error()) 81 } 82 ctx = loggingutil.SetAnalysisID(ctx, tfa.ID) 83 84 err = updateRerun(ctx, rerun, tfa, req) 85 if err != nil { 86 // If there is any error, consider the rerun having infra failure. 87 e := saveRerun(ctx, rerun, func(rerun *model.TestSingleRerun) { 88 rerun.Status = pb.RerunStatus_RERUN_STATUS_INFRA_FAILED 89 rerun.ReportTime = clock.Now(ctx) 90 }) 91 if e != nil { 92 // Nothing we can do now, just log the error. 93 logging.Errorf(ctx, "Error when saving rerun %s", e.Error()) 94 } 95 96 e = testfailureanalysis.UpdateAnalysisStatusWhenError(ctx, tfa) 97 if e != nil { 98 logging.Errorf(ctx, "UpdateAnalysisStatusWhenRerunError %s", e.Error()) 99 } 100 return status.Errorf(codes.Internal, errors.Annotate(err, "update rerun").Err().Error()) 101 } 102 103 if rerun.Type == model.RerunBuildType_CulpritVerification { 104 err := processCulpritVerificationUpdate(ctx, rerun, tfa) 105 if err != nil { 106 e := testfailureanalysis.UpdateAnalysisStatus(ctx, tfa, pb.AnalysisStatus_ERROR, pb.AnalysisRunStatus_ENDED) 107 if e != nil { 108 // Just log. 109 logging.Errorf(ctx, "Update analysis status %s", e.Error()) 110 } 111 return status.Errorf(codes.Internal, errors.Annotate(err, "process culprit verification update").Err().Error()) 112 } 113 } 114 if rerun.Type == model.RerunBuildType_NthSection { 115 err := processNthSectionUpdate(ctx, rerun, tfa, req) 116 if err != nil { 117 e := testfailureanalysis.UpdateAnalysisStatusWhenError(ctx, tfa) 118 if e != nil { 119 // Just log. 120 logging.Errorf(ctx, "UpdateAnalysisStatusWhenRerunError %s", e.Error()) 121 } 122 return status.Errorf(codes.Internal, errors.Annotate(err, "process nthsection update").Err().Error()) 123 } 124 } 125 return nil 126 } 127 128 func processCulpritVerificationUpdate(ctx context.Context, rerun *model.TestSingleRerun, tfa *model.TestFailureAnalysis) error { 129 // Retrieve suspect. 130 if rerun.CulpritKey == nil { 131 return errors.New("no suspect for rerun") 132 } 133 suspect, err := datastoreutil.GetSuspect(ctx, rerun.CulpritKey.IntID(), rerun.CulpritKey.Parent()) 134 if err != nil { 135 return errors.Annotate(err, "get suspect for rerun").Err() 136 } 137 suspectRerun, err := datastoreutil.GetTestSingleRerun(ctx, suspect.SuspectRerunBuild.IntID()) 138 if err != nil { 139 return errors.Annotate(err, "get suspect rerun %d", suspect.SuspectRerunBuild.IntID()).Err() 140 } 141 parentRerun, err := datastoreutil.GetTestSingleRerun(ctx, suspect.ParentRerunBuild.IntID()) 142 if err != nil { 143 return errors.Annotate(err, "get parent rerun %d", suspect.ParentRerunBuild.IntID()).Err() 144 } 145 // Update suspect based on rerun status. 146 suspectStatus := model.SuspectStatus(suspectRerun.Status, parentRerun.Status) 147 148 if err := datastore.RunInTransaction(ctx, func(ctx context.Context) error { 149 e := datastore.Get(ctx, suspect) 150 if e != nil { 151 return e 152 } 153 suspect.VerificationStatus = suspectStatus 154 return datastore.Put(ctx, suspect) 155 }, nil); err != nil { 156 return errors.Annotate(err, "update suspect status %d", suspect.Id).Err() 157 } 158 if suspect.VerificationStatus == model.SuspectVerificationStatus_UnderVerification { 159 return nil 160 } 161 // Update test failure analysis. 162 if suspect.VerificationStatus == model.SuspectVerificationStatus_ConfirmedCulprit { 163 err = datastore.RunInTransaction(ctx, func(ctx context.Context) error { 164 e := datastore.Get(ctx, tfa) 165 if e != nil { 166 return e 167 } 168 tfa.VerifiedCulpritKey = datastore.KeyForObj(ctx, suspect) 169 return datastore.Put(ctx, tfa) 170 }, nil) 171 if err != nil { 172 return errors.Annotate(err, "update VerifiedCulpritKey of analysis").Err() 173 } 174 // TODO(@beining): Schedule this task when suspect is VerificationError too. 175 // According to go/luci-bisection-integrating-gerrit, 176 // we want to also perform gerrit action when suspect is VerificationError. 177 if err := revertculprit.ScheduleTestFailureTask(ctx, tfa.ID); err != nil { 178 // Non-critical, just log the error 179 err := errors.Annotate(err, "schedule culprit action task %d", tfa.ID).Err() 180 logging.Errorf(ctx, err.Error()) 181 // No task scheduled, we should update suspect's HasTakenActions field. 182 suspect.HasTakenActions = true 183 err = datastore.Put(ctx, suspect) 184 if err != nil { 185 err = errors.Annotate(err, "saving suspect's HasActionTaken field").Err() 186 logging.Errorf(ctx, err.Error()) 187 } 188 } 189 return testfailureanalysis.UpdateAnalysisStatus(ctx, tfa, pb.AnalysisStatus_FOUND, pb.AnalysisRunStatus_ENDED) 190 } 191 return testfailureanalysis.UpdateAnalysisStatus(ctx, tfa, pb.AnalysisStatus_SUSPECTFOUND, pb.AnalysisRunStatus_ENDED) 192 } 193 194 func processNthSectionUpdate(ctx context.Context, rerun *model.TestSingleRerun, tfa *model.TestFailureAnalysis, req *pb.UpdateTestAnalysisProgressRequest) (reterr error) { 195 if rerun.NthSectionAnalysisKey == nil { 196 return errors.New("nthsection_analysis_key not found") 197 } 198 nsa, err := datastoreutil.GetTestNthSectionAnalysis(ctx, rerun.NthSectionAnalysisKey.IntID()) 199 if err != nil { 200 return errors.Annotate(err, "get test nthsection analysis").Err() 201 } 202 // This may happen during tri-section (or higher nth-section) analysis. 203 // Nthsection analysis may ended before a rerun result arrives (in such case, 204 // the rerun is considered redundant and should not affect the nthsection). 205 // For example, if the blamelist is [2,3,4] (1 is last pass, 4 is first fail), 206 // and we run tri-section reruns at position 2 and 3. If result for position 2 is 207 // "fail", then 2 should be the culprit, and we don't need to wait for 3. 208 // When the result of 3 comes in, it will be ignored. 209 if nsa.HasEnded() { 210 logging.Infof(ctx, "Nthsection analysis has ended. Rerun result will be ignored.") 211 return nil 212 } 213 snapshot, err := bisection.CreateSnapshot(ctx, nsa) 214 if err != nil { 215 return errors.Annotate(err, "create snapshot").Err() 216 } 217 218 // Check if we already found the culprit or not. 219 ok, cul := snapshot.GetCulprit() 220 221 // Found culprit -> Update the nthsection analysis 222 if ok { 223 err := bisection.SaveSuspectAndTriggerCulpritVerification(ctx, tfa, nsa, snapshot.BlameList.Commits[cul]) 224 if err != nil { 225 return errors.Annotate(err, "save suspect and trigger culprit verification").Err() 226 } 227 return nil 228 } 229 230 // Culprit not found yet. Still need to trigger more rerun. 231 enabled, err := bisection.IsEnabled(ctx, tfa.Project) 232 if err != nil { 233 return errors.Annotate(err, "is enabled").Err() 234 } 235 if !enabled { 236 logging.Infof(ctx, "Bisection not enabled") 237 return nil 238 } 239 240 // Find the next commit to run. 241 commit, err := snapshot.FindNextSingleCommitToRun() 242 var badRangeError *nthsectionsnapshot.BadRangeError 243 if err != nil { 244 if !errors.As(err, &badRangeError) { 245 return errors.Annotate(err, "find next single commit to run").Err() 246 } 247 // BadRangeError suggests the regression range is invalid. 248 // This is not really an error, but more of a indication of no suspect can be found 249 // in this regression range. 250 logging.Warningf(ctx, "find next single commit to run %s", err.Error()) 251 } 252 if commit == "" || errors.As(err, &badRangeError) { 253 // We don't have more run to wait -> we've failed to find the suspect. 254 if snapshot.NumInProgress == 0 { 255 err = bisection.SaveNthSectionAnalysis(ctx, nsa, func(nsa *model.TestNthSectionAnalysis) { 256 nsa.Status = pb.AnalysisStatus_NOTFOUND 257 nsa.RunStatus = pb.AnalysisRunStatus_ENDED 258 nsa.EndTime = clock.Now(ctx) 259 }) 260 if err != nil { 261 return errors.Annotate(err, "save nthsection analysis").Err() 262 } 263 err = testfailureanalysis.UpdateAnalysisStatus(ctx, tfa, pb.AnalysisStatus_NOTFOUND, pb.AnalysisRunStatus_ENDED) 264 if err != nil { 265 return errors.Annotate(err, "update analysis status").Err() 266 } 267 } 268 return nil 269 } 270 271 projectBisector, err := bisection.GetProjectBisector(ctx, tfa) 272 if err != nil { 273 return errors.Annotate(err, "get project bisector").Err() 274 } 275 option := projectbisector.RerunOption{ 276 BotID: req.BotId, 277 } 278 err = bisection.TriggerRerunBuildForCommits(ctx, tfa, nsa, projectBisector, []string{commit}, option) 279 if err != nil { 280 return errors.Annotate(err, "trigger rerun build for commits").Err() 281 } 282 return nil 283 } 284 285 // updateRerun updates TestSingleRerun and TestFailure with the results from recipe. 286 func updateRerun(ctx context.Context, rerun *model.TestSingleRerun, tfa *model.TestFailureAnalysis, req *pb.UpdateTestAnalysisProgressRequest) (reterr error) { 287 if !req.RunSucceeded { 288 err := saveRerun(ctx, rerun, func(rerun *model.TestSingleRerun) { 289 rerun.Status = pb.RerunStatus_RERUN_STATUS_INFRA_FAILED 290 rerun.ReportTime = clock.Now(ctx) 291 }) 292 if err != nil { 293 return errors.Annotate(err, "save rerun").Err() 294 } 295 // Return nil here because the request is valid and INFRA_FAILED is expected. 296 return nil 297 } 298 299 rerunTestResults := rerun.TestResults 300 rerunTestResults.IsFinalized = true 301 var rerunStatus pb.RerunStatus 302 303 // Handle primary test failure. 304 // The result of the primary test failure will determine the status of the rerun. 305 primary, err := datastoreutil.GetPrimaryTestFailure(ctx, tfa) 306 if err != nil { 307 return errors.Annotate(err, "get primary test failure").Err() 308 } 309 310 recipeResults := req.Results 311 // We expect primary failure to have result. 312 primaryResult := findTestResult(ctx, recipeResults, primary.TestID, primary.VariantHash) 313 if primaryResult == nil { 314 return errors.New("no result for primary failure") 315 } 316 317 // We are bisecting from expected -> unexpected, so we consider 318 // expected as "PASSED" and unexpected as "FAILED". 319 // Skipped should be treated separately. 320 if primaryResult.IsExpected { 321 rerunStatus = pb.RerunStatus_RERUN_STATUS_PASSED 322 } else { 323 rerunStatus = pb.RerunStatus_RERUN_STATUS_FAILED 324 } 325 if primaryResult.Status == pb.TestResultStatus_SKIP { 326 rerunStatus = pb.RerunStatus_RERUN_STATUS_TEST_SKIPPED 327 } 328 329 divergedTestFailures := []*model.TestFailure{} 330 // rerunTestResults.Results should be pre-populate with test failure keys. 331 for i := range rerunTestResults.Results { 332 tf, err := datastoreutil.GetTestFailure(ctx, rerunTestResults.Results[i].TestFailureKey.IntID()) 333 if err != nil { 334 return errors.Reason("could not find test failure %d", tf.ID).Err() 335 } 336 recipeTestResult := findTestResult(ctx, recipeResults, tf.TestID, tf.VariantHash) 337 if divergedFromPrimary(recipeTestResult, primaryResult) { 338 tf.IsDiverged = true 339 divergedTestFailures = append(divergedTestFailures, tf) 340 } 341 if recipeTestResult != nil && recipeTestResult.Status != pb.TestResultStatus_SKIP { 342 if recipeTestResult.IsExpected { 343 rerunTestResults.Results[i].ExpectedCount = 1 344 } else { 345 rerunTestResults.Results[i].UnexpectedCount = 1 346 } 347 } 348 } 349 350 return datastore.RunInTransaction(ctx, func(ctx context.Context) error { 351 // Get and save the rerun. 352 err := datastore.Get(ctx, rerun) 353 if err != nil { 354 return errors.Annotate(err, "get rerun").Err() 355 } 356 357 rerun.Status = rerunStatus 358 rerun.TestResults = rerunTestResults 359 rerun.ReportTime = clock.Now(ctx) 360 361 err = datastore.Put(ctx, rerun) 362 if err != nil { 363 return errors.Annotate(err, "save rerun").Err() 364 } 365 366 // It should be safe to just save the test failures here because we don't expect 367 // any update to other fields of test failures. 368 err = datastore.Put(ctx, divergedTestFailures) 369 if err != nil { 370 return errors.Annotate(err, "save test failures to update").Err() 371 } 372 return nil 373 }, nil) 374 } 375 376 // saveRerun updates reruns in a way that avoid race condition 377 // if another thread also update the rerun. 378 func saveRerun(ctx context.Context, rerun *model.TestSingleRerun, updateFunc func(*model.TestSingleRerun)) error { 379 return datastore.RunInTransaction(ctx, func(ctx context.Context) error { 380 // Get rerun to avoid race condition if something also update the rerun. 381 err := datastore.Get(ctx, rerun) 382 if err != nil { 383 return errors.Annotate(err, "get rerun").Err() 384 } 385 updateFunc(rerun) 386 // Save the rerun. 387 err = datastore.Put(ctx, rerun) 388 if err != nil { 389 return errors.Annotate(err, "save rerun").Err() 390 } 391 return nil 392 }, nil) 393 } 394 395 // divergedFromPrimary returns true if testResult diverged from primary result. 396 // Assuming primaryResult is not nil. 397 func divergedFromPrimary(testResult *pb.TestResult, primaryResult *pb.TestResult) bool { 398 // In case the test was not found or is not run, testResult is nil. 399 // In this case, we consider it to be diverged from primary result. 400 if testResult == nil { 401 return true 402 } 403 // If the primary test is skipped, we will not know if test result is diverged. 404 // But perhaps it should not matter, since we will not be able to continue anyway. 405 if primaryResult.Status == pb.TestResultStatus_SKIP { 406 return false 407 } 408 // Primary not skip and test skip -> diverge. 409 if testResult.Status == pb.TestResultStatus_SKIP { 410 return true 411 } 412 return testResult.IsExpected != primaryResult.IsExpected 413 } 414 415 // findTestResult returns TestResult given testID and variantHash. 416 func findTestResult(ctx context.Context, results []*pb.TestResult, testID string, variantHash string) *pb.TestResult { 417 for _, r := range results { 418 if r.TestId == testID && r.VariantHash == variantHash { 419 return r 420 } 421 } 422 return nil 423 } 424 425 func validateRequest(req *pb.UpdateTestAnalysisProgressRequest) error { 426 if req.Bbid == 0 { 427 return errors.New("no rerun bbid specified") 428 } 429 if req.BotId == "" { 430 return errors.New("no bot id specified") 431 } 432 return nil 433 }