go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/bisection/server/updatetestrerun/update.go (about)

     1  // Copyright 2023 The LUCI Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package updatetestrerun updates test failure analysis when we
    16  // got test results from recipes.
    17  package updatetestrerun
    18  
    19  import (
    20  	"context"
    21  
    22  	"go.chromium.org/luci/bisection/culpritaction/revertculprit"
    23  	"go.chromium.org/luci/bisection/model"
    24  	"go.chromium.org/luci/bisection/nthsectionsnapshot"
    25  	pb "go.chromium.org/luci/bisection/proto/v1"
    26  	"go.chromium.org/luci/bisection/testfailureanalysis"
    27  	"go.chromium.org/luci/bisection/testfailureanalysis/bisection"
    28  	"go.chromium.org/luci/bisection/testfailureanalysis/bisection/projectbisector"
    29  	"go.chromium.org/luci/bisection/util/datastoreutil"
    30  	"go.chromium.org/luci/bisection/util/loggingutil"
    31  	"go.chromium.org/luci/common/clock"
    32  	"go.chromium.org/luci/common/errors"
    33  	"go.chromium.org/luci/common/logging"
    34  	"go.chromium.org/luci/gae/service/datastore"
    35  	"google.golang.org/grpc/codes"
    36  	"google.golang.org/grpc/status"
    37  )
    38  
    39  // Update is for updating test failure analysis given the request from recipe.
    40  func Update(ctx context.Context, req *pb.UpdateTestAnalysisProgressRequest) (reterr error) {
    41  	defer func() {
    42  		if reterr != nil {
    43  			// We log here instead of UpdateTestAnalysisProgress to make sure the analysis ID
    44  			// and the rerun BBID correctly displayed in log.
    45  			logging.Errorf(ctx, "Update test analysis progress got error: %v", reterr.Error())
    46  		}
    47  	}()
    48  	err := validateRequest(req)
    49  	if err != nil {
    50  		return status.Errorf(codes.InvalidArgument, errors.Annotate(err, "validate request").Err().Error())
    51  	}
    52  	ctx = loggingutil.SetRerunBBID(ctx, req.Bbid)
    53  
    54  	// Fetch rerun.
    55  	rerun, err := datastoreutil.GetTestSingleRerun(ctx, req.Bbid)
    56  	if err != nil {
    57  		// We don't compare err == datastore.ErrNoSuchEntity because err may be annotated.
    58  		if errors.Is(err, datastore.ErrNoSuchEntity) {
    59  			return status.Errorf(codes.NotFound, errors.Annotate(err, "get test single rerun").Err().Error())
    60  		} else {
    61  			return status.Errorf(codes.Internal, errors.Annotate(err, "get test single rerun").Err().Error())
    62  		}
    63  	}
    64  
    65  	// Something is wrong here. We should not receive an update for ended rerun.
    66  	if rerun.HasEnded() {
    67  		return status.Errorf(codes.Internal, "rerun has ended")
    68  	}
    69  
    70  	// Safeguard, we don't really expect any other type.
    71  	if rerun.Type != model.RerunBuildType_CulpritVerification && rerun.Type != model.RerunBuildType_NthSection {
    72  		return status.Errorf(codes.Internal, "invalid rerun type %v", rerun.Type)
    73  	}
    74  
    75  	// Fetch analysis
    76  	tfa, err := datastoreutil.GetTestFailureAnalysis(ctx, rerun.AnalysisKey.IntID())
    77  	if err != nil {
    78  		// Do not return a NOTFOUND here since the rerun was found.
    79  		// If the analysis is not found, there is likely something wrong.
    80  		return status.Errorf(codes.Internal, errors.Annotate(err, "get test failure analysis").Err().Error())
    81  	}
    82  	ctx = loggingutil.SetAnalysisID(ctx, tfa.ID)
    83  
    84  	err = updateRerun(ctx, rerun, tfa, req)
    85  	if err != nil {
    86  		// If there is any error, consider the rerun having infra failure.
    87  		e := saveRerun(ctx, rerun, func(rerun *model.TestSingleRerun) {
    88  			rerun.Status = pb.RerunStatus_RERUN_STATUS_INFRA_FAILED
    89  			rerun.ReportTime = clock.Now(ctx)
    90  		})
    91  		if e != nil {
    92  			// Nothing we can do now, just log the error.
    93  			logging.Errorf(ctx, "Error when saving rerun %s", e.Error())
    94  		}
    95  
    96  		e = testfailureanalysis.UpdateAnalysisStatusWhenError(ctx, tfa)
    97  		if e != nil {
    98  			logging.Errorf(ctx, "UpdateAnalysisStatusWhenRerunError %s", e.Error())
    99  		}
   100  		return status.Errorf(codes.Internal, errors.Annotate(err, "update rerun").Err().Error())
   101  	}
   102  
   103  	if rerun.Type == model.RerunBuildType_CulpritVerification {
   104  		err := processCulpritVerificationUpdate(ctx, rerun, tfa)
   105  		if err != nil {
   106  			e := testfailureanalysis.UpdateAnalysisStatus(ctx, tfa, pb.AnalysisStatus_ERROR, pb.AnalysisRunStatus_ENDED)
   107  			if e != nil {
   108  				// Just log.
   109  				logging.Errorf(ctx, "Update analysis status %s", e.Error())
   110  			}
   111  			return status.Errorf(codes.Internal, errors.Annotate(err, "process culprit verification update").Err().Error())
   112  		}
   113  	}
   114  	if rerun.Type == model.RerunBuildType_NthSection {
   115  		err := processNthSectionUpdate(ctx, rerun, tfa, req)
   116  		if err != nil {
   117  			e := testfailureanalysis.UpdateAnalysisStatusWhenError(ctx, tfa)
   118  			if e != nil {
   119  				// Just log.
   120  				logging.Errorf(ctx, "UpdateAnalysisStatusWhenRerunError %s", e.Error())
   121  			}
   122  			return status.Errorf(codes.Internal, errors.Annotate(err, "process nthsection update").Err().Error())
   123  		}
   124  	}
   125  	return nil
   126  }
   127  
   128  func processCulpritVerificationUpdate(ctx context.Context, rerun *model.TestSingleRerun, tfa *model.TestFailureAnalysis) error {
   129  	// Retrieve suspect.
   130  	if rerun.CulpritKey == nil {
   131  		return errors.New("no suspect for rerun")
   132  	}
   133  	suspect, err := datastoreutil.GetSuspect(ctx, rerun.CulpritKey.IntID(), rerun.CulpritKey.Parent())
   134  	if err != nil {
   135  		return errors.Annotate(err, "get suspect for rerun").Err()
   136  	}
   137  	suspectRerun, err := datastoreutil.GetTestSingleRerun(ctx, suspect.SuspectRerunBuild.IntID())
   138  	if err != nil {
   139  		return errors.Annotate(err, "get suspect rerun %d", suspect.SuspectRerunBuild.IntID()).Err()
   140  	}
   141  	parentRerun, err := datastoreutil.GetTestSingleRerun(ctx, suspect.ParentRerunBuild.IntID())
   142  	if err != nil {
   143  		return errors.Annotate(err, "get parent rerun %d", suspect.ParentRerunBuild.IntID()).Err()
   144  	}
   145  	// Update suspect based on rerun status.
   146  	suspectStatus := model.SuspectStatus(suspectRerun.Status, parentRerun.Status)
   147  
   148  	if err := datastore.RunInTransaction(ctx, func(ctx context.Context) error {
   149  		e := datastore.Get(ctx, suspect)
   150  		if e != nil {
   151  			return e
   152  		}
   153  		suspect.VerificationStatus = suspectStatus
   154  		return datastore.Put(ctx, suspect)
   155  	}, nil); err != nil {
   156  		return errors.Annotate(err, "update suspect status %d", suspect.Id).Err()
   157  	}
   158  	if suspect.VerificationStatus == model.SuspectVerificationStatus_UnderVerification {
   159  		return nil
   160  	}
   161  	// Update test failure analysis.
   162  	if suspect.VerificationStatus == model.SuspectVerificationStatus_ConfirmedCulprit {
   163  		err = datastore.RunInTransaction(ctx, func(ctx context.Context) error {
   164  			e := datastore.Get(ctx, tfa)
   165  			if e != nil {
   166  				return e
   167  			}
   168  			tfa.VerifiedCulpritKey = datastore.KeyForObj(ctx, suspect)
   169  			return datastore.Put(ctx, tfa)
   170  		}, nil)
   171  		if err != nil {
   172  			return errors.Annotate(err, "update VerifiedCulpritKey of analysis").Err()
   173  		}
   174  		// TODO(@beining): Schedule this task when suspect is VerificationError too.
   175  		// According to go/luci-bisection-integrating-gerrit,
   176  		// we want to also perform gerrit action when suspect is VerificationError.
   177  		if err := revertculprit.ScheduleTestFailureTask(ctx, tfa.ID); err != nil {
   178  			// Non-critical, just log the error
   179  			err := errors.Annotate(err, "schedule culprit action task %d", tfa.ID).Err()
   180  			logging.Errorf(ctx, err.Error())
   181  			// No task scheduled, we should update suspect's HasTakenActions field.
   182  			suspect.HasTakenActions = true
   183  			err = datastore.Put(ctx, suspect)
   184  			if err != nil {
   185  				err = errors.Annotate(err, "saving suspect's HasActionTaken field").Err()
   186  				logging.Errorf(ctx, err.Error())
   187  			}
   188  		}
   189  		return testfailureanalysis.UpdateAnalysisStatus(ctx, tfa, pb.AnalysisStatus_FOUND, pb.AnalysisRunStatus_ENDED)
   190  	}
   191  	return testfailureanalysis.UpdateAnalysisStatus(ctx, tfa, pb.AnalysisStatus_SUSPECTFOUND, pb.AnalysisRunStatus_ENDED)
   192  }
   193  
   194  func processNthSectionUpdate(ctx context.Context, rerun *model.TestSingleRerun, tfa *model.TestFailureAnalysis, req *pb.UpdateTestAnalysisProgressRequest) (reterr error) {
   195  	if rerun.NthSectionAnalysisKey == nil {
   196  		return errors.New("nthsection_analysis_key not found")
   197  	}
   198  	nsa, err := datastoreutil.GetTestNthSectionAnalysis(ctx, rerun.NthSectionAnalysisKey.IntID())
   199  	if err != nil {
   200  		return errors.Annotate(err, "get test nthsection analysis").Err()
   201  	}
   202  	// This may happen during tri-section (or higher nth-section) analysis.
   203  	// Nthsection analysis may ended before a rerun result arrives (in such case,
   204  	// the rerun is considered redundant and should not affect the nthsection).
   205  	// For example, if the blamelist is [2,3,4] (1 is last pass, 4 is first fail),
   206  	// and we run tri-section reruns at position 2 and 3. If result for position 2 is
   207  	// "fail", then 2 should be the culprit, and we don't need to wait for 3.
   208  	// When the result of 3 comes in, it will be ignored.
   209  	if nsa.HasEnded() {
   210  		logging.Infof(ctx, "Nthsection analysis has ended. Rerun result will be ignored.")
   211  		return nil
   212  	}
   213  	snapshot, err := bisection.CreateSnapshot(ctx, nsa)
   214  	if err != nil {
   215  		return errors.Annotate(err, "create snapshot").Err()
   216  	}
   217  
   218  	// Check if we already found the culprit or not.
   219  	ok, cul := snapshot.GetCulprit()
   220  
   221  	// Found culprit -> Update the nthsection analysis
   222  	if ok {
   223  		err := bisection.SaveSuspectAndTriggerCulpritVerification(ctx, tfa, nsa, snapshot.BlameList.Commits[cul])
   224  		if err != nil {
   225  			return errors.Annotate(err, "save suspect and trigger culprit verification").Err()
   226  		}
   227  		return nil
   228  	}
   229  
   230  	// Culprit not found yet. Still need to trigger more rerun.
   231  	enabled, err := bisection.IsEnabled(ctx, tfa.Project)
   232  	if err != nil {
   233  		return errors.Annotate(err, "is enabled").Err()
   234  	}
   235  	if !enabled {
   236  		logging.Infof(ctx, "Bisection not enabled")
   237  		return nil
   238  	}
   239  
   240  	// Find the next commit to run.
   241  	commit, err := snapshot.FindNextSingleCommitToRun()
   242  	var badRangeError *nthsectionsnapshot.BadRangeError
   243  	if err != nil {
   244  		if !errors.As(err, &badRangeError) {
   245  			return errors.Annotate(err, "find next single commit to run").Err()
   246  		}
   247  		// BadRangeError suggests the regression range is invalid.
   248  		// This is not really an error, but more of a indication of no suspect can be found
   249  		// in this regression range.
   250  		logging.Warningf(ctx, "find next single commit to run %s", err.Error())
   251  	}
   252  	if commit == "" || errors.As(err, &badRangeError) {
   253  		// We don't have more run to wait -> we've failed to find the suspect.
   254  		if snapshot.NumInProgress == 0 {
   255  			err = bisection.SaveNthSectionAnalysis(ctx, nsa, func(nsa *model.TestNthSectionAnalysis) {
   256  				nsa.Status = pb.AnalysisStatus_NOTFOUND
   257  				nsa.RunStatus = pb.AnalysisRunStatus_ENDED
   258  				nsa.EndTime = clock.Now(ctx)
   259  			})
   260  			if err != nil {
   261  				return errors.Annotate(err, "save nthsection analysis").Err()
   262  			}
   263  			err = testfailureanalysis.UpdateAnalysisStatus(ctx, tfa, pb.AnalysisStatus_NOTFOUND, pb.AnalysisRunStatus_ENDED)
   264  			if err != nil {
   265  				return errors.Annotate(err, "update analysis status").Err()
   266  			}
   267  		}
   268  		return nil
   269  	}
   270  
   271  	projectBisector, err := bisection.GetProjectBisector(ctx, tfa)
   272  	if err != nil {
   273  		return errors.Annotate(err, "get project bisector").Err()
   274  	}
   275  	option := projectbisector.RerunOption{
   276  		BotID: req.BotId,
   277  	}
   278  	err = bisection.TriggerRerunBuildForCommits(ctx, tfa, nsa, projectBisector, []string{commit}, option)
   279  	if err != nil {
   280  		return errors.Annotate(err, "trigger rerun build for commits").Err()
   281  	}
   282  	return nil
   283  }
   284  
   285  // updateRerun updates TestSingleRerun and TestFailure with the results from recipe.
   286  func updateRerun(ctx context.Context, rerun *model.TestSingleRerun, tfa *model.TestFailureAnalysis, req *pb.UpdateTestAnalysisProgressRequest) (reterr error) {
   287  	if !req.RunSucceeded {
   288  		err := saveRerun(ctx, rerun, func(rerun *model.TestSingleRerun) {
   289  			rerun.Status = pb.RerunStatus_RERUN_STATUS_INFRA_FAILED
   290  			rerun.ReportTime = clock.Now(ctx)
   291  		})
   292  		if err != nil {
   293  			return errors.Annotate(err, "save rerun").Err()
   294  		}
   295  		// Return nil here because the request is valid and INFRA_FAILED is expected.
   296  		return nil
   297  	}
   298  
   299  	rerunTestResults := rerun.TestResults
   300  	rerunTestResults.IsFinalized = true
   301  	var rerunStatus pb.RerunStatus
   302  
   303  	// Handle primary test failure.
   304  	// The result of the primary test failure will determine the status of the rerun.
   305  	primary, err := datastoreutil.GetPrimaryTestFailure(ctx, tfa)
   306  	if err != nil {
   307  		return errors.Annotate(err, "get primary test failure").Err()
   308  	}
   309  
   310  	recipeResults := req.Results
   311  	// We expect primary failure to have result.
   312  	primaryResult := findTestResult(ctx, recipeResults, primary.TestID, primary.VariantHash)
   313  	if primaryResult == nil {
   314  		return errors.New("no result for primary failure")
   315  	}
   316  
   317  	// We are bisecting from expected -> unexpected, so we consider
   318  	// expected as "PASSED" and unexpected as "FAILED".
   319  	// Skipped should be treated separately.
   320  	if primaryResult.IsExpected {
   321  		rerunStatus = pb.RerunStatus_RERUN_STATUS_PASSED
   322  	} else {
   323  		rerunStatus = pb.RerunStatus_RERUN_STATUS_FAILED
   324  	}
   325  	if primaryResult.Status == pb.TestResultStatus_SKIP {
   326  		rerunStatus = pb.RerunStatus_RERUN_STATUS_TEST_SKIPPED
   327  	}
   328  
   329  	divergedTestFailures := []*model.TestFailure{}
   330  	// rerunTestResults.Results should be pre-populate with test failure keys.
   331  	for i := range rerunTestResults.Results {
   332  		tf, err := datastoreutil.GetTestFailure(ctx, rerunTestResults.Results[i].TestFailureKey.IntID())
   333  		if err != nil {
   334  			return errors.Reason("could not find test failure %d", tf.ID).Err()
   335  		}
   336  		recipeTestResult := findTestResult(ctx, recipeResults, tf.TestID, tf.VariantHash)
   337  		if divergedFromPrimary(recipeTestResult, primaryResult) {
   338  			tf.IsDiverged = true
   339  			divergedTestFailures = append(divergedTestFailures, tf)
   340  		}
   341  		if recipeTestResult != nil && recipeTestResult.Status != pb.TestResultStatus_SKIP {
   342  			if recipeTestResult.IsExpected {
   343  				rerunTestResults.Results[i].ExpectedCount = 1
   344  			} else {
   345  				rerunTestResults.Results[i].UnexpectedCount = 1
   346  			}
   347  		}
   348  	}
   349  
   350  	return datastore.RunInTransaction(ctx, func(ctx context.Context) error {
   351  		// Get and save the rerun.
   352  		err := datastore.Get(ctx, rerun)
   353  		if err != nil {
   354  			return errors.Annotate(err, "get rerun").Err()
   355  		}
   356  
   357  		rerun.Status = rerunStatus
   358  		rerun.TestResults = rerunTestResults
   359  		rerun.ReportTime = clock.Now(ctx)
   360  
   361  		err = datastore.Put(ctx, rerun)
   362  		if err != nil {
   363  			return errors.Annotate(err, "save rerun").Err()
   364  		}
   365  
   366  		// It should be safe to just save the test failures here because we don't expect
   367  		// any update to other fields of test failures.
   368  		err = datastore.Put(ctx, divergedTestFailures)
   369  		if err != nil {
   370  			return errors.Annotate(err, "save test failures to update").Err()
   371  		}
   372  		return nil
   373  	}, nil)
   374  }
   375  
   376  // saveRerun updates reruns in a way that avoid race condition
   377  // if another thread also update the rerun.
   378  func saveRerun(ctx context.Context, rerun *model.TestSingleRerun, updateFunc func(*model.TestSingleRerun)) error {
   379  	return datastore.RunInTransaction(ctx, func(ctx context.Context) error {
   380  		// Get rerun to avoid race condition if something also update the rerun.
   381  		err := datastore.Get(ctx, rerun)
   382  		if err != nil {
   383  			return errors.Annotate(err, "get rerun").Err()
   384  		}
   385  		updateFunc(rerun)
   386  		// Save the rerun.
   387  		err = datastore.Put(ctx, rerun)
   388  		if err != nil {
   389  			return errors.Annotate(err, "save rerun").Err()
   390  		}
   391  		return nil
   392  	}, nil)
   393  }
   394  
   395  // divergedFromPrimary returns true if testResult diverged from primary result.
   396  // Assuming primaryResult is not nil.
   397  func divergedFromPrimary(testResult *pb.TestResult, primaryResult *pb.TestResult) bool {
   398  	// In case the test was not found or is not run, testResult is nil.
   399  	// In this case, we consider it to be diverged from primary result.
   400  	if testResult == nil {
   401  		return true
   402  	}
   403  	// If the primary test is skipped, we will not know if test result is diverged.
   404  	// But perhaps it should not matter, since we will not be able to continue anyway.
   405  	if primaryResult.Status == pb.TestResultStatus_SKIP {
   406  		return false
   407  	}
   408  	// Primary not skip and test skip -> diverge.
   409  	if testResult.Status == pb.TestResultStatus_SKIP {
   410  		return true
   411  	}
   412  	return testResult.IsExpected != primaryResult.IsExpected
   413  }
   414  
   415  // findTestResult returns TestResult given testID and variantHash.
   416  func findTestResult(ctx context.Context, results []*pb.TestResult, testID string, variantHash string) *pb.TestResult {
   417  	for _, r := range results {
   418  		if r.TestId == testID && r.VariantHash == variantHash {
   419  			return r
   420  		}
   421  	}
   422  	return nil
   423  }
   424  
   425  func validateRequest(req *pb.UpdateTestAnalysisProgressRequest) error {
   426  	if req.Bbid == 0 {
   427  		return errors.New("no rerun bbid specified")
   428  	}
   429  	if req.BotId == "" {
   430  		return errors.New("no bot id specified")
   431  	}
   432  	return nil
   433  }