go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/bisection/server/bot_updates.go

go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/bisection/server/bot_updates.go (about)

     1  // Copyright 2023 The LUCI Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package server
    16  
    17  import (
    18  	"context"
    19  	"fmt"
    20  
    21  	"go.chromium.org/luci/bisection/compilefailureanalysis/nthsection"
    22  	"go.chromium.org/luci/bisection/compilefailureanalysis/statusupdater"
    23  	"go.chromium.org/luci/bisection/model"
    24  	"go.chromium.org/luci/bisection/nthsectionsnapshot"
    25  	pb "go.chromium.org/luci/bisection/proto/v1"
    26  	"go.chromium.org/luci/bisection/server/updatetestrerun"
    27  	taskpb "go.chromium.org/luci/bisection/task/proto"
    28  	"go.chromium.org/luci/bisection/util/datastoreutil"
    29  	"go.chromium.org/luci/bisection/util/loggingutil"
    30  
    31  	bbpb "go.chromium.org/luci/buildbucket/proto"
    32  	"go.chromium.org/luci/common/clock"
    33  	"go.chromium.org/luci/common/errors"
    34  	"go.chromium.org/luci/common/logging"
    35  	"go.chromium.org/luci/gae/service/datastore"
    36  	"go.chromium.org/luci/server/tq"
    37  
    38  	"google.golang.org/grpc/codes"
    39  	"google.golang.org/grpc/status"
    40  )
    41  
    42  // BotUpdatesServer implements the LUCI Bisection proto service for BotUpdates.
    43  type BotUpdatesServer struct{}
    44  
    45  // UpdateAnalysisProgress is an RPC endpoints used by the recipes to update
    46  // analysis progress.
    47  func (server *BotUpdatesServer) UpdateAnalysisProgress(c context.Context, req *pb.UpdateAnalysisProgressRequest) (*pb.UpdateAnalysisProgressResponse, error) {
    48  	err := verifyUpdateAnalysisProgressRequest(c, req)
    49  	if err != nil {
    50  		return nil, status.Errorf(codes.InvalidArgument, "Invalid request: %s", err)
    51  	}
    52  	c = loggingutil.SetAnalysisID(c, req.AnalysisId)
    53  	c = loggingutil.SetRerunBBID(c, req.Bbid)
    54  
    55  	logging.Infof(c, "Update analysis with rerun_build_id = %d analysis_id = %d gitiles_commit=%v ", req.Bbid, req.AnalysisId, req.GitilesCommit)
    56  
    57  	cfa, err := datastoreutil.GetCompileFailureAnalysis(c, req.AnalysisId)
    58  	if err != nil {
    59  		err = errors.Annotate(err, "failed GetCompileFailureAnalysis ID: %d", req.AnalysisId).Err()
    60  		errors.Log(c, err)
    61  		return nil, status.Errorf(codes.Internal, "error GetCompileFailureAnalysis")
    62  	}
    63  	if cfa.CompileFailure != nil && cfa.CompileFailure.Parent() != nil {
    64  		c = loggingutil.SetAnalyzedBBID(c, cfa.CompileFailure.Parent().IntID())
    65  	}
    66  
    67  	// Get rerun model
    68  	rerunModel := &model.CompileRerunBuild{
    69  		Id: req.Bbid,
    70  	}
    71  	switch err := datastore.Get(c, rerunModel); {
    72  	case err == datastore.ErrNoSuchEntity:
    73  		return nil, status.Errorf(codes.NotFound, "could not find rerun build with id %d", req.Bbid)
    74  	case err != nil:
    75  		return nil, status.Errorf(codes.Internal, "error finding rerun build")
    76  	default:
    77  		//continue
    78  	}
    79  
    80  	lastRerun, err := datastoreutil.GetLastRerunForRerunBuild(c, rerunModel)
    81  	if err != nil {
    82  		err = errors.Annotate(err, "failed getting last rerun for build %d. Analysis ID: %d", rerunModel.Id, req.AnalysisId).Err()
    83  		errors.Log(c, err)
    84  		return nil, status.Errorf(codes.Internal, "error getting last rerun build")
    85  	}
    86  
    87  	// Update rerun model
    88  	err = updateRerun(c, req, lastRerun)
    89  	if err != nil {
    90  		err = errors.Annotate(err, "failed updating rerun for build %d. Analysis ID: %d", rerunModel.Id, req.AnalysisId).Err()
    91  		errors.Log(c, err)
    92  		return nil, status.Errorf(codes.Internal, "error updating rerun build")
    93  	}
    94  
    95  	// Safeguard, we really don't expect any other type
    96  	if lastRerun.Type != model.RerunBuildType_CulpritVerification && lastRerun.Type != model.RerunBuildType_NthSection {
    97  		logging.Errorf(c, "Invalid type %v for analysis %d", lastRerun.Type, req.AnalysisId)
    98  		return nil, status.Errorf(codes.Internal, "Invalid type %v", lastRerun.Type)
    99  	}
   100  
   101  	// Culprit verification
   102  	if lastRerun.Type == model.RerunBuildType_CulpritVerification {
   103  		err := updateSuspectWithRerunData(c, lastRerun)
   104  		if err != nil {
   105  			err = errors.Annotate(err, "updateSuspectWithRerunData for build id %d. Analysis ID: %d", rerunModel.Id, req.AnalysisId).Err()
   106  			errors.Log(c, err)
   107  			return nil, status.Errorf(codes.Internal, "error updating suspect")
   108  		}
   109  
   110  		// Update analysis status
   111  		err = statusupdater.UpdateAnalysisStatus(c, cfa)
   112  		if err != nil {
   113  			err = errors.Annotate(err, "statusupdater.UpdateAnalysisStatus. Analysis ID: %d", req.AnalysisId).Err()
   114  			errors.Log(c, err)
   115  			return nil, status.Errorf(codes.Internal, "error UpdateAnalysisStatus")
   116  		}
   117  
   118  		// TODO (nqmtuan): It is possible that we schedule an nth-section run right after
   119  		// a culprit verification run within the same build. We will do this later, for
   120  		// safety, after we verify nth-section analysis is running fine.
   121  		return &pb.UpdateAnalysisProgressResponse{}, nil
   122  	}
   123  
   124  	// Nth section
   125  	if lastRerun.Type == model.RerunBuildType_NthSection {
   126  		nsa, err := processNthSectionUpdate(c, req)
   127  		if err != nil {
   128  			err = errors.Annotate(err, "processNthSectionUpdate. Analysis ID: %d", req.AnalysisId).Err()
   129  			logging.Errorf(c, err.Error())
   130  
   131  			// If there is an error, then nthsection analysis may ended
   132  			// if there is no unfinised nthsection runs
   133  			e := setNthSectionError(c, nsa)
   134  			if e != nil {
   135  				e = errors.Annotate(e, "setNthSectionError. Analysis ID: %d", req.AnalysisId).Err()
   136  				logging.Errorf(c, e.Error())
   137  			}
   138  
   139  			// Also the main analysis status may need to change as well
   140  			e = statusupdater.UpdateAnalysisStatus(c, cfa)
   141  			if e != nil {
   142  				e = errors.Annotate(e, "UpdateAnalysisStatus. Analysis ID: %d", req.AnalysisId).Err()
   143  				logging.Errorf(c, e.Error())
   144  			}
   145  			return nil, status.Errorf(codes.Internal, err.Error())
   146  		}
   147  
   148  		// Update analysis status
   149  		err = statusupdater.UpdateAnalysisStatus(c, cfa)
   150  		if err != nil {
   151  			err = errors.Annotate(err, "statusupdater.UpdateAnalysisStatus. Analysis ID: %d", req.AnalysisId).Err()
   152  			errors.Log(c, err)
   153  			return nil, status.Errorf(codes.Internal, "error UpdateAnalysisStatus")
   154  		}
   155  
   156  		return &pb.UpdateAnalysisProgressResponse{}, nil
   157  	}
   158  
   159  	return nil, status.Errorf(codes.Internal, "unknown error")
   160  }
   161  
   162  func (server *BotUpdatesServer) UpdateTestAnalysisProgress(ctx context.Context, req *pb.UpdateTestAnalysisProgressRequest) (*pb.UpdateTestAnalysisProgressResponse, error) {
   163  	err := updatetestrerun.Update(ctx, req)
   164  	if err != nil {
   165  		return nil, err
   166  	}
   167  	return &pb.UpdateTestAnalysisProgressResponse{}, nil
   168  }
   169  
   170  func setNthSectionError(c context.Context, nsa *model.CompileNthSectionAnalysis) error {
   171  	if nsa == nil {
   172  		return nil
   173  	}
   174  	reruns, err := datastoreutil.GetRerunsForNthSectionAnalysis(c, nsa)
   175  	if err != nil {
   176  		return errors.Annotate(err, "GetRerunsForNthSectionAnalysis").Err()
   177  	}
   178  
   179  	for _, rerun := range reruns {
   180  		// There are some rerun running, so do not mark this as error yet
   181  		if rerun.Status == pb.RerunStatus_RERUN_STATUS_IN_PROGRESS {
   182  			return nil
   183  		}
   184  	}
   185  
   186  	return datastore.RunInTransaction(c, func(c context.Context) error {
   187  		e := datastore.Get(c, nsa)
   188  		if e != nil {
   189  			return e
   190  		}
   191  		nsa.Status = pb.AnalysisStatus_ERROR
   192  		nsa.RunStatus = pb.AnalysisRunStatus_ENDED
   193  		nsa.EndTime = clock.Now(c)
   194  		return datastore.Put(c, nsa)
   195  	}, nil)
   196  }
   197  
   198  // processNthSectionUpdate processes the bot update for nthsection analysis run
   199  // It will schedule the next run for nthsection analysis targeting the same bot
   200  func processNthSectionUpdate(c context.Context, req *pb.UpdateAnalysisProgressRequest) (*model.CompileNthSectionAnalysis, error) {
   201  	cfa, err := datastoreutil.GetCompileFailureAnalysis(c, req.AnalysisId)
   202  	if err != nil {
   203  		return nil, err
   204  	}
   205  
   206  	// We should not schedule any more run for this analysis
   207  	if cfa.ShouldCancel {
   208  		return nil, nil
   209  	}
   210  
   211  	nsa, err := datastoreutil.GetNthSectionAnalysis(c, cfa)
   212  	if err != nil {
   213  		return nil, err
   214  	}
   215  
   216  	// There is no nthsection analysis for this analysis
   217  	if nsa == nil {
   218  		return nil, nil
   219  	}
   220  
   221  	snapshot, err := nthsection.CreateSnapshot(c, nsa)
   222  	if err != nil {
   223  		return nsa, errors.Annotate(err, "couldn't create snapshot").Err()
   224  	}
   225  
   226  	// Check if we already found the culprit or not
   227  	ok, cul := snapshot.GetCulprit()
   228  
   229  	// Found culprit -> Update the nthsection analysis
   230  	if ok {
   231  		err := nthsection.SaveSuspectAndTriggerCulpritVerification(c, nsa, cfa, snapshot.BlameList.Commits[cul])
   232  		if err != nil {
   233  			return nsa, errors.Annotate(err, "save suspect and trigger culprit verification").Err()
   234  		}
   235  		return nsa, nil
   236  	}
   237  
   238  	shouldRunNthSection, err := nthsection.ShouldRunNthSectionAnalysis(c, cfa)
   239  	if err != nil {
   240  		return nsa, errors.Annotate(err, "couldn't fetch config for nthsection").Err()
   241  	}
   242  	if !shouldRunNthSection {
   243  		return nsa, nil
   244  	}
   245  
   246  	commit, err := snapshot.FindNextSingleCommitToRun()
   247  	var badRangeError *nthsectionsnapshot.BadRangeError
   248  	if err != nil {
   249  		if !errors.As(err, &badRangeError) {
   250  			return nsa, errors.Annotate(err, "find next single commit to run").Err()
   251  		}
   252  		// BadRangeError suggests the regression range is invalid.
   253  		// This is not really an error, but more of a indication of no suspect can be found
   254  		// in this regression range.
   255  		logging.Warningf(c, "find next single commit to run %s", err.Error())
   256  	}
   257  	if commit == "" || errors.As(err, &badRangeError) {
   258  		// We don't have more run to wait -> we've failed to find the suspect
   259  		if snapshot.NumInProgress == 0 {
   260  			return nsa, updateNthSectionModelNotFound(c, nsa)
   261  		}
   262  		return nsa, nil
   263  	}
   264  
   265  	// We got the next commit to run. We will schedule a rerun targetting the same bot
   266  	gitilesCommit := &bbpb.GitilesCommit{
   267  		Host:    req.GitilesCommit.Host,
   268  		Project: req.GitilesCommit.Project,
   269  		Ref:     req.GitilesCommit.Ref,
   270  		Id:      commit,
   271  	}
   272  	dims := map[string]string{
   273  		"id": req.BotId,
   274  	}
   275  	err = nthsection.RerunCommit(c, nsa, gitilesCommit, cfa.FirstFailedBuildId, dims)
   276  	if err != nil {
   277  		return nsa, errors.Annotate(err, "rerun commit for %s", commit).Err()
   278  	}
   279  	return nsa, nil
   280  }
   281  
   282  func updateNthSectionModelNotFound(c context.Context, nsa *model.CompileNthSectionAnalysis) error {
   283  	err := datastore.RunInTransaction(c, func(c context.Context) error {
   284  		e := datastore.Get(c, nsa)
   285  		if e != nil {
   286  			return e
   287  		}
   288  		nsa.EndTime = clock.Now(c)
   289  		nsa.Status = pb.AnalysisStatus_NOTFOUND
   290  		nsa.RunStatus = pb.AnalysisRunStatus_ENDED
   291  		return datastore.Put(c, nsa)
   292  	}, nil)
   293  	if err != nil {
   294  		return errors.Annotate(err, "failed updating nthsectionModel").Err()
   295  	}
   296  	return nil
   297  }
   298  
   299  func updateSuspectWithRerunData(c context.Context, rerun *model.SingleRerun) error {
   300  	// Get the suspect for the rerun build
   301  	if rerun.Suspect == nil {
   302  		return fmt.Errorf("no suspect for rerun %d", rerun.Id)
   303  	}
   304  
   305  	suspect := &model.Suspect{
   306  		Id:             rerun.Suspect.IntID(),
   307  		ParentAnalysis: rerun.Suspect.Parent(),
   308  	}
   309  	err := datastore.Get(c, suspect)
   310  	if err != nil {
   311  		return errors.Annotate(err, "couldn't find suspect for rerun %d", rerun.Id).Err()
   312  	}
   313  
   314  	err = updateSuspect(c, suspect)
   315  	if err != nil {
   316  		return errors.Annotate(err, "error updating suspect for rerun %d", rerun.Id).Err()
   317  	}
   318  
   319  	if suspect.VerificationStatus == model.SuspectVerificationStatus_ConfirmedCulprit {
   320  		err = updateSuspectAsConfirmedCulprit(c, suspect)
   321  		if err != nil {
   322  			return errors.Annotate(err, "error updateSuspectAsConfirmedCulprit for rerun %d", rerun.Id).Err()
   323  		}
   324  
   325  		// Cancel all remaining runs
   326  		analysisID := suspect.ParentAnalysis.Parent().IntID()
   327  		err = tq.AddTask(c, &tq.Task{
   328  			Title: fmt.Sprintf("cancel_analysis_%d", analysisID),
   329  			Payload: &taskpb.CancelAnalysisTask{
   330  				AnalysisId: analysisID,
   331  			},
   332  		})
   333  		if err != nil {
   334  			// Non-critical, just log the error
   335  			err := errors.Annotate(err, "schedule canceling analysis %d", analysisID).Err()
   336  			logging.Errorf(c, err.Error())
   337  		}
   338  
   339  		// Add task to revert the heuristic confirmed culprit
   340  		// TODO(@beining): Schedule this task when suspect is VerificationError too.
   341  		// According to go/luci-bisection-integrating-gerrit,
   342  		// we want to also perform gerrit action when suspect is VerificationError.
   343  		err = tq.AddTask(c, &tq.Task{
   344  			Title: fmt.Sprintf("revert_culprit_%d_%d", suspect.Id, analysisID),
   345  			Payload: &taskpb.RevertCulpritTask{
   346  				AnalysisId: analysisID,
   347  				CulpritId:  suspect.Id,
   348  			},
   349  		})
   350  		if err != nil {
   351  			return errors.Annotate(err,
   352  				"error creating task in task queue to revert culprit (analysis ID=%d, suspect ID=%d)",
   353  				analysisID, suspect.Id).Err()
   354  		}
   355  	}
   356  	return nil
   357  }
   358  
   359  func verifyUpdateAnalysisProgressRequest(c context.Context, req *pb.UpdateAnalysisProgressRequest) error {
   360  	if req.AnalysisId == 0 {
   361  		return fmt.Errorf("analysis_id is required")
   362  	}
   363  	if req.Bbid == 0 {
   364  		return fmt.Errorf("build bucket id is required")
   365  	}
   366  	if req.GitilesCommit == nil {
   367  		return fmt.Errorf("gitiles commit is required")
   368  	}
   369  	if req.RerunResult == nil {
   370  		return fmt.Errorf("rerun result is required")
   371  	}
   372  	if req.BotId == "" {
   373  		return fmt.Errorf("bot_id is required")
   374  	}
   375  	return nil
   376  }
   377  
   378  // updateSuspect looks at rerun and set the suspect status
   379  func updateSuspect(c context.Context, suspect *model.Suspect) error {
   380  	rerunStatus, err := getSingleRerunStatus(c, suspect.SuspectRerunBuild.IntID())
   381  	if err != nil {
   382  		return err
   383  	}
   384  	parentRerunStatus, err := getSingleRerunStatus(c, suspect.ParentRerunBuild.IntID())
   385  	if err != nil {
   386  		return err
   387  	}
   388  
   389  	// Update suspect based on rerunStatus and parentRerunStatus
   390  	suspectStatus := model.SuspectStatus(rerunStatus, parentRerunStatus)
   391  
   392  	return datastore.RunInTransaction(c, func(ctx context.Context) error {
   393  		e := datastore.Get(c, suspect)
   394  		if e != nil {
   395  			return e
   396  		}
   397  		suspect.VerificationStatus = suspectStatus
   398  		return datastore.Put(c, suspect)
   399  	}, nil)
   400  }
   401  
   402  // updateSuspectAsConfirmedCulprit update the suspect as the confirmed culprit of analysis
   403  func updateSuspectAsConfirmedCulprit(c context.Context, suspect *model.Suspect) error {
   404  	analysisKey := suspect.ParentAnalysis.Parent()
   405  	analysis := &model.CompileFailureAnalysis{
   406  		Id: analysisKey.IntID(),
   407  	}
   408  	err := datastore.Get(c, analysis)
   409  	if err != nil {
   410  		return err
   411  	}
   412  	verifiedCulprits := analysis.VerifiedCulprits
   413  	verifiedCulprits = append(verifiedCulprits, datastore.KeyForObj(c, suspect))
   414  	if len(verifiedCulprits) > 1 {
   415  		// Just log the warning here, as it is a rare case
   416  		logging.Warningf(c, "found more than 2 suspects for analysis %d", analysis.Id)
   417  	}
   418  
   419  	err = datastore.RunInTransaction(c, func(ctx context.Context) error {
   420  		e := datastore.Get(c, analysis)
   421  		if e != nil {
   422  			return e
   423  		}
   424  		analysis.VerifiedCulprits = verifiedCulprits
   425  		return datastore.Put(c, analysis)
   426  	}, nil)
   427  	if err != nil {
   428  		return err
   429  	}
   430  	return statusupdater.UpdateAnalysisStatus(c, analysis)
   431  }
   432  
   433  // updateRerun updates the last SingleRerun for rerunModel with the information from req.
   434  // Returns the last SingleRerun and error (if it occur).
   435  func updateRerun(c context.Context, req *pb.UpdateAnalysisProgressRequest, rerun *model.SingleRerun) error {
   436  	// Verify the gitiles commit, making sure it was the right rerun we are updating
   437  	if !sameGitilesCommit(req.GitilesCommit, &rerun.GitilesCommit) {
   438  		logging.Errorf(c, "Got different Gitles commit for rerun build %d", req.Bbid)
   439  		return fmt.Errorf("different gitiles commit for rerun")
   440  	}
   441  
   442  	err := datastore.RunInTransaction(c, func(ctx context.Context) error {
   443  		e := datastore.Get(c, rerun)
   444  		if e != nil {
   445  			return e
   446  		}
   447  		rerun.EndTime = clock.Now(c)
   448  		rerun.Status = req.RerunResult.RerunStatus
   449  		return datastore.Put(c, rerun)
   450  	}, nil)
   451  
   452  	if err != nil {
   453  		logging.Errorf(c, "Error updating SingleRerun for build %d: %s", req.Bbid, rerun)
   454  		return errors.Annotate(err, "saving SingleRerun").Err()
   455  	}
   456  	return nil
   457  }
   458  
   459  func getSingleRerunStatus(c context.Context, rerunId int64) (pb.RerunStatus, error) {
   460  	rerunBuild := &model.CompileRerunBuild{
   461  		Id: rerunId,
   462  	}
   463  	err := datastore.Get(c, rerunBuild)
   464  	if err != nil {
   465  		return pb.RerunStatus_RERUN_STATUS_UNSPECIFIED, err
   466  	}
   467  
   468  	// Get SingleRerun
   469  	singleRerun, err := datastoreutil.GetLastRerunForRerunBuild(c, rerunBuild)
   470  	if err != nil {
   471  		return pb.RerunStatus_RERUN_STATUS_UNSPECIFIED, err
   472  	}
   473  
   474  	return singleRerun.Status, nil
   475  }
   476  
   477  func sameGitilesCommit(g1 *bbpb.GitilesCommit, g2 *bbpb.GitilesCommit) bool {
   478  	return g1.Host == g2.Host && g1.Project == g2.Project && g1.Id == g2.Id && g1.Ref == g2.Ref
   479  }