go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/bisection/server/bot_updates.go (about) 1 // Copyright 2023 The LUCI Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package server 16 17 import ( 18 "context" 19 "fmt" 20 21 "go.chromium.org/luci/bisection/compilefailureanalysis/nthsection" 22 "go.chromium.org/luci/bisection/compilefailureanalysis/statusupdater" 23 "go.chromium.org/luci/bisection/model" 24 "go.chromium.org/luci/bisection/nthsectionsnapshot" 25 pb "go.chromium.org/luci/bisection/proto/v1" 26 "go.chromium.org/luci/bisection/server/updatetestrerun" 27 taskpb "go.chromium.org/luci/bisection/task/proto" 28 "go.chromium.org/luci/bisection/util/datastoreutil" 29 "go.chromium.org/luci/bisection/util/loggingutil" 30 31 bbpb "go.chromium.org/luci/buildbucket/proto" 32 "go.chromium.org/luci/common/clock" 33 "go.chromium.org/luci/common/errors" 34 "go.chromium.org/luci/common/logging" 35 "go.chromium.org/luci/gae/service/datastore" 36 "go.chromium.org/luci/server/tq" 37 38 "google.golang.org/grpc/codes" 39 "google.golang.org/grpc/status" 40 ) 41 42 // BotUpdatesServer implements the LUCI Bisection proto service for BotUpdates. 43 type BotUpdatesServer struct{} 44 45 // UpdateAnalysisProgress is an RPC endpoints used by the recipes to update 46 // analysis progress. 47 func (server *BotUpdatesServer) UpdateAnalysisProgress(c context.Context, req *pb.UpdateAnalysisProgressRequest) (*pb.UpdateAnalysisProgressResponse, error) { 48 err := verifyUpdateAnalysisProgressRequest(c, req) 49 if err != nil { 50 return nil, status.Errorf(codes.InvalidArgument, "Invalid request: %s", err) 51 } 52 c = loggingutil.SetAnalysisID(c, req.AnalysisId) 53 c = loggingutil.SetRerunBBID(c, req.Bbid) 54 55 logging.Infof(c, "Update analysis with rerun_build_id = %d analysis_id = %d gitiles_commit=%v ", req.Bbid, req.AnalysisId, req.GitilesCommit) 56 57 cfa, err := datastoreutil.GetCompileFailureAnalysis(c, req.AnalysisId) 58 if err != nil { 59 err = errors.Annotate(err, "failed GetCompileFailureAnalysis ID: %d", req.AnalysisId).Err() 60 errors.Log(c, err) 61 return nil, status.Errorf(codes.Internal, "error GetCompileFailureAnalysis") 62 } 63 if cfa.CompileFailure != nil && cfa.CompileFailure.Parent() != nil { 64 c = loggingutil.SetAnalyzedBBID(c, cfa.CompileFailure.Parent().IntID()) 65 } 66 67 // Get rerun model 68 rerunModel := &model.CompileRerunBuild{ 69 Id: req.Bbid, 70 } 71 switch err := datastore.Get(c, rerunModel); { 72 case err == datastore.ErrNoSuchEntity: 73 return nil, status.Errorf(codes.NotFound, "could not find rerun build with id %d", req.Bbid) 74 case err != nil: 75 return nil, status.Errorf(codes.Internal, "error finding rerun build") 76 default: 77 //continue 78 } 79 80 lastRerun, err := datastoreutil.GetLastRerunForRerunBuild(c, rerunModel) 81 if err != nil { 82 err = errors.Annotate(err, "failed getting last rerun for build %d. Analysis ID: %d", rerunModel.Id, req.AnalysisId).Err() 83 errors.Log(c, err) 84 return nil, status.Errorf(codes.Internal, "error getting last rerun build") 85 } 86 87 // Update rerun model 88 err = updateRerun(c, req, lastRerun) 89 if err != nil { 90 err = errors.Annotate(err, "failed updating rerun for build %d. Analysis ID: %d", rerunModel.Id, req.AnalysisId).Err() 91 errors.Log(c, err) 92 return nil, status.Errorf(codes.Internal, "error updating rerun build") 93 } 94 95 // Safeguard, we really don't expect any other type 96 if lastRerun.Type != model.RerunBuildType_CulpritVerification && lastRerun.Type != model.RerunBuildType_NthSection { 97 logging.Errorf(c, "Invalid type %v for analysis %d", lastRerun.Type, req.AnalysisId) 98 return nil, status.Errorf(codes.Internal, "Invalid type %v", lastRerun.Type) 99 } 100 101 // Culprit verification 102 if lastRerun.Type == model.RerunBuildType_CulpritVerification { 103 err := updateSuspectWithRerunData(c, lastRerun) 104 if err != nil { 105 err = errors.Annotate(err, "updateSuspectWithRerunData for build id %d. Analysis ID: %d", rerunModel.Id, req.AnalysisId).Err() 106 errors.Log(c, err) 107 return nil, status.Errorf(codes.Internal, "error updating suspect") 108 } 109 110 // Update analysis status 111 err = statusupdater.UpdateAnalysisStatus(c, cfa) 112 if err != nil { 113 err = errors.Annotate(err, "statusupdater.UpdateAnalysisStatus. Analysis ID: %d", req.AnalysisId).Err() 114 errors.Log(c, err) 115 return nil, status.Errorf(codes.Internal, "error UpdateAnalysisStatus") 116 } 117 118 // TODO (nqmtuan): It is possible that we schedule an nth-section run right after 119 // a culprit verification run within the same build. We will do this later, for 120 // safety, after we verify nth-section analysis is running fine. 121 return &pb.UpdateAnalysisProgressResponse{}, nil 122 } 123 124 // Nth section 125 if lastRerun.Type == model.RerunBuildType_NthSection { 126 nsa, err := processNthSectionUpdate(c, req) 127 if err != nil { 128 err = errors.Annotate(err, "processNthSectionUpdate. Analysis ID: %d", req.AnalysisId).Err() 129 logging.Errorf(c, err.Error()) 130 131 // If there is an error, then nthsection analysis may ended 132 // if there is no unfinised nthsection runs 133 e := setNthSectionError(c, nsa) 134 if e != nil { 135 e = errors.Annotate(e, "setNthSectionError. Analysis ID: %d", req.AnalysisId).Err() 136 logging.Errorf(c, e.Error()) 137 } 138 139 // Also the main analysis status may need to change as well 140 e = statusupdater.UpdateAnalysisStatus(c, cfa) 141 if e != nil { 142 e = errors.Annotate(e, "UpdateAnalysisStatus. Analysis ID: %d", req.AnalysisId).Err() 143 logging.Errorf(c, e.Error()) 144 } 145 return nil, status.Errorf(codes.Internal, err.Error()) 146 } 147 148 // Update analysis status 149 err = statusupdater.UpdateAnalysisStatus(c, cfa) 150 if err != nil { 151 err = errors.Annotate(err, "statusupdater.UpdateAnalysisStatus. Analysis ID: %d", req.AnalysisId).Err() 152 errors.Log(c, err) 153 return nil, status.Errorf(codes.Internal, "error UpdateAnalysisStatus") 154 } 155 156 return &pb.UpdateAnalysisProgressResponse{}, nil 157 } 158 159 return nil, status.Errorf(codes.Internal, "unknown error") 160 } 161 162 func (server *BotUpdatesServer) UpdateTestAnalysisProgress(ctx context.Context, req *pb.UpdateTestAnalysisProgressRequest) (*pb.UpdateTestAnalysisProgressResponse, error) { 163 err := updatetestrerun.Update(ctx, req) 164 if err != nil { 165 return nil, err 166 } 167 return &pb.UpdateTestAnalysisProgressResponse{}, nil 168 } 169 170 func setNthSectionError(c context.Context, nsa *model.CompileNthSectionAnalysis) error { 171 if nsa == nil { 172 return nil 173 } 174 reruns, err := datastoreutil.GetRerunsForNthSectionAnalysis(c, nsa) 175 if err != nil { 176 return errors.Annotate(err, "GetRerunsForNthSectionAnalysis").Err() 177 } 178 179 for _, rerun := range reruns { 180 // There are some rerun running, so do not mark this as error yet 181 if rerun.Status == pb.RerunStatus_RERUN_STATUS_IN_PROGRESS { 182 return nil 183 } 184 } 185 186 return datastore.RunInTransaction(c, func(c context.Context) error { 187 e := datastore.Get(c, nsa) 188 if e != nil { 189 return e 190 } 191 nsa.Status = pb.AnalysisStatus_ERROR 192 nsa.RunStatus = pb.AnalysisRunStatus_ENDED 193 nsa.EndTime = clock.Now(c) 194 return datastore.Put(c, nsa) 195 }, nil) 196 } 197 198 // processNthSectionUpdate processes the bot update for nthsection analysis run 199 // It will schedule the next run for nthsection analysis targeting the same bot 200 func processNthSectionUpdate(c context.Context, req *pb.UpdateAnalysisProgressRequest) (*model.CompileNthSectionAnalysis, error) { 201 cfa, err := datastoreutil.GetCompileFailureAnalysis(c, req.AnalysisId) 202 if err != nil { 203 return nil, err 204 } 205 206 // We should not schedule any more run for this analysis 207 if cfa.ShouldCancel { 208 return nil, nil 209 } 210 211 nsa, err := datastoreutil.GetNthSectionAnalysis(c, cfa) 212 if err != nil { 213 return nil, err 214 } 215 216 // There is no nthsection analysis for this analysis 217 if nsa == nil { 218 return nil, nil 219 } 220 221 snapshot, err := nthsection.CreateSnapshot(c, nsa) 222 if err != nil { 223 return nsa, errors.Annotate(err, "couldn't create snapshot").Err() 224 } 225 226 // Check if we already found the culprit or not 227 ok, cul := snapshot.GetCulprit() 228 229 // Found culprit -> Update the nthsection analysis 230 if ok { 231 err := nthsection.SaveSuspectAndTriggerCulpritVerification(c, nsa, cfa, snapshot.BlameList.Commits[cul]) 232 if err != nil { 233 return nsa, errors.Annotate(err, "save suspect and trigger culprit verification").Err() 234 } 235 return nsa, nil 236 } 237 238 shouldRunNthSection, err := nthsection.ShouldRunNthSectionAnalysis(c, cfa) 239 if err != nil { 240 return nsa, errors.Annotate(err, "couldn't fetch config for nthsection").Err() 241 } 242 if !shouldRunNthSection { 243 return nsa, nil 244 } 245 246 commit, err := snapshot.FindNextSingleCommitToRun() 247 var badRangeError *nthsectionsnapshot.BadRangeError 248 if err != nil { 249 if !errors.As(err, &badRangeError) { 250 return nsa, errors.Annotate(err, "find next single commit to run").Err() 251 } 252 // BadRangeError suggests the regression range is invalid. 253 // This is not really an error, but more of a indication of no suspect can be found 254 // in this regression range. 255 logging.Warningf(c, "find next single commit to run %s", err.Error()) 256 } 257 if commit == "" || errors.As(err, &badRangeError) { 258 // We don't have more run to wait -> we've failed to find the suspect 259 if snapshot.NumInProgress == 0 { 260 return nsa, updateNthSectionModelNotFound(c, nsa) 261 } 262 return nsa, nil 263 } 264 265 // We got the next commit to run. We will schedule a rerun targetting the same bot 266 gitilesCommit := &bbpb.GitilesCommit{ 267 Host: req.GitilesCommit.Host, 268 Project: req.GitilesCommit.Project, 269 Ref: req.GitilesCommit.Ref, 270 Id: commit, 271 } 272 dims := map[string]string{ 273 "id": req.BotId, 274 } 275 err = nthsection.RerunCommit(c, nsa, gitilesCommit, cfa.FirstFailedBuildId, dims) 276 if err != nil { 277 return nsa, errors.Annotate(err, "rerun commit for %s", commit).Err() 278 } 279 return nsa, nil 280 } 281 282 func updateNthSectionModelNotFound(c context.Context, nsa *model.CompileNthSectionAnalysis) error { 283 err := datastore.RunInTransaction(c, func(c context.Context) error { 284 e := datastore.Get(c, nsa) 285 if e != nil { 286 return e 287 } 288 nsa.EndTime = clock.Now(c) 289 nsa.Status = pb.AnalysisStatus_NOTFOUND 290 nsa.RunStatus = pb.AnalysisRunStatus_ENDED 291 return datastore.Put(c, nsa) 292 }, nil) 293 if err != nil { 294 return errors.Annotate(err, "failed updating nthsectionModel").Err() 295 } 296 return nil 297 } 298 299 func updateSuspectWithRerunData(c context.Context, rerun *model.SingleRerun) error { 300 // Get the suspect for the rerun build 301 if rerun.Suspect == nil { 302 return fmt.Errorf("no suspect for rerun %d", rerun.Id) 303 } 304 305 suspect := &model.Suspect{ 306 Id: rerun.Suspect.IntID(), 307 ParentAnalysis: rerun.Suspect.Parent(), 308 } 309 err := datastore.Get(c, suspect) 310 if err != nil { 311 return errors.Annotate(err, "couldn't find suspect for rerun %d", rerun.Id).Err() 312 } 313 314 err = updateSuspect(c, suspect) 315 if err != nil { 316 return errors.Annotate(err, "error updating suspect for rerun %d", rerun.Id).Err() 317 } 318 319 if suspect.VerificationStatus == model.SuspectVerificationStatus_ConfirmedCulprit { 320 err = updateSuspectAsConfirmedCulprit(c, suspect) 321 if err != nil { 322 return errors.Annotate(err, "error updateSuspectAsConfirmedCulprit for rerun %d", rerun.Id).Err() 323 } 324 325 // Cancel all remaining runs 326 analysisID := suspect.ParentAnalysis.Parent().IntID() 327 err = tq.AddTask(c, &tq.Task{ 328 Title: fmt.Sprintf("cancel_analysis_%d", analysisID), 329 Payload: &taskpb.CancelAnalysisTask{ 330 AnalysisId: analysisID, 331 }, 332 }) 333 if err != nil { 334 // Non-critical, just log the error 335 err := errors.Annotate(err, "schedule canceling analysis %d", analysisID).Err() 336 logging.Errorf(c, err.Error()) 337 } 338 339 // Add task to revert the heuristic confirmed culprit 340 // TODO(@beining): Schedule this task when suspect is VerificationError too. 341 // According to go/luci-bisection-integrating-gerrit, 342 // we want to also perform gerrit action when suspect is VerificationError. 343 err = tq.AddTask(c, &tq.Task{ 344 Title: fmt.Sprintf("revert_culprit_%d_%d", suspect.Id, analysisID), 345 Payload: &taskpb.RevertCulpritTask{ 346 AnalysisId: analysisID, 347 CulpritId: suspect.Id, 348 }, 349 }) 350 if err != nil { 351 return errors.Annotate(err, 352 "error creating task in task queue to revert culprit (analysis ID=%d, suspect ID=%d)", 353 analysisID, suspect.Id).Err() 354 } 355 } 356 return nil 357 } 358 359 func verifyUpdateAnalysisProgressRequest(c context.Context, req *pb.UpdateAnalysisProgressRequest) error { 360 if req.AnalysisId == 0 { 361 return fmt.Errorf("analysis_id is required") 362 } 363 if req.Bbid == 0 { 364 return fmt.Errorf("build bucket id is required") 365 } 366 if req.GitilesCommit == nil { 367 return fmt.Errorf("gitiles commit is required") 368 } 369 if req.RerunResult == nil { 370 return fmt.Errorf("rerun result is required") 371 } 372 if req.BotId == "" { 373 return fmt.Errorf("bot_id is required") 374 } 375 return nil 376 } 377 378 // updateSuspect looks at rerun and set the suspect status 379 func updateSuspect(c context.Context, suspect *model.Suspect) error { 380 rerunStatus, err := getSingleRerunStatus(c, suspect.SuspectRerunBuild.IntID()) 381 if err != nil { 382 return err 383 } 384 parentRerunStatus, err := getSingleRerunStatus(c, suspect.ParentRerunBuild.IntID()) 385 if err != nil { 386 return err 387 } 388 389 // Update suspect based on rerunStatus and parentRerunStatus 390 suspectStatus := model.SuspectStatus(rerunStatus, parentRerunStatus) 391 392 return datastore.RunInTransaction(c, func(ctx context.Context) error { 393 e := datastore.Get(c, suspect) 394 if e != nil { 395 return e 396 } 397 suspect.VerificationStatus = suspectStatus 398 return datastore.Put(c, suspect) 399 }, nil) 400 } 401 402 // updateSuspectAsConfirmedCulprit update the suspect as the confirmed culprit of analysis 403 func updateSuspectAsConfirmedCulprit(c context.Context, suspect *model.Suspect) error { 404 analysisKey := suspect.ParentAnalysis.Parent() 405 analysis := &model.CompileFailureAnalysis{ 406 Id: analysisKey.IntID(), 407 } 408 err := datastore.Get(c, analysis) 409 if err != nil { 410 return err 411 } 412 verifiedCulprits := analysis.VerifiedCulprits 413 verifiedCulprits = append(verifiedCulprits, datastore.KeyForObj(c, suspect)) 414 if len(verifiedCulprits) > 1 { 415 // Just log the warning here, as it is a rare case 416 logging.Warningf(c, "found more than 2 suspects for analysis %d", analysis.Id) 417 } 418 419 err = datastore.RunInTransaction(c, func(ctx context.Context) error { 420 e := datastore.Get(c, analysis) 421 if e != nil { 422 return e 423 } 424 analysis.VerifiedCulprits = verifiedCulprits 425 return datastore.Put(c, analysis) 426 }, nil) 427 if err != nil { 428 return err 429 } 430 return statusupdater.UpdateAnalysisStatus(c, analysis) 431 } 432 433 // updateRerun updates the last SingleRerun for rerunModel with the information from req. 434 // Returns the last SingleRerun and error (if it occur). 435 func updateRerun(c context.Context, req *pb.UpdateAnalysisProgressRequest, rerun *model.SingleRerun) error { 436 // Verify the gitiles commit, making sure it was the right rerun we are updating 437 if !sameGitilesCommit(req.GitilesCommit, &rerun.GitilesCommit) { 438 logging.Errorf(c, "Got different Gitles commit for rerun build %d", req.Bbid) 439 return fmt.Errorf("different gitiles commit for rerun") 440 } 441 442 err := datastore.RunInTransaction(c, func(ctx context.Context) error { 443 e := datastore.Get(c, rerun) 444 if e != nil { 445 return e 446 } 447 rerun.EndTime = clock.Now(c) 448 rerun.Status = req.RerunResult.RerunStatus 449 return datastore.Put(c, rerun) 450 }, nil) 451 452 if err != nil { 453 logging.Errorf(c, "Error updating SingleRerun for build %d: %s", req.Bbid, rerun) 454 return errors.Annotate(err, "saving SingleRerun").Err() 455 } 456 return nil 457 } 458 459 func getSingleRerunStatus(c context.Context, rerunId int64) (pb.RerunStatus, error) { 460 rerunBuild := &model.CompileRerunBuild{ 461 Id: rerunId, 462 } 463 err := datastore.Get(c, rerunBuild) 464 if err != nil { 465 return pb.RerunStatus_RERUN_STATUS_UNSPECIFIED, err 466 } 467 468 // Get SingleRerun 469 singleRerun, err := datastoreutil.GetLastRerunForRerunBuild(c, rerunBuild) 470 if err != nil { 471 return pb.RerunStatus_RERUN_STATUS_UNSPECIFIED, err 472 } 473 474 return singleRerun.Status, nil 475 } 476 477 func sameGitilesCommit(g1 *bbpb.GitilesCommit, g2 *bbpb.GitilesCommit) bool { 478 return g1.Host == g2.Host && g1.Project == g2.Project && g1.Id == g2.Id && g1.Ref == g2.Ref 479 }