go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/bisection/compilefailuredetection/failure_detection.go (about) 1 // Copyright 2022 The LUCI Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package compilefailuredetection analyses a failed build and determines if it 16 // needs to trigger a new analysis for it. 17 package compilefailuredetection 18 19 import ( 20 "context" 21 "fmt" 22 23 "go.chromium.org/luci/bisection/compilefailureanalysis" 24 "go.chromium.org/luci/bisection/internal/buildbucket" 25 "go.chromium.org/luci/bisection/model" 26 pb "go.chromium.org/luci/bisection/proto/v1" 27 tpb "go.chromium.org/luci/bisection/task/proto" 28 "go.chromium.org/luci/bisection/util" 29 "go.chromium.org/luci/bisection/util/datastoreutil" 30 "go.chromium.org/luci/bisection/util/loggingutil" 31 32 "go.chromium.org/luci/gae/service/datastore" 33 34 buildbucketpb "go.chromium.org/luci/buildbucket/proto" 35 "go.chromium.org/luci/common/errors" 36 "go.chromium.org/luci/common/logging" 37 "go.chromium.org/luci/common/retry/transient" 38 "go.chromium.org/luci/common/tsmon/field" 39 "go.chromium.org/luci/common/tsmon/metric" 40 "go.chromium.org/luci/server/tq" 41 "google.golang.org/protobuf/proto" 42 "google.golang.org/protobuf/types/known/fieldmaskpb" 43 ) 44 45 const ( 46 taskClass = "build-failure-ingestion" 47 queue = "build-failure-ingestion" 48 ) 49 50 var ( 51 analysisCounter = metric.NewCounter( 52 "bisection/compile/analysis/trigger", 53 "The number of Compile Failure Analysis triggered by LUCI Bisection.", 54 nil, 55 // The LUCI Project. 56 field.String("project"), 57 ) 58 ) 59 60 // RegisterTaskClass registers the task class for tq dispatcher. 61 func RegisterTaskClass() { 62 tq.RegisterTaskClass(tq.TaskClass{ 63 ID: taskClass, 64 Prototype: (*tpb.FailedBuildIngestionTask)(nil), 65 Queue: queue, 66 Kind: tq.NonTransactional, 67 Handler: func(c context.Context, payload proto.Message) error { 68 task := payload.(*tpb.FailedBuildIngestionTask) 69 logging.Infof(c, "Processing failed build task with id = %d", task.GetBbid()) 70 _, err := AnalyzeBuild(c, task.GetBbid()) 71 if err != nil { 72 logging.Errorf(c, "Error processing failed build task with id = %d: %s", task.GetBbid(), err) 73 // If the error is transient, return err to retry 74 if transient.Tag.In(err) { 75 return err 76 } 77 return nil 78 } 79 return nil 80 }, 81 }) 82 } 83 84 // AnalyzeBuild analyzes a build and trigger an analysis if necessary. 85 // Returns true if a new analysis is triggered, returns false otherwise. 86 func AnalyzeBuild(c context.Context, bbid int64) (bool, error) { 87 c = loggingutil.SetAnalyzedBBID(c, bbid) 88 logging.Infof(c, "AnalyzeBuild %d", bbid) 89 build, err := buildbucket.GetBuild(c, bbid, &buildbucketpb.BuildMask{ 90 Fields: &fieldmaskpb.FieldMask{ 91 Paths: []string{"id", "builder", "input", "status", "steps", "number", "start_time", "end_time", "create_time", "infra.swarming.task_dimensions", "infra.backend.task_dimensions", "output.gitiles_commit"}, 92 }, 93 }) 94 if err != nil { 95 return false, err 96 } 97 98 if !shouldAnalyzeBuild(c, build) { 99 return false, nil 100 } 101 102 lastPassedBuild, firstFailedBuild, err := getLastPassedFirstFailedBuilds(c, build) 103 104 // Could not find last passed build, skip the analysis. 105 if err != nil { 106 logging.Infof(c, "Could not find last passed/first failed builds for failure of build %d. Exiting...", bbid) 107 return false, nil 108 } 109 110 // Check if we need to trigger a new analysis. 111 yes, cf, err := analysisExists(c, build, firstFailedBuild) 112 if err != nil { 113 return false, err 114 } 115 // We don't need to trigger a new analysis. 116 if !yes { 117 logging.Infof(c, "There is already an analysis for first failed build %d. No new analysis will be triggered for build %d", firstFailedBuild.Id, bbid) 118 return false, nil 119 } 120 121 // No analysis for the regression range. Trigger one. 122 _, err = compilefailureanalysis.AnalyzeFailure(c, cf, firstFailedBuild.Id, lastPassedBuild.Id) 123 if err != nil { 124 return false, err 125 } 126 analysisCounter.Add(c, 1, build.Builder.Project) 127 return true, nil 128 } 129 130 // UpdateSucceededBuild will be called when we got notification for a succeeded build 131 // It will set the ShouldCancel flag of the analysis for the corresponding build. 132 // Is should only do so if the commit for succeeded build is later than the commit 133 // for the analysis 134 func UpdateSucceededBuild(c context.Context, bbid int64) error { 135 logging.Infof(c, "Received succeeded build %d", bbid) 136 build, err := buildbucket.GetBuild(c, bbid, &buildbucketpb.BuildMask{ 137 Fields: &fieldmaskpb.FieldMask{ 138 Paths: []string{"id", "builder", "input.gitiles_commit", "output.gitiles_commit", "number"}, 139 }, 140 }) 141 142 if err != nil { 143 return errors.Annotate(err, "couldn't get build %d", bbid).Err() 144 } 145 146 analysis, err := datastoreutil.GetLatestAnalysisForBuilder(c, build.Builder.Project, build.Builder.Bucket, build.Builder.Builder) 147 if err != nil { 148 return errors.Annotate(err, "couldn't GetLatestAnalysisForBuilder").Err() 149 } 150 151 if analysis == nil { 152 return nil 153 } 154 155 shouldCancel, err := shouldCancelAnalysis(c, analysis, build) 156 if err != nil { 157 return errors.Annotate(err, "shouldCancelAnalysis %d", analysis.Id).Err() 158 } 159 if !shouldCancel { 160 logging.Infof(c, "The build under analysis is more recent than the succeeded build") 161 return nil 162 } 163 164 // Update analysis ShouldCancelFlag 165 err = datastore.RunInTransaction(c, func(c context.Context) error { 166 e := datastore.Get(c, analysis) 167 if e != nil { 168 return e 169 } 170 analysis.ShouldCancel = true 171 return datastore.Put(c, analysis) 172 }, nil) 173 174 // Create a task to cancel all remaining runs 175 err = tq.AddTask(c, &tq.Task{ 176 Title: fmt.Sprintf("cancel_analysis_%d", analysis.Id), 177 Payload: &tpb.CancelAnalysisTask{ 178 AnalysisId: analysis.Id, 179 }, 180 }) 181 182 if err != nil { 183 return errors.Annotate(err, "couldn't set ShouldCancel flag").Err() 184 } 185 186 return nil 187 } 188 189 // shouldCancelAnalysis returns true if the succeeded build is more recent than 190 // the build being analyzed. 191 func shouldCancelAnalysis(c context.Context, cfa *model.CompileFailureAnalysis, succededBuild *buildbucketpb.Build) (bool, error) { 192 build, err := datastoreutil.GetFailedBuildForAnalysis(c, cfa) 193 if err != nil { 194 return false, errors.Annotate(err, "getFailedBuildForAnalysis %d", cfa.Id).Err() 195 } 196 if succededBuild.GetOutput() != nil && succededBuild.GetOutput().GetGitilesCommit() != nil && succededBuild.GetOutput().GetGitilesCommit().Position > 0 && build.Position > 0 { 197 return succededBuild.GetOutput().GetGitilesCommit().Position > build.Position, nil 198 } 199 // Else, fallback to build number 200 return succededBuild.GetNumber() > int32(build.BuildNumber), nil 201 } 202 203 func shouldAnalyzeBuild(c context.Context, build *buildbucketpb.Build) bool { 204 // We only care about failed build 205 // Note: We already check for status = bbv1.ResultFailure during pubsub ingestion. 206 // But bbv1.ResultFailure is true for both failure and infra failure 207 // So we need to check it here. 208 if build.Status != buildbucketpb.Status_FAILURE { 209 logging.Infof(c, "Build %d does not have FAILURE status", build.Id) 210 return false 211 } 212 213 // We only care about builds with compile failure 214 if !hasCompileStepStatus(c, build, buildbucketpb.Status_FAILURE) { 215 logging.Infof(c, "No compile step for build %d", build.Id) 216 return false 217 } 218 return true 219 } 220 221 // Search builds older than refBuild to find the last passed and first failed builds 222 func getLastPassedFirstFailedBuilds(c context.Context, refBuild *buildbucketpb.Build) (*buildbucketpb.Build, *buildbucketpb.Build, error) { 223 firstFailedBuild := refBuild 224 225 // Query buildbucket for the first build with compile failure 226 // We only consider maximum of 100 builds before the failed build. 227 // If we cannot find the regression range within 100 builds, the failure is 228 // too old for the analysis to be useful. 229 var buildsToSearch int32 = 100 230 var batchSize int32 = 20 231 var pageToken string = "" 232 233 buildMask := &buildbucketpb.BuildMask{ 234 Fields: &fieldmaskpb.FieldMask{ 235 Paths: []string{"id", "builder", "input", "status", "steps"}, 236 }, 237 } 238 239 for buildsToSearch > 0 { 240 // Tweak the batch size if necessary to respect the search limit 241 if buildsToSearch < batchSize { 242 batchSize = buildsToSearch 243 } 244 245 // Get the next batch of older builds 246 olderBuilds, nextPageToken, err := buildbucket.SearchOlderBuilds(c, refBuild, buildMask, batchSize, pageToken) 247 if err != nil { 248 logging.Errorf(c, "Could not search for older builds: %s", err) 249 return nil, nil, err 250 } 251 252 // Search this batch of older builds for the last passed and first failed build 253 for _, oldBuild := range olderBuilds { 254 // We found the last passed build 255 if oldBuild.Status == buildbucketpb.Status_SUCCESS && hasCompileStepStatus(c, oldBuild, buildbucketpb.Status_SUCCESS) { 256 return oldBuild, firstFailedBuild, nil 257 } 258 if oldBuild.Status == buildbucketpb.Status_FAILURE && hasCompileStepStatus(c, oldBuild, buildbucketpb.Status_FAILURE) { 259 firstFailedBuild = oldBuild 260 } 261 } 262 263 // Stop searching if there are no more older builds available 264 if nextPageToken == "" { 265 break 266 } 267 268 // Update the remaining number of builds to search and the page token 269 buildsToSearch -= int32(len(olderBuilds)) 270 pageToken = nextPageToken 271 } 272 273 // If we have reached here, the last passed build could not be found within the search limit 274 return nil, nil, fmt.Errorf("could not find last passed build") 275 } 276 277 // analysisExists checks if we need to trigger a new analysis. 278 // The function checks if there has been an analysis associated with the firstFailedBuild. 279 // Returns true if a new analysis should be triggered, returns false otherwise. 280 // Also return the compileFailure model associated with the failure for convenience. 281 // Note that this function also create/update the associated CompileFailureModel 282 func analysisExists(c context.Context, refFailedBuild *buildbucketpb.Build, firstFailedBuild *buildbucketpb.Build) (bool, *model.CompileFailure, error) { 283 logging.Infof(c, "check analysisExists for firstFailedBuild %d", firstFailedBuild.Id) 284 285 // Create a CompileFailure record in datastore if necessary 286 compileFailure, err := createCompileFailureModel(c, refFailedBuild) 287 288 // Search in datastore if there is already an analysis with the first failed build. 289 // If not, trigger an analysis 290 analysis, err := searchAnalysis(c, firstFailedBuild.Id) 291 292 if err != nil { 293 return false, nil, err 294 } 295 296 // There is an existing analysis. 297 // We should not trigger another analysis, but instead we will "merge" the 298 // compile failure with the existing one. 299 if analysis != nil { 300 compileFailureId := analysis.CompileFailure.IntID() 301 logging.Infof(c, "An analysis already existed for compile failure with ID %d", compileFailureId) 302 cf := &model.CompileFailure{ 303 Id: compileFailureId, 304 Build: analysis.CompileFailure.Parent(), 305 } 306 // Find the compile failure that the analysis runs on 307 err := datastore.Get(c, cf) 308 if err != nil { 309 logging.Errorf(c, "Cannot find compile failure ID %d", compileFailureId) 310 return false, nil, err 311 } 312 313 // If they are the same compileFailure, don't do anything. 314 // This may happen when we receive duplicated/retried message from pubsub. 315 if cf.Id == compileFailure.Id { 316 return false, compileFailure, nil 317 } 318 319 // "Merge" the compile failures, so they use the same analysis 320 err = datastore.RunInTransaction(c, func(c context.Context) error { 321 e := datastore.Get(c, compileFailure) 322 if e != nil { 323 return e 324 } 325 compileFailure.MergedFailureKey = analysis.CompileFailure 326 return datastore.Put(c, compileFailure) 327 }, nil) 328 329 if err != nil { 330 return false, nil, err 331 } 332 333 return false, compileFailure, nil 334 } 335 336 return true, compileFailure, nil 337 } 338 339 func createCompileFailureModel(c context.Context, failedBuild *buildbucketpb.Build) (*model.CompileFailure, error) { 340 // As we are using build ID as ID here, the entities will be created if not exist. 341 // If it exists, we just update the entities. 342 var compileFailure *model.CompileFailure 343 err := datastore.RunInTransaction(c, func(c context.Context) error { 344 gitilesCommit := util.GetGitilesCommitForBuild(failedBuild) 345 buildModel := &model.LuciFailedBuild{ 346 Id: failedBuild.Id, 347 LuciBuild: model.LuciBuild{ 348 BuildId: failedBuild.Id, 349 Project: failedBuild.GetBuilder().Project, 350 Bucket: failedBuild.GetBuilder().Bucket, 351 Builder: failedBuild.GetBuilder().Builder, 352 BuildNumber: int(failedBuild.Number), 353 Status: failedBuild.Status, 354 StartTime: failedBuild.StartTime.AsTime(), 355 EndTime: failedBuild.EndTime.AsTime(), 356 CreateTime: failedBuild.CreateTime.AsTime(), 357 }, 358 BuildFailureType: pb.BuildFailureType_COMPILE, 359 Platform: platformForBuild(c, failedBuild), 360 SheriffRotations: util.GetSheriffRotationsForBuild(failedBuild), 361 } 362 proto.Merge(&buildModel.GitilesCommit, gitilesCommit) 363 e := datastore.Put(c, buildModel) 364 if e != nil { 365 return e 366 } 367 compileFailure = &model.CompileFailure{ 368 Id: failedBuild.Id, 369 Build: datastore.KeyForObj(c, buildModel), 370 } 371 return datastore.Put(c, compileFailure) 372 }, nil) 373 374 if err != nil { 375 return nil, err 376 } 377 378 return compileFailure, nil 379 } 380 381 func searchAnalysis(c context.Context, firstFailedBuildId int64) (*model.CompileFailureAnalysis, error) { 382 q := datastore.NewQuery("CompileFailureAnalysis").Eq("first_failed_build_id", firstFailedBuildId) 383 analyses := []*model.CompileFailureAnalysis{} 384 err := datastore.GetAll(c, q, &analyses) 385 if err != nil { 386 logging.Errorf(c, "Error querying datastore for analysis for first_failed_build_id %d: %s", firstFailedBuildId, err) 387 return nil, err 388 } 389 if len(analyses) == 0 { 390 return nil, nil 391 } 392 // There should only be at most one analysis firstFailedBuildId. 393 if len(analyses) > 1 { 394 logging.Warningf(c, "Found more than one analysis for first_failed_build_id %d", firstFailedBuildId) 395 } 396 return analyses[0], nil 397 } 398 399 // hasCompileStepStatus checks if the compile step for a build has the specified status. 400 func hasCompileStepStatus(c context.Context, build *buildbucketpb.Build, status buildbucketpb.Status) bool { 401 for _, step := range build.Steps { 402 if util.IsCompileStep(step) && step.Status == status { 403 return true 404 } 405 } 406 return false 407 } 408 409 func platformForBuild(c context.Context, build *buildbucketpb.Build) model.Platform { 410 dimens := util.GetTaskDimensions(build) 411 if dimens == nil { 412 return model.PlatformUnspecified 413 } 414 for _, d := range dimens { 415 if d.Key == "os" { 416 return model.PlatformFromOS(c, d.Value) 417 } 418 } 419 return model.PlatformUnspecified 420 }