go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/bisection/compilefailuredetection/failure_detection.go

go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/bisection/compilefailuredetection/failure_detection.go (about)

     1  // Copyright 2022 The LUCI Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package compilefailuredetection analyses a failed build and determines if it
    16  // needs to trigger a new analysis for it.
    17  package compilefailuredetection
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  
    23  	"go.chromium.org/luci/bisection/compilefailureanalysis"
    24  	"go.chromium.org/luci/bisection/internal/buildbucket"
    25  	"go.chromium.org/luci/bisection/model"
    26  	pb "go.chromium.org/luci/bisection/proto/v1"
    27  	tpb "go.chromium.org/luci/bisection/task/proto"
    28  	"go.chromium.org/luci/bisection/util"
    29  	"go.chromium.org/luci/bisection/util/datastoreutil"
    30  	"go.chromium.org/luci/bisection/util/loggingutil"
    31  
    32  	"go.chromium.org/luci/gae/service/datastore"
    33  
    34  	buildbucketpb "go.chromium.org/luci/buildbucket/proto"
    35  	"go.chromium.org/luci/common/errors"
    36  	"go.chromium.org/luci/common/logging"
    37  	"go.chromium.org/luci/common/retry/transient"
    38  	"go.chromium.org/luci/common/tsmon/field"
    39  	"go.chromium.org/luci/common/tsmon/metric"
    40  	"go.chromium.org/luci/server/tq"
    41  	"google.golang.org/protobuf/proto"
    42  	"google.golang.org/protobuf/types/known/fieldmaskpb"
    43  )
    44  
    45  const (
    46  	taskClass = "build-failure-ingestion"
    47  	queue     = "build-failure-ingestion"
    48  )
    49  
    50  var (
    51  	analysisCounter = metric.NewCounter(
    52  		"bisection/compile/analysis/trigger",
    53  		"The number of Compile Failure Analysis triggered by LUCI Bisection.",
    54  		nil,
    55  		// The LUCI Project.
    56  		field.String("project"),
    57  	)
    58  )
    59  
    60  // RegisterTaskClass registers the task class for tq dispatcher.
    61  func RegisterTaskClass() {
    62  	tq.RegisterTaskClass(tq.TaskClass{
    63  		ID:        taskClass,
    64  		Prototype: (*tpb.FailedBuildIngestionTask)(nil),
    65  		Queue:     queue,
    66  		Kind:      tq.NonTransactional,
    67  		Handler: func(c context.Context, payload proto.Message) error {
    68  			task := payload.(*tpb.FailedBuildIngestionTask)
    69  			logging.Infof(c, "Processing failed build task with id = %d", task.GetBbid())
    70  			_, err := AnalyzeBuild(c, task.GetBbid())
    71  			if err != nil {
    72  				logging.Errorf(c, "Error processing failed build task with id = %d: %s", task.GetBbid(), err)
    73  				// If the error is transient, return err to retry
    74  				if transient.Tag.In(err) {
    75  					return err
    76  				}
    77  				return nil
    78  			}
    79  			return nil
    80  		},
    81  	})
    82  }
    83  
    84  // AnalyzeBuild analyzes a build and trigger an analysis if necessary.
    85  // Returns true if a new analysis is triggered, returns false otherwise.
    86  func AnalyzeBuild(c context.Context, bbid int64) (bool, error) {
    87  	c = loggingutil.SetAnalyzedBBID(c, bbid)
    88  	logging.Infof(c, "AnalyzeBuild %d", bbid)
    89  	build, err := buildbucket.GetBuild(c, bbid, &buildbucketpb.BuildMask{
    90  		Fields: &fieldmaskpb.FieldMask{
    91  			Paths: []string{"id", "builder", "input", "status", "steps", "number", "start_time", "end_time", "create_time", "infra.swarming.task_dimensions", "infra.backend.task_dimensions", "output.gitiles_commit"},
    92  		},
    93  	})
    94  	if err != nil {
    95  		return false, err
    96  	}
    97  
    98  	if !shouldAnalyzeBuild(c, build) {
    99  		return false, nil
   100  	}
   101  
   102  	lastPassedBuild, firstFailedBuild, err := getLastPassedFirstFailedBuilds(c, build)
   103  
   104  	// Could not find last passed build, skip the analysis.
   105  	if err != nil {
   106  		logging.Infof(c, "Could not find last passed/first failed builds for failure of build %d. Exiting...", bbid)
   107  		return false, nil
   108  	}
   109  
   110  	// Check if we need to trigger a new analysis.
   111  	yes, cf, err := analysisExists(c, build, firstFailedBuild)
   112  	if err != nil {
   113  		return false, err
   114  	}
   115  	// We don't need to trigger a new analysis.
   116  	if !yes {
   117  		logging.Infof(c, "There is already an analysis for first failed build %d. No new analysis will be triggered for build %d", firstFailedBuild.Id, bbid)
   118  		return false, nil
   119  	}
   120  
   121  	// No analysis for the regression range. Trigger one.
   122  	_, err = compilefailureanalysis.AnalyzeFailure(c, cf, firstFailedBuild.Id, lastPassedBuild.Id)
   123  	if err != nil {
   124  		return false, err
   125  	}
   126  	analysisCounter.Add(c, 1, build.Builder.Project)
   127  	return true, nil
   128  }
   129  
   130  // UpdateSucceededBuild will be called when we got notification for a succeeded build
   131  // It will set the ShouldCancel flag of the analysis for the corresponding build.
   132  // Is should only do so if the commit for succeeded build is later than the commit
   133  // for the analysis
   134  func UpdateSucceededBuild(c context.Context, bbid int64) error {
   135  	logging.Infof(c, "Received succeeded build %d", bbid)
   136  	build, err := buildbucket.GetBuild(c, bbid, &buildbucketpb.BuildMask{
   137  		Fields: &fieldmaskpb.FieldMask{
   138  			Paths: []string{"id", "builder", "input.gitiles_commit", "output.gitiles_commit", "number"},
   139  		},
   140  	})
   141  
   142  	if err != nil {
   143  		return errors.Annotate(err, "couldn't get build %d", bbid).Err()
   144  	}
   145  
   146  	analysis, err := datastoreutil.GetLatestAnalysisForBuilder(c, build.Builder.Project, build.Builder.Bucket, build.Builder.Builder)
   147  	if err != nil {
   148  		return errors.Annotate(err, "couldn't GetLatestAnalysisForBuilder").Err()
   149  	}
   150  
   151  	if analysis == nil {
   152  		return nil
   153  	}
   154  
   155  	shouldCancel, err := shouldCancelAnalysis(c, analysis, build)
   156  	if err != nil {
   157  		return errors.Annotate(err, "shouldCancelAnalysis %d", analysis.Id).Err()
   158  	}
   159  	if !shouldCancel {
   160  		logging.Infof(c, "The build under analysis is more recent than the succeeded build")
   161  		return nil
   162  	}
   163  
   164  	// Update analysis ShouldCancelFlag
   165  	err = datastore.RunInTransaction(c, func(c context.Context) error {
   166  		e := datastore.Get(c, analysis)
   167  		if e != nil {
   168  			return e
   169  		}
   170  		analysis.ShouldCancel = true
   171  		return datastore.Put(c, analysis)
   172  	}, nil)
   173  
   174  	// Create a task to cancel all remaining runs
   175  	err = tq.AddTask(c, &tq.Task{
   176  		Title: fmt.Sprintf("cancel_analysis_%d", analysis.Id),
   177  		Payload: &tpb.CancelAnalysisTask{
   178  			AnalysisId: analysis.Id,
   179  		},
   180  	})
   181  
   182  	if err != nil {
   183  		return errors.Annotate(err, "couldn't set ShouldCancel flag").Err()
   184  	}
   185  
   186  	return nil
   187  }
   188  
   189  // shouldCancelAnalysis returns true if the succeeded build is more recent than
   190  // the build being analyzed.
   191  func shouldCancelAnalysis(c context.Context, cfa *model.CompileFailureAnalysis, succededBuild *buildbucketpb.Build) (bool, error) {
   192  	build, err := datastoreutil.GetFailedBuildForAnalysis(c, cfa)
   193  	if err != nil {
   194  		return false, errors.Annotate(err, "getFailedBuildForAnalysis %d", cfa.Id).Err()
   195  	}
   196  	if succededBuild.GetOutput() != nil && succededBuild.GetOutput().GetGitilesCommit() != nil && succededBuild.GetOutput().GetGitilesCommit().Position > 0 && build.Position > 0 {
   197  		return succededBuild.GetOutput().GetGitilesCommit().Position > build.Position, nil
   198  	}
   199  	// Else, fallback to build number
   200  	return succededBuild.GetNumber() > int32(build.BuildNumber), nil
   201  }
   202  
   203  func shouldAnalyzeBuild(c context.Context, build *buildbucketpb.Build) bool {
   204  	// We only care about failed build
   205  	// Note: We already check for status = bbv1.ResultFailure during pubsub ingestion.
   206  	// But bbv1.ResultFailure is true for both failure and infra failure
   207  	// So we need to check it here.
   208  	if build.Status != buildbucketpb.Status_FAILURE {
   209  		logging.Infof(c, "Build %d does not have FAILURE status", build.Id)
   210  		return false
   211  	}
   212  
   213  	// We only care about builds with compile failure
   214  	if !hasCompileStepStatus(c, build, buildbucketpb.Status_FAILURE) {
   215  		logging.Infof(c, "No compile step for build %d", build.Id)
   216  		return false
   217  	}
   218  	return true
   219  }
   220  
   221  // Search builds older than refBuild to find the last passed and first failed builds
   222  func getLastPassedFirstFailedBuilds(c context.Context, refBuild *buildbucketpb.Build) (*buildbucketpb.Build, *buildbucketpb.Build, error) {
   223  	firstFailedBuild := refBuild
   224  
   225  	// Query buildbucket for the first build with compile failure
   226  	// We only consider maximum of 100 builds before the failed build.
   227  	// If we cannot find the regression range within 100 builds, the failure is
   228  	// too old for the analysis to be useful.
   229  	var buildsToSearch int32 = 100
   230  	var batchSize int32 = 20
   231  	var pageToken string = ""
   232  
   233  	buildMask := &buildbucketpb.BuildMask{
   234  		Fields: &fieldmaskpb.FieldMask{
   235  			Paths: []string{"id", "builder", "input", "status", "steps"},
   236  		},
   237  	}
   238  
   239  	for buildsToSearch > 0 {
   240  		// Tweak the batch size if necessary to respect the search limit
   241  		if buildsToSearch < batchSize {
   242  			batchSize = buildsToSearch
   243  		}
   244  
   245  		// Get the next batch of older builds
   246  		olderBuilds, nextPageToken, err := buildbucket.SearchOlderBuilds(c, refBuild, buildMask, batchSize, pageToken)
   247  		if err != nil {
   248  			logging.Errorf(c, "Could not search for older builds: %s", err)
   249  			return nil, nil, err
   250  		}
   251  
   252  		// Search this batch of older builds for the last passed and first failed build
   253  		for _, oldBuild := range olderBuilds {
   254  			// We found the last passed build
   255  			if oldBuild.Status == buildbucketpb.Status_SUCCESS && hasCompileStepStatus(c, oldBuild, buildbucketpb.Status_SUCCESS) {
   256  				return oldBuild, firstFailedBuild, nil
   257  			}
   258  			if oldBuild.Status == buildbucketpb.Status_FAILURE && hasCompileStepStatus(c, oldBuild, buildbucketpb.Status_FAILURE) {
   259  				firstFailedBuild = oldBuild
   260  			}
   261  		}
   262  
   263  		// Stop searching if there are no more older builds available
   264  		if nextPageToken == "" {
   265  			break
   266  		}
   267  
   268  		// Update the remaining number of builds to search and the page token
   269  		buildsToSearch -= int32(len(olderBuilds))
   270  		pageToken = nextPageToken
   271  	}
   272  
   273  	// If we have reached here, the last passed build could not be found within the search limit
   274  	return nil, nil, fmt.Errorf("could not find last passed build")
   275  }
   276  
   277  // analysisExists checks if we need to trigger a new analysis.
   278  // The function checks if there has been an analysis associated with the firstFailedBuild.
   279  // Returns true if a new analysis should be triggered, returns false otherwise.
   280  // Also return the compileFailure model associated with the failure for convenience.
   281  // Note that this function also create/update the associated CompileFailureModel
   282  func analysisExists(c context.Context, refFailedBuild *buildbucketpb.Build, firstFailedBuild *buildbucketpb.Build) (bool, *model.CompileFailure, error) {
   283  	logging.Infof(c, "check analysisExists for firstFailedBuild %d", firstFailedBuild.Id)
   284  
   285  	// Create a CompileFailure record in datastore if necessary
   286  	compileFailure, err := createCompileFailureModel(c, refFailedBuild)
   287  
   288  	// Search in datastore if there is already an analysis with the first failed build.
   289  	// If not, trigger an analysis
   290  	analysis, err := searchAnalysis(c, firstFailedBuild.Id)
   291  
   292  	if err != nil {
   293  		return false, nil, err
   294  	}
   295  
   296  	// There is an existing analysis.
   297  	// We should not trigger another analysis, but instead we will "merge" the
   298  	// compile failure with the existing one.
   299  	if analysis != nil {
   300  		compileFailureId := analysis.CompileFailure.IntID()
   301  		logging.Infof(c, "An analysis already existed for compile failure with ID %d", compileFailureId)
   302  		cf := &model.CompileFailure{
   303  			Id:    compileFailureId,
   304  			Build: analysis.CompileFailure.Parent(),
   305  		}
   306  		// Find the compile failure that the analysis runs on
   307  		err := datastore.Get(c, cf)
   308  		if err != nil {
   309  			logging.Errorf(c, "Cannot find compile failure ID %d", compileFailureId)
   310  			return false, nil, err
   311  		}
   312  
   313  		// If they are the same compileFailure, don't do anything.
   314  		// This may happen when we receive duplicated/retried message from pubsub.
   315  		if cf.Id == compileFailure.Id {
   316  			return false, compileFailure, nil
   317  		}
   318  
   319  		// "Merge" the compile failures, so they use the same analysis
   320  		err = datastore.RunInTransaction(c, func(c context.Context) error {
   321  			e := datastore.Get(c, compileFailure)
   322  			if e != nil {
   323  				return e
   324  			}
   325  			compileFailure.MergedFailureKey = analysis.CompileFailure
   326  			return datastore.Put(c, compileFailure)
   327  		}, nil)
   328  
   329  		if err != nil {
   330  			return false, nil, err
   331  		}
   332  
   333  		return false, compileFailure, nil
   334  	}
   335  
   336  	return true, compileFailure, nil
   337  }
   338  
   339  func createCompileFailureModel(c context.Context, failedBuild *buildbucketpb.Build) (*model.CompileFailure, error) {
   340  	// As we are using build ID as ID here, the entities will be created if not exist.
   341  	// If it exists, we just update the entities.
   342  	var compileFailure *model.CompileFailure
   343  	err := datastore.RunInTransaction(c, func(c context.Context) error {
   344  		gitilesCommit := util.GetGitilesCommitForBuild(failedBuild)
   345  		buildModel := &model.LuciFailedBuild{
   346  			Id: failedBuild.Id,
   347  			LuciBuild: model.LuciBuild{
   348  				BuildId:     failedBuild.Id,
   349  				Project:     failedBuild.GetBuilder().Project,
   350  				Bucket:      failedBuild.GetBuilder().Bucket,
   351  				Builder:     failedBuild.GetBuilder().Builder,
   352  				BuildNumber: int(failedBuild.Number),
   353  				Status:      failedBuild.Status,
   354  				StartTime:   failedBuild.StartTime.AsTime(),
   355  				EndTime:     failedBuild.EndTime.AsTime(),
   356  				CreateTime:  failedBuild.CreateTime.AsTime(),
   357  			},
   358  			BuildFailureType: pb.BuildFailureType_COMPILE,
   359  			Platform:         platformForBuild(c, failedBuild),
   360  			SheriffRotations: util.GetSheriffRotationsForBuild(failedBuild),
   361  		}
   362  		proto.Merge(&buildModel.GitilesCommit, gitilesCommit)
   363  		e := datastore.Put(c, buildModel)
   364  		if e != nil {
   365  			return e
   366  		}
   367  		compileFailure = &model.CompileFailure{
   368  			Id:    failedBuild.Id,
   369  			Build: datastore.KeyForObj(c, buildModel),
   370  		}
   371  		return datastore.Put(c, compileFailure)
   372  	}, nil)
   373  
   374  	if err != nil {
   375  		return nil, err
   376  	}
   377  
   378  	return compileFailure, nil
   379  }
   380  
   381  func searchAnalysis(c context.Context, firstFailedBuildId int64) (*model.CompileFailureAnalysis, error) {
   382  	q := datastore.NewQuery("CompileFailureAnalysis").Eq("first_failed_build_id", firstFailedBuildId)
   383  	analyses := []*model.CompileFailureAnalysis{}
   384  	err := datastore.GetAll(c, q, &analyses)
   385  	if err != nil {
   386  		logging.Errorf(c, "Error querying datastore for analysis for first_failed_build_id %d: %s", firstFailedBuildId, err)
   387  		return nil, err
   388  	}
   389  	if len(analyses) == 0 {
   390  		return nil, nil
   391  	}
   392  	// There should only be at most one analysis firstFailedBuildId.
   393  	if len(analyses) > 1 {
   394  		logging.Warningf(c, "Found more than one analysis for first_failed_build_id %d", firstFailedBuildId)
   395  	}
   396  	return analyses[0], nil
   397  }
   398  
   399  // hasCompileStepStatus checks if the compile step for a build has the specified status.
   400  func hasCompileStepStatus(c context.Context, build *buildbucketpb.Build, status buildbucketpb.Status) bool {
   401  	for _, step := range build.Steps {
   402  		if util.IsCompileStep(step) && step.Status == status {
   403  			return true
   404  		}
   405  	}
   406  	return false
   407  }
   408  
   409  func platformForBuild(c context.Context, build *buildbucketpb.Build) model.Platform {
   410  	dimens := util.GetTaskDimensions(build)
   411  	if dimens == nil {
   412  		return model.PlatformUnspecified
   413  	}
   414  	for _, d := range dimens {
   415  		if d.Key == "os" {
   416  			return model.PlatformFromOS(c, d.Value)
   417  		}
   418  	}
   419  	return model.PlatformUnspecified
   420  }