go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/bisection/testfailuredetection/test_failure_detection.go

go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/bisection/testfailuredetection/test_failure_detection.go (about)

     1  // Copyright 2023 The LUCI Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package testfailuredetection analyses recent test failures with
    16  // the changepoint analysis from LUCI analysis, and select test failures to bisect.
    17  package testfailuredetection
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  	"math"
    23  	"strings"
    24  
    25  	"go.chromium.org/luci/bisection/internal/config"
    26  	"go.chromium.org/luci/bisection/internal/lucianalysis"
    27  	"go.chromium.org/luci/bisection/model"
    28  	configpb "go.chromium.org/luci/bisection/proto/config"
    29  	pb "go.chromium.org/luci/bisection/proto/v1"
    30  	"go.chromium.org/luci/bisection/rerun"
    31  	tpb "go.chromium.org/luci/bisection/task/proto"
    32  	"go.chromium.org/luci/bisection/testfailureanalysis/bisection"
    33  	"go.chromium.org/luci/bisection/util"
    34  	"go.chromium.org/luci/bisection/util/datastoreutil"
    35  	"go.chromium.org/luci/bisection/util/loggingutil"
    36  	"go.chromium.org/luci/common/clock"
    37  	"go.chromium.org/luci/common/errors"
    38  	"go.chromium.org/luci/common/logging"
    39  	"go.chromium.org/luci/common/retry/transient"
    40  	"go.chromium.org/luci/gae/service/datastore"
    41  	"go.chromium.org/luci/server"
    42  	"go.chromium.org/luci/server/tq"
    43  	"google.golang.org/protobuf/proto"
    44  )
    45  
    46  const (
    47  	taskClass = "test-failure-detection"
    48  	queue     = "test-failure-detection"
    49  )
    50  
    51  var taskClassRef = tq.RegisterTaskClass(tq.TaskClass{
    52  	ID:        taskClass,
    53  	Prototype: (*tpb.TestFailureDetectionTask)(nil),
    54  	Queue:     queue,
    55  	Kind:      tq.NonTransactional,
    56  })
    57  
    58  // RegisterTaskClass registers the task class for tq dispatcher.
    59  func RegisterTaskClass(srv *server.Server, luciAnalysisProjectFunc func(luciProject string) string) error {
    60  	ctx := srv.Context
    61  	ac, err := lucianalysis.NewClient(ctx, srv.Options.CloudProject, luciAnalysisProjectFunc)
    62  	if err != nil {
    63  		return err
    64  	}
    65  	srv.RegisterCleanup(func(context.Context) {
    66  		ac.Close()
    67  	})
    68  	handler := func(c context.Context, payload proto.Message) error {
    69  		task := payload.(*tpb.TestFailureDetectionTask)
    70  		logging.Infof(c, "Processing test failure detection task %v", task)
    71  		err := Run(ctx, ac, task)
    72  		if err != nil {
    73  			err = errors.Annotate(err, "run detection").Err()
    74  			logging.Errorf(ctx, err.Error())
    75  			// If the error is transient, return err to retry.
    76  			if transient.Tag.In(err) {
    77  				return err
    78  			}
    79  			return nil
    80  		}
    81  		return nil
    82  	}
    83  	taskClassRef.AttachHandler(handler)
    84  	return nil
    85  }
    86  
    87  // Schedule enqueues a task to find test failures to bisect.
    88  func Schedule(ctx context.Context, task *tpb.TestFailureDetectionTask) error {
    89  	return tq.AddTask(ctx, &tq.Task{Payload: task})
    90  }
    91  
    92  type analysisClient interface {
    93  	ReadTestFailures(ctx context.Context, task *tpb.TestFailureDetectionTask, filter *configpb.FailureIngestionFilter) ([]*lucianalysis.BuilderRegressionGroup, error)
    94  	ReadBuildInfo(ctx context.Context, tf *model.TestFailure) (lucianalysis.BuildInfo, error)
    95  }
    96  
    97  // Run finds and group test failures to send to bisector.
    98  func Run(ctx context.Context, client analysisClient, task *tpb.TestFailureDetectionTask) error {
    99  	ctx = loggingutil.SetProject(ctx, task.Project)
   100  	logging.Infof(ctx, "Run test failure detection")
   101  	// Checks if test failure detection is enabled.
   102  	enabled, err := isEnabled(ctx, task.Project)
   103  	if err != nil {
   104  		return errors.Annotate(err, "is enabled").Err()
   105  	}
   106  	if !enabled {
   107  		logging.Infof(ctx, "Dectection is not enabled")
   108  		return nil
   109  	}
   110  	filter, err := getFailureIngestionFilter(ctx, task.Project)
   111  	if err != nil {
   112  		return errors.Annotate(err, "get excluded buckets").Err()
   113  	}
   114  	groups, err := client.ReadTestFailures(ctx, task, filter)
   115  	if err != nil {
   116  		return errors.Annotate(err, "read test failures").Err()
   117  	}
   118  	logging.Infof(ctx, "There are %d groups from LUCI Analysis query", len(groups))
   119  	bundles := []*model.TestFailureBundle{}
   120  	skippedBundleLogLines := []string{}
   121  	for _, g := range groups {
   122  		bundle, err := newTestFailureBundle(task.Project, g)
   123  		if err != nil {
   124  			return errors.Annotate(err, "new test failure bundle").Err()
   125  		}
   126  		// Use the redundancy score of the primary test failure as
   127  		// the redundancy score of this test failure bundle.
   128  		rs, err := redundancyScore(ctx, bundle.Primary())
   129  		if err != nil {
   130  			return errors.Annotate(err, "calculate redundancy score").Err()
   131  		}
   132  		if rs == 1 {
   133  			// Test failures in this bundle are completely redundant.
   134  			// This bundle should be skipped.
   135  			line := fmt.Sprintf("primary test %s(%s)", bundle.Primary().TestID, bundle.Primary().VariantHash)
   136  			skippedBundleLogLines = append(skippedBundleLogLines, line)
   137  			continue
   138  		}
   139  		bundle.Primary().RedundancyScore = rs
   140  		bundles = append(bundles, bundle)
   141  	}
   142  	logging.Infof(ctx, fmt.Sprintf("skip completely redundant bundles\n%s", strings.Join(skippedBundleLogLines, "\n")))
   143  	logging.Infof(ctx, "There are %d bundles after redundancy filter", len(bundles))
   144  	if len(bundles) == 0 {
   145  		logging.Infof(ctx, "Cannot find new test failures to bisect for project %s", task.Project)
   146  		return nil
   147  	}
   148  	bestBundle := First(ctx, bundles)
   149  	logging.Infof(ctx, "Selected test failure bundle with primary failure ID %s, variantHash %s, refHash %s",
   150  		bestBundle.Primary().TestID, bestBundle.Primary().VariantHash, bestBundle.Primary().RefHash)
   151  	testFailureAnalysis, err := prepareFailureAnalysis(ctx, client, bestBundle)
   152  	if err != nil {
   153  		// If there is a failure in preparing, in particular, in reading build info,
   154  		// we should store the analysis, so subsequent runs will not consider this
   155  		// test failure again.
   156  		testFailureAnalysis = &model.TestFailureAnalysis{
   157  			Project:          bestBundle.Primary().Project,
   158  			CreateTime:       clock.Now(ctx),
   159  			Status:           pb.AnalysisStatus_INSUFFICENTDATA,
   160  			RunStatus:        pb.AnalysisRunStatus_ENDED,
   161  			EndTime:          clock.Now(ctx),
   162  			SheriffRotations: bestBundle.Metadata.SheriffRotations,
   163  		}
   164  		e := saveTestFailuresAndAnalysis(ctx, bestBundle, testFailureAnalysis, false)
   165  		if e != nil {
   166  			// Just log.
   167  			logging.Errorf(ctx, "save test failure and analysis when insufficient data %v", e.Error())
   168  		}
   169  		return errors.Annotate(err, "prepare failure analysis").Err()
   170  	}
   171  	if err := saveTestFailuresAndAnalysis(ctx, bestBundle, testFailureAnalysis, true); err != nil {
   172  		return errors.Annotate(err, "save test failure and analysis").Err()
   173  	}
   174  	return nil
   175  }
   176  
   177  func newTestFailureBundle(project string, group *lucianalysis.BuilderRegressionGroup) (*model.TestFailureBundle, error) {
   178  	testFailures := make([]*model.TestFailure, len(group.TestVariants))
   179  	for i, tv := range group.TestVariants {
   180  		variant, err := util.VariantPB(tv.Variant.String())
   181  		if err != nil {
   182  			return nil, err
   183  		}
   184  		testFailures[i] = &model.TestFailure{
   185  			ID:          0,
   186  			Project:     project,
   187  			TestID:      tv.TestID.String(),
   188  			VariantHash: tv.VariantHash.String(),
   189  			Variant:     variant,
   190  			RefHash:     group.RefHash.String(),
   191  			Bucket:      group.Bucket.String(),
   192  			Builder:     group.Builder.String(),
   193  			Ref: &pb.SourceRef{System: &pb.SourceRef_Gitiles{
   194  				Gitiles: &pb.GitilesRef{
   195  					Host:    group.Ref.Gitiles.Host.String(),
   196  					Project: group.Ref.Gitiles.Project.String(),
   197  					Ref:     group.Ref.Gitiles.Ref.String(),
   198  				},
   199  			}},
   200  			RegressionStartPosition:  group.RegressionStartPosition.Int64,
   201  			RegressionEndPosition:    group.RegressionEndPosition.Int64,
   202  			StartPositionFailureRate: group.StartPositionFailureRate,
   203  			EndPositionFailureRate:   group.EndPositionFailureRate,
   204  			IsPrimary:                i == 0,
   205  			IsDiverged:               false,
   206  			AnalysisKey:              nil,
   207  			RedundancyScore:          0,
   208  			StartHour:                group.StartHour.Timestamp.UTC(),
   209  			EndHour:                  group.EndHour.Timestamp.UTC(),
   210  		}
   211  	}
   212  	bundle := &model.TestFailureBundle{}
   213  	err := bundle.Add(testFailures)
   214  	if err != nil {
   215  		return nil, err
   216  	}
   217  	sheriffRotations := []string{}
   218  	for _, r := range group.SheriffRotations {
   219  		if r.String() != "" {
   220  			sheriffRotations = append(sheriffRotations, r.String())
   221  		}
   222  	}
   223  	bundle.Metadata = &model.BundleMetaData{
   224  		SheriffRotations: sheriffRotations,
   225  	}
   226  	return bundle, nil
   227  }
   228  
   229  // RedundancyScore returns a floating point number between 0 and 1 inclusive.
   230  func redundancyScore(c context.Context, tf *model.TestFailure) (float64, error) {
   231  	sameTestVariant, err := datastoreutil.GetTestFailures(c, tf.Project, tf.TestID, tf.RefHash, tf.VariantHash)
   232  	if err != nil {
   233  		return 0, errors.Annotate(err, "get test failures of same test variant").Err()
   234  	}
   235  	for _, a := range sameTestVariant {
   236  		if numberOfOverlapCommit(tf.RegressionStartPosition, tf.RegressionEndPosition,
   237  			a.RegressionStartPosition, a.RegressionEndPosition) > 0 {
   238  			return 1, nil
   239  		}
   240  	}
   241  	maxOverlap := float64(0)
   242  	sameTest, err := datastoreutil.GetTestFailures(c, tf.Project, tf.TestID, tf.RefHash, "")
   243  	if err != nil {
   244  		return 0, errors.Annotate(err, "get test failures of same test").Err()
   245  	}
   246  	for _, t := range sameTest {
   247  		overlap := regressionRangeOverlap(tf.RegressionStartPosition, tf.RegressionEndPosition,
   248  			t.RegressionStartPosition, t.RegressionEndPosition)
   249  		maxOverlap = math.Max(maxOverlap, overlap)
   250  	}
   251  	if maxOverlap < 0 || maxOverlap > 1 {
   252  		return 0, errors.New("maxOverlap must between 0 to 1 inclusive. this suggests something wrong with the implementation")
   253  	}
   254  	return maxOverlap, nil
   255  }
   256  
   257  func numberOfOverlapCommit(rl1, ru1, rl2, ru2 int64) float64 {
   258  	return math.Min(float64(ru1), float64(ru2)) - math.Max(float64(rl1), float64(rl2)) + 1
   259  }
   260  
   261  func regressionRangeOverlap(rl1, ru1, rl2, ru2 int64) float64 {
   262  	return math.Max(0, numberOfOverlapCommit(rl1, ru1, rl2, ru2)) / float64(ru1-rl1+ru2-rl2+2)
   263  }
   264  
   265  func prepareFailureAnalysis(ctx context.Context, client analysisClient, bundle *model.TestFailureBundle) (*model.TestFailureAnalysis, error) {
   266  	tf := bundle.Primary()
   267  	buildInfo, err := client.ReadBuildInfo(ctx, tf)
   268  	if err != nil {
   269  		return nil, errors.Annotate(err, "read build info").Err()
   270  	}
   271  	testFailureAnalysis := &model.TestFailureAnalysis{
   272  		Project:          tf.Project,
   273  		Bucket:           tf.Bucket,
   274  		Builder:          tf.Builder,
   275  		CreateTime:       clock.Now(ctx),
   276  		Status:           pb.AnalysisStatus_CREATED,
   277  		Priority:         rerun.PriorityTestFailure,
   278  		StartCommitHash:  buildInfo.StartCommitHash,
   279  		EndCommitHash:    buildInfo.EndCommitHash,
   280  		FailedBuildID:    buildInfo.BuildID,
   281  		SheriffRotations: bundle.Metadata.SheriffRotations,
   282  	}
   283  	return testFailureAnalysis, nil
   284  }
   285  
   286  // saveTestFailuresAndAnalysis saves the test failures and a test failures analysis into datastore.
   287  // It also transactionally enqueue a task to bisector, if shouldTriggerBisection is set to true.
   288  func saveTestFailuresAndAnalysis(ctx context.Context, bundle *model.TestFailureBundle, testFailureAnalysis *model.TestFailureAnalysis, shouldTriggerBisection bool) error {
   289  	return datastore.RunInTransaction(ctx, func(ctx context.Context) error {
   290  		if err := datastore.AllocateIDs(ctx, testFailureAnalysis); err != nil {
   291  			return errors.Annotate(err, "allocate datastore ID for test failure analysis").Err()
   292  		}
   293  		for _, testFailure := range bundle.All() {
   294  			testFailure.AnalysisKey = datastore.KeyForObj(ctx, testFailureAnalysis)
   295  		}
   296  		// TODO(beining@): This will fail if the size of the bundle is greater than 499.
   297  		// If this becomes a problem, we need to save TestFailures in batches.
   298  		// https://cloud.google.com/datastore/docs/concepts/transactions#what_can_be_done_in_a_transaction
   299  		if err := datastore.Put(ctx, bundle.All()); err != nil {
   300  			return errors.Annotate(err, "save test failures").Err()
   301  		}
   302  		testFailureAnalysis.TestFailure = datastore.KeyForObj(ctx, bundle.Primary())
   303  		if err := datastore.Put(ctx, testFailureAnalysis); err != nil {
   304  			return errors.Annotate(err, "save test failure analysis").Err()
   305  		}
   306  		// Send task to bisector transactionally.
   307  		if shouldTriggerBisection {
   308  			if err := bisection.Schedule(ctx, testFailureAnalysis.ID); err != nil {
   309  				return errors.Annotate(err, "send task to bisector").Err()
   310  			}
   311  		}
   312  		return nil
   313  	}, nil)
   314  }
   315  
   316  func isEnabled(ctx context.Context, project string) (bool, error) {
   317  	cfg, err := config.Project(ctx, project)
   318  	if err != nil {
   319  		return false, err
   320  	}
   321  	return cfg.TestAnalysisConfig.GetDetectorEnabled(), nil
   322  }
   323  
   324  func getFailureIngestionFilter(ctx context.Context, project string) (*configpb.FailureIngestionFilter, error) {
   325  	cfg, err := config.Project(ctx, project)
   326  	if err != nil {
   327  		return nil, err
   328  	}
   329  	return cfg.TestAnalysisConfig.GetFailureIngestionFilter(), nil
   330  }