go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/bisection/throttle/throttle.go (about)

     1  // Copyright 2023 The LUCI Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package throttle analysis current running reruns and send task to test failure detector.
    16  package throttle
    17  
    18  import (
    19  	"context"
    20  	"time"
    21  
    22  	"go.chromium.org/luci/bisection/internal/config"
    23  	"go.chromium.org/luci/bisection/model"
    24  	pb "go.chromium.org/luci/bisection/proto/v1"
    25  	tpb "go.chromium.org/luci/bisection/task/proto"
    26  	"go.chromium.org/luci/bisection/testfailuredetection"
    27  	"go.chromium.org/luci/bisection/util"
    28  	buildbucketpb "go.chromium.org/luci/buildbucket/proto"
    29  	"go.chromium.org/luci/common/clock"
    30  	"go.chromium.org/luci/common/errors"
    31  	"go.chromium.org/luci/common/logging"
    32  	"go.chromium.org/luci/gae/service/datastore"
    33  )
    34  
    35  const (
    36  	// Rerun that is pending for more than 5 minutes should be
    37  	// considered as congested.
    38  	congestedPendingThreshold = -time.Minute * 5
    39  	// Rerun that is older than 7 days should be excluded.
    40  	// Because there maybe cases that for some reasons
    41  	// (e.g. crashes) that status may not be updated.
    42  	// Any reruns more than 7 days are surely canceled by buildbucket, so it is
    43  	// safe to exclude them.
    44  	cutoffThreshold = -time.Hour * 7 * 24
    45  )
    46  
    47  func CronHandler(ctx context.Context) error {
    48  	projectsToProcess, err := config.SupportedProjects(ctx)
    49  	if err != nil {
    50  		return errors.Annotate(err, "supported projects").Err()
    51  	}
    52  	// TODO(beining@): We should continue to next iteration when there is an error.
    53  	// Because error in one project should not block other projects.
    54  	for _, project := range projectsToProcess {
    55  		count, err := dailyAnalysisCount(ctx, project)
    56  		if err != nil {
    57  			return errors.Annotate(err, "daily analysis count").Err()
    58  		}
    59  		dailyLimit, err := dailyLimit(ctx, project)
    60  		if err != nil {
    61  			return errors.Annotate(err, "daily limit").Err()
    62  		}
    63  		if count >= dailyLimit {
    64  			logging.Warningf(ctx, "%d reached daily limit %d for project %s", count, dailyLimit, project)
    65  			continue
    66  		}
    67  		rerunBuilds, err := congestedCompileReruns(ctx, project)
    68  		if err != nil {
    69  			return errors.Annotate(err, "obtain congested compile reruns").Err()
    70  		}
    71  		testReruns, err := congestedTestReruns(ctx, project)
    72  		if err != nil {
    73  			return errors.Annotate(err, "obtain congested test reruns").Err()
    74  		}
    75  		dimensionExcludes := []*pb.Dimension{}
    76  		for _, d := range allRerunDimensions(rerunBuilds, testReruns) {
    77  			if dim := util.GetDimensionWithKey(d, "os"); dim != nil {
    78  				dimensionExcludes = append(dimensionExcludes, dim)
    79  			}
    80  		}
    81  		util.SortDimension(dimensionExcludes)
    82  		task := &tpb.TestFailureDetectionTask{
    83  			Project:           project,
    84  			DimensionExcludes: dimensionExcludes,
    85  		}
    86  		if err := testfailuredetection.Schedule(ctx, task); err != nil {
    87  			return errors.Annotate(err, "schedule test failure detection task").Err()
    88  		}
    89  		logging.Infof(ctx, "Test failure detection task scheduled %v", task)
    90  	}
    91  	return nil
    92  }
    93  
    94  func dailyAnalysisCount(ctx context.Context, project string) (int, error) {
    95  	cutoffTime := clock.Now(ctx).Add(-time.Hour * 24)
    96  	q := datastore.NewQuery("TestFailureAnalysis").Eq("project", project).Gt("create_time", cutoffTime)
    97  	analyses := []*model.TestFailureAnalysis{}
    98  	err := datastore.GetAll(ctx, q, &analyses)
    99  	if err != nil {
   100  		return 0, errors.Annotate(err, "get analyses").Err()
   101  	}
   102  	count := 0
   103  	for _, tfa := range analyses {
   104  		if tfa.Status != pb.AnalysisStatus_DISABLED && tfa.Status != pb.AnalysisStatus_UNSUPPORTED {
   105  			count++
   106  		}
   107  	}
   108  	return count, nil
   109  }
   110  
   111  func congestedCompileReruns(ctx context.Context, project string) ([]*model.SingleRerun, error) {
   112  	cutoffTime := clock.Now(ctx).Add(cutoffThreshold)
   113  	pendingCutoffTime := clock.Now(ctx).Add(congestedPendingThreshold)
   114  	q := datastore.NewQuery("CompileRerunBuild").
   115  		Eq("status", buildbucketpb.Status_SCHEDULED).
   116  		Eq("project", project).
   117  		Gt("create_time", cutoffTime).
   118  		Lt("create_time", pendingCutoffTime)
   119  	rerunBuilds := []*model.CompileRerunBuild{}
   120  	err := datastore.GetAll(ctx, q, &rerunBuilds)
   121  	if err != nil {
   122  		return nil, errors.Annotate(err, "get scheduled CompileRerunBuilds").Err()
   123  	}
   124  	reruns := []*model.SingleRerun{}
   125  	for _, r := range rerunBuilds {
   126  		rerun := []*model.SingleRerun{}
   127  		q := datastore.NewQuery("SingleRerun").Eq("rerun_build", datastore.KeyForObj(ctx, r))
   128  		err := datastore.GetAll(ctx, q, &rerun)
   129  		if err != nil {
   130  			return nil, errors.Annotate(err, "get rerun with CompileRerunBuilds ID %d", r.Id).Err()
   131  		}
   132  		reruns = append(reruns, rerun...)
   133  	}
   134  	return reruns, nil
   135  }
   136  
   137  func congestedTestReruns(ctx context.Context, project string) ([]*model.TestSingleRerun, error) {
   138  	cutoffTime := clock.Now(ctx).Add(cutoffThreshold)
   139  	pendingCutoffTime := clock.Now(ctx).Add(congestedPendingThreshold)
   140  	q := datastore.NewQuery("TestSingleRerun").
   141  		Eq("luci_build.status", buildbucketpb.Status_SCHEDULED).
   142  		Eq("luci_build.project", project).
   143  		Gt("luci_build.create_time", cutoffTime).
   144  		Lt("luci_build.create_time", pendingCutoffTime)
   145  	reruns := []*model.TestSingleRerun{}
   146  	err := datastore.GetAll(ctx, q, &reruns)
   147  	if err != nil {
   148  		return nil, errors.Annotate(err, "get scheduled TestSingleRerun").Err()
   149  	}
   150  	return reruns, nil
   151  }
   152  
   153  func allRerunDimensions(rerunBuilds []*model.SingleRerun, testReruns []*model.TestSingleRerun) []*pb.Dimensions {
   154  	dims := []*pb.Dimensions{}
   155  	for _, r := range rerunBuilds {
   156  		dims = append(dims, r.Dimensions)
   157  	}
   158  	for _, r := range testReruns {
   159  		dims = append(dims, r.Dimensions)
   160  	}
   161  	return dims
   162  }
   163  
   164  func dailyLimit(ctx context.Context, project string) (int, error) {
   165  	cfg, err := config.Project(ctx, project)
   166  	if err != nil {
   167  		return 0, err
   168  	}
   169  	return (int)(cfg.TestAnalysisConfig.GetDailyLimit()), nil
   170  }