go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/bisection/metrics/metrics.go (about)

     1  // Copyright 2022 The LUCI Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package metrics handles sending metrics to tsmon.
    16  package metrics
    17  
    18  import (
    19  	"context"
    20  	"fmt"
    21  	"time"
    22  
    23  	"go.chromium.org/luci/bisection/model"
    24  	"go.chromium.org/luci/bisection/util"
    25  	"go.chromium.org/luci/bisection/util/datastoreutil"
    26  	buildbucketpb "go.chromium.org/luci/buildbucket/proto"
    27  	"go.chromium.org/luci/common/clock"
    28  	"go.chromium.org/luci/common/errors"
    29  	"go.chromium.org/luci/common/logging"
    30  	"go.chromium.org/luci/common/tsmon"
    31  	"go.chromium.org/luci/common/tsmon/distribution"
    32  	"go.chromium.org/luci/common/tsmon/field"
    33  	"go.chromium.org/luci/common/tsmon/metric"
    34  	"go.chromium.org/luci/common/tsmon/types"
    35  	"go.chromium.org/luci/gae/service/datastore"
    36  
    37  	pb "go.chromium.org/luci/bisection/proto/v1"
    38  )
    39  
    40  var (
    41  	// Measure how many analyses are currently running
    42  	runningAnalysesGauge = metric.NewInt(
    43  		"bisection/analysis/running_count",
    44  		"The total number running compile analysis, by LUCI project.",
    45  		&types.MetricMetadata{Units: "analyses"},
    46  		// The LUCI Project.
    47  		field.String("project"),
    48  		// The type of the analysis.
    49  		// The possible values are "compile", "test".
    50  		field.String("type"),
    51  	)
    52  	// Measure how many rerun builds are currently running
    53  	runningRerunGauge = metric.NewInt(
    54  		"bisection/rerun/running_count",
    55  		"The number of running rerun builds, by LUCI project.",
    56  		&types.MetricMetadata{Units: "reruns"},
    57  		// The LUCI Project.
    58  		field.String("project"),
    59  		// "running", "pending"
    60  		field.String("status"),
    61  		// "mac", "windows", "linux"
    62  		field.String("platform"),
    63  		// The type of the analysis that rerun belongs to.
    64  		// The possible values are "compile", "test".
    65  		field.String("type"),
    66  	)
    67  	// Measure the "age" of running rerun builds
    68  	rerunAgeMetric = metric.NewNonCumulativeDistribution(
    69  		"bisection/rerun/age",
    70  		"The age of running reruns, by LUCI project.",
    71  		&types.MetricMetadata{Units: "seconds"},
    72  		distribution.DefaultBucketer,
    73  		// The LUCI Project.
    74  		field.String("project"),
    75  		// "running", "pending"
    76  		field.String("status"),
    77  		// "mac", "windows", "linux"
    78  		field.String("platform"),
    79  		// The type of the analysis that rerun belongs to.
    80  		// The possible values are "compile", "test".
    81  		field.String("type"),
    82  	)
    83  )
    84  
    85  // AnalysisType is used for sending metrics to tsmon
    86  type AnalysisType string
    87  
    88  const (
    89  	AnalysisTypeCompile AnalysisType = "compile"
    90  	AnalysisTypeTest    AnalysisType = "test"
    91  )
    92  
    93  // rerunKey is keys for maps for runningRerunGauge and rerunAgeMetric
    94  type rerunKey struct {
    95  	Project  string
    96  	Status   string
    97  	Platform string
    98  }
    99  
   100  func init() {
   101  	// Register metrics as global metrics, which has the effort of
   102  	// resetting them after every flush.
   103  	tsmon.RegisterGlobalCallback(func(ctx context.Context) {
   104  		// Do nothing -- the metrics will be populated by the cron
   105  		// job itself and does not need to be triggered externally.
   106  	}, runningAnalysesGauge, runningRerunGauge, rerunAgeMetric)
   107  }
   108  
   109  // CollectGlobalMetrics is called in a cron job.
   110  // It collects global metrics and send to tsmon.
   111  func CollectGlobalMetrics(c context.Context) error {
   112  	var errs []error
   113  	err := collectMetricsForRunningAnalyses(c)
   114  	if err != nil {
   115  		err = errors.Annotate(err, "collectMetricsForRunningAnalyses").Err()
   116  		errs = append(errs, err)
   117  		logging.Errorf(c, err.Error())
   118  	}
   119  	err = collectMetricsForRunningReruns(c)
   120  	if err != nil {
   121  		err = errors.Annotate(err, "collectMetricsForRunningReruns").Err()
   122  		errs = append(errs, err)
   123  		logging.Errorf(c, err.Error())
   124  	}
   125  	err = collectMetricsForRunningTestReruns(c)
   126  	if err != nil {
   127  		err = errors.Annotate(err, "collectMetricsForRunningTestReruns").Err()
   128  		errs = append(errs, err)
   129  		logging.Errorf(c, err.Error())
   130  	}
   131  	if len(errs) > 0 {
   132  		return errors.NewMultiError(errs...)
   133  	}
   134  	return nil
   135  }
   136  
   137  func collectMetricsForRunningAnalyses(c context.Context) error {
   138  	// Compile failure analysis running count.
   139  	compileRunningCount, err := retrieveRunningAnalyses(c)
   140  	if err != nil {
   141  		return err
   142  	}
   143  	// Test failure analysis running count.
   144  	testRunningCount, err := retrieveRunningTestAnalyses(c)
   145  	if err != nil {
   146  		return err
   147  	}
   148  	// Set the metric
   149  	for proj, count := range compileRunningCount {
   150  		runningAnalysesGauge.Set(c, int64(count), proj, string(AnalysisTypeCompile))
   151  	}
   152  	for proj, count := range testRunningCount {
   153  		runningAnalysesGauge.Set(c, int64(count), proj, string(AnalysisTypeTest))
   154  	}
   155  	return nil
   156  }
   157  
   158  func retrieveRunningTestAnalyses(c context.Context) (map[string]int, error) {
   159  	q := datastore.NewQuery("TestFailureAnalysis").Eq("run_status", pb.AnalysisRunStatus_STARTED)
   160  	analyses := []*model.TestFailureAnalysis{}
   161  	err := datastore.GetAll(c, q, &analyses)
   162  	if err != nil {
   163  		return nil, errors.Annotate(err, "get running test failure analyses").Err()
   164  	}
   165  
   166  	// To store the running analyses for each project
   167  	runningCount := map[string]int{}
   168  	for _, tfa := range analyses {
   169  		runningCount[tfa.Project] = runningCount[tfa.Project] + 1
   170  	}
   171  	return runningCount, nil
   172  }
   173  
   174  func retrieveRunningAnalyses(c context.Context) (map[string]int, error) {
   175  	q := datastore.NewQuery("CompileFailureAnalysis").Eq("run_status", pb.AnalysisRunStatus_STARTED)
   176  	analyses := []*model.CompileFailureAnalysis{}
   177  	err := datastore.GetAll(c, q, &analyses)
   178  	if err != nil {
   179  		return nil, errors.Annotate(err, "couldn't get running analyses").Err()
   180  	}
   181  
   182  	// To store the running analyses for each project
   183  	runningCount := map[string]int{}
   184  	for _, cfa := range analyses {
   185  		build, err := datastoreutil.GetBuild(c, cfa.CompileFailure.Parent().IntID())
   186  		if err != nil {
   187  			return nil, errors.Annotate(err, "getting build for analysis %d", cfa.Id).Err()
   188  		}
   189  		if build == nil {
   190  			return nil, fmt.Errorf("getting build for analysis %d", cfa.Id)
   191  		}
   192  
   193  		runningCount[build.Project] = runningCount[build.Project] + 1
   194  	}
   195  	return runningCount, nil
   196  }
   197  
   198  func collectMetricsForRunningReruns(c context.Context) error {
   199  	// Query all in-progress single reruns in the last 7 days.
   200  	// We set the limit to 7 days because there maybe cases that for some reasons
   201  	// (e.g. crashes) that a rerun status may not be updated.
   202  	// Any reruns more than 7 days are surely canceled by buildbucket, so it is
   203  	// safe to exclude them.
   204  	cutoffTime := clock.Now(c).Add(-time.Hour * 7 * 24)
   205  	q := datastore.NewQuery("SingleRerun").Eq("Status", pb.RerunStatus_RERUN_STATUS_IN_PROGRESS).Gt("create_time", cutoffTime)
   206  	reruns := []*model.SingleRerun{}
   207  	err := datastore.GetAll(c, q, &reruns)
   208  	if err != nil {
   209  		return errors.Annotate(err, "couldn't get running reruns").Err()
   210  	}
   211  
   212  	// Get the metrics for rerun count and rerun age
   213  	// Maps where each key is one project-status-platform combination
   214  	rerunCountMap := map[rerunKey]int64{}
   215  	rerunAgeMap := map[rerunKey]*distribution.Distribution{}
   216  	for _, rerun := range reruns {
   217  		proj, platform, err := projectAndPlatformForRerun(c, rerun)
   218  		if err != nil {
   219  			return errors.Annotate(err, "projectForRerun %d", rerun.Id).Err()
   220  		}
   221  
   222  		rerunBuild := &model.CompileRerunBuild{
   223  			Id: rerun.RerunBuild.IntID(),
   224  		}
   225  		err = datastore.Get(c, rerunBuild)
   226  		if err != nil {
   227  			return errors.Annotate(err, "couldn't get rerun build %d", rerun.RerunBuild.IntID()).Err()
   228  		}
   229  
   230  		var key = rerunKey{
   231  			Project:  proj,
   232  			Platform: platform,
   233  		}
   234  		if rerunBuild.Status == buildbucketpb.Status_STATUS_UNSPECIFIED || rerunBuild.Status == buildbucketpb.Status_SCHEDULED {
   235  			key.Status = "pending"
   236  		}
   237  		if rerunBuild.Status == buildbucketpb.Status_STARTED {
   238  			key.Status = "running"
   239  		}
   240  		if key.Status != "" {
   241  			rerunCountMap[key] = rerunCountMap[key] + 1
   242  			if _, ok := rerunAgeMap[key]; !ok {
   243  				rerunAgeMap[key] = distribution.New(rerunAgeMetric.Bucketer())
   244  			}
   245  			rerunAgeMap[key].Add(rerunAgeInSeconds(c, rerun))
   246  		}
   247  	}
   248  
   249  	// Send metrics to tsmon
   250  	for k, count := range rerunCountMap {
   251  		runningRerunGauge.Set(c, count, k.Project, k.Status, k.Platform, string(AnalysisTypeCompile))
   252  	}
   253  
   254  	for k, dist := range rerunAgeMap {
   255  		rerunAgeMetric.Set(c, dist, k.Project, k.Status, k.Platform, string(AnalysisTypeCompile))
   256  	}
   257  
   258  	return nil
   259  }
   260  
   261  func projectAndPlatformForRerun(c context.Context, rerun *model.SingleRerun) (string, string, error) {
   262  	cfa, err := datastoreutil.GetCompileFailureAnalysis(c, rerun.Analysis.IntID())
   263  	if err != nil {
   264  		return "", "", err
   265  	}
   266  	build, err := datastoreutil.GetBuild(c, cfa.CompileFailure.Parent().IntID())
   267  	if err != nil {
   268  		return "", "", errors.Annotate(err, "getting build for analysis %d", cfa.Id).Err()
   269  	}
   270  	if build == nil {
   271  		return "", "", fmt.Errorf("build for analysis %d does not exist", cfa.Id)
   272  	}
   273  	return build.Project, string(build.Platform), nil
   274  }
   275  
   276  func collectMetricsForRunningTestReruns(c context.Context) error {
   277  	// Query all in-progress single reruns in the last 7 days.
   278  	// We set the limit to 7 days because there maybe cases that for some reasons
   279  	// (e.g. crashes) that a rerun status may not be updated.
   280  	// Any reruns more than 7 days are surely canceled by buildbucket, so it is
   281  	// safe to exclude them.
   282  	cutoffTime := clock.Now(c).Add(-time.Hour * 7 * 24)
   283  	q := datastore.NewQuery("TestSingleRerun").Eq("status", pb.RerunStatus_RERUN_STATUS_IN_PROGRESS).Gt("luci_build.create_time", cutoffTime)
   284  	reruns := []*model.TestSingleRerun{}
   285  	err := datastore.GetAll(c, q, &reruns)
   286  	if err != nil {
   287  		return errors.Annotate(err, "get running test reruns").Err()
   288  	}
   289  
   290  	// Get the metrics for rerun count and rerun age
   291  	// Maps where each key is one project-status-platform combination
   292  	rerunCountMap := map[rerunKey]int64{}
   293  	rerunAgeMap := map[rerunKey]*distribution.Distribution{}
   294  	for _, rerun := range reruns {
   295  		os := util.GetDimensionWithKey(rerun.Dimensions, "os")
   296  		if os == nil {
   297  			logging.Warningf(c, "rerun dimension has no OS %d", rerun.ID)
   298  			continue
   299  		}
   300  		var key = rerunKey{
   301  			Project:  rerun.Project,
   302  			Platform: string(model.PlatformFromOS(c, os.Value)),
   303  		}
   304  		if rerun.LUCIBuild.Status == buildbucketpb.Status_STATUS_UNSPECIFIED || rerun.LUCIBuild.Status == buildbucketpb.Status_SCHEDULED {
   305  			key.Status = "pending"
   306  		}
   307  		if rerun.LUCIBuild.Status == buildbucketpb.Status_STARTED {
   308  			key.Status = "running"
   309  		}
   310  		if key.Status != "" {
   311  			rerunCountMap[key] = rerunCountMap[key] + 1
   312  			if _, ok := rerunAgeMap[key]; !ok {
   313  				rerunAgeMap[key] = distribution.New(rerunAgeMetric.Bucketer())
   314  			}
   315  			dur := clock.Now(c).Sub(rerun.CreateTime)
   316  			rerunAgeMap[key].Add(dur.Seconds())
   317  		}
   318  	}
   319  
   320  	// Send metrics to tsmon
   321  	for k, count := range rerunCountMap {
   322  		runningRerunGauge.Set(c, count, k.Project, k.Status, k.Platform, string(AnalysisTypeTest))
   323  	}
   324  
   325  	for k, dist := range rerunAgeMap {
   326  		rerunAgeMetric.Set(c, dist, k.Project, k.Status, k.Platform, string(AnalysisTypeTest))
   327  	}
   328  
   329  	return nil
   330  }
   331  
   332  func rerunAgeInSeconds(c context.Context, rerun *model.SingleRerun) float64 {
   333  	dur := clock.Now(c).Sub(rerun.CreateTime)
   334  	return dur.Seconds()
   335  }