go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/bisection/metrics/metrics.go (about) 1 // Copyright 2022 The LUCI Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package metrics handles sending metrics to tsmon. 16 package metrics 17 18 import ( 19 "context" 20 "fmt" 21 "time" 22 23 "go.chromium.org/luci/bisection/model" 24 "go.chromium.org/luci/bisection/util" 25 "go.chromium.org/luci/bisection/util/datastoreutil" 26 buildbucketpb "go.chromium.org/luci/buildbucket/proto" 27 "go.chromium.org/luci/common/clock" 28 "go.chromium.org/luci/common/errors" 29 "go.chromium.org/luci/common/logging" 30 "go.chromium.org/luci/common/tsmon" 31 "go.chromium.org/luci/common/tsmon/distribution" 32 "go.chromium.org/luci/common/tsmon/field" 33 "go.chromium.org/luci/common/tsmon/metric" 34 "go.chromium.org/luci/common/tsmon/types" 35 "go.chromium.org/luci/gae/service/datastore" 36 37 pb "go.chromium.org/luci/bisection/proto/v1" 38 ) 39 40 var ( 41 // Measure how many analyses are currently running 42 runningAnalysesGauge = metric.NewInt( 43 "bisection/analysis/running_count", 44 "The total number running compile analysis, by LUCI project.", 45 &types.MetricMetadata{Units: "analyses"}, 46 // The LUCI Project. 47 field.String("project"), 48 // The type of the analysis. 49 // The possible values are "compile", "test". 50 field.String("type"), 51 ) 52 // Measure how many rerun builds are currently running 53 runningRerunGauge = metric.NewInt( 54 "bisection/rerun/running_count", 55 "The number of running rerun builds, by LUCI project.", 56 &types.MetricMetadata{Units: "reruns"}, 57 // The LUCI Project. 58 field.String("project"), 59 // "running", "pending" 60 field.String("status"), 61 // "mac", "windows", "linux" 62 field.String("platform"), 63 // The type of the analysis that rerun belongs to. 64 // The possible values are "compile", "test". 65 field.String("type"), 66 ) 67 // Measure the "age" of running rerun builds 68 rerunAgeMetric = metric.NewNonCumulativeDistribution( 69 "bisection/rerun/age", 70 "The age of running reruns, by LUCI project.", 71 &types.MetricMetadata{Units: "seconds"}, 72 distribution.DefaultBucketer, 73 // The LUCI Project. 74 field.String("project"), 75 // "running", "pending" 76 field.String("status"), 77 // "mac", "windows", "linux" 78 field.String("platform"), 79 // The type of the analysis that rerun belongs to. 80 // The possible values are "compile", "test". 81 field.String("type"), 82 ) 83 ) 84 85 // AnalysisType is used for sending metrics to tsmon 86 type AnalysisType string 87 88 const ( 89 AnalysisTypeCompile AnalysisType = "compile" 90 AnalysisTypeTest AnalysisType = "test" 91 ) 92 93 // rerunKey is keys for maps for runningRerunGauge and rerunAgeMetric 94 type rerunKey struct { 95 Project string 96 Status string 97 Platform string 98 } 99 100 func init() { 101 // Register metrics as global metrics, which has the effort of 102 // resetting them after every flush. 103 tsmon.RegisterGlobalCallback(func(ctx context.Context) { 104 // Do nothing -- the metrics will be populated by the cron 105 // job itself and does not need to be triggered externally. 106 }, runningAnalysesGauge, runningRerunGauge, rerunAgeMetric) 107 } 108 109 // CollectGlobalMetrics is called in a cron job. 110 // It collects global metrics and send to tsmon. 111 func CollectGlobalMetrics(c context.Context) error { 112 var errs []error 113 err := collectMetricsForRunningAnalyses(c) 114 if err != nil { 115 err = errors.Annotate(err, "collectMetricsForRunningAnalyses").Err() 116 errs = append(errs, err) 117 logging.Errorf(c, err.Error()) 118 } 119 err = collectMetricsForRunningReruns(c) 120 if err != nil { 121 err = errors.Annotate(err, "collectMetricsForRunningReruns").Err() 122 errs = append(errs, err) 123 logging.Errorf(c, err.Error()) 124 } 125 err = collectMetricsForRunningTestReruns(c) 126 if err != nil { 127 err = errors.Annotate(err, "collectMetricsForRunningTestReruns").Err() 128 errs = append(errs, err) 129 logging.Errorf(c, err.Error()) 130 } 131 if len(errs) > 0 { 132 return errors.NewMultiError(errs...) 133 } 134 return nil 135 } 136 137 func collectMetricsForRunningAnalyses(c context.Context) error { 138 // Compile failure analysis running count. 139 compileRunningCount, err := retrieveRunningAnalyses(c) 140 if err != nil { 141 return err 142 } 143 // Test failure analysis running count. 144 testRunningCount, err := retrieveRunningTestAnalyses(c) 145 if err != nil { 146 return err 147 } 148 // Set the metric 149 for proj, count := range compileRunningCount { 150 runningAnalysesGauge.Set(c, int64(count), proj, string(AnalysisTypeCompile)) 151 } 152 for proj, count := range testRunningCount { 153 runningAnalysesGauge.Set(c, int64(count), proj, string(AnalysisTypeTest)) 154 } 155 return nil 156 } 157 158 func retrieveRunningTestAnalyses(c context.Context) (map[string]int, error) { 159 q := datastore.NewQuery("TestFailureAnalysis").Eq("run_status", pb.AnalysisRunStatus_STARTED) 160 analyses := []*model.TestFailureAnalysis{} 161 err := datastore.GetAll(c, q, &analyses) 162 if err != nil { 163 return nil, errors.Annotate(err, "get running test failure analyses").Err() 164 } 165 166 // To store the running analyses for each project 167 runningCount := map[string]int{} 168 for _, tfa := range analyses { 169 runningCount[tfa.Project] = runningCount[tfa.Project] + 1 170 } 171 return runningCount, nil 172 } 173 174 func retrieveRunningAnalyses(c context.Context) (map[string]int, error) { 175 q := datastore.NewQuery("CompileFailureAnalysis").Eq("run_status", pb.AnalysisRunStatus_STARTED) 176 analyses := []*model.CompileFailureAnalysis{} 177 err := datastore.GetAll(c, q, &analyses) 178 if err != nil { 179 return nil, errors.Annotate(err, "couldn't get running analyses").Err() 180 } 181 182 // To store the running analyses for each project 183 runningCount := map[string]int{} 184 for _, cfa := range analyses { 185 build, err := datastoreutil.GetBuild(c, cfa.CompileFailure.Parent().IntID()) 186 if err != nil { 187 return nil, errors.Annotate(err, "getting build for analysis %d", cfa.Id).Err() 188 } 189 if build == nil { 190 return nil, fmt.Errorf("getting build for analysis %d", cfa.Id) 191 } 192 193 runningCount[build.Project] = runningCount[build.Project] + 1 194 } 195 return runningCount, nil 196 } 197 198 func collectMetricsForRunningReruns(c context.Context) error { 199 // Query all in-progress single reruns in the last 7 days. 200 // We set the limit to 7 days because there maybe cases that for some reasons 201 // (e.g. crashes) that a rerun status may not be updated. 202 // Any reruns more than 7 days are surely canceled by buildbucket, so it is 203 // safe to exclude them. 204 cutoffTime := clock.Now(c).Add(-time.Hour * 7 * 24) 205 q := datastore.NewQuery("SingleRerun").Eq("Status", pb.RerunStatus_RERUN_STATUS_IN_PROGRESS).Gt("create_time", cutoffTime) 206 reruns := []*model.SingleRerun{} 207 err := datastore.GetAll(c, q, &reruns) 208 if err != nil { 209 return errors.Annotate(err, "couldn't get running reruns").Err() 210 } 211 212 // Get the metrics for rerun count and rerun age 213 // Maps where each key is one project-status-platform combination 214 rerunCountMap := map[rerunKey]int64{} 215 rerunAgeMap := map[rerunKey]*distribution.Distribution{} 216 for _, rerun := range reruns { 217 proj, platform, err := projectAndPlatformForRerun(c, rerun) 218 if err != nil { 219 return errors.Annotate(err, "projectForRerun %d", rerun.Id).Err() 220 } 221 222 rerunBuild := &model.CompileRerunBuild{ 223 Id: rerun.RerunBuild.IntID(), 224 } 225 err = datastore.Get(c, rerunBuild) 226 if err != nil { 227 return errors.Annotate(err, "couldn't get rerun build %d", rerun.RerunBuild.IntID()).Err() 228 } 229 230 var key = rerunKey{ 231 Project: proj, 232 Platform: platform, 233 } 234 if rerunBuild.Status == buildbucketpb.Status_STATUS_UNSPECIFIED || rerunBuild.Status == buildbucketpb.Status_SCHEDULED { 235 key.Status = "pending" 236 } 237 if rerunBuild.Status == buildbucketpb.Status_STARTED { 238 key.Status = "running" 239 } 240 if key.Status != "" { 241 rerunCountMap[key] = rerunCountMap[key] + 1 242 if _, ok := rerunAgeMap[key]; !ok { 243 rerunAgeMap[key] = distribution.New(rerunAgeMetric.Bucketer()) 244 } 245 rerunAgeMap[key].Add(rerunAgeInSeconds(c, rerun)) 246 } 247 } 248 249 // Send metrics to tsmon 250 for k, count := range rerunCountMap { 251 runningRerunGauge.Set(c, count, k.Project, k.Status, k.Platform, string(AnalysisTypeCompile)) 252 } 253 254 for k, dist := range rerunAgeMap { 255 rerunAgeMetric.Set(c, dist, k.Project, k.Status, k.Platform, string(AnalysisTypeCompile)) 256 } 257 258 return nil 259 } 260 261 func projectAndPlatformForRerun(c context.Context, rerun *model.SingleRerun) (string, string, error) { 262 cfa, err := datastoreutil.GetCompileFailureAnalysis(c, rerun.Analysis.IntID()) 263 if err != nil { 264 return "", "", err 265 } 266 build, err := datastoreutil.GetBuild(c, cfa.CompileFailure.Parent().IntID()) 267 if err != nil { 268 return "", "", errors.Annotate(err, "getting build for analysis %d", cfa.Id).Err() 269 } 270 if build == nil { 271 return "", "", fmt.Errorf("build for analysis %d does not exist", cfa.Id) 272 } 273 return build.Project, string(build.Platform), nil 274 } 275 276 func collectMetricsForRunningTestReruns(c context.Context) error { 277 // Query all in-progress single reruns in the last 7 days. 278 // We set the limit to 7 days because there maybe cases that for some reasons 279 // (e.g. crashes) that a rerun status may not be updated. 280 // Any reruns more than 7 days are surely canceled by buildbucket, so it is 281 // safe to exclude them. 282 cutoffTime := clock.Now(c).Add(-time.Hour * 7 * 24) 283 q := datastore.NewQuery("TestSingleRerun").Eq("status", pb.RerunStatus_RERUN_STATUS_IN_PROGRESS).Gt("luci_build.create_time", cutoffTime) 284 reruns := []*model.TestSingleRerun{} 285 err := datastore.GetAll(c, q, &reruns) 286 if err != nil { 287 return errors.Annotate(err, "get running test reruns").Err() 288 } 289 290 // Get the metrics for rerun count and rerun age 291 // Maps where each key is one project-status-platform combination 292 rerunCountMap := map[rerunKey]int64{} 293 rerunAgeMap := map[rerunKey]*distribution.Distribution{} 294 for _, rerun := range reruns { 295 os := util.GetDimensionWithKey(rerun.Dimensions, "os") 296 if os == nil { 297 logging.Warningf(c, "rerun dimension has no OS %d", rerun.ID) 298 continue 299 } 300 var key = rerunKey{ 301 Project: rerun.Project, 302 Platform: string(model.PlatformFromOS(c, os.Value)), 303 } 304 if rerun.LUCIBuild.Status == buildbucketpb.Status_STATUS_UNSPECIFIED || rerun.LUCIBuild.Status == buildbucketpb.Status_SCHEDULED { 305 key.Status = "pending" 306 } 307 if rerun.LUCIBuild.Status == buildbucketpb.Status_STARTED { 308 key.Status = "running" 309 } 310 if key.Status != "" { 311 rerunCountMap[key] = rerunCountMap[key] + 1 312 if _, ok := rerunAgeMap[key]; !ok { 313 rerunAgeMap[key] = distribution.New(rerunAgeMetric.Bucketer()) 314 } 315 dur := clock.Now(c).Sub(rerun.CreateTime) 316 rerunAgeMap[key].Add(dur.Seconds()) 317 } 318 } 319 320 // Send metrics to tsmon 321 for k, count := range rerunCountMap { 322 runningRerunGauge.Set(c, count, k.Project, k.Status, k.Platform, string(AnalysisTypeTest)) 323 } 324 325 for k, dist := range rerunAgeMap { 326 rerunAgeMetric.Set(c, dist, k.Project, k.Status, k.Platform, string(AnalysisTypeTest)) 327 } 328 329 return nil 330 } 331 332 func rerunAgeInSeconds(c context.Context, rerun *model.SingleRerun) float64 { 333 dur := clock.Now(c).Sub(rerun.CreateTime) 334 return dur.Seconds() 335 }