go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/analysis/internal/bugs/updater/updater.go (about) 1 // Copyright 2022 The LUCI Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package updater contains methods to orchestrate automatic bug management, 16 // including automatic bug filing and automatic priority updates/auto-closure. 17 package updater 18 19 import ( 20 "context" 21 "encoding/hex" 22 "fmt" 23 "sort" 24 "strconv" 25 "time" 26 27 "google.golang.org/grpc/codes" 28 "google.golang.org/grpc/status" 29 30 "go.chromium.org/luci/common/errors" 31 "go.chromium.org/luci/common/logging" 32 "go.chromium.org/luci/server/span" 33 34 "go.chromium.org/luci/analysis/internal/analysis" 35 "go.chromium.org/luci/analysis/internal/analysis/metrics" 36 "go.chromium.org/luci/analysis/internal/bugs" 37 bugspb "go.chromium.org/luci/analysis/internal/bugs/proto" 38 "go.chromium.org/luci/analysis/internal/clustering" 39 "go.chromium.org/luci/analysis/internal/clustering/algorithms" 40 "go.chromium.org/luci/analysis/internal/clustering/algorithms/rulesalgorithm" 41 "go.chromium.org/luci/analysis/internal/clustering/rules" 42 "go.chromium.org/luci/analysis/internal/clustering/rules/lang" 43 "go.chromium.org/luci/analysis/internal/clustering/runs" 44 "go.chromium.org/luci/analysis/internal/config/compiledcfg" 45 configpb "go.chromium.org/luci/analysis/proto/config" 46 pb "go.chromium.org/luci/analysis/proto/v1" 47 ) 48 49 // testnameThresholdInflationPercent is the percentage factor by which 50 // the bug filing threshold is inflated when applied to test-name clusters. 51 // This is to bias bug-filing towards failure reason clusters, which are 52 // seen as generally better scoped and more actionable (because they 53 // focus on one reason for the test failing.) 54 // 55 // The value of 34% was selected as it is sufficient to inflate any threshold 56 // values which are a '3' (e.g. CV runs rejected) to a '4'. Otherwise integer 57 // discretization of the statistics would cancel out any intended bias. 58 // 59 // If changing this value, please also update the comment in 60 // project_config.proto. 61 const testnameThresholdInflationPercent = 34 62 63 // mergeIntoCycleErr is the error returned if a cycle is detected in a bug's 64 // merged-into graph when handling a bug marked as duplicate. 65 var mergeIntoCycleErr = errors.New("a cycle was detected in the bug merged-into graph") 66 67 // mergeIntoPermissionErr is the error returned if we get a permission error while traversing and/or 68 // updating duplicate bugs. 69 var mergeIntoPermissionErr = errors.New("permission error occured while merging duplicate bugs") 70 71 // ruleDefinitionTooLongErr is the error returned if merging two failure 72 // association rules results in a rule that is too long. 73 var ruleDefinitionTooLongErr = errors.New("the merged rule definition is too long") 74 75 // mergeIntoCycleMessage is the message posted on bugs when LUCI Analysis 76 // cannot deal with a bug marked as the duplicate of another because of 77 // a duplicate bug. 78 const mergeIntoCycleMessage = "LUCI Analysis cannot merge the failure" + 79 " association rule for this bug into the rule for the merged-into bug," + 80 " because a cycle was detected in the bug merged-into graph. Please" + 81 " manually resolve the cycle, or update rules manually and archive the" + 82 " rule for this bug." 83 84 const mergeIntoPermissionMessage = "LUCI Analysis cannot merge the association rule" + 85 " for this bug into the rule for the merged-into bug because" + 86 " it doesn't have permission to access the merged-into bug." + 87 " Please make sure that LUCI Analysis has access to all the" + 88 " bugs in the bug duplicate chain, " + 89 " or update rules manually and archive the rule for this bug." 90 91 // ruleDefinitionTooLongMessage is the message posted on bugs when 92 // LUCI Analysis cannot deal with a bug marked as the duplicate of another 93 // because the merged rule would be too long. 94 const ruleDefinitionTooLongMessage = "LUCI Analysis cannot merge the failure" + 95 " association rule for this bug into the rule for the merged-into bug," + 96 " because the merged failure association rule would be too long. Please" + 97 " manually update the rule for the merged-into bug and archive the" + 98 " rule for this bug." 99 100 // BugManager implements bug creation and bug updates for a bug-tracking 101 // system. The BugManager determines bug content and priority given a 102 // cluster. 103 type BugManager interface { 104 // Create creates a new bug for the given request, returning its ID 105 // (if a bug was created) and any encountered error. 106 Create(ctx context.Context, request bugs.BugCreateRequest) bugs.BugCreateResponse 107 // Update updates the specified list of bugs. 108 // 109 // Exactly one response item is returned for each request item. 110 // If an error is encountered on a specific bug, the error is recorded 111 // on the bug's response item and processing continues. 112 // 113 // If a catastrophic error occurs, the error is returned 114 // at the top-level and the responses slice should be ignored. 115 Update(ctx context.Context, bugs []bugs.BugUpdateRequest) ([]bugs.BugUpdateResponse, error) 116 // GetMergedInto reads the bug the given bug is merged into (if any). 117 // This is to allow step-wise discovery of the canonical bug a bug 118 // is merged into (if it exists and there is no cycle in the bug 119 // merged-into graph). 120 GetMergedInto(ctx context.Context, bug bugs.BugID) (*bugs.BugID, error) 121 // UpdateDuplicateSource updates the source bug of a duplicate 122 // bug relationship. 123 // It normally posts a message advising the user LUCI Analysis 124 // has merged the rule for the source bug to the destination 125 // (merged-into) bug, and provides a new link to the failure 126 // association rule. 127 // If a cycle was detected, it instead posts a message that the 128 // duplicate bug could not be handled and marks the bug no 129 // longer a duplicate to break the cycle. 130 UpdateDuplicateSource(ctx context.Context, request bugs.UpdateDuplicateSourceRequest) error 131 } 132 133 // BugUpdater performs updates to bugs and failure association 134 // rules to keep them in sync with clusters generated by analysis. 135 type BugUpdater struct { 136 // project is the LUCI project to act on behalf of. 137 project string 138 // analysisClient provides access to cluster analysis. 139 analysisClient AnalysisClient 140 // managers stores the manager responsible for updating bugs for each 141 // bug tracking system (monorail, buganizer, etc.). 142 managers map[string]BugManager 143 // projectCfg is the snapshot of project configuration to use for 144 // the auto-bug filing run. 145 projectCfg *compiledcfg.ProjectConfig 146 // MaxBugsFiledPerRun is the maximum number of bugs to file each time 147 // BugUpdater runs. This throttles the rate of changes to the bug system. 148 MaxBugsFiledPerRun int 149 // UpdateRuleBatchSize is the maximum number of rules to update in one 150 // transaction, when updating rule bug management state. 151 UpdateRuleBatchSize int 152 // Timestamp of the cron job. Used to timestamp policy activations/deactivations 153 // that happen as a result of this run. 154 RunTimestamp time.Time 155 } 156 157 // NewBugUpdater initialises a new BugUpdater. 158 func NewBugUpdater(project string, mgrs map[string]BugManager, ac AnalysisClient, projectCfg *compiledcfg.ProjectConfig, runTimestamp time.Time) *BugUpdater { 159 return &BugUpdater{ 160 project: project, 161 managers: mgrs, 162 analysisClient: ac, 163 projectCfg: projectCfg, 164 MaxBugsFiledPerRun: 1, // Default value. 165 UpdateRuleBatchSize: 1000, // Default value. 166 RunTimestamp: runTimestamp, 167 } 168 } 169 170 // Run files/updates bugs to match high-impact clusters as 171 // identified by analysis. Each bug has a corresponding failure association 172 // rule. 173 // The passed progress should reflect the progress of re-clustering as captured 174 // in the latest analysis. 175 func (b *BugUpdater) Run(ctx context.Context, reclusteringProgress *runs.ReclusteringProgress) error { 176 // Verify we are not currently reclustering to a new version of 177 // algorithms or project configuration. If we are, we should 178 // suspend bug creation, priority updates and auto-closure 179 // as cluster impact is unreliable. 180 metricsValid := b.verifyClusterImpactValid(ctx, reclusteringProgress) 181 182 activeRules, err := rules.ReadActive(span.Single(ctx), b.project) 183 if err != nil { 184 return errors.Annotate(err, "read active failure association rules").Err() 185 } 186 187 metricsByRuleID := make(map[string]bugs.ClusterMetrics) 188 if metricsValid { 189 var thresholds []*configpb.ImpactMetricThreshold 190 for _, p := range b.projectCfg.Config.BugManagement.GetPolicies() { 191 thresholds = append(thresholds, bugs.ActivationThresholds(p)...) 192 } 193 194 // We want to read analysis for two categories of clusters: 195 // - Bug Clusters: to update the priority of filed bugs. 196 // - Impactful Suggested Clusters: if any suggested clusters may be 197 // near the threshold to file a new bug for, we want to 198 // read them, so we can file a bug. (Note: the thresholding applied 199 // here is weaker than the actual bug filing criteria which is 200 // implemented in this package, it exists mainly to avoid pulling 201 // back all suggested clusters). 202 clusters, err := b.analysisClient.ReadImpactfulClusters(ctx, analysis.ImpactfulClusterReadOptions{ 203 Project: b.project, 204 Thresholds: thresholds, 205 AlwaysIncludeBugClusters: true, 206 }) 207 if err != nil { 208 return errors.Annotate(err, "read impactful clusters").Err() 209 } 210 211 // blockedSourceClusterIDs is the set of source cluster IDs for which 212 // filing new bugs should be suspended. 213 blockedSourceClusterIDs := make(map[clustering.ClusterID]struct{}) 214 for _, r := range activeRules { 215 if !reclusteringProgress.IncorporatesRulesVersion(r.CreateTime) { 216 // If a bug cluster was recently filed for a source cluster, and 217 // re-clustering and analysis is not yet complete (to move the 218 // impact from the source cluster to the bug cluster), do not file 219 // another bug for the source cluster. 220 // (Of course, if a bug cluster was filed for a source cluster, 221 // but the bug cluster's failure association rule was subsequently 222 // modified (e.g. narrowed), it is allowed to file another bug 223 // if the residual impact justifies it.) 224 blockedSourceClusterIDs[r.SourceCluster] = struct{}{} 225 } 226 } 227 228 if err := b.fileNewBugs(ctx, clusters, blockedSourceClusterIDs); err != nil { 229 return err 230 } 231 232 for _, cluster := range clusters { 233 if cluster.ClusterID.Algorithm == rulesalgorithm.AlgorithmName { 234 // Use only impact from latest algorithm version. 235 ruleID := cluster.ClusterID.ID 236 metricsByRuleID[ruleID] = ExtractResidualMetrics(cluster) 237 } 238 } 239 } 240 241 var rms []ruleWithMetrics 242 for _, rule := range activeRules { 243 var metrics bugs.ClusterMetrics 244 245 // Metrics are valid if re-clustering and analysis ran on the latest 246 // version of this failure association rule. This avoids bugs getting 247 // erroneous priority changes while metrics information is incomplete. 248 ruleMetricsValid := metricsValid && 249 reclusteringProgress.IncorporatesRulesVersion(rule.PredicateLastUpdateTime) 250 251 if ruleMetricsValid { 252 var ok bool 253 metrics, ok = metricsByRuleID[rule.RuleID] 254 if !ok { 255 // If there is no analysis, this means the cluster is 256 // empty. Use empty impact. 257 metrics = bugs.ClusterMetrics{} 258 } 259 } 260 // Else leave metrics as nil. Bug-updating code takes this as an 261 // indication valid metrics are not available and will not attempt 262 // priority updates/auto-closure. 263 264 rms = append(rms, ruleWithMetrics{ 265 RuleID: rule.RuleID, 266 Metrics: metrics, 267 }) 268 } 269 270 // Update bug management state (i.e. policy activations) for existing 271 // rules based on current cluster metrics. Prepare the bug update requests 272 // based on this state. 273 bugsToUpdate, err := b.updateBugManagementState(ctx, rms) 274 if err != nil { 275 return errors.Annotate(err, "update bug management state").Err() 276 } 277 278 // Break bug updates down by bug system. 279 bugUpdatesBySystem := make(map[string][]bugs.BugUpdateRequest) 280 for _, bug := range bugsToUpdate { 281 bugUpdates := bugUpdatesBySystem[bug.Bug.System] 282 bugUpdates = append(bugUpdates, bug) 283 bugUpdatesBySystem[bug.Bug.System] = bugUpdates 284 } 285 286 // Perform bug updates. 287 var errs []error 288 for system, systemBugsToUpdate := range bugUpdatesBySystem { 289 err := b.updateBugsForSystem(ctx, system, systemBugsToUpdate) 290 if err != nil { 291 errs = append(errs, errors.Annotate(err, "updating bugs in %s", system).Err()) 292 } 293 } 294 // Returns nil if len(errs) == 0. 295 return errors.Append(errs...) 296 } 297 298 type ruleWithMetrics struct { 299 // Rule identifier. 300 RuleID string 301 // The bug cluster metrics. May be nil if no reliable metrics 302 // are available because reclustering is in progress. 303 Metrics bugs.ClusterMetrics 304 } 305 306 // updateBugManagementState updates policy activations for the 307 // specified rules using the given current metric values. 308 // 309 // BugUpdateRequests then are created based on the read rules 310 // and updated bug management state. The returned BugUpdateRequests 311 // will be in 1:1 correspondance to the specified rules. 312 func (b *BugUpdater) updateBugManagementState(ctx context.Context, rs []ruleWithMetrics) ([]bugs.BugUpdateRequest, error) { 313 // Read and update bug management state in batches. 314 // Batching is required as Spanner limits the number of mutations 315 // per transaction to 40,000 (as at August 2023): 316 // https://cloud.google.com/spanner/quotas#limits-for 317 batches := batch(rs, b.UpdateRuleBatchSize) 318 319 result := make([]bugs.BugUpdateRequest, 0, len(rs)) 320 for _, ruleBatch := range batches { 321 var batchResult []bugs.BugUpdateRequest 322 batchResult, err := b.updateBugManagementStateBatch(ctx, ruleBatch) 323 if err != nil { 324 return nil, err 325 } 326 327 result = append(result, batchResult...) 328 } 329 return result, nil 330 } 331 332 func batch[K any](items []K, batchSize int) [][]K { 333 if batchSize < 1 { 334 panic("batch size must be greater than 0") 335 } 336 337 batchCount := (len(items) + batchSize - 1) / batchSize 338 result := make([][]K, 0, batchCount) 339 for i := 0; i < batchCount; i++ { 340 batchStartIndex := i * batchSize // inclusive 341 batchEndIndex := batchStartIndex + batchSize // exclusive 342 if batchEndIndex > len(items) { 343 batchEndIndex = len(items) 344 } 345 result = append(result, items[batchStartIndex:batchEndIndex]) 346 } 347 return result 348 } 349 350 // updateBugManagementStateBatch updates policy activations for the 351 // specified rules using the given current metric values. 352 // 353 // BugUpdateRequests then are created based on the read rules 354 // and updated bug management state. The returned BugUpdateRequests 355 // will be in 1:1 correspondance to the specified rules. 356 func (b *BugUpdater) updateBugManagementStateBatch(ctx context.Context, rulesAndMetrics []ruleWithMetrics) ([]bugs.BugUpdateRequest, error) { 357 ruleIDs := make([]string, 0, len(rulesAndMetrics)) 358 for _, rule := range rulesAndMetrics { 359 ruleIDs = append(ruleIDs, rule.RuleID) 360 } 361 362 var result []bugs.BugUpdateRequest 363 f := func(ctx context.Context) error { 364 // This transaction may be retried. Reset the result each time 365 // the transaction runs to avoid data from previous aborted 366 // attempts leaking into subsequent attempts. 367 result = make([]bugs.BugUpdateRequest, 0, len(rulesAndMetrics)) 368 369 // Read the rules in the transaction again to implement an 370 // atomic Read-Update transaction, which protects against 371 // update races. Subsequent bug-filing action will be based 372 // only on this second read. 373 // N.B.: ReadMany returns items in 1:1 correspondence to the request. 374 rs, err := rules.ReadMany(ctx, b.project, ruleIDs) 375 if err != nil { 376 return errors.Annotate(err, "read rules").Err() 377 } 378 379 for i, r := range rs { 380 // Fetches the corresponding metrics for a rule. 381 clusterMetrics := rulesAndMetrics[i].Metrics 382 383 // If metrics data is valid (e.g. no reclustering in progress). 384 if clusterMetrics != nil { 385 // Update which policies are active. 386 updatedBugManagementState, changed := bugs.UpdatePolicyActivations(r.BugManagementState, b.projectCfg.Config.BugManagement.GetPolicies(), clusterMetrics, b.RunTimestamp) 387 if changed { 388 // Only update the rule if a policy has activated or 389 // deactivated, to avoid unnecessary writes and rule 390 // cache invalidations. 391 r.BugManagementState = updatedBugManagementState 392 393 opts := rules.UpdateOptions{} 394 ms, err := rules.Update(r, opts, rules.LUCIAnalysisSystem) 395 if err != nil { 396 return errors.Annotate(err, "update rule").Err() 397 } 398 span.BufferWrite(ctx, ms) 399 } 400 } 401 402 updateRequest := bugs.BugUpdateRequest{ 403 Bug: r.BugID, 404 IsManagingBug: r.IsManagingBug, 405 IsManagingBugPriority: r.IsManagingBugPriority, 406 IsManagingBugPriorityLastUpdated: r.IsManagingBugPriorityLastUpdateTime, 407 RuleID: r.RuleID, 408 } 409 updateRequest.BugManagementState = r.BugManagementState 410 result = append(result, updateRequest) 411 } 412 return nil 413 } 414 if _, err := span.ReadWriteTransaction(ctx, f); err != nil { 415 return nil, err 416 } 417 return result, nil 418 } 419 420 func (b *BugUpdater) updateBugsForSystem(ctx context.Context, system string, bugsToUpdate []bugs.BugUpdateRequest) error { 421 manager, ok := b.managers[system] 422 if !ok { 423 logging.Warningf(ctx, "Encountered bug(s) with an unrecognised manager: %q", system) 424 return nil 425 } 426 427 // Keep a minute of time in reserve to update rules. 428 // It is important that we still update the rules for bugs we did 429 // successfully update as some bug behaviours rely on this as 430 // part of their control loop (we will keep posting the same 431 // comment on the bug until the rule is updated). 432 mgrCtx, cancel := bugs.Shorten(ctx, time.Minute) 433 defer cancel() 434 435 logging.Debugf(ctx, "Considering update of %v %s bugs in project %s", len(bugsToUpdate), system, b.project) 436 437 responses, err := manager.Update(mgrCtx, bugsToUpdate) 438 if err != nil { 439 // Catastrophic error, exit immediately. 440 return errors.Annotate(err, "update bugs").Err() 441 } 442 443 // The set of non-catastrophic errors encountered so far. 444 var errs []error 445 // The set of bugs marked as duplicate encountered. 446 var duplicateBugs []bugs.DuplicateBugDetails 447 // The updates to failure association rules required. 448 var updateRuleRequests []updateRuleRequest 449 450 for i, rsp := range responses { 451 if rsp.Error != nil { 452 // Capture the error, but continue processing this bug 453 // and other bugs, as partial success is possible 454 // and pending rule updates must be applied. 455 err := errors.Annotate(rsp.Error, "updating bug (%s)", bugsToUpdate[i].Bug.String()).Err() 456 errs = append(errs, err) 457 logging.Errorf(ctx, "%s", err) 458 } 459 460 if rsp.IsDuplicate { 461 duplicateBugs = append(duplicateBugs, bugs.DuplicateBugDetails{ 462 RuleID: bugsToUpdate[i].RuleID, 463 Bug: bugsToUpdate[i].Bug, 464 IsAssigned: rsp.IsDuplicateAndAssigned, 465 }) 466 // Inhibit archiving if rules are duplicates. 467 rsp.ShouldArchive = false 468 } 469 if rsp.ShouldArchive || rsp.DisableRulePriorityUpdates || rsp.RuleAssociationNotified || len(rsp.PolicyActivationsNotified) > 0 { 470 logging.Fields{ 471 "RuleID": bugsToUpdate[i].RuleID, 472 "BugID": bugsToUpdate[i].Bug.String(), 473 "Archive": rsp.ShouldArchive, 474 "DisableRulePriorityUpdates": rsp.DisableRulePriorityUpdates, 475 "RuleAssociationNotified": rsp.RuleAssociationNotified, 476 "PolicyActivationsNotified": rsp.PolicyActivationsNotified, 477 }.Debugf(ctx, "Preparing rule update for bug %s", bugsToUpdate[i].Bug.String()) 478 479 updateRuleRequests = append(updateRuleRequests, updateRuleRequest{ 480 RuleID: bugsToUpdate[i].RuleID, 481 BugID: bugsToUpdate[i].Bug, 482 Archive: rsp.ShouldArchive, 483 DisableRulePriorityUpdates: rsp.DisableRulePriorityUpdates, 484 RuleAssociationNotified: rsp.RuleAssociationNotified, 485 PolicyActivationsNotified: rsp.PolicyActivationsNotified, 486 }) 487 } 488 } 489 490 if err := b.updateRules(ctx, updateRuleRequests); err != nil { 491 err = errors.Annotate(err, "updating rules after updating bugs").Err() 492 errs = append(errs, err) 493 logging.Errorf(ctx, "%s", err) 494 } 495 496 // Handle bugs marked as duplicate. 497 for _, duplicateDetails := range duplicateBugs { 498 if err := b.handleDuplicateBug(ctx, duplicateDetails); err != nil { 499 err = errors.Annotate(err, "handling duplicate bug (%s)", duplicateDetails.Bug.String()).Err() 500 errs = append(errs, err) 501 logging.Errorf(ctx, "%s", err) 502 } 503 } 504 // Returns nil if len(errs) == 0. 505 return errors.Append(errs...) 506 } 507 508 func (b *BugUpdater) verifyClusterImpactValid(ctx context.Context, progress *runs.ReclusteringProgress) bool { 509 if progress.IsReclusteringToNewAlgorithms() { 510 logging.Warningf(ctx, "Auto-bug filing paused for project %s as re-clustering to new algorithms is in progress.", b.project) 511 return false 512 } 513 if progress.IsReclusteringToNewConfig() { 514 logging.Warningf(ctx, "Auto-bug filing paused for project %s as re-clustering to new configuration is in progress.", b.project) 515 return false 516 } 517 if algorithms.AlgorithmsVersion != progress.Next.AlgorithmsVersion { 518 logging.Warningf(ctx, "Auto-bug filing paused for project %s as bug-filing is running mismatched algorithms version %v (want %v).", 519 b.project, algorithms.AlgorithmsVersion, progress.Next.AlgorithmsVersion) 520 return false 521 } 522 if !b.projectCfg.LastUpdated.Equal(progress.Next.ConfigVersion) { 523 logging.Warningf(ctx, "Auto-bug filing paused for project %s as bug-filing is running mismatched config version %v (want %v).", 524 b.project, b.projectCfg.LastUpdated, progress.Next.ConfigVersion) 525 return false 526 } 527 return true 528 } 529 530 func (b *BugUpdater) fileNewBugs(ctx context.Context, clusters []*analysis.Cluster, blockedClusterIDs map[clustering.ClusterID]struct{}) error { 531 // The set of clusters IDs to file bugs for. Used for deduplicating creation 532 // requests accross policies. 533 clusterIDsToCreateBugsFor := make(map[clustering.ClusterID]struct{}) 534 535 // The list of clusters to file bugs for. Uses a list instead of a set to ensure 536 // the order that bugs are created is deterministic and matches the order that 537 // policies are configured, which simplifies testing. 538 var clustersToCreateBugsFor []*analysis.Cluster 539 540 for _, p := range b.projectCfg.Config.BugManagement.GetPolicies() { 541 sortByPolicyBugFilingPreference(clusters, p) 542 543 for _, cluster := range clusters { 544 if cluster.ClusterID.IsBugCluster() { 545 // Never file another bug for a bug cluster. 546 continue 547 } 548 549 // Was a bug recently filed for this suggested cluster? 550 // We want to avoid race conditions whereby we file multiple bug 551 // clusters for the same suggested cluster, because re-clustering and 552 // re-analysis has not yet run and moved residual impact from the 553 // suggested cluster to the bug cluster. 554 _, ok := blockedClusterIDs[cluster.ClusterID] 555 if ok { 556 // Do not file a bug. 557 continue 558 } 559 560 // Were the failures are confined to only automation CLs 561 // and/or 1-2 user CLs? In other words, are the failures in this 562 // clusters unlikely to be present in the tree? 563 if cluster.DistinctUserCLsWithFailures7d.Residual < 3 && 564 cluster.PostsubmitBuildsWithFailures7d.Residual == 0 { 565 // Do not file a bug. 566 continue 567 } 568 569 // Only file a bug if the residual impact exceeds the threshold. 570 impact := ExtractResidualMetrics(cluster) 571 bugFilingThresholds := bugs.ActivationThresholds(p) 572 if cluster.ClusterID.IsTestNameCluster() { 573 // Use an inflated threshold for test name clusters to bias 574 // bug creation towards failure reason clusters. 575 bugFilingThresholds = 576 bugs.InflateThreshold(bugFilingThresholds, 577 testnameThresholdInflationPercent) 578 } 579 if !impact.MeetsAnyOfThresholds(bugFilingThresholds) { 580 continue 581 } 582 583 // Create a bug for this cluster, deduplicating creation 584 // requests across policies. 585 if _, ok := clusterIDsToCreateBugsFor[cluster.ClusterID]; !ok { 586 clustersToCreateBugsFor = append(clustersToCreateBugsFor, cluster) 587 clusterIDsToCreateBugsFor[cluster.ClusterID] = struct{}{} 588 } 589 590 // The policy has picked the one cluster it wants to file a bug for. 591 // If this cluster is the same as another policy, the one bug is filed 592 // for both policies. 593 // 594 // This ensures if a top failure cluster clusters well by both reason 595 // and test name, we do not file bugs for both. 596 break 597 } 598 } 599 600 // File new bugs. 601 bugsFiled := 0 602 for _, cluster := range clustersToCreateBugsFor { 603 if bugsFiled >= b.MaxBugsFiledPerRun { 604 break 605 } 606 created, err := b.createBug(ctx, cluster) 607 if err != nil { 608 return err 609 } 610 if created { 611 bugsFiled++ 612 } 613 } 614 return nil 615 } 616 617 type updateRuleRequest struct { 618 // The identity of the rule. 619 RuleID string 620 // The bug that was updated and/or from which the updates were sourced. 621 // If the bug on the rule has changed from this value, rule updates will 622 // not be applied. 623 BugID bugs.BugID 624 // Whether the rule should be archived. 625 Archive bool 626 // Whether rule priority updates should be disabled. 627 DisableRulePriorityUpdates bool 628 // Whether BugManagementState.RuleAssociationNotified should be set. 629 RuleAssociationNotified bool 630 // A map containing the IDs of policies for which 631 // BugManagementState.Policies[<policyID>].ActivationNotified should 632 // be set. 633 PolicyActivationsNotified map[bugs.PolicyID]struct{} 634 } 635 636 // updateRules applies updates to failure association rules 637 // following a round of bug updates. This includes: 638 // - archiving rules if the bug was detected in an archived state 639 // - disabling automatic priority updates if it was detected that 640 // the user manually set the bug priority. 641 // 642 // requests and response slices should have 1:1 correspondance, i.e. 643 // requests[i] corresponds to responses[i]. 644 func (b *BugUpdater) updateRules(ctx context.Context, requests []updateRuleRequest) error { 645 // Perform updates in batches to stay within mutation Spanner limits. 646 requestBatches := batch(requests, b.UpdateRuleBatchSize) 647 for _, batch := range requestBatches { 648 err := b.updateRulesBatch(ctx, batch) 649 if err != nil { 650 return err 651 } 652 } 653 return nil 654 } 655 656 func (b *BugUpdater) updateRulesBatch(ctx context.Context, requests []updateRuleRequest) error { 657 ruleIDs := make([]string, 0, len(requests)) 658 for _, req := range requests { 659 ruleIDs = append(ruleIDs, req.RuleID) 660 } 661 f := func(ctx context.Context) error { 662 // Perform transactional read-update of rule to protect 663 // against update races. 664 rs, err := rules.ReadMany(ctx, b.project, ruleIDs) 665 if err != nil { 666 return errors.Annotate(err, "read rules").Err() 667 } 668 for i, rule := range rs { 669 updateRequest := requests[i] 670 if rule.RuleID != updateRequest.RuleID { 671 // ReadMany's response should be in 1:1 correspondance 672 // to the request. 673 panic("logic error") 674 } 675 if rule.BugID != updateRequest.BugID { 676 // A data race has occured: the rule has been modified while 677 // we were updating bugs, and now the update to the rule no 678 // longer makes sense. This should only occur rarely. 679 logging.Warningf(ctx, "Bug associated with rule %v changed during bug-filing run, skipping updates to rule.") 680 continue 681 } 682 updateOptions := rules.UpdateOptions{} 683 if updateRequest.Archive { 684 rule.IsActive = false 685 updateOptions.IsAuditableUpdate = true 686 updateOptions.PredicateUpdated = true 687 } 688 if updateRequest.DisableRulePriorityUpdates { 689 rule.IsManagingBugPriority = false 690 updateOptions.IsAuditableUpdate = true 691 updateOptions.IsManagingBugPriorityUpdated = true 692 } 693 if updateRequest.RuleAssociationNotified { 694 rule.BugManagementState.RuleAssociationNotified = true 695 } 696 for policyID := range updateRequest.PolicyActivationsNotified { 697 policyState, ok := rule.BugManagementState.PolicyState[string(policyID)] 698 if !ok { 699 // The policy has been deleted during the bug-filing run. 700 logging.Warningf(ctx, "Policy activation notified for policy %v, which is now deleted.", policyID) 701 continue 702 } 703 policyState.ActivationNotified = true 704 } 705 ms, err := rules.Update(rule, updateOptions, rules.LUCIAnalysisSystem) 706 if err != nil { 707 // Validation error; this should never happen here. 708 return errors.Annotate(err, "prepare rule update").Err() 709 } 710 span.BufferWrite(ctx, ms) 711 } 712 return nil 713 } 714 _, err := span.ReadWriteTransaction(ctx, f) 715 if err != nil { 716 return errors.Annotate(err, "update rules").Err() 717 } 718 return nil 719 } 720 721 // handleDuplicateBug handles a duplicate bug, merging its failure association 722 // rule with the bug it is ultimately merged into (creating the rule if it does 723 // not exist). In case of unhandleable errors, the source bug is kicked out of the 724 // duplicate state and an error message is posted on the bug. 725 func (b *BugUpdater) handleDuplicateBug(ctx context.Context, duplicateDetails bugs.DuplicateBugDetails) error { 726 err := b.handleDuplicateBugHappyPath(ctx, duplicateDetails) 727 if errors.Is(err, mergeIntoCycleErr) { 728 request := bugs.UpdateDuplicateSourceRequest{ 729 BugDetails: duplicateDetails, 730 ErrorMessage: mergeIntoCycleMessage, 731 } 732 if err := b.updateDuplicateSource(ctx, request); err != nil { 733 return errors.Annotate(err, "update source bug after a cycle was found").Err() 734 } 735 } else if errors.Is(err, ruleDefinitionTooLongErr) { 736 request := bugs.UpdateDuplicateSourceRequest{ 737 BugDetails: duplicateDetails, 738 ErrorMessage: ruleDefinitionTooLongMessage, 739 } 740 if err := b.updateDuplicateSource(ctx, request); err != nil { 741 return errors.Annotate(err, "update source bug after merging rule definition was found too long").Err() 742 } 743 } else if errors.Is(err, mergeIntoPermissionErr) { 744 request := bugs.UpdateDuplicateSourceRequest{ 745 BugDetails: duplicateDetails, 746 ErrorMessage: mergeIntoPermissionMessage, 747 } 748 if err := b.updateDuplicateSource(ctx, request); err != nil { 749 return errors.Annotate(err, "update source bug after merging rule definition encountered a permission error").Err() 750 } 751 } else if err != nil { 752 return err 753 } 754 return nil 755 } 756 757 // handleDuplicateBugHappyPath handles a duplicate bug, merging its failure association 758 // rule with the bug it is ultimately merged into (creating the rule if it does 759 // not exist). The original rule is archived. 760 func (b *BugUpdater) handleDuplicateBugHappyPath(ctx context.Context, duplicateDetails bugs.DuplicateBugDetails) error { 761 // Chase the bug merged-into graph until we find the sink of the graph. 762 // (The canonical bug of the chain of duplicate bugs.) 763 destBug, err := b.resolveMergedIntoBug(ctx, duplicateDetails.Bug) 764 if err != nil { 765 // May return mergeIntoCycleErr. 766 return err 767 } 768 769 var destinationBugRuleID string 770 771 f := func(ctx context.Context) error { 772 sourceRule, _, err := readRuleForBugAndProject(ctx, duplicateDetails.Bug, b.project) 773 if err != nil { 774 return errors.Annotate(err, "reading rule for source bug").Err() 775 } 776 if !sourceRule.IsActive { 777 // The source rule is no longer active. This is a race condition 778 // as we only do bug updates for rules that exist at the time 779 // we start bug updates. 780 // An inactive rule does not match any failures so merging the 781 // it into another rule should have no effect anyway. 782 return nil 783 } 784 // Try and read the rule for the bug we are merging into. 785 destinationRule, _, err := 786 readRuleForBugAndProject(ctx, destBug, b.project) 787 if err != nil { 788 return errors.Annotate(err, "reading rule for destination bug").Err() 789 } 790 if destinationRule == nil { 791 // The destination bug does not have a rule in this project. 792 // Simply update the source rule to point to the new bug. 793 sourceRule.BugID = destBug 794 795 // As the bug has changed, flags tracking notification of policy 796 // activation must be reset. 797 if sourceRule.BugManagementState.PolicyState != nil { 798 for _, policyState := range sourceRule.BugManagementState.PolicyState { 799 policyState.ActivationNotified = false 800 } 801 } 802 803 // The destination bug is not a LUCI Analysis bug. 804 // Do not automatically verify/auto-close it as we do not 805 // know what problems it was for. 806 sourceRule.IsManagingBug = false 807 808 sourceRule.BugManagementState.RuleAssociationNotified = false 809 810 ms, err := rules.Update(sourceRule, rules.UpdateOptions{ 811 IsAuditableUpdate: true, 812 }, rules.LUCIAnalysisSystem) 813 if err != nil { 814 // Indicates validation error. Should never happen. 815 return err 816 } 817 span.BufferWrite(ctx, ms) 818 819 destinationBugRuleID = sourceRule.RuleID 820 return nil 821 } else { 822 // The bug we are a duplicate of already has a rule. 823 if destinationRule.IsActive { 824 // Merge the source and destination rules with an "OR". 825 mergedRule, err := lang.Merge(destinationRule.RuleDefinition, sourceRule.RuleDefinition) 826 if err != nil { 827 return errors.Annotate(err, "merging rules").Err() 828 } 829 if len(mergedRule) > rules.MaxRuleDefinitionLength { 830 // The merged rule is too long to store. 831 return ruleDefinitionTooLongErr 832 } 833 destinationRule.RuleDefinition = mergedRule 834 } else { 835 // Else: an inactive rule does not match any failures, so we should 836 // use only the rule from the source bug. 837 destinationRule.RuleDefinition = sourceRule.RuleDefinition 838 } 839 840 // Disable the source rule. 841 sourceRule.IsActive = false 842 ms, err := rules.Update(sourceRule, rules.UpdateOptions{ 843 IsAuditableUpdate: true, 844 PredicateUpdated: true, 845 }, rules.LUCIAnalysisSystem) 846 if err != nil { 847 // Indicates validation error. Should never happen. 848 return err 849 } 850 span.BufferWrite(ctx, ms) 851 852 // Update the rule on the destination rule. 853 destinationRule.IsActive = true 854 ms, err = rules.Update(destinationRule, rules.UpdateOptions{ 855 IsAuditableUpdate: true, 856 PredicateUpdated: true, 857 }, rules.LUCIAnalysisSystem) 858 if err != nil { 859 return err 860 } 861 span.BufferWrite(ctx, ms) 862 863 destinationBugRuleID = destinationRule.RuleID 864 return nil 865 } 866 } 867 // Update source and destination rules in one transaction, to ensure 868 // consistency. 869 _, err = span.ReadWriteTransaction(ctx, f) 870 if err != nil { 871 return err 872 } 873 874 if !b.projectCfg.Config.BugManagement.GetDisableDuplicateBugComments() { 875 // Notify that the bugs were successfully merged. 876 request := bugs.UpdateDuplicateSourceRequest{ 877 BugDetails: duplicateDetails, 878 DestinationRuleID: destinationBugRuleID, 879 } 880 if err := b.updateDuplicateSource(ctx, request); err != nil { 881 return errors.Annotate(err, "updating source bug").Err() 882 } 883 } 884 885 return err 886 } 887 888 // resolveMergedIntoBug resolves the bug the given bug is ultimately merged 889 // into. 890 func (b *BugUpdater) resolveMergedIntoBug(ctx context.Context, bug bugs.BugID) (bugs.BugID, error) { 891 isResolved := false 892 mergedIntoBug := bug 893 const maxResolutionSteps = 5 894 for i := 0; i < maxResolutionSteps; i++ { 895 system := mergedIntoBug.System 896 manager, ok := b.managers[system] 897 if !ok { 898 if mergedIntoBug.System == "buganizer" { 899 // Do not attempt to resolve the canoncial bug within 900 // buganizer if buganizer is not registered. We hit this 901 // path with buganizer not registered if a monorail bug marks 902 // itself as a duplicate of a buganizer bug. 903 isResolved = true 904 break 905 } 906 return bugs.BugID{}, fmt.Errorf("encountered unknown bug system: %q", system) 907 } 908 mergedInto, err := manager.GetMergedInto(ctx, mergedIntoBug) 909 if status.Code(err) == codes.PermissionDenied { 910 // We don't have permission to view the issue 911 return bugs.BugID{}, mergeIntoPermissionErr 912 } else if err != nil { 913 return bugs.BugID{}, err 914 } 915 if mergedInto == nil { 916 // We have found the canoncial merged-into bug. 917 isResolved = true 918 break 919 } else { 920 mergedIntoBug = *mergedInto 921 } 922 } 923 if !isResolved { 924 // We found a cycle in the graph. 925 return bugs.BugID{}, mergeIntoCycleErr 926 } 927 if mergedIntoBug == bug { 928 // This would normally never occur, but is possible in some 929 // exceptional scenarios like race conditions where a cycle 930 // is broken during the graph traversal, or a bug which 931 // was marked as duplicate but is no longer marked as duplicate 932 // now. 933 return bugs.BugID{}, fmt.Errorf("cannot deduplicate a bug into itself") 934 } 935 return mergedIntoBug, nil 936 } 937 938 // updateDuplicateSource updates the source bug of a duplicate 939 // bug pair (source bug, destination bug). 940 // It either posts a message notifying the user the rule was successfully 941 // merged to the destination, or notifies the user of the error and 942 // marks the bug no longer a duplicate (to avoid repeated attempts to 943 // handle the problematic duplicate bug). 944 func (b *BugUpdater) updateDuplicateSource(ctx context.Context, request bugs.UpdateDuplicateSourceRequest) error { 945 manager, ok := b.managers[request.BugDetails.Bug.System] 946 if !ok { 947 return fmt.Errorf("encountered unknown bug system: %q", request.BugDetails.Bug.System) 948 } 949 err := manager.UpdateDuplicateSource(ctx, request) 950 if err != nil { 951 return err 952 } 953 return nil 954 } 955 956 // readRuleForBugAndProject reads the failure association rule for the given 957 // bug in the given project, if it exists. It additionally returns whether 958 // there is any rule in the system that manages the given bug, even if in 959 // a different project. 960 // If the rule cannot be read, it returns nil. 961 func readRuleForBugAndProject(ctx context.Context, bug bugs.BugID, project string) (rule *rules.Entry, anyRuleManaging bool, err error) { 962 rules, err := rules.ReadByBug(ctx, bug) 963 if err != nil { 964 return nil, false, err 965 } 966 rule = nil 967 anyRuleManaging = false 968 for _, r := range rules { 969 if r.IsManagingBug { 970 anyRuleManaging = true 971 } 972 if r.Project == project { 973 rule = r 974 } 975 } 976 return rule, anyRuleManaging, nil 977 } 978 979 // sortByPolicyBugFilingPreference sorts clusters based on our preference 980 // to file bugs for these clusters. 981 func sortByPolicyBugFilingPreference(cs []*analysis.Cluster, policy *configpb.BugManagementPolicy) { 982 // The current ranking approach prefers filing bugs for clusters 983 // which more strongly meet the bug-filing threshold, with a bias 984 // towards reason clusters. 985 // 986 // The order of this ranking is only important where there are 987 // multiple competing clusters which meet the bug-filing threshold. 988 // As bug filing runs relatively often, except in cases of contention, 989 // the first bug to meet the threshold will be filed. 990 sort.Slice(cs, func(i, j int) bool { 991 // N.B. This does not rank clusters perfectly where the policy has 992 // multiple metrics, as the first metric may only slightly the 993 // threshold, but the second metric strongly exceeds. 994 // Most policies have only one metric, however, so this should 995 // be pretty rare. 996 for _, metric := range policy.Metrics { 997 if equal, less := rankByMetric(cs[i], cs[j], metrics.ID(metric.MetricId)); !equal { 998 return less 999 } 1000 } 1001 // If all else fails, sort by cluster ID. This is mostly to ensure 1002 // the code behaves deterministically when under unit testing. 1003 if cs[i].ClusterID.Algorithm != cs[j].ClusterID.Algorithm { 1004 return cs[i].ClusterID.Algorithm < cs[j].ClusterID.Algorithm 1005 } 1006 return cs[i].ClusterID.ID < cs[j].ClusterID.ID 1007 }) 1008 } 1009 1010 func rankByMetric(a, b *analysis.Cluster, metric metrics.ID) (equal bool, less bool) { 1011 valueA := a.MetricValues[metric].SevenDay.Residual 1012 valueB := b.MetricValues[metric].SevenDay.Residual 1013 // If one cluster we are comparing with is a test name cluster, 1014 // give the other cluster an impact boost in the comparison, so 1015 // that we bias towards filing it (instead of the test name cluster). 1016 if b.ClusterID.IsTestNameCluster() { 1017 valueA = (valueA * (100 + testnameThresholdInflationPercent)) / 100 1018 } 1019 if a.ClusterID.IsTestNameCluster() { 1020 valueB = (valueB * (100 + testnameThresholdInflationPercent)) / 100 1021 } 1022 equal = (valueA == valueB) 1023 // a less than b in the sort order is defined as a having more impact 1024 // than b, so that clusters are sorted in descending impact order. 1025 less = (valueA > valueB) 1026 return equal, less 1027 } 1028 1029 // createBug files a new bug for the given suggested cluster, 1030 // and stores the association from bug to failures through a new 1031 // failure association rule. 1032 func (b *BugUpdater) createBug(ctx context.Context, cs *analysis.Cluster) (created bool, err error) { 1033 alg, err := algorithms.SuggestingAlgorithm(cs.ClusterID.Algorithm) 1034 if err == algorithms.ErrAlgorithmNotExist { 1035 // The cluster is for an old algorithm that no longer exists, or 1036 // for a new algorithm that is not known by us yet. 1037 // Do not file a bug. This is not an error, it is expected during 1038 // algorithm version changes. 1039 return false, nil 1040 } 1041 1042 summary := clusterSummaryFromAnalysis(cs) 1043 1044 // Double-check the failure matches the cluster. Generating a 1045 // failure association rule that does not match the suggested cluster 1046 // could result in indefinite creation of new bugs, as the system 1047 // will repeatedly create new failure association rules for the 1048 // same suggested cluster. 1049 // Mismatches should usually be transient as re-clustering will fix 1050 // up any incorrect clustering. 1051 if hex.EncodeToString(alg.Cluster(b.projectCfg, &summary.Example)) != cs.ClusterID.ID { 1052 return false, errors.New("example failure did not match cluster ID") 1053 } 1054 rule, err := b.generateFailureAssociationRule(alg, &summary.Example) 1055 if err != nil { 1056 return false, errors.Annotate(err, "obtain failure association rule").Err() 1057 } 1058 1059 ruleID, err := rules.GenerateID() 1060 if err != nil { 1061 return false, errors.Annotate(err, "generating rule ID").Err() 1062 } 1063 1064 description, err := alg.ClusterDescription(b.projectCfg, summary) 1065 if err != nil { 1066 return false, errors.Annotate(err, "prepare bug description").Err() 1067 } 1068 1069 // Set policy activations starting from a state where no policies 1070 // are active. 1071 impact := ExtractResidualMetrics(cs) 1072 bugManagementState, _ := bugs.UpdatePolicyActivations(&bugspb.BugManagementState{}, b.projectCfg.Config.BugManagement.GetPolicies(), impact, b.RunTimestamp) 1073 1074 request := bugs.BugCreateRequest{ 1075 RuleID: ruleID, 1076 Description: description, 1077 } 1078 1079 activePolicyIDs := make(map[bugs.PolicyID]struct{}) 1080 for policyID, state := range bugManagementState.PolicyState { 1081 if state.IsActive { 1082 activePolicyIDs[bugs.PolicyID(policyID)] = struct{}{} 1083 } 1084 } 1085 request.ActivePolicyIDs = activePolicyIDs 1086 1087 system, err := b.routeToBugSystem(cs) 1088 if err != nil { 1089 return false, errors.Annotate(err, "extracting bug system").Err() 1090 } 1091 1092 if system == bugs.BuganizerSystem { 1093 var err error 1094 request.BuganizerComponent, err = extractBuganizerComponent(cs) 1095 if err != nil { 1096 return false, errors.Annotate(err, "extracting buganizer component").Err() 1097 } 1098 } else { 1099 request.MonorailComponents = extractMonorailComponents(cs) 1100 } 1101 1102 manager := b.managers[system] 1103 response := manager.Create(ctx, request) 1104 1105 if !response.Simulated && response.ID != "" { 1106 // We filed a bug. 1107 // Create a failure association rule associating the failures with a bug. 1108 1109 // In filing a bug, we notified the rule association. 1110 bugManagementState.RuleAssociationNotified = true 1111 1112 // Record which policies we notified as activating. 1113 for policyID := range response.PolicyActivationsNotified { 1114 bugManagementState.PolicyState[string(policyID)].ActivationNotified = true 1115 } 1116 1117 newRule := &rules.Entry{ 1118 Project: b.project, 1119 RuleID: ruleID, 1120 RuleDefinition: rule, 1121 BugID: bugs.BugID{System: system, ID: response.ID}, 1122 IsActive: true, 1123 IsManagingBug: true, 1124 IsManagingBugPriority: true, 1125 SourceCluster: cs.ClusterID, 1126 BugManagementState: bugManagementState, 1127 } 1128 create := func(ctx context.Context) error { 1129 user := rules.LUCIAnalysisSystem 1130 ms, err := rules.Create(newRule, user) 1131 if err != nil { 1132 return err 1133 } 1134 span.BufferWrite(ctx, ms) 1135 return nil 1136 } 1137 if _, err := span.ReadWriteTransaction(ctx, create); err != nil { 1138 return false, errors.Annotate(err, "create rule").Err() 1139 } 1140 } 1141 1142 if response.Error != nil { 1143 // We encountered an error creating the bug. Note that this 1144 // is not mutually exclusive with having filed a bug, as 1145 // steps after creating the bug may have failed, and in 1146 // this case a failure association rule should still be created. 1147 return false, errors.Annotate(response.Error, "create issue in %v (created ID: %q)", system, response.ID).Err() 1148 } 1149 1150 return true, nil 1151 } 1152 1153 func (b *BugUpdater) routeToBugSystem(cs *analysis.Cluster) (string, error) { 1154 hasMonorail := b.projectCfg.Config.BugManagement.GetMonorail() != nil 1155 hasBuganizer := b.projectCfg.Config.BugManagement.GetBuganizer() != nil 1156 defaultSystem := b.projectCfg.Config.BugManagement.GetDefaultBugSystem() 1157 1158 if !hasMonorail && !hasBuganizer { 1159 return "", errors.New("at least one bug filing system need to be configured") 1160 } 1161 // If only one bug system configured, pick that system. 1162 if !hasMonorail { 1163 return bugs.BuganizerSystem, nil 1164 } 1165 if !hasBuganizer { 1166 return bugs.MonorailSystem, nil 1167 } 1168 // When both bug systems are configured, pick the most suitable one. 1169 1170 // The most impactful monorail component. 1171 var topMonorailComponent analysis.TopCount 1172 for _, tc := range cs.TopMonorailComponents { 1173 if tc.Value == "" { 1174 continue 1175 } 1176 // Any monorail component is associated for more than 30% of the 1177 // failures in the cluster should be checked for top impact. 1178 if tc.Count > ((cs.MetricValues[metrics.Failures.ID].SevenDay.Nominal * 3) / 10) { 1179 if tc.Count > topMonorailComponent.Count || topMonorailComponent.Value == "" { 1180 topMonorailComponent = tc 1181 } 1182 } 1183 } 1184 1185 // The most impactful buganizer component. 1186 var topBuganizerComponent analysis.TopCount 1187 for _, tc := range cs.TopBuganizerComponents { 1188 if tc.Value == "" { 1189 continue 1190 } 1191 // Any buganizer component is associated for more than 30% of the 1192 // failures in the cluster should be checked for top impact. 1193 if tc.Count > ((cs.MetricValues[metrics.Failures.ID].SevenDay.Nominal * 3) / 10) { 1194 if tc.Count > topBuganizerComponent.Count || topBuganizerComponent.Value == "" { 1195 topBuganizerComponent = tc 1196 } 1197 } 1198 } 1199 1200 if topMonorailComponent.Value == "" && topBuganizerComponent.Value == "" { 1201 return defaultBugSystemName(defaultSystem), nil 1202 } else if topMonorailComponent.Value != "" && topBuganizerComponent.Value == "" { 1203 return bugs.MonorailSystem, nil 1204 } else if topMonorailComponent.Value == "" && topBuganizerComponent.Value != "" { 1205 return bugs.BuganizerSystem, nil 1206 } else { 1207 // Return the system corresponding with the highest impact. 1208 if topMonorailComponent.Count > topBuganizerComponent.Count { 1209 return bugs.MonorailSystem, nil 1210 } else if topMonorailComponent.Count == topBuganizerComponent.Count { 1211 // If top components have equal impact, use the configured default system. 1212 return defaultBugSystemName(defaultSystem), nil 1213 } else { 1214 return bugs.BuganizerSystem, nil 1215 } 1216 } 1217 } 1218 1219 func extractBuganizerComponent(cs *analysis.Cluster) (int64, error) { 1220 for _, tc := range cs.TopBuganizerComponents { 1221 // The top buganizer component that is associated for more than 30% of the 1222 // failures in the cluster should be on the filed bug. 1223 if tc.Value != "" && tc.Count > ((cs.MetricValues[metrics.Failures.ID].SevenDay.Nominal*3)/10) { 1224 componentID, err := strconv.ParseInt(tc.Value, 10, 64) 1225 if err != nil { 1226 return 0, errors.Annotate(err, "parse buganizer component id").Err() 1227 } 1228 return componentID, nil 1229 } 1230 } 1231 return 0, nil 1232 } 1233 1234 func extractMonorailComponents(cs *analysis.Cluster) []string { 1235 var monorailComponents []string 1236 for _, tc := range cs.TopMonorailComponents { 1237 // Any monorail component is associated for more than 30% of the 1238 // failures in the cluster should be on the filed bug. 1239 if tc.Count > ((cs.MetricValues[metrics.Failures.ID].SevenDay.Nominal * 3) / 10) { 1240 monorailComponents = append(monorailComponents, tc.Value) 1241 } 1242 } 1243 return monorailComponents 1244 } 1245 1246 func defaultBugSystemName(defaultSystem configpb.BugSystem) string { 1247 if defaultSystem == configpb.BugSystem_BUGANIZER { 1248 return bugs.BuganizerSystem 1249 } else { 1250 return bugs.MonorailSystem 1251 } 1252 } 1253 1254 func clusterSummaryFromAnalysis(c *analysis.Cluster) *clustering.ClusterSummary { 1255 example := clustering.Failure{ 1256 TestID: c.ExampleTestID(), 1257 } 1258 if c.ExampleFailureReason.Valid { 1259 example.Reason = &pb.FailureReason{PrimaryErrorMessage: c.ExampleFailureReason.StringVal} 1260 } 1261 // A list of 5 commonly occuring tests are included in bugs created 1262 // for failure reason clusters, to improve searchability by test name. 1263 var topTests []string 1264 for _, tt := range c.TopTestIDs { 1265 topTests = append(topTests, tt.Value) 1266 } 1267 return &clustering.ClusterSummary{ 1268 Example: example, 1269 TopTests: topTests, 1270 } 1271 } 1272 1273 func (b *BugUpdater) generateFailureAssociationRule(alg algorithms.Algorithm, failure *clustering.Failure) (string, error) { 1274 rule := alg.FailureAssociationRule(b.projectCfg, failure) 1275 1276 // Check the generated rule is valid and matches the failure. 1277 // An improperly generated failure association rule could result 1278 // in uncontrolled creation of new bugs. 1279 expr, err := lang.Parse(rule) 1280 if err != nil { 1281 return "", errors.Annotate(err, "rule generated by %s did not parse", alg.Name()).Err() 1282 } 1283 match := expr.Evaluate(failure) 1284 if !match { 1285 reason := "" 1286 if failure.Reason != nil { 1287 reason = failure.Reason.PrimaryErrorMessage 1288 } 1289 return "", fmt.Errorf("rule generated by %s did not match example failure (testID: %q, failureReason: %q)", 1290 alg.Name(), failure.TestID, reason) 1291 } 1292 return rule, nil 1293 }