go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/analysis/internal/bugs/updater/updater.go (about)

     1  // Copyright 2022 The LUCI Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package updater contains methods to orchestrate automatic bug management,
    16  // including automatic bug filing and automatic priority updates/auto-closure.
    17  package updater
    18  
    19  import (
    20  	"context"
    21  	"encoding/hex"
    22  	"fmt"
    23  	"sort"
    24  	"strconv"
    25  	"time"
    26  
    27  	"google.golang.org/grpc/codes"
    28  	"google.golang.org/grpc/status"
    29  
    30  	"go.chromium.org/luci/common/errors"
    31  	"go.chromium.org/luci/common/logging"
    32  	"go.chromium.org/luci/server/span"
    33  
    34  	"go.chromium.org/luci/analysis/internal/analysis"
    35  	"go.chromium.org/luci/analysis/internal/analysis/metrics"
    36  	"go.chromium.org/luci/analysis/internal/bugs"
    37  	bugspb "go.chromium.org/luci/analysis/internal/bugs/proto"
    38  	"go.chromium.org/luci/analysis/internal/clustering"
    39  	"go.chromium.org/luci/analysis/internal/clustering/algorithms"
    40  	"go.chromium.org/luci/analysis/internal/clustering/algorithms/rulesalgorithm"
    41  	"go.chromium.org/luci/analysis/internal/clustering/rules"
    42  	"go.chromium.org/luci/analysis/internal/clustering/rules/lang"
    43  	"go.chromium.org/luci/analysis/internal/clustering/runs"
    44  	"go.chromium.org/luci/analysis/internal/config/compiledcfg"
    45  	configpb "go.chromium.org/luci/analysis/proto/config"
    46  	pb "go.chromium.org/luci/analysis/proto/v1"
    47  )
    48  
    49  // testnameThresholdInflationPercent is the percentage factor by which
    50  // the bug filing threshold is inflated when applied to test-name clusters.
    51  // This is to bias bug-filing towards failure reason clusters, which are
    52  // seen as generally better scoped and more actionable (because they
    53  // focus on one reason for the test failing.)
    54  //
    55  // The value of 34% was selected as it is sufficient to inflate any threshold
    56  // values which are a '3' (e.g. CV runs rejected) to a '4'. Otherwise integer
    57  // discretization of the statistics would cancel out any intended bias.
    58  //
    59  // If changing this value, please also update the comment in
    60  // project_config.proto.
    61  const testnameThresholdInflationPercent = 34
    62  
    63  // mergeIntoCycleErr is the error returned if a cycle is detected in a bug's
    64  // merged-into graph when handling a bug marked as duplicate.
    65  var mergeIntoCycleErr = errors.New("a cycle was detected in the bug merged-into graph")
    66  
    67  // mergeIntoPermissionErr is the error returned if we get a permission error while traversing and/or
    68  // updating duplicate bugs.
    69  var mergeIntoPermissionErr = errors.New("permission error occured while merging duplicate bugs")
    70  
    71  // ruleDefinitionTooLongErr is the error returned if merging two failure
    72  // association rules results in a rule that is too long.
    73  var ruleDefinitionTooLongErr = errors.New("the merged rule definition is too long")
    74  
    75  // mergeIntoCycleMessage is the message posted on bugs when LUCI Analysis
    76  // cannot deal with a bug marked as the duplicate of another because of
    77  // a duplicate bug.
    78  const mergeIntoCycleMessage = "LUCI Analysis cannot merge the failure" +
    79  	" association rule for this bug into the rule for the merged-into bug," +
    80  	" because a cycle was detected in the bug merged-into graph. Please" +
    81  	" manually resolve the cycle, or update rules manually and archive the" +
    82  	" rule for this bug."
    83  
    84  const mergeIntoPermissionMessage = "LUCI Analysis cannot merge the association rule" +
    85  	" for this bug into the rule for the merged-into bug because" +
    86  	" it doesn't have permission to access the merged-into bug." +
    87  	" Please make sure that LUCI Analysis has access to all the" +
    88  	" bugs in the bug duplicate chain, " +
    89  	" or update rules manually and archive the rule for this bug."
    90  
    91  // ruleDefinitionTooLongMessage is the message posted on bugs when
    92  // LUCI Analysis cannot deal with a bug marked as the duplicate of another
    93  // because the merged rule would be too long.
    94  const ruleDefinitionTooLongMessage = "LUCI Analysis cannot merge the failure" +
    95  	" association rule for this bug into the rule for the merged-into bug," +
    96  	" because the merged failure association rule would be too long. Please" +
    97  	" manually update the rule for the merged-into bug and archive the" +
    98  	" rule for this bug."
    99  
   100  // BugManager implements bug creation and bug updates for a bug-tracking
   101  // system. The BugManager determines bug content and priority given a
   102  // cluster.
   103  type BugManager interface {
   104  	// Create creates a new bug for the given request, returning its ID
   105  	// (if a bug was created) and any encountered error.
   106  	Create(ctx context.Context, request bugs.BugCreateRequest) bugs.BugCreateResponse
   107  	// Update updates the specified list of bugs.
   108  	//
   109  	// Exactly one response item is returned for each request item.
   110  	// If an error is encountered on a specific bug, the error is recorded
   111  	// on the bug's response item and processing continues.
   112  	//
   113  	// If a catastrophic error occurs, the error is returned
   114  	// at the top-level and the responses slice should be ignored.
   115  	Update(ctx context.Context, bugs []bugs.BugUpdateRequest) ([]bugs.BugUpdateResponse, error)
   116  	// GetMergedInto reads the bug the given bug is merged into (if any).
   117  	// This is to allow step-wise discovery of the canonical bug a bug
   118  	// is merged into (if it exists and there is no cycle in the bug
   119  	// merged-into graph).
   120  	GetMergedInto(ctx context.Context, bug bugs.BugID) (*bugs.BugID, error)
   121  	// UpdateDuplicateSource updates the source bug of a duplicate
   122  	// bug relationship.
   123  	// It normally posts a message advising the user LUCI Analysis
   124  	// has merged the rule for the source bug to the destination
   125  	// (merged-into) bug, and provides a new link to the failure
   126  	// association rule.
   127  	// If a cycle was detected, it instead posts a message that the
   128  	// duplicate bug could not be handled and marks the bug no
   129  	// longer a duplicate to break the cycle.
   130  	UpdateDuplicateSource(ctx context.Context, request bugs.UpdateDuplicateSourceRequest) error
   131  }
   132  
   133  // BugUpdater performs updates to bugs and failure association
   134  // rules to keep them in sync with clusters generated by analysis.
   135  type BugUpdater struct {
   136  	// project is the LUCI project to act on behalf of.
   137  	project string
   138  	// analysisClient provides access to cluster analysis.
   139  	analysisClient AnalysisClient
   140  	// managers stores the manager responsible for updating bugs for each
   141  	// bug tracking system (monorail, buganizer, etc.).
   142  	managers map[string]BugManager
   143  	// projectCfg is the snapshot of project configuration to use for
   144  	// the auto-bug filing run.
   145  	projectCfg *compiledcfg.ProjectConfig
   146  	// MaxBugsFiledPerRun is the maximum number of bugs to file each time
   147  	// BugUpdater runs. This throttles the rate of changes to the bug system.
   148  	MaxBugsFiledPerRun int
   149  	// UpdateRuleBatchSize is the maximum number of rules to update in one
   150  	// transaction, when updating rule bug management state.
   151  	UpdateRuleBatchSize int
   152  	// Timestamp of the cron job. Used to timestamp policy activations/deactivations
   153  	// that happen as a result of this run.
   154  	RunTimestamp time.Time
   155  }
   156  
   157  // NewBugUpdater initialises a new BugUpdater.
   158  func NewBugUpdater(project string, mgrs map[string]BugManager, ac AnalysisClient, projectCfg *compiledcfg.ProjectConfig, runTimestamp time.Time) *BugUpdater {
   159  	return &BugUpdater{
   160  		project:             project,
   161  		managers:            mgrs,
   162  		analysisClient:      ac,
   163  		projectCfg:          projectCfg,
   164  		MaxBugsFiledPerRun:  1,    // Default value.
   165  		UpdateRuleBatchSize: 1000, // Default value.
   166  		RunTimestamp:        runTimestamp,
   167  	}
   168  }
   169  
   170  // Run files/updates bugs to match high-impact clusters as
   171  // identified by analysis. Each bug has a corresponding failure association
   172  // rule.
   173  // The passed progress should reflect the progress of re-clustering as captured
   174  // in the latest analysis.
   175  func (b *BugUpdater) Run(ctx context.Context, reclusteringProgress *runs.ReclusteringProgress) error {
   176  	// Verify we are not currently reclustering to a new version of
   177  	// algorithms or project configuration. If we are, we should
   178  	// suspend bug creation, priority updates and auto-closure
   179  	// as cluster impact is unreliable.
   180  	metricsValid := b.verifyClusterImpactValid(ctx, reclusteringProgress)
   181  
   182  	activeRules, err := rules.ReadActive(span.Single(ctx), b.project)
   183  	if err != nil {
   184  		return errors.Annotate(err, "read active failure association rules").Err()
   185  	}
   186  
   187  	metricsByRuleID := make(map[string]bugs.ClusterMetrics)
   188  	if metricsValid {
   189  		var thresholds []*configpb.ImpactMetricThreshold
   190  		for _, p := range b.projectCfg.Config.BugManagement.GetPolicies() {
   191  			thresholds = append(thresholds, bugs.ActivationThresholds(p)...)
   192  		}
   193  
   194  		// We want to read analysis for two categories of clusters:
   195  		// - Bug Clusters: to update the priority of filed bugs.
   196  		// - Impactful Suggested Clusters: if any suggested clusters may be
   197  		//    near the threshold to file a new bug for, we want to
   198  		//    read them, so we can file a bug. (Note: the thresholding applied
   199  		//    here is weaker than the actual bug filing criteria which is
   200  		//    implemented in this package, it exists mainly to avoid pulling
   201  		//    back all suggested clusters).
   202  		clusters, err := b.analysisClient.ReadImpactfulClusters(ctx, analysis.ImpactfulClusterReadOptions{
   203  			Project:                  b.project,
   204  			Thresholds:               thresholds,
   205  			AlwaysIncludeBugClusters: true,
   206  		})
   207  		if err != nil {
   208  			return errors.Annotate(err, "read impactful clusters").Err()
   209  		}
   210  
   211  		// blockedSourceClusterIDs is the set of source cluster IDs for which
   212  		// filing new bugs should be suspended.
   213  		blockedSourceClusterIDs := make(map[clustering.ClusterID]struct{})
   214  		for _, r := range activeRules {
   215  			if !reclusteringProgress.IncorporatesRulesVersion(r.CreateTime) {
   216  				// If a bug cluster was recently filed for a source cluster, and
   217  				// re-clustering and analysis is not yet complete (to move the
   218  				// impact from the source cluster to the bug cluster), do not file
   219  				// another bug for the source cluster.
   220  				// (Of course, if a bug cluster was filed for a source cluster,
   221  				// but the bug cluster's failure association rule was subsequently
   222  				// modified (e.g. narrowed), it is allowed to file another bug
   223  				// if the residual impact justifies it.)
   224  				blockedSourceClusterIDs[r.SourceCluster] = struct{}{}
   225  			}
   226  		}
   227  
   228  		if err := b.fileNewBugs(ctx, clusters, blockedSourceClusterIDs); err != nil {
   229  			return err
   230  		}
   231  
   232  		for _, cluster := range clusters {
   233  			if cluster.ClusterID.Algorithm == rulesalgorithm.AlgorithmName {
   234  				// Use only impact from latest algorithm version.
   235  				ruleID := cluster.ClusterID.ID
   236  				metricsByRuleID[ruleID] = ExtractResidualMetrics(cluster)
   237  			}
   238  		}
   239  	}
   240  
   241  	var rms []ruleWithMetrics
   242  	for _, rule := range activeRules {
   243  		var metrics bugs.ClusterMetrics
   244  
   245  		// Metrics are valid if re-clustering and analysis ran on the latest
   246  		// version of this failure association rule. This avoids bugs getting
   247  		// erroneous priority changes while metrics information is incomplete.
   248  		ruleMetricsValid := metricsValid &&
   249  			reclusteringProgress.IncorporatesRulesVersion(rule.PredicateLastUpdateTime)
   250  
   251  		if ruleMetricsValid {
   252  			var ok bool
   253  			metrics, ok = metricsByRuleID[rule.RuleID]
   254  			if !ok {
   255  				// If there is no analysis, this means the cluster is
   256  				// empty. Use empty impact.
   257  				metrics = bugs.ClusterMetrics{}
   258  			}
   259  		}
   260  		// Else leave metrics as nil. Bug-updating code takes this as an
   261  		// indication valid metrics are not available and will not attempt
   262  		// priority updates/auto-closure.
   263  
   264  		rms = append(rms, ruleWithMetrics{
   265  			RuleID:  rule.RuleID,
   266  			Metrics: metrics,
   267  		})
   268  	}
   269  
   270  	// Update bug management state (i.e. policy activations) for existing
   271  	// rules based on current cluster metrics. Prepare the bug update requests
   272  	// based on this state.
   273  	bugsToUpdate, err := b.updateBugManagementState(ctx, rms)
   274  	if err != nil {
   275  		return errors.Annotate(err, "update bug management state").Err()
   276  	}
   277  
   278  	// Break bug updates down by bug system.
   279  	bugUpdatesBySystem := make(map[string][]bugs.BugUpdateRequest)
   280  	for _, bug := range bugsToUpdate {
   281  		bugUpdates := bugUpdatesBySystem[bug.Bug.System]
   282  		bugUpdates = append(bugUpdates, bug)
   283  		bugUpdatesBySystem[bug.Bug.System] = bugUpdates
   284  	}
   285  
   286  	// Perform bug updates.
   287  	var errs []error
   288  	for system, systemBugsToUpdate := range bugUpdatesBySystem {
   289  		err := b.updateBugsForSystem(ctx, system, systemBugsToUpdate)
   290  		if err != nil {
   291  			errs = append(errs, errors.Annotate(err, "updating bugs in %s", system).Err())
   292  		}
   293  	}
   294  	// Returns nil if len(errs) == 0.
   295  	return errors.Append(errs...)
   296  }
   297  
   298  type ruleWithMetrics struct {
   299  	// Rule identifier.
   300  	RuleID string
   301  	// The bug cluster metrics. May be nil if no reliable metrics
   302  	// are available because reclustering is in progress.
   303  	Metrics bugs.ClusterMetrics
   304  }
   305  
   306  // updateBugManagementState updates policy activations for the
   307  // specified rules using the given current metric values.
   308  //
   309  // BugUpdateRequests then are created based on the read rules
   310  // and updated bug management state. The returned BugUpdateRequests
   311  // will be in 1:1 correspondance to the specified rules.
   312  func (b *BugUpdater) updateBugManagementState(ctx context.Context, rs []ruleWithMetrics) ([]bugs.BugUpdateRequest, error) {
   313  	// Read and update bug management state in batches.
   314  	// Batching is required as Spanner limits the number of mutations
   315  	// per transaction to 40,000 (as at August 2023):
   316  	// https://cloud.google.com/spanner/quotas#limits-for
   317  	batches := batch(rs, b.UpdateRuleBatchSize)
   318  
   319  	result := make([]bugs.BugUpdateRequest, 0, len(rs))
   320  	for _, ruleBatch := range batches {
   321  		var batchResult []bugs.BugUpdateRequest
   322  		batchResult, err := b.updateBugManagementStateBatch(ctx, ruleBatch)
   323  		if err != nil {
   324  			return nil, err
   325  		}
   326  
   327  		result = append(result, batchResult...)
   328  	}
   329  	return result, nil
   330  }
   331  
   332  func batch[K any](items []K, batchSize int) [][]K {
   333  	if batchSize < 1 {
   334  		panic("batch size must be greater than 0")
   335  	}
   336  
   337  	batchCount := (len(items) + batchSize - 1) / batchSize
   338  	result := make([][]K, 0, batchCount)
   339  	for i := 0; i < batchCount; i++ {
   340  		batchStartIndex := i * batchSize             // inclusive
   341  		batchEndIndex := batchStartIndex + batchSize // exclusive
   342  		if batchEndIndex > len(items) {
   343  			batchEndIndex = len(items)
   344  		}
   345  		result = append(result, items[batchStartIndex:batchEndIndex])
   346  	}
   347  	return result
   348  }
   349  
   350  // updateBugManagementStateBatch updates policy activations for the
   351  // specified rules using the given current metric values.
   352  //
   353  // BugUpdateRequests then are created based on the read rules
   354  // and updated bug management state. The returned BugUpdateRequests
   355  // will be in 1:1 correspondance to the specified rules.
   356  func (b *BugUpdater) updateBugManagementStateBatch(ctx context.Context, rulesAndMetrics []ruleWithMetrics) ([]bugs.BugUpdateRequest, error) {
   357  	ruleIDs := make([]string, 0, len(rulesAndMetrics))
   358  	for _, rule := range rulesAndMetrics {
   359  		ruleIDs = append(ruleIDs, rule.RuleID)
   360  	}
   361  
   362  	var result []bugs.BugUpdateRequest
   363  	f := func(ctx context.Context) error {
   364  		// This transaction may be retried. Reset the result each time
   365  		// the transaction runs to avoid data from previous aborted
   366  		// attempts leaking into subsequent attempts.
   367  		result = make([]bugs.BugUpdateRequest, 0, len(rulesAndMetrics))
   368  
   369  		// Read the rules in the transaction again to implement an
   370  		// atomic Read-Update transaction, which protects against
   371  		// update races. Subsequent bug-filing action will be based
   372  		// only on this second read.
   373  		// N.B.: ReadMany returns items in 1:1 correspondence to the request.
   374  		rs, err := rules.ReadMany(ctx, b.project, ruleIDs)
   375  		if err != nil {
   376  			return errors.Annotate(err, "read rules").Err()
   377  		}
   378  
   379  		for i, r := range rs {
   380  			// Fetches the corresponding metrics for a rule.
   381  			clusterMetrics := rulesAndMetrics[i].Metrics
   382  
   383  			// If metrics data is valid (e.g. no reclustering in progress).
   384  			if clusterMetrics != nil {
   385  				// Update which policies are active.
   386  				updatedBugManagementState, changed := bugs.UpdatePolicyActivations(r.BugManagementState, b.projectCfg.Config.BugManagement.GetPolicies(), clusterMetrics, b.RunTimestamp)
   387  				if changed {
   388  					// Only update the rule if a policy has activated or
   389  					// deactivated, to avoid unnecessary writes and rule
   390  					// cache invalidations.
   391  					r.BugManagementState = updatedBugManagementState
   392  
   393  					opts := rules.UpdateOptions{}
   394  					ms, err := rules.Update(r, opts, rules.LUCIAnalysisSystem)
   395  					if err != nil {
   396  						return errors.Annotate(err, "update rule").Err()
   397  					}
   398  					span.BufferWrite(ctx, ms)
   399  				}
   400  			}
   401  
   402  			updateRequest := bugs.BugUpdateRequest{
   403  				Bug:                              r.BugID,
   404  				IsManagingBug:                    r.IsManagingBug,
   405  				IsManagingBugPriority:            r.IsManagingBugPriority,
   406  				IsManagingBugPriorityLastUpdated: r.IsManagingBugPriorityLastUpdateTime,
   407  				RuleID:                           r.RuleID,
   408  			}
   409  			updateRequest.BugManagementState = r.BugManagementState
   410  			result = append(result, updateRequest)
   411  		}
   412  		return nil
   413  	}
   414  	if _, err := span.ReadWriteTransaction(ctx, f); err != nil {
   415  		return nil, err
   416  	}
   417  	return result, nil
   418  }
   419  
   420  func (b *BugUpdater) updateBugsForSystem(ctx context.Context, system string, bugsToUpdate []bugs.BugUpdateRequest) error {
   421  	manager, ok := b.managers[system]
   422  	if !ok {
   423  		logging.Warningf(ctx, "Encountered bug(s) with an unrecognised manager: %q", system)
   424  		return nil
   425  	}
   426  
   427  	// Keep a minute of time in reserve to update rules.
   428  	// It is important that we still update the rules for bugs we did
   429  	// successfully update as some bug behaviours rely on this as
   430  	// part of their control loop (we will keep posting the same
   431  	// comment on the bug until the rule is updated).
   432  	mgrCtx, cancel := bugs.Shorten(ctx, time.Minute)
   433  	defer cancel()
   434  
   435  	logging.Debugf(ctx, "Considering update of %v %s bugs in project %s", len(bugsToUpdate), system, b.project)
   436  
   437  	responses, err := manager.Update(mgrCtx, bugsToUpdate)
   438  	if err != nil {
   439  		// Catastrophic error, exit immediately.
   440  		return errors.Annotate(err, "update bugs").Err()
   441  	}
   442  
   443  	// The set of non-catastrophic errors encountered so far.
   444  	var errs []error
   445  	// The set of bugs marked as duplicate encountered.
   446  	var duplicateBugs []bugs.DuplicateBugDetails
   447  	// The updates to failure association rules required.
   448  	var updateRuleRequests []updateRuleRequest
   449  
   450  	for i, rsp := range responses {
   451  		if rsp.Error != nil {
   452  			// Capture the error, but continue processing this bug
   453  			// and other bugs, as partial success is possible
   454  			// and pending rule updates must be applied.
   455  			err := errors.Annotate(rsp.Error, "updating bug (%s)", bugsToUpdate[i].Bug.String()).Err()
   456  			errs = append(errs, err)
   457  			logging.Errorf(ctx, "%s", err)
   458  		}
   459  
   460  		if rsp.IsDuplicate {
   461  			duplicateBugs = append(duplicateBugs, bugs.DuplicateBugDetails{
   462  				RuleID:     bugsToUpdate[i].RuleID,
   463  				Bug:        bugsToUpdate[i].Bug,
   464  				IsAssigned: rsp.IsDuplicateAndAssigned,
   465  			})
   466  			// Inhibit archiving if rules are duplicates.
   467  			rsp.ShouldArchive = false
   468  		}
   469  		if rsp.ShouldArchive || rsp.DisableRulePriorityUpdates || rsp.RuleAssociationNotified || len(rsp.PolicyActivationsNotified) > 0 {
   470  			logging.Fields{
   471  				"RuleID":                     bugsToUpdate[i].RuleID,
   472  				"BugID":                      bugsToUpdate[i].Bug.String(),
   473  				"Archive":                    rsp.ShouldArchive,
   474  				"DisableRulePriorityUpdates": rsp.DisableRulePriorityUpdates,
   475  				"RuleAssociationNotified":    rsp.RuleAssociationNotified,
   476  				"PolicyActivationsNotified":  rsp.PolicyActivationsNotified,
   477  			}.Debugf(ctx, "Preparing rule update for bug %s", bugsToUpdate[i].Bug.String())
   478  
   479  			updateRuleRequests = append(updateRuleRequests, updateRuleRequest{
   480  				RuleID:                     bugsToUpdate[i].RuleID,
   481  				BugID:                      bugsToUpdate[i].Bug,
   482  				Archive:                    rsp.ShouldArchive,
   483  				DisableRulePriorityUpdates: rsp.DisableRulePriorityUpdates,
   484  				RuleAssociationNotified:    rsp.RuleAssociationNotified,
   485  				PolicyActivationsNotified:  rsp.PolicyActivationsNotified,
   486  			})
   487  		}
   488  	}
   489  
   490  	if err := b.updateRules(ctx, updateRuleRequests); err != nil {
   491  		err = errors.Annotate(err, "updating rules after updating bugs").Err()
   492  		errs = append(errs, err)
   493  		logging.Errorf(ctx, "%s", err)
   494  	}
   495  
   496  	// Handle bugs marked as duplicate.
   497  	for _, duplicateDetails := range duplicateBugs {
   498  		if err := b.handleDuplicateBug(ctx, duplicateDetails); err != nil {
   499  			err = errors.Annotate(err, "handling duplicate bug (%s)", duplicateDetails.Bug.String()).Err()
   500  			errs = append(errs, err)
   501  			logging.Errorf(ctx, "%s", err)
   502  		}
   503  	}
   504  	// Returns nil if len(errs) == 0.
   505  	return errors.Append(errs...)
   506  }
   507  
   508  func (b *BugUpdater) verifyClusterImpactValid(ctx context.Context, progress *runs.ReclusteringProgress) bool {
   509  	if progress.IsReclusteringToNewAlgorithms() {
   510  		logging.Warningf(ctx, "Auto-bug filing paused for project %s as re-clustering to new algorithms is in progress.", b.project)
   511  		return false
   512  	}
   513  	if progress.IsReclusteringToNewConfig() {
   514  		logging.Warningf(ctx, "Auto-bug filing paused for project %s as re-clustering to new configuration is in progress.", b.project)
   515  		return false
   516  	}
   517  	if algorithms.AlgorithmsVersion != progress.Next.AlgorithmsVersion {
   518  		logging.Warningf(ctx, "Auto-bug filing paused for project %s as bug-filing is running mismatched algorithms version %v (want %v).",
   519  			b.project, algorithms.AlgorithmsVersion, progress.Next.AlgorithmsVersion)
   520  		return false
   521  	}
   522  	if !b.projectCfg.LastUpdated.Equal(progress.Next.ConfigVersion) {
   523  		logging.Warningf(ctx, "Auto-bug filing paused for project %s as bug-filing is running mismatched config version %v (want %v).",
   524  			b.project, b.projectCfg.LastUpdated, progress.Next.ConfigVersion)
   525  		return false
   526  	}
   527  	return true
   528  }
   529  
   530  func (b *BugUpdater) fileNewBugs(ctx context.Context, clusters []*analysis.Cluster, blockedClusterIDs map[clustering.ClusterID]struct{}) error {
   531  	// The set of clusters IDs to file bugs for. Used for deduplicating creation
   532  	// requests accross policies.
   533  	clusterIDsToCreateBugsFor := make(map[clustering.ClusterID]struct{})
   534  
   535  	// The list of clusters to file bugs for. Uses a list instead of a set to ensure
   536  	// the order that bugs are created is deterministic and matches the order that
   537  	// policies are configured, which simplifies testing.
   538  	var clustersToCreateBugsFor []*analysis.Cluster
   539  
   540  	for _, p := range b.projectCfg.Config.BugManagement.GetPolicies() {
   541  		sortByPolicyBugFilingPreference(clusters, p)
   542  
   543  		for _, cluster := range clusters {
   544  			if cluster.ClusterID.IsBugCluster() {
   545  				// Never file another bug for a bug cluster.
   546  				continue
   547  			}
   548  
   549  			// Was a bug recently filed for this suggested cluster?
   550  			// We want to avoid race conditions whereby we file multiple bug
   551  			// clusters for the same suggested cluster, because re-clustering and
   552  			// re-analysis has not yet run and moved residual impact from the
   553  			// suggested cluster to the bug cluster.
   554  			_, ok := blockedClusterIDs[cluster.ClusterID]
   555  			if ok {
   556  				// Do not file a bug.
   557  				continue
   558  			}
   559  
   560  			// Were the failures are confined to only automation CLs
   561  			// and/or 1-2 user CLs? In other words, are the failures in this
   562  			// clusters unlikely to be present in the tree?
   563  			if cluster.DistinctUserCLsWithFailures7d.Residual < 3 &&
   564  				cluster.PostsubmitBuildsWithFailures7d.Residual == 0 {
   565  				// Do not file a bug.
   566  				continue
   567  			}
   568  
   569  			// Only file a bug if the residual impact exceeds the threshold.
   570  			impact := ExtractResidualMetrics(cluster)
   571  			bugFilingThresholds := bugs.ActivationThresholds(p)
   572  			if cluster.ClusterID.IsTestNameCluster() {
   573  				// Use an inflated threshold for test name clusters to bias
   574  				// bug creation towards failure reason clusters.
   575  				bugFilingThresholds =
   576  					bugs.InflateThreshold(bugFilingThresholds,
   577  						testnameThresholdInflationPercent)
   578  			}
   579  			if !impact.MeetsAnyOfThresholds(bugFilingThresholds) {
   580  				continue
   581  			}
   582  
   583  			// Create a bug for this cluster, deduplicating creation
   584  			// requests across policies.
   585  			if _, ok := clusterIDsToCreateBugsFor[cluster.ClusterID]; !ok {
   586  				clustersToCreateBugsFor = append(clustersToCreateBugsFor, cluster)
   587  				clusterIDsToCreateBugsFor[cluster.ClusterID] = struct{}{}
   588  			}
   589  
   590  			// The policy has picked the one cluster it wants to file a bug for.
   591  			// If this cluster is the same as another policy, the one bug is filed
   592  			// for both policies.
   593  			//
   594  			// This ensures if a top failure cluster clusters well by both reason
   595  			// and test name, we do not file bugs for both.
   596  			break
   597  		}
   598  	}
   599  
   600  	// File new bugs.
   601  	bugsFiled := 0
   602  	for _, cluster := range clustersToCreateBugsFor {
   603  		if bugsFiled >= b.MaxBugsFiledPerRun {
   604  			break
   605  		}
   606  		created, err := b.createBug(ctx, cluster)
   607  		if err != nil {
   608  			return err
   609  		}
   610  		if created {
   611  			bugsFiled++
   612  		}
   613  	}
   614  	return nil
   615  }
   616  
   617  type updateRuleRequest struct {
   618  	// The identity of the rule.
   619  	RuleID string
   620  	// The bug that was updated and/or from which the updates were sourced.
   621  	// If the bug on the rule has changed from this value, rule updates will
   622  	// not be applied.
   623  	BugID bugs.BugID
   624  	// Whether the rule should be archived.
   625  	Archive bool
   626  	// Whether rule priority updates should be disabled.
   627  	DisableRulePriorityUpdates bool
   628  	// Whether BugManagementState.RuleAssociationNotified should be set.
   629  	RuleAssociationNotified bool
   630  	// A map containing the IDs of policies for which
   631  	// BugManagementState.Policies[<policyID>].ActivationNotified should
   632  	// be set.
   633  	PolicyActivationsNotified map[bugs.PolicyID]struct{}
   634  }
   635  
   636  // updateRules applies updates to failure association rules
   637  // following a round of bug updates. This includes:
   638  //   - archiving rules if the bug was detected in an archived state
   639  //   - disabling automatic priority updates if it was detected that
   640  //     the user manually set the bug priority.
   641  //
   642  // requests and response slices should have 1:1 correspondance, i.e.
   643  // requests[i] corresponds to responses[i].
   644  func (b *BugUpdater) updateRules(ctx context.Context, requests []updateRuleRequest) error {
   645  	// Perform updates in batches to stay within mutation Spanner limits.
   646  	requestBatches := batch(requests, b.UpdateRuleBatchSize)
   647  	for _, batch := range requestBatches {
   648  		err := b.updateRulesBatch(ctx, batch)
   649  		if err != nil {
   650  			return err
   651  		}
   652  	}
   653  	return nil
   654  }
   655  
   656  func (b *BugUpdater) updateRulesBatch(ctx context.Context, requests []updateRuleRequest) error {
   657  	ruleIDs := make([]string, 0, len(requests))
   658  	for _, req := range requests {
   659  		ruleIDs = append(ruleIDs, req.RuleID)
   660  	}
   661  	f := func(ctx context.Context) error {
   662  		// Perform transactional read-update of rule to protect
   663  		// against update races.
   664  		rs, err := rules.ReadMany(ctx, b.project, ruleIDs)
   665  		if err != nil {
   666  			return errors.Annotate(err, "read rules").Err()
   667  		}
   668  		for i, rule := range rs {
   669  			updateRequest := requests[i]
   670  			if rule.RuleID != updateRequest.RuleID {
   671  				// ReadMany's response should be in 1:1 correspondance
   672  				// to the request.
   673  				panic("logic error")
   674  			}
   675  			if rule.BugID != updateRequest.BugID {
   676  				// A data race has occured: the rule has been modified while
   677  				// we were updating bugs, and now the update to the rule no
   678  				// longer makes sense. This should only occur rarely.
   679  				logging.Warningf(ctx, "Bug associated with rule %v changed during bug-filing run, skipping updates to rule.")
   680  				continue
   681  			}
   682  			updateOptions := rules.UpdateOptions{}
   683  			if updateRequest.Archive {
   684  				rule.IsActive = false
   685  				updateOptions.IsAuditableUpdate = true
   686  				updateOptions.PredicateUpdated = true
   687  			}
   688  			if updateRequest.DisableRulePriorityUpdates {
   689  				rule.IsManagingBugPriority = false
   690  				updateOptions.IsAuditableUpdate = true
   691  				updateOptions.IsManagingBugPriorityUpdated = true
   692  			}
   693  			if updateRequest.RuleAssociationNotified {
   694  				rule.BugManagementState.RuleAssociationNotified = true
   695  			}
   696  			for policyID := range updateRequest.PolicyActivationsNotified {
   697  				policyState, ok := rule.BugManagementState.PolicyState[string(policyID)]
   698  				if !ok {
   699  					// The policy has been deleted during the bug-filing run.
   700  					logging.Warningf(ctx, "Policy activation notified for policy %v, which is now deleted.", policyID)
   701  					continue
   702  				}
   703  				policyState.ActivationNotified = true
   704  			}
   705  			ms, err := rules.Update(rule, updateOptions, rules.LUCIAnalysisSystem)
   706  			if err != nil {
   707  				// Validation error; this should never happen here.
   708  				return errors.Annotate(err, "prepare rule update").Err()
   709  			}
   710  			span.BufferWrite(ctx, ms)
   711  		}
   712  		return nil
   713  	}
   714  	_, err := span.ReadWriteTransaction(ctx, f)
   715  	if err != nil {
   716  		return errors.Annotate(err, "update rules").Err()
   717  	}
   718  	return nil
   719  }
   720  
   721  // handleDuplicateBug handles a duplicate bug, merging its failure association
   722  // rule with the bug it is ultimately merged into (creating the rule if it does
   723  // not exist). In case of unhandleable errors, the source bug is kicked out of the
   724  // duplicate state and an error message is posted on the bug.
   725  func (b *BugUpdater) handleDuplicateBug(ctx context.Context, duplicateDetails bugs.DuplicateBugDetails) error {
   726  	err := b.handleDuplicateBugHappyPath(ctx, duplicateDetails)
   727  	if errors.Is(err, mergeIntoCycleErr) {
   728  		request := bugs.UpdateDuplicateSourceRequest{
   729  			BugDetails:   duplicateDetails,
   730  			ErrorMessage: mergeIntoCycleMessage,
   731  		}
   732  		if err := b.updateDuplicateSource(ctx, request); err != nil {
   733  			return errors.Annotate(err, "update source bug after a cycle was found").Err()
   734  		}
   735  	} else if errors.Is(err, ruleDefinitionTooLongErr) {
   736  		request := bugs.UpdateDuplicateSourceRequest{
   737  			BugDetails:   duplicateDetails,
   738  			ErrorMessage: ruleDefinitionTooLongMessage,
   739  		}
   740  		if err := b.updateDuplicateSource(ctx, request); err != nil {
   741  			return errors.Annotate(err, "update source bug after merging rule definition was found too long").Err()
   742  		}
   743  	} else if errors.Is(err, mergeIntoPermissionErr) {
   744  		request := bugs.UpdateDuplicateSourceRequest{
   745  			BugDetails:   duplicateDetails,
   746  			ErrorMessage: mergeIntoPermissionMessage,
   747  		}
   748  		if err := b.updateDuplicateSource(ctx, request); err != nil {
   749  			return errors.Annotate(err, "update source bug after merging rule definition encountered a permission error").Err()
   750  		}
   751  	} else if err != nil {
   752  		return err
   753  	}
   754  	return nil
   755  }
   756  
   757  // handleDuplicateBugHappyPath handles a duplicate bug, merging its failure association
   758  // rule with the bug it is ultimately merged into (creating the rule if it does
   759  // not exist). The original rule is archived.
   760  func (b *BugUpdater) handleDuplicateBugHappyPath(ctx context.Context, duplicateDetails bugs.DuplicateBugDetails) error {
   761  	// Chase the bug merged-into graph until we find the sink of the graph.
   762  	// (The canonical bug of the chain of duplicate bugs.)
   763  	destBug, err := b.resolveMergedIntoBug(ctx, duplicateDetails.Bug)
   764  	if err != nil {
   765  		// May return mergeIntoCycleErr.
   766  		return err
   767  	}
   768  
   769  	var destinationBugRuleID string
   770  
   771  	f := func(ctx context.Context) error {
   772  		sourceRule, _, err := readRuleForBugAndProject(ctx, duplicateDetails.Bug, b.project)
   773  		if err != nil {
   774  			return errors.Annotate(err, "reading rule for source bug").Err()
   775  		}
   776  		if !sourceRule.IsActive {
   777  			// The source rule is no longer active. This is a race condition
   778  			// as we only do bug updates for rules that exist at the time
   779  			// we start bug updates.
   780  			// An inactive rule does not match any failures so merging the
   781  			// it into another rule should have no effect anyway.
   782  			return nil
   783  		}
   784  		// Try and read the rule for the bug we are merging into.
   785  		destinationRule, _, err :=
   786  			readRuleForBugAndProject(ctx, destBug, b.project)
   787  		if err != nil {
   788  			return errors.Annotate(err, "reading rule for destination bug").Err()
   789  		}
   790  		if destinationRule == nil {
   791  			// The destination bug does not have a rule in this project.
   792  			// Simply update the source rule to point to the new bug.
   793  			sourceRule.BugID = destBug
   794  
   795  			// As the bug has changed, flags tracking notification of policy
   796  			// activation must be reset.
   797  			if sourceRule.BugManagementState.PolicyState != nil {
   798  				for _, policyState := range sourceRule.BugManagementState.PolicyState {
   799  					policyState.ActivationNotified = false
   800  				}
   801  			}
   802  
   803  			// The destination bug is not a LUCI Analysis bug.
   804  			// Do not automatically verify/auto-close it as we do not
   805  			// know what problems it was for.
   806  			sourceRule.IsManagingBug = false
   807  
   808  			sourceRule.BugManagementState.RuleAssociationNotified = false
   809  
   810  			ms, err := rules.Update(sourceRule, rules.UpdateOptions{
   811  				IsAuditableUpdate: true,
   812  			}, rules.LUCIAnalysisSystem)
   813  			if err != nil {
   814  				// Indicates validation error. Should never happen.
   815  				return err
   816  			}
   817  			span.BufferWrite(ctx, ms)
   818  
   819  			destinationBugRuleID = sourceRule.RuleID
   820  			return nil
   821  		} else {
   822  			// The bug we are a duplicate of already has a rule.
   823  			if destinationRule.IsActive {
   824  				// Merge the source and destination rules with an "OR".
   825  				mergedRule, err := lang.Merge(destinationRule.RuleDefinition, sourceRule.RuleDefinition)
   826  				if err != nil {
   827  					return errors.Annotate(err, "merging rules").Err()
   828  				}
   829  				if len(mergedRule) > rules.MaxRuleDefinitionLength {
   830  					// The merged rule is too long to store.
   831  					return ruleDefinitionTooLongErr
   832  				}
   833  				destinationRule.RuleDefinition = mergedRule
   834  			} else {
   835  				// Else: an inactive rule does not match any failures, so we should
   836  				// use only the rule from the source bug.
   837  				destinationRule.RuleDefinition = sourceRule.RuleDefinition
   838  			}
   839  
   840  			// Disable the source rule.
   841  			sourceRule.IsActive = false
   842  			ms, err := rules.Update(sourceRule, rules.UpdateOptions{
   843  				IsAuditableUpdate: true,
   844  				PredicateUpdated:  true,
   845  			}, rules.LUCIAnalysisSystem)
   846  			if err != nil {
   847  				// Indicates validation error. Should never happen.
   848  				return err
   849  			}
   850  			span.BufferWrite(ctx, ms)
   851  
   852  			// Update the rule on the destination rule.
   853  			destinationRule.IsActive = true
   854  			ms, err = rules.Update(destinationRule, rules.UpdateOptions{
   855  				IsAuditableUpdate: true,
   856  				PredicateUpdated:  true,
   857  			}, rules.LUCIAnalysisSystem)
   858  			if err != nil {
   859  				return err
   860  			}
   861  			span.BufferWrite(ctx, ms)
   862  
   863  			destinationBugRuleID = destinationRule.RuleID
   864  			return nil
   865  		}
   866  	}
   867  	// Update source and destination rules in one transaction, to ensure
   868  	// consistency.
   869  	_, err = span.ReadWriteTransaction(ctx, f)
   870  	if err != nil {
   871  		return err
   872  	}
   873  
   874  	if !b.projectCfg.Config.BugManagement.GetDisableDuplicateBugComments() {
   875  		// Notify that the bugs were successfully merged.
   876  		request := bugs.UpdateDuplicateSourceRequest{
   877  			BugDetails:        duplicateDetails,
   878  			DestinationRuleID: destinationBugRuleID,
   879  		}
   880  		if err := b.updateDuplicateSource(ctx, request); err != nil {
   881  			return errors.Annotate(err, "updating source bug").Err()
   882  		}
   883  	}
   884  
   885  	return err
   886  }
   887  
   888  // resolveMergedIntoBug resolves the bug the given bug is ultimately merged
   889  // into.
   890  func (b *BugUpdater) resolveMergedIntoBug(ctx context.Context, bug bugs.BugID) (bugs.BugID, error) {
   891  	isResolved := false
   892  	mergedIntoBug := bug
   893  	const maxResolutionSteps = 5
   894  	for i := 0; i < maxResolutionSteps; i++ {
   895  		system := mergedIntoBug.System
   896  		manager, ok := b.managers[system]
   897  		if !ok {
   898  			if mergedIntoBug.System == "buganizer" {
   899  				// Do not attempt to resolve the canoncial bug within
   900  				// buganizer if buganizer is not registered. We hit this
   901  				// path with buganizer not registered if a monorail bug marks
   902  				// itself as a duplicate of a buganizer bug.
   903  				isResolved = true
   904  				break
   905  			}
   906  			return bugs.BugID{}, fmt.Errorf("encountered unknown bug system: %q", system)
   907  		}
   908  		mergedInto, err := manager.GetMergedInto(ctx, mergedIntoBug)
   909  		if status.Code(err) == codes.PermissionDenied {
   910  			// We don't have permission to view the issue
   911  			return bugs.BugID{}, mergeIntoPermissionErr
   912  		} else if err != nil {
   913  			return bugs.BugID{}, err
   914  		}
   915  		if mergedInto == nil {
   916  			// We have found the canoncial merged-into bug.
   917  			isResolved = true
   918  			break
   919  		} else {
   920  			mergedIntoBug = *mergedInto
   921  		}
   922  	}
   923  	if !isResolved {
   924  		// We found a cycle in the graph.
   925  		return bugs.BugID{}, mergeIntoCycleErr
   926  	}
   927  	if mergedIntoBug == bug {
   928  		// This would normally never occur, but is possible in some
   929  		// exceptional scenarios like race conditions where a cycle
   930  		// is broken during the graph traversal, or a bug which
   931  		// was marked as duplicate but is no longer marked as duplicate
   932  		// now.
   933  		return bugs.BugID{}, fmt.Errorf("cannot deduplicate a bug into itself")
   934  	}
   935  	return mergedIntoBug, nil
   936  }
   937  
   938  // updateDuplicateSource updates the source bug of a duplicate
   939  // bug pair (source bug, destination bug).
   940  // It either posts a message notifying the user the rule was successfully
   941  // merged to the destination, or notifies the user of the error and
   942  // marks the bug no longer a duplicate (to avoid repeated attempts to
   943  // handle the problematic duplicate bug).
   944  func (b *BugUpdater) updateDuplicateSource(ctx context.Context, request bugs.UpdateDuplicateSourceRequest) error {
   945  	manager, ok := b.managers[request.BugDetails.Bug.System]
   946  	if !ok {
   947  		return fmt.Errorf("encountered unknown bug system: %q", request.BugDetails.Bug.System)
   948  	}
   949  	err := manager.UpdateDuplicateSource(ctx, request)
   950  	if err != nil {
   951  		return err
   952  	}
   953  	return nil
   954  }
   955  
   956  // readRuleForBugAndProject reads the failure association rule for the given
   957  // bug in the given project, if it exists. It additionally returns whether
   958  // there is any rule in the system that manages the given bug, even if in
   959  // a different project.
   960  // If the rule cannot be read, it returns nil.
   961  func readRuleForBugAndProject(ctx context.Context, bug bugs.BugID, project string) (rule *rules.Entry, anyRuleManaging bool, err error) {
   962  	rules, err := rules.ReadByBug(ctx, bug)
   963  	if err != nil {
   964  		return nil, false, err
   965  	}
   966  	rule = nil
   967  	anyRuleManaging = false
   968  	for _, r := range rules {
   969  		if r.IsManagingBug {
   970  			anyRuleManaging = true
   971  		}
   972  		if r.Project == project {
   973  			rule = r
   974  		}
   975  	}
   976  	return rule, anyRuleManaging, nil
   977  }
   978  
   979  // sortByPolicyBugFilingPreference sorts clusters based on our preference
   980  // to file bugs for these clusters.
   981  func sortByPolicyBugFilingPreference(cs []*analysis.Cluster, policy *configpb.BugManagementPolicy) {
   982  	// The current ranking approach prefers filing bugs for clusters
   983  	// which more strongly meet the bug-filing threshold, with a bias
   984  	// towards reason clusters.
   985  	//
   986  	// The order of this ranking is only important where there are
   987  	// multiple competing clusters which meet the bug-filing threshold.
   988  	// As bug filing runs relatively often, except in cases of contention,
   989  	// the first bug to meet the threshold will be filed.
   990  	sort.Slice(cs, func(i, j int) bool {
   991  		// N.B. This does not rank clusters perfectly where the policy has
   992  		// multiple metrics, as the first metric may only slightly the
   993  		// threshold, but the second metric strongly exceeds.
   994  		// Most policies have only one metric, however, so this should
   995  		// be pretty rare.
   996  		for _, metric := range policy.Metrics {
   997  			if equal, less := rankByMetric(cs[i], cs[j], metrics.ID(metric.MetricId)); !equal {
   998  				return less
   999  			}
  1000  		}
  1001  		// If all else fails, sort by cluster ID. This is mostly to ensure
  1002  		// the code behaves deterministically when under unit testing.
  1003  		if cs[i].ClusterID.Algorithm != cs[j].ClusterID.Algorithm {
  1004  			return cs[i].ClusterID.Algorithm < cs[j].ClusterID.Algorithm
  1005  		}
  1006  		return cs[i].ClusterID.ID < cs[j].ClusterID.ID
  1007  	})
  1008  }
  1009  
  1010  func rankByMetric(a, b *analysis.Cluster, metric metrics.ID) (equal bool, less bool) {
  1011  	valueA := a.MetricValues[metric].SevenDay.Residual
  1012  	valueB := b.MetricValues[metric].SevenDay.Residual
  1013  	// If one cluster we are comparing with is a test name cluster,
  1014  	// give the other cluster an impact boost in the comparison, so
  1015  	// that we bias towards filing it (instead of the test name cluster).
  1016  	if b.ClusterID.IsTestNameCluster() {
  1017  		valueA = (valueA * (100 + testnameThresholdInflationPercent)) / 100
  1018  	}
  1019  	if a.ClusterID.IsTestNameCluster() {
  1020  		valueB = (valueB * (100 + testnameThresholdInflationPercent)) / 100
  1021  	}
  1022  	equal = (valueA == valueB)
  1023  	// a less than b in the sort order is defined as a having more impact
  1024  	// than b, so that clusters are sorted in descending impact order.
  1025  	less = (valueA > valueB)
  1026  	return equal, less
  1027  }
  1028  
  1029  // createBug files a new bug for the given suggested cluster,
  1030  // and stores the association from bug to failures through a new
  1031  // failure association rule.
  1032  func (b *BugUpdater) createBug(ctx context.Context, cs *analysis.Cluster) (created bool, err error) {
  1033  	alg, err := algorithms.SuggestingAlgorithm(cs.ClusterID.Algorithm)
  1034  	if err == algorithms.ErrAlgorithmNotExist {
  1035  		// The cluster is for an old algorithm that no longer exists, or
  1036  		// for a new algorithm that is not known by us yet.
  1037  		// Do not file a bug. This is not an error, it is expected during
  1038  		// algorithm version changes.
  1039  		return false, nil
  1040  	}
  1041  
  1042  	summary := clusterSummaryFromAnalysis(cs)
  1043  
  1044  	// Double-check the failure matches the cluster. Generating a
  1045  	// failure association rule that does not match the suggested cluster
  1046  	// could result in indefinite creation of new bugs, as the system
  1047  	// will repeatedly create new failure association rules for the
  1048  	// same suggested cluster.
  1049  	// Mismatches should usually be transient as re-clustering will fix
  1050  	// up any incorrect clustering.
  1051  	if hex.EncodeToString(alg.Cluster(b.projectCfg, &summary.Example)) != cs.ClusterID.ID {
  1052  		return false, errors.New("example failure did not match cluster ID")
  1053  	}
  1054  	rule, err := b.generateFailureAssociationRule(alg, &summary.Example)
  1055  	if err != nil {
  1056  		return false, errors.Annotate(err, "obtain failure association rule").Err()
  1057  	}
  1058  
  1059  	ruleID, err := rules.GenerateID()
  1060  	if err != nil {
  1061  		return false, errors.Annotate(err, "generating rule ID").Err()
  1062  	}
  1063  
  1064  	description, err := alg.ClusterDescription(b.projectCfg, summary)
  1065  	if err != nil {
  1066  		return false, errors.Annotate(err, "prepare bug description").Err()
  1067  	}
  1068  
  1069  	// Set policy activations starting from a state where no policies
  1070  	// are active.
  1071  	impact := ExtractResidualMetrics(cs)
  1072  	bugManagementState, _ := bugs.UpdatePolicyActivations(&bugspb.BugManagementState{}, b.projectCfg.Config.BugManagement.GetPolicies(), impact, b.RunTimestamp)
  1073  
  1074  	request := bugs.BugCreateRequest{
  1075  		RuleID:      ruleID,
  1076  		Description: description,
  1077  	}
  1078  
  1079  	activePolicyIDs := make(map[bugs.PolicyID]struct{})
  1080  	for policyID, state := range bugManagementState.PolicyState {
  1081  		if state.IsActive {
  1082  			activePolicyIDs[bugs.PolicyID(policyID)] = struct{}{}
  1083  		}
  1084  	}
  1085  	request.ActivePolicyIDs = activePolicyIDs
  1086  
  1087  	system, err := b.routeToBugSystem(cs)
  1088  	if err != nil {
  1089  		return false, errors.Annotate(err, "extracting bug system").Err()
  1090  	}
  1091  
  1092  	if system == bugs.BuganizerSystem {
  1093  		var err error
  1094  		request.BuganizerComponent, err = extractBuganizerComponent(cs)
  1095  		if err != nil {
  1096  			return false, errors.Annotate(err, "extracting buganizer component").Err()
  1097  		}
  1098  	} else {
  1099  		request.MonorailComponents = extractMonorailComponents(cs)
  1100  	}
  1101  
  1102  	manager := b.managers[system]
  1103  	response := manager.Create(ctx, request)
  1104  
  1105  	if !response.Simulated && response.ID != "" {
  1106  		// We filed a bug.
  1107  		// Create a failure association rule associating the failures with a bug.
  1108  
  1109  		// In filing a bug, we notified the rule association.
  1110  		bugManagementState.RuleAssociationNotified = true
  1111  
  1112  		// Record which policies we notified as activating.
  1113  		for policyID := range response.PolicyActivationsNotified {
  1114  			bugManagementState.PolicyState[string(policyID)].ActivationNotified = true
  1115  		}
  1116  
  1117  		newRule := &rules.Entry{
  1118  			Project:               b.project,
  1119  			RuleID:                ruleID,
  1120  			RuleDefinition:        rule,
  1121  			BugID:                 bugs.BugID{System: system, ID: response.ID},
  1122  			IsActive:              true,
  1123  			IsManagingBug:         true,
  1124  			IsManagingBugPriority: true,
  1125  			SourceCluster:         cs.ClusterID,
  1126  			BugManagementState:    bugManagementState,
  1127  		}
  1128  		create := func(ctx context.Context) error {
  1129  			user := rules.LUCIAnalysisSystem
  1130  			ms, err := rules.Create(newRule, user)
  1131  			if err != nil {
  1132  				return err
  1133  			}
  1134  			span.BufferWrite(ctx, ms)
  1135  			return nil
  1136  		}
  1137  		if _, err := span.ReadWriteTransaction(ctx, create); err != nil {
  1138  			return false, errors.Annotate(err, "create rule").Err()
  1139  		}
  1140  	}
  1141  
  1142  	if response.Error != nil {
  1143  		// We encountered an error creating the bug. Note that this
  1144  		// is not mutually exclusive with having filed a bug, as
  1145  		// steps after creating the bug may have failed, and in
  1146  		// this case a failure association rule should still be created.
  1147  		return false, errors.Annotate(response.Error, "create issue in %v (created ID: %q)", system, response.ID).Err()
  1148  	}
  1149  
  1150  	return true, nil
  1151  }
  1152  
  1153  func (b *BugUpdater) routeToBugSystem(cs *analysis.Cluster) (string, error) {
  1154  	hasMonorail := b.projectCfg.Config.BugManagement.GetMonorail() != nil
  1155  	hasBuganizer := b.projectCfg.Config.BugManagement.GetBuganizer() != nil
  1156  	defaultSystem := b.projectCfg.Config.BugManagement.GetDefaultBugSystem()
  1157  
  1158  	if !hasMonorail && !hasBuganizer {
  1159  		return "", errors.New("at least one bug filing system need to be configured")
  1160  	}
  1161  	// If only one bug system configured, pick that system.
  1162  	if !hasMonorail {
  1163  		return bugs.BuganizerSystem, nil
  1164  	}
  1165  	if !hasBuganizer {
  1166  		return bugs.MonorailSystem, nil
  1167  	}
  1168  	// When both bug systems are configured, pick the most suitable one.
  1169  
  1170  	// The most impactful monorail component.
  1171  	var topMonorailComponent analysis.TopCount
  1172  	for _, tc := range cs.TopMonorailComponents {
  1173  		if tc.Value == "" {
  1174  			continue
  1175  		}
  1176  		// Any monorail component is associated for more than 30% of the
  1177  		// failures in the cluster should be checked for top impact.
  1178  		if tc.Count > ((cs.MetricValues[metrics.Failures.ID].SevenDay.Nominal * 3) / 10) {
  1179  			if tc.Count > topMonorailComponent.Count || topMonorailComponent.Value == "" {
  1180  				topMonorailComponent = tc
  1181  			}
  1182  		}
  1183  	}
  1184  
  1185  	// The most impactful buganizer component.
  1186  	var topBuganizerComponent analysis.TopCount
  1187  	for _, tc := range cs.TopBuganizerComponents {
  1188  		if tc.Value == "" {
  1189  			continue
  1190  		}
  1191  		// Any buganizer component is associated for more than 30% of the
  1192  		// failures in the cluster should be checked for top impact.
  1193  		if tc.Count > ((cs.MetricValues[metrics.Failures.ID].SevenDay.Nominal * 3) / 10) {
  1194  			if tc.Count > topBuganizerComponent.Count || topBuganizerComponent.Value == "" {
  1195  				topBuganizerComponent = tc
  1196  			}
  1197  		}
  1198  	}
  1199  
  1200  	if topMonorailComponent.Value == "" && topBuganizerComponent.Value == "" {
  1201  		return defaultBugSystemName(defaultSystem), nil
  1202  	} else if topMonorailComponent.Value != "" && topBuganizerComponent.Value == "" {
  1203  		return bugs.MonorailSystem, nil
  1204  	} else if topMonorailComponent.Value == "" && topBuganizerComponent.Value != "" {
  1205  		return bugs.BuganizerSystem, nil
  1206  	} else {
  1207  		// Return the system corresponding with the highest impact.
  1208  		if topMonorailComponent.Count > topBuganizerComponent.Count {
  1209  			return bugs.MonorailSystem, nil
  1210  		} else if topMonorailComponent.Count == topBuganizerComponent.Count {
  1211  			// If top components have equal impact, use the configured default system.
  1212  			return defaultBugSystemName(defaultSystem), nil
  1213  		} else {
  1214  			return bugs.BuganizerSystem, nil
  1215  		}
  1216  	}
  1217  }
  1218  
  1219  func extractBuganizerComponent(cs *analysis.Cluster) (int64, error) {
  1220  	for _, tc := range cs.TopBuganizerComponents {
  1221  		// The top buganizer component that is associated for more than 30% of the
  1222  		// failures in the cluster should be on the filed bug.
  1223  		if tc.Value != "" && tc.Count > ((cs.MetricValues[metrics.Failures.ID].SevenDay.Nominal*3)/10) {
  1224  			componentID, err := strconv.ParseInt(tc.Value, 10, 64)
  1225  			if err != nil {
  1226  				return 0, errors.Annotate(err, "parse buganizer component id").Err()
  1227  			}
  1228  			return componentID, nil
  1229  		}
  1230  	}
  1231  	return 0, nil
  1232  }
  1233  
  1234  func extractMonorailComponents(cs *analysis.Cluster) []string {
  1235  	var monorailComponents []string
  1236  	for _, tc := range cs.TopMonorailComponents {
  1237  		// Any monorail component is associated for more than 30% of the
  1238  		// failures in the cluster should be on the filed bug.
  1239  		if tc.Count > ((cs.MetricValues[metrics.Failures.ID].SevenDay.Nominal * 3) / 10) {
  1240  			monorailComponents = append(monorailComponents, tc.Value)
  1241  		}
  1242  	}
  1243  	return monorailComponents
  1244  }
  1245  
  1246  func defaultBugSystemName(defaultSystem configpb.BugSystem) string {
  1247  	if defaultSystem == configpb.BugSystem_BUGANIZER {
  1248  		return bugs.BuganizerSystem
  1249  	} else {
  1250  		return bugs.MonorailSystem
  1251  	}
  1252  }
  1253  
  1254  func clusterSummaryFromAnalysis(c *analysis.Cluster) *clustering.ClusterSummary {
  1255  	example := clustering.Failure{
  1256  		TestID: c.ExampleTestID(),
  1257  	}
  1258  	if c.ExampleFailureReason.Valid {
  1259  		example.Reason = &pb.FailureReason{PrimaryErrorMessage: c.ExampleFailureReason.StringVal}
  1260  	}
  1261  	// A list of 5 commonly occuring tests are included in bugs created
  1262  	// for failure reason clusters, to improve searchability by test name.
  1263  	var topTests []string
  1264  	for _, tt := range c.TopTestIDs {
  1265  		topTests = append(topTests, tt.Value)
  1266  	}
  1267  	return &clustering.ClusterSummary{
  1268  		Example:  example,
  1269  		TopTests: topTests,
  1270  	}
  1271  }
  1272  
  1273  func (b *BugUpdater) generateFailureAssociationRule(alg algorithms.Algorithm, failure *clustering.Failure) (string, error) {
  1274  	rule := alg.FailureAssociationRule(b.projectCfg, failure)
  1275  
  1276  	// Check the generated rule is valid and matches the failure.
  1277  	// An improperly generated failure association rule could result
  1278  	// in uncontrolled creation of new bugs.
  1279  	expr, err := lang.Parse(rule)
  1280  	if err != nil {
  1281  		return "", errors.Annotate(err, "rule generated by %s did not parse", alg.Name()).Err()
  1282  	}
  1283  	match := expr.Evaluate(failure)
  1284  	if !match {
  1285  		reason := ""
  1286  		if failure.Reason != nil {
  1287  			reason = failure.Reason.PrimaryErrorMessage
  1288  		}
  1289  		return "", fmt.Errorf("rule generated by %s did not match example failure (testID: %q, failureReason: %q)",
  1290  			alg.Name(), failure.TestID, reason)
  1291  	}
  1292  	return rule, nil
  1293  }