go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/analysis/rpc/clusters.go (about)

     1  // Copyright 2022 The LUCI Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package rpc
    16  
    17  import (
    18  	"context"
    19  	"encoding/hex"
    20  	"fmt"
    21  	"time"
    22  
    23  	"google.golang.org/protobuf/types/known/timestamppb"
    24  
    25  	"go.chromium.org/luci/common/data/stringset"
    26  	"go.chromium.org/luci/common/errors"
    27  	"go.chromium.org/luci/common/logging"
    28  	"go.chromium.org/luci/common/sync/parallel"
    29  	"go.chromium.org/luci/resultdb/rdbperms"
    30  
    31  	"go.chromium.org/luci/analysis/internal/aip"
    32  	"go.chromium.org/luci/analysis/internal/analysis"
    33  	"go.chromium.org/luci/analysis/internal/analysis/metrics"
    34  	"go.chromium.org/luci/analysis/internal/clustering"
    35  	"go.chromium.org/luci/analysis/internal/clustering/algorithms"
    36  	"go.chromium.org/luci/analysis/internal/clustering/reclustering"
    37  	"go.chromium.org/luci/analysis/internal/clustering/rules/cache"
    38  	"go.chromium.org/luci/analysis/internal/clustering/runs"
    39  	"go.chromium.org/luci/analysis/internal/config/compiledcfg"
    40  	"go.chromium.org/luci/analysis/internal/perms"
    41  	"go.chromium.org/luci/analysis/pbutil"
    42  	pb "go.chromium.org/luci/analysis/proto/v1"
    43  )
    44  
    45  // MaxClusterRequestSize is the maximum number of test results to cluster in
    46  // one call to Cluster(...).
    47  const MaxClusterRequestSize = 1000
    48  
    49  // MaxBatchGetClustersRequestSize is the maximum number of clusters to obtain
    50  // impact for in one call to BatchGetClusters().
    51  const MaxBatchGetClustersRequestSize = 1000
    52  
    53  type AnalysisClient interface {
    54  	ReadCluster(ctx context.Context, luciProject string, clusterID clustering.ClusterID) (*analysis.Cluster, error)
    55  	ReadClusterFailures(ctx context.Context, options analysis.ReadClusterFailuresOptions) (cfs []*analysis.ClusterFailure, err error)
    56  	ReadClusterExoneratedTestVariants(ctx context.Context, options analysis.ReadClusterExoneratedTestVariantsOptions) (tvs []*analysis.ExoneratedTestVariant, err error)
    57  	ReadClusterExoneratedTestVariantBranches(ctx context.Context, options analysis.ReadClusterExoneratedTestVariantBranchesOptions) (tvbs []*analysis.ExoneratedTestVariantBranch, err error)
    58  	ReadClusterHistory(ctx context.Context, options analysis.ReadClusterHistoryOptions) (ret []*analysis.ReadClusterHistoryDay, err error)
    59  	QueryClusterSummaries(ctx context.Context, luciProject string, options *analysis.QueryClusterSummariesOptions) ([]*analysis.ClusterSummary, error)
    60  }
    61  
    62  type clustersServer struct {
    63  	analysisClient AnalysisClient
    64  }
    65  
    66  func NewClustersServer(analysisClient AnalysisClient) *pb.DecoratedClusters {
    67  	return &pb.DecoratedClusters{
    68  		Prelude:  checkAllowedPrelude,
    69  		Service:  &clustersServer{analysisClient: analysisClient},
    70  		Postlude: gRPCifyAndLogPostlude,
    71  	}
    72  }
    73  
    74  // Cluster clusters a list of test failures. See proto definition for more.
    75  func (*clustersServer) Cluster(ctx context.Context, req *pb.ClusterRequest) (*pb.ClusterResponse, error) {
    76  	if err := pbutil.ValidateProject(req.Project); err != nil {
    77  		return nil, invalidArgumentError(errors.Annotate(err, "project").Err())
    78  	}
    79  	// We could make an implementation that gracefully degrades if
    80  	// perms.PermGetRule is not available (i.e. by not returning the
    81  	// bug associated with a rule cluster), but there is currently no point.
    82  	// All LUCI Analysis roles currently always grants both permissions
    83  	// together.
    84  	if err := perms.VerifyProjectPermissions(ctx, req.Project, perms.PermGetClustersByFailure, perms.PermGetRule); err != nil {
    85  		return nil, err
    86  	}
    87  
    88  	if len(req.TestResults) > MaxClusterRequestSize {
    89  		return nil, invalidArgumentError(fmt.Errorf(
    90  			"too many test results: at most %v test results can be clustered in one request", MaxClusterRequestSize))
    91  	}
    92  
    93  	failures := make([]*clustering.Failure, 0, len(req.TestResults))
    94  	for i, tr := range req.TestResults {
    95  		if err := validateTestResult(i, tr); err != nil {
    96  			return nil, err
    97  		}
    98  		failures = append(failures, &clustering.Failure{
    99  			TestID: tr.TestId,
   100  			Reason: tr.FailureReason,
   101  		})
   102  	}
   103  
   104  	// Fetch a recent project configuration.
   105  	// (May be a recent value that was cached.)
   106  	cfg, err := readProjectConfig(ctx, req.Project)
   107  	if err != nil {
   108  		return nil, err
   109  	}
   110  
   111  	// Fetch a recent ruleset.
   112  	ruleset, err := reclustering.Ruleset(ctx, req.Project, cache.StrongRead)
   113  	if err != nil {
   114  		return nil, err
   115  	}
   116  
   117  	// Perform clustering from scratch. (Incremental clustering does not make
   118  	// sense for this RPC.)
   119  	existing := algorithms.NewEmptyClusterResults(len(req.TestResults))
   120  
   121  	results := algorithms.Cluster(cfg, ruleset, existing, failures)
   122  
   123  	// Construct the response proto.
   124  	clusteredTRs := make([]*pb.ClusterResponse_ClusteredTestResult, 0, len(results.Clusters))
   125  	for i, r := range results.Clusters {
   126  		request := req.TestResults[i]
   127  
   128  		entries := make([]*pb.ClusterResponse_ClusteredTestResult_ClusterEntry, 0, len(r))
   129  		for _, clusterID := range r {
   130  			entry := &pb.ClusterResponse_ClusteredTestResult_ClusterEntry{
   131  				ClusterId: createClusterIdPB(clusterID),
   132  			}
   133  			if clusterID.IsBugCluster() {
   134  				// For bug clusters, the ID of the cluster is also the ID of
   135  				// the rule that defines it. Use this property to lookup the
   136  				// associated rule.
   137  				ruleID := clusterID.ID
   138  				rule := ruleset.ActiveRulesByID[ruleID]
   139  				entry.Bug = createAssociatedBugPB(rule.Rule.BugID, cfg.Config)
   140  			}
   141  			entries = append(entries, entry)
   142  		}
   143  		clusteredTR := &pb.ClusterResponse_ClusteredTestResult{
   144  			RequestTag: request.RequestTag,
   145  			Clusters:   entries,
   146  		}
   147  		clusteredTRs = append(clusteredTRs, clusteredTR)
   148  	}
   149  
   150  	version := &pb.ClusteringVersion{
   151  		AlgorithmsVersion: int32(results.AlgorithmsVersion),
   152  		RulesVersion:      timestamppb.New(results.RulesVersion),
   153  		ConfigVersion:     timestamppb.New(results.ConfigVersion),
   154  	}
   155  
   156  	return &pb.ClusterResponse{
   157  		ClusteredTestResults: clusteredTRs,
   158  		ClusteringVersion:    version,
   159  	}, nil
   160  }
   161  
   162  func validateTestResult(i int, tr *pb.ClusterRequest_TestResult) error {
   163  	if tr.TestId == "" {
   164  		return invalidArgumentError(fmt.Errorf("test result %v: test ID must not be empty", i))
   165  	}
   166  	return nil
   167  }
   168  
   169  func (c *clustersServer) Get(ctx context.Context, req *pb.GetClusterRequest) (*pb.Cluster, error) {
   170  	project, clusterID, err := parseClusterName(req.Name)
   171  	if err != nil {
   172  		return nil, invalidArgumentError(errors.Annotate(err, "name").Err())
   173  	}
   174  
   175  	if err := perms.VerifyProjectPermissions(ctx, project, perms.PermGetCluster); err != nil {
   176  		return nil, err
   177  	}
   178  
   179  	cfg, err := readProjectConfig(ctx, project)
   180  	if err != nil {
   181  		return nil, err
   182  	}
   183  
   184  	cluster, err := c.analysisClient.ReadCluster(ctx, project, clusterID)
   185  	if err != nil {
   186  		return nil, err
   187  	}
   188  
   189  	readableRealms, err := perms.QueryRealms(ctx, project, nil, rdbperms.PermListTestResults)
   190  	if err != nil {
   191  		return nil, err
   192  	}
   193  	readableRealmsSet := stringset.NewFromSlice(readableRealms...)
   194  
   195  	exists := len(cluster.Realms) > 0
   196  	result := &pb.Cluster{
   197  		Name:       req.Name,
   198  		HasExample: exists,
   199  		Metrics:    make(map[string]*pb.Cluster_TimewiseCounts),
   200  	}
   201  	for metricID, metricValue := range cluster.MetricValues {
   202  		result.Metrics[string(metricID)] = createTimewiseCountsPB(metricValue)
   203  	}
   204  
   205  	if !clusterID.IsBugCluster() && exists {
   206  		example := &clustering.Failure{
   207  			TestID: cluster.ExampleTestID(),
   208  			Reason: &pb.FailureReason{
   209  				PrimaryErrorMessage: cluster.ExampleFailureReason.StringVal,
   210  			},
   211  		}
   212  
   213  		// Whether the user has access to at least one test result in the cluster.
   214  		canSeeAtLeastOneExample := false
   215  		for _, r := range cluster.Realms {
   216  			if readableRealmsSet.Has(r) {
   217  				canSeeAtLeastOneExample = true
   218  				break
   219  			}
   220  		}
   221  		if canSeeAtLeastOneExample {
   222  			// While the user has access to at least one test result in the cluster,
   223  			// they may not have access to the randomly selected example we retrieved
   224  			// from the cluster_summaries table. Therefore, we must be careful not
   225  			// to disclose any aspect of this example other than the
   226  			// clustering key it has in common with all other examples
   227  			// in the cluster.
   228  			hasAccessToGivenExample := false
   229  			result.Title = suggestedClusterTitle(cluster.ClusterID, example, hasAccessToGivenExample, cfg)
   230  			result.EquivalentFailureAssociationRule = failureAssociationRule(cluster.ClusterID, example, cfg)
   231  		}
   232  	}
   233  
   234  	return result, nil
   235  }
   236  
   237  func createTimewiseCountsPB(counts metrics.TimewiseCounts) *pb.Cluster_TimewiseCounts {
   238  	return &pb.Cluster_TimewiseCounts{
   239  		OneDay:   createCountsPB(counts.OneDay),
   240  		ThreeDay: createCountsPB(counts.ThreeDay),
   241  		SevenDay: createCountsPB(counts.SevenDay),
   242  	}
   243  }
   244  
   245  func createCountsPB(counts metrics.Counts) *pb.Cluster_Counts {
   246  	return &pb.Cluster_Counts{Nominal: counts.Nominal}
   247  }
   248  
   249  // failureAssociationRule returns the failure association rule for the
   250  // given cluster ID, assuming the provided example is still a current
   251  // example of the cluster.
   252  // It is assumed the user does not have access to the specific test
   253  // result represented by exampleFailure, but does have access to at
   254  // least one other test result in the cluster. As such, this method
   255  // must only return aspects of the test result which are common
   256  // to all test results in this cluster.
   257  func failureAssociationRule(clusterID clustering.ClusterID, exampleFailure *clustering.Failure, cfg *compiledcfg.ProjectConfig) string {
   258  	// Ignore error, it is only returned if algorithm cannot be found.
   259  	alg, _ := algorithms.SuggestingAlgorithm(clusterID.Algorithm)
   260  	if alg != nil {
   261  		// Check the example is still in the cluster. Sometimes cluster
   262  		// examples are stale (e.g. because cluster configuration has
   263  		// changed and re-clustering is yet to be fully complete and
   264  		// reflected in the cluster_summaries table).
   265  		//
   266  		// If the example is stale, it cannot be used as the basis for
   267  		// deriving the failure association rule to show to the user.
   268  		// This is for two reasons:
   269  		// 1) Functionality. The rule derived from the example
   270  		//    would not be the correct rule for this cluster.
   271  		// 2) Security. The example failure provided may not be from a realm
   272  		//    the user has access to. As a result of a configuration change,
   273  		//    it may now be in a new cluster.
   274  		//    There is no guarantee the user has access to any test results
   275  		//    in this new cluster, even if it contains some of the test results
   276  		//    of the old cluster, which the user could see some examples of.
   277  		//    The failure association rule for the new cluster is one that the
   278  		//    user may not be allowed to see.
   279  		exampleClusterID := hex.EncodeToString(alg.Cluster(cfg, exampleFailure))
   280  		if exampleClusterID == clusterID.ID {
   281  			return alg.FailureAssociationRule(cfg, exampleFailure)
   282  		}
   283  	}
   284  	return ""
   285  }
   286  
   287  // suggestedClusterTitle returns a human-readable description of the cluster,
   288  // using an example failure to help recover the unhashed clustering key.
   289  // hasAccessToGivenExample indicates if the user has permission to see the specific
   290  // example of the cluster (exampleFailure), or (if false) whether they can
   291  // only see one example (but not necessarily exampleFailure).
   292  // If it is false, the result of this method will not contain any aspects
   293  // of the test result other than the aspects which are common to all other
   294  // test results in the cluster (i.e. the clustering key).
   295  func suggestedClusterTitle(clusterID clustering.ClusterID, exampleFailure *clustering.Failure, hasAccessToGivenExample bool, cfg *compiledcfg.ProjectConfig) string {
   296  	// Ignore error, it is only returned if algorithm cannot be found.
   297  	alg, _ := algorithms.SuggestingAlgorithm(clusterID.Algorithm)
   298  	if alg != nil {
   299  		// Check the example is still in the cluster. Sometimes cluster
   300  		// examples are stale (e.g. because cluster configuration has
   301  		// changed and re-clustering is yet to be fully complete and
   302  		// reflected in the cluster_summaries table).
   303  		//
   304  		// If the example is stale, it cannot be used as the basis for
   305  		// deriving the clustering key (cluster definition) to show to
   306  		// the user. This is for two reasons:
   307  		// 1) Functionality. The clustering key derived from the example
   308  		//    would not be the correct clustering key for this cluster.
   309  		// 2) Security. The example failure provided may not be from a realm
   310  		//    the user has access to. As a result of a configuration change,
   311  		//    it may now be in a new cluster.
   312  		//    There is no guarantee the user has access to any test results
   313  		//    in this new cluster, even if it contains some of the test results
   314  		//    of the current cluster, which the user could see some examples of.
   315  		//    The failure association rule for the new cluster is one that the
   316  		//    user may not be allowed to see.
   317  		exampleClusterID := hex.EncodeToString(alg.Cluster(cfg, exampleFailure))
   318  		if exampleClusterID == clusterID.ID {
   319  			return alg.ClusterTitle(cfg, exampleFailure)
   320  		}
   321  	}
   322  	// Fallback.
   323  	if hasAccessToGivenExample {
   324  		// The user has access to the specific test result used as an example.
   325  		// We are fine to disclose it; we do not have to rely on sanitising it
   326  		// down to the common clustering key.
   327  		if clusterID.IsTestNameCluster() {
   328  			// Fallback for old test name clusters.
   329  			return exampleFailure.TestID
   330  		}
   331  		if clusterID.IsFailureReasonCluster() {
   332  			// Fallback for old reason-based clusters.
   333  			return exampleFailure.Reason.PrimaryErrorMessage
   334  		}
   335  	}
   336  	// Fallback for all other cases.
   337  	return "(definition unavailable due to ongoing reclustering)"
   338  }
   339  
   340  func (c *clustersServer) GetReclusteringProgress(ctx context.Context, req *pb.GetReclusteringProgressRequest) (*pb.ReclusteringProgress, error) {
   341  	project, err := parseReclusteringProgressName(req.Name)
   342  	if err != nil {
   343  		return nil, invalidArgumentError(errors.Annotate(err, "name").Err())
   344  	}
   345  	// Getting reclustering progress is considered part of getting a cluster:
   346  	// whenever you retrieve a cluster, you should be able to tell if the
   347  	// information you are reading is up to date.
   348  	if err := perms.VerifyProjectPermissions(ctx, project, perms.PermGetCluster); err != nil {
   349  		return nil, err
   350  	}
   351  
   352  	progress, err := runs.ReadReclusteringProgress(ctx, project)
   353  	if err != nil {
   354  		return nil, err
   355  	}
   356  
   357  	return &pb.ReclusteringProgress{
   358  		Name:             req.Name,
   359  		ProgressPerMille: int32(progress.ProgressPerMille),
   360  		Last: &pb.ClusteringVersion{
   361  			AlgorithmsVersion: int32(progress.Last.AlgorithmsVersion),
   362  			RulesVersion:      timestamppb.New(progress.Last.RulesVersion),
   363  			ConfigVersion:     timestamppb.New(progress.Last.ConfigVersion),
   364  		},
   365  		Next: &pb.ClusteringVersion{
   366  			AlgorithmsVersion: int32(progress.Next.AlgorithmsVersion),
   367  			RulesVersion:      timestamppb.New(progress.Next.RulesVersion),
   368  			ConfigVersion:     timestamppb.New(progress.Next.ConfigVersion),
   369  		},
   370  	}, nil
   371  }
   372  
   373  func (c *clustersServer) QueryClusterSummaries(ctx context.Context, req *pb.QueryClusterSummariesRequest) (*pb.QueryClusterSummariesResponse, error) {
   374  	if err := pbutil.ValidateProject(req.Project); err != nil {
   375  		return nil, invalidArgumentError(errors.Annotate(err, "project").Err())
   376  	}
   377  
   378  	if err := pbutil.ValidateTimeRange(ctx, req.TimeRange); err != nil {
   379  		err = errors.Annotate(err, "time_range").Err()
   380  		return nil, invalidArgumentError(err)
   381  	}
   382  
   383  	// TODO(b/239768873): Provide some sort of fallback for users who do not
   384  	// have permission to run expensive queries if no filters are applied.
   385  
   386  	// We could make an implementation that gracefully deals with the situation
   387  	// where the user does not have perms.PermGetRule, but there is currently
   388  	// no point as the LUCI Analysis reader role currently always grants
   389  	// PermGetRule with PermListClusters.
   390  	if err := perms.VerifyProjectPermissions(ctx, req.Project, perms.PermListClusters, perms.PermGetRule); err != nil {
   391  		return nil, err
   392  	}
   393  	canSeeRuleDefinition, err := perms.HasProjectPermission(ctx, req.Project, perms.PermGetRuleDefinition)
   394  	if err != nil {
   395  		return nil, err
   396  	}
   397  
   398  	// Fetch a recent project configuration.
   399  	// (May be a recent value that was cached.)
   400  	cfg, err := readProjectConfig(ctx, req.Project)
   401  	if err != nil {
   402  		return nil, err
   403  	}
   404  
   405  	view := req.View
   406  	if view == pb.ClusterSummaryView_CLUSTER_SUMMARY_VIEW_UNSPECIFIED {
   407  		view = pb.ClusterSummaryView_BASIC
   408  	}
   409  	var includeMetricBreakdown = view == pb.ClusterSummaryView_FULL
   410  
   411  	var ruleset *cache.Ruleset
   412  	var clusters []*analysis.ClusterSummary
   413  	var bqErr error
   414  	// Parallelise call to Biquery (slow call)
   415  	// with the datastore/spanner calls to reduce the critical path.
   416  	err = parallel.FanOutIn(func(ch chan<- func() error) {
   417  		ch <- func() error {
   418  			start := time.Now()
   419  			var err error
   420  
   421  			// Fetch a recent ruleset.
   422  			ruleset, err = reclustering.Ruleset(ctx, req.Project, cache.StrongRead)
   423  			if err != nil {
   424  				return err
   425  			}
   426  			logging.Infof(ctx, "QueryClusterSummaries: Ruleset part took %v", time.Since(start))
   427  			return nil
   428  		}
   429  		ch <- func() error {
   430  			start := time.Now()
   431  			// To avoid the error returned from the service being non-deterministic
   432  			// if both goroutines error, populate any error encountered here
   433  			// into bqErr and return no error.
   434  			opts := &analysis.QueryClusterSummariesOptions{
   435  				TimeRange:              req.TimeRange,
   436  				IncludeMetricBreakdown: includeMetricBreakdown,
   437  			}
   438  			var err error
   439  
   440  			opts.FailureFilter, err = aip.ParseFilter(req.FailureFilter)
   441  			if err != nil {
   442  				bqErr = invalidArgumentError(errors.Annotate(err, "failure_filter").Err())
   443  				return nil
   444  			}
   445  			opts.OrderBy, err = aip.ParseOrderBy(req.OrderBy)
   446  			if err != nil {
   447  				bqErr = invalidArgumentError(errors.Annotate(err, "order_by").Err())
   448  				return nil
   449  			}
   450  			opts.Metrics, err = metricsByName(req.Project, cfg, req.Metrics)
   451  			if err != nil {
   452  				bqErr = invalidArgumentError(errors.Annotate(err, "metrics").Err())
   453  				return nil
   454  			}
   455  			opts.Realms, err = perms.QueryRealmsNonEmpty(ctx, req.Project, nil, perms.ListTestResultsAndExonerations...)
   456  			if err != nil {
   457  				bqErr = err
   458  				return nil
   459  			}
   460  
   461  			clusters, err = c.analysisClient.QueryClusterSummaries(ctx, req.Project, opts)
   462  			if err != nil {
   463  				if analysis.InvalidArgumentTag.In(err) {
   464  					bqErr = invalidArgumentError(err)
   465  					return nil
   466  				}
   467  				bqErr = errors.Annotate(err, "query clusters for failures").Err()
   468  				return nil
   469  			}
   470  			logging.Infof(ctx, "QueryClusterSummaries: BigQuery part took %v", time.Since(start))
   471  			return nil
   472  		}
   473  	})
   474  	if err != nil {
   475  		return nil, err
   476  	}
   477  	// To avoid the error returned from the service being non-deterministic
   478  	// if both goroutines error, return error from bigQuery part after any other errors.
   479  	if bqErr != nil {
   480  		return nil, bqErr
   481  	}
   482  
   483  	result := []*pb.ClusterSummary{}
   484  	for _, c := range clusters {
   485  		cs := &pb.ClusterSummary{
   486  			ClusterId: createClusterIdPB(c.ClusterID),
   487  			Metrics:   make(map[string]*pb.ClusterSummary_MetricValue),
   488  		}
   489  		for id, metricValue := range c.MetricValues {
   490  			cs.Metrics[string(id)] = &pb.ClusterSummary_MetricValue{
   491  				Value:          metricValue.Value,
   492  				DailyBreakdown: metricValue.DailyBreakdown,
   493  			}
   494  		}
   495  
   496  		if c.ClusterID.IsBugCluster() {
   497  			ruleID := c.ClusterID.ID
   498  			rule := ruleset.ActiveRulesByID[ruleID]
   499  			if rule != nil {
   500  				cs.Bug = createAssociatedBugPB(rule.Rule.BugID, cfg.Config)
   501  				if canSeeRuleDefinition {
   502  					cs.Title = rule.Rule.RuleDefinition
   503  				} else {
   504  					// Because the query is limited to running over the test
   505  					// failures the user has access to, they have permission
   506  					// to see the example Test ID for the cluster.
   507  
   508  					// Attempt to provide a description of the failures matched
   509  					// by the rule from the data the user can see, without
   510  					// revealing the content of the rule itself.
   511  					cs.Title = fmt.Sprintf("Selected failures in %s", c.ExampleTestID)
   512  					if c.UniqueTestIDs > 1 {
   513  						cs.Title += fmt.Sprintf(" (and %v more)", c.UniqueTestIDs-1)
   514  					}
   515  				}
   516  			} else {
   517  				// Rule is inactive / in process of being archived.
   518  				cs.Title = "(rule archived)"
   519  			}
   520  		} else {
   521  			example := &clustering.Failure{
   522  				TestID: c.ExampleTestID,
   523  				Reason: &pb.FailureReason{
   524  					PrimaryErrorMessage: c.ExampleFailureReason.StringVal,
   525  				},
   526  			}
   527  			// Because QueryClusterSummaries only reads failures the user has
   528  			// access to, the example is one the user has access to, and
   529  			// so we can use it for the title.
   530  			hasAccessToGivenExample := true
   531  			cs.Title = suggestedClusterTitle(c.ClusterID, example, hasAccessToGivenExample, cfg)
   532  		}
   533  
   534  		result = append(result, cs)
   535  	}
   536  	return &pb.QueryClusterSummariesResponse{ClusterSummaries: result}, nil
   537  }
   538  
   539  // metricsByName retrieves the metrics with the given name from a
   540  // given LUCI Project and configuration. If the metric is not
   541  // from the given LUCI Project, an error will be returned.
   542  func metricByName(project string, cfg *compiledcfg.ProjectConfig, name string) (metrics.Definition, error) {
   543  	metricProject, id, err := parseProjectMetricName(name)
   544  	if err != nil {
   545  		return metrics.Definition{}, err
   546  	}
   547  	if metricProject != project {
   548  		return metrics.Definition{}, errors.Reason("metric %s cannot be used as it is from a different LUCI Project", name).Err()
   549  	}
   550  	metric, err := metrics.ByID(id)
   551  	if err != nil {
   552  		return metrics.Definition{}, err
   553  	}
   554  	return metric.AdaptToProject(project, cfg.Config.Metrics), nil
   555  }
   556  
   557  // metricsByName retrieves the metrics with the given names from a
   558  // given LUCI Project and configuration. If the metrics are not
   559  // from the given LUCI Project, an error will be returned.
   560  func metricsByName(project string, cfg *compiledcfg.ProjectConfig, names []string) ([]metrics.Definition, error) {
   561  	results := make([]metrics.Definition, 0, len(names))
   562  	for _, name := range names {
   563  		metric, err := metricByName(project, cfg, name)
   564  		if err != nil {
   565  			return nil, err
   566  		}
   567  		results = append(results, metric)
   568  	}
   569  	return results, nil
   570  }
   571  
   572  func (c *clustersServer) QueryClusterFailures(ctx context.Context, req *pb.QueryClusterFailuresRequest) (*pb.QueryClusterFailuresResponse, error) {
   573  	project, clusterID, err := parseClusterFailuresName(req.Parent)
   574  	if err != nil {
   575  		return nil, invalidArgumentError(errors.Annotate(err, "parent").Err())
   576  	}
   577  
   578  	if err := perms.VerifyProjectPermissions(ctx, project, perms.PermGetCluster); err != nil {
   579  		return nil, err
   580  	}
   581  
   582  	// Fetch a recent project configuration.
   583  	// (May be a recent value that was cached.)
   584  	cfg, err := readProjectConfig(ctx, project)
   585  	if err != nil {
   586  		return nil, err
   587  	}
   588  
   589  	opts := analysis.ReadClusterFailuresOptions{
   590  		Project:   project,
   591  		ClusterID: clusterID,
   592  	}
   593  	opts.Realms, err = perms.QueryRealmsNonEmpty(ctx, project, nil, perms.ListTestResultsAndExonerations...)
   594  	if err != nil {
   595  		// If the user has permission in no realms, QueryRealmsNonEmpty
   596  		// will return an appstatus error PERMISSION_DENIED.
   597  		// Otherwise, e.g. in case AuthDB was unavailable, the error will
   598  		// not be an appstatus error and the client will get an internal
   599  		// server error.
   600  		return nil, err
   601  	}
   602  	if req.MetricFilter != "" {
   603  		metric, err := metricByName(project, cfg, req.MetricFilter)
   604  		if err != nil {
   605  			return nil, invalidArgumentError(errors.Annotate(err, "filter_metric").Err())
   606  		}
   607  		opts.MetricFilter = &metric
   608  	}
   609  
   610  	failures, err := c.analysisClient.ReadClusterFailures(ctx, opts)
   611  	if err != nil {
   612  		return nil, errors.Annotate(err, "query cluster failures").Err()
   613  	}
   614  	response := &pb.QueryClusterFailuresResponse{}
   615  	for _, f := range failures {
   616  		response.Failures = append(response.Failures, createDistinctClusterFailurePB(f))
   617  	}
   618  
   619  	return response, nil
   620  }
   621  
   622  func createDistinctClusterFailurePB(f *analysis.ClusterFailure) *pb.DistinctClusterFailure {
   623  	var exonerations []*pb.DistinctClusterFailure_Exoneration
   624  	for _, ex := range f.Exonerations {
   625  		exonerations = append(exonerations, &pb.DistinctClusterFailure_Exoneration{
   626  			Reason: analysis.FromBQExonerationReason(ex.Reason.StringVal),
   627  		})
   628  	}
   629  
   630  	var changelists []*pb.Changelist
   631  	for _, cl := range f.Changelists {
   632  		changelists = append(changelists, &pb.Changelist{
   633  			Host:     cl.Host.StringVal,
   634  			Change:   cl.Change.Int64,
   635  			Patchset: int32(cl.Patchset.Int64),
   636  		})
   637  	}
   638  
   639  	buildStatus := analysis.FromBQBuildStatus(f.BuildStatus.StringVal)
   640  
   641  	var presubmitRun *pb.DistinctClusterFailure_PresubmitRun
   642  	if f.PresubmitRunID != nil {
   643  		presubmitRun = &pb.DistinctClusterFailure_PresubmitRun{
   644  			PresubmitRunId: &pb.PresubmitRunId{
   645  				System: f.PresubmitRunID.System.StringVal,
   646  				Id:     f.PresubmitRunID.ID.StringVal,
   647  			},
   648  			Owner:  f.PresubmitRunOwner.StringVal,
   649  			Mode:   analysis.FromBQPresubmitRunMode(f.PresubmitRunMode.StringVal),
   650  			Status: analysis.FromBQPresubmitRunStatus(f.PresubmitRunStatus.StringVal),
   651  		}
   652  	}
   653  
   654  	return &pb.DistinctClusterFailure{
   655  		TestId:                      f.TestID.StringVal,
   656  		Variant:                     createVariantPB(f.Variant),
   657  		PartitionTime:               timestamppb.New(f.PartitionTime.Timestamp),
   658  		PresubmitRun:                presubmitRun,
   659  		IsBuildCritical:             f.IsBuildCritical.Bool,
   660  		Exonerations:                exonerations,
   661  		BuildStatus:                 buildStatus,
   662  		IngestedInvocationId:        f.IngestedInvocationID.StringVal,
   663  		IsIngestedInvocationBlocked: f.IsIngestedInvocationBlocked.Bool,
   664  		Changelists:                 changelists,
   665  		Count:                       f.Count,
   666  	}
   667  }
   668  
   669  func createVariantPB(variant []*analysis.Variant) *pb.Variant {
   670  	def := make(map[string]string)
   671  	for _, v := range variant {
   672  		def[v.Key.StringVal] = v.Value.StringVal
   673  	}
   674  	var result *pb.Variant
   675  	if len(def) > 0 {
   676  		result = &pb.Variant{Def: def}
   677  	}
   678  	return result
   679  }
   680  
   681  func (c *clustersServer) QueryExoneratedTestVariants(ctx context.Context, req *pb.QueryClusterExoneratedTestVariantsRequest) (*pb.QueryClusterExoneratedTestVariantsResponse, error) {
   682  	project, clusterID, err := parseClusterExoneratedTestVariantsName(req.Parent)
   683  	if err != nil {
   684  		return nil, invalidArgumentError(errors.Annotate(err, "parent").Err())
   685  	}
   686  
   687  	if err := perms.VerifyProjectPermissions(ctx, project, perms.PermGetCluster); err != nil {
   688  		return nil, err
   689  	}
   690  	opts := analysis.ReadClusterExoneratedTestVariantsOptions{
   691  		Project:   project,
   692  		ClusterID: clusterID,
   693  	}
   694  	opts.Realms, err = perms.QueryRealmsNonEmpty(ctx, project, nil, perms.ListTestResultsAndExonerations...)
   695  	if err != nil {
   696  		// If the user has permission in no realms, QueryRealmsNonEmpty
   697  		// will return an appstatus error PERMISSION_DENIED.
   698  		// Otherwise, e.g. in case AuthDB was unavailable, the error will
   699  		// not be an appstatus error and the client will get an internal
   700  		// server error.
   701  		return nil, err
   702  	}
   703  
   704  	testVariants, err := c.analysisClient.ReadClusterExoneratedTestVariants(ctx, opts)
   705  	if err != nil {
   706  		return nil, errors.Annotate(err, "query exonerated test variants").Err()
   707  	}
   708  	response := &pb.QueryClusterExoneratedTestVariantsResponse{}
   709  	for _, f := range testVariants {
   710  		response.TestVariants = append(response.TestVariants, createClusterExoneratedTestVariant(f))
   711  	}
   712  
   713  	return response, nil
   714  }
   715  
   716  func createClusterExoneratedTestVariant(tv *analysis.ExoneratedTestVariant) *pb.ClusterExoneratedTestVariant {
   717  	return &pb.ClusterExoneratedTestVariant{
   718  		TestId:                     tv.TestID.StringVal,
   719  		Variant:                    createVariantPB(tv.Variant),
   720  		CriticalFailuresExonerated: tv.CriticalFailuresExonerated,
   721  		LastExoneration:            timestamppb.New(tv.LastExoneration.Timestamp),
   722  	}
   723  }
   724  
   725  func (c *clustersServer) QueryExoneratedTestVariantBranches(ctx context.Context, req *pb.QueryClusterExoneratedTestVariantBranchesRequest) (*pb.QueryClusterExoneratedTestVariantBranchesResponse, error) {
   726  	project, clusterID, err := parseClusterExoneratedTestVariantBranchesName(req.Parent)
   727  	if err != nil {
   728  		return nil, invalidArgumentError(errors.Annotate(err, "parent").Err())
   729  	}
   730  
   731  	if err := perms.VerifyProjectPermissions(ctx, project, perms.PermGetCluster); err != nil {
   732  		return nil, err
   733  	}
   734  	opts := analysis.ReadClusterExoneratedTestVariantBranchesOptions{
   735  		Project:   project,
   736  		ClusterID: clusterID,
   737  	}
   738  	opts.Realms, err = perms.QueryRealmsNonEmpty(ctx, project, nil, perms.ListTestResultsAndExonerations...)
   739  	if err != nil {
   740  		// If the user has permission in no realms, QueryRealmsNonEmpty
   741  		// will return an appstatus error PERMISSION_DENIED.
   742  		// Otherwise, e.g. in case AuthDB was unavailable, the error will
   743  		// not be an appstatus error and the client will get an internal
   744  		// server error.
   745  		return nil, err
   746  	}
   747  
   748  	testVariantBranches, err := c.analysisClient.ReadClusterExoneratedTestVariantBranches(ctx, opts)
   749  	if err != nil {
   750  		return nil, errors.Annotate(err, "query exonerated test variant branches").Err()
   751  	}
   752  	response := &pb.QueryClusterExoneratedTestVariantBranchesResponse{}
   753  	for _, tvb := range testVariantBranches {
   754  		response.TestVariantBranches = append(response.TestVariantBranches, createClusterExoneratedTestVariantBranch(tvb))
   755  	}
   756  
   757  	return response, nil
   758  }
   759  
   760  func createClusterExoneratedTestVariantBranch(tv *analysis.ExoneratedTestVariantBranch) *pb.ClusterExoneratedTestVariantBranch {
   761  	return &pb.ClusterExoneratedTestVariantBranch{
   762  		Project:                    tv.Project.StringVal,
   763  		TestId:                     tv.TestID.StringVal,
   764  		Variant:                    createVariantPB(tv.Variant),
   765  		SourceRef:                  createSourceRef(tv.SourceRef),
   766  		CriticalFailuresExonerated: tv.CriticalFailuresExonerated,
   767  		LastExoneration:            timestamppb.New(tv.LastExoneration.Timestamp),
   768  	}
   769  }
   770  
   771  func createSourceRef(sourceRef analysis.SourceRef) *pb.SourceRef {
   772  	result := &pb.SourceRef{}
   773  	if sourceRef.Gitiles != nil {
   774  		result.System = &pb.SourceRef_Gitiles{
   775  			Gitiles: &pb.GitilesRef{
   776  				Host:    sourceRef.Gitiles.Host.StringVal,
   777  				Project: sourceRef.Gitiles.Project.StringVal,
   778  				Ref:     sourceRef.Gitiles.Ref.StringVal,
   779  			},
   780  		}
   781  	}
   782  	return result
   783  }
   784  
   785  // QueryHistory clusters a list of test failures. See proto definition for more.
   786  func (c *clustersServer) QueryHistory(ctx context.Context, req *pb.QueryClusterHistoryRequest) (*pb.QueryClusterHistoryResponse, error) {
   787  	if err := pbutil.ValidateProject(req.Project); err != nil {
   788  		return nil, invalidArgumentError(errors.Annotate(err, "project").Err())
   789  	}
   790  
   791  	if err := perms.VerifyProjectPermissions(ctx, req.Project, perms.PermGetConfig); err != nil {
   792  		return nil, err
   793  	}
   794  
   795  	cfg, err := readProjectConfig(ctx, req.Project)
   796  	if err != nil {
   797  		return nil, err
   798  	}
   799  
   800  	opts := analysis.ReadClusterHistoryOptions{
   801  		Project: req.Project,
   802  		Days:    req.Days,
   803  	}
   804  
   805  	opts.FailureFilter, err = aip.ParseFilter(req.FailureFilter)
   806  	if err != nil {
   807  		return nil, invalidArgumentError(errors.Annotate(err, "failure_filter").Err())
   808  	}
   809  
   810  	opts.Metrics, err = metricsByName(req.Project, cfg, req.Metrics)
   811  	if err != nil {
   812  		return nil, invalidArgumentError(err)
   813  	}
   814  
   815  	realms, err := perms.QueryRealmsNonEmpty(ctx, req.Project, nil, perms.ListTestResultsAndExonerations...)
   816  	if err != nil {
   817  		// If the user has permission in no realms, QueryRealmsNonEmpty
   818  		// will return an appstatus error PERMISSION_DENIED.
   819  		// Otherwise, e.g. in case AuthDB was unavailable, the error will
   820  		// not be an appstatus error and the client will get an internal
   821  		// server error.
   822  		return nil, err
   823  	}
   824  	opts.Realms = realms
   825  
   826  	days, err := c.analysisClient.ReadClusterHistory(ctx, opts)
   827  	if err != nil {
   828  		return nil, errors.Annotate(err, "cluster history").Err()
   829  	}
   830  
   831  	response := &pb.QueryClusterHistoryResponse{}
   832  	if len(days) == 0 {
   833  		return response, nil
   834  	}
   835  
   836  	for _, day := range days {
   837  		metrics := make(map[string]int32)
   838  		for id, value := range day.MetricValues {
   839  			metrics[id.String()] = value
   840  		}
   841  		response.Days = append(response.Days, &pb.ClusterHistoryDay{
   842  			Metrics: metrics,
   843  			Date:    day.Date.Format("2006-01-02"),
   844  		})
   845  	}
   846  	return response, nil
   847  }