go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/analysis/internal/clustering/clusterid.go

go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/analysis/internal/clustering/clusterid.go (about)

     1  // Copyright 2022 The LUCI Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package clustering
    16  
    17  import (
    18  	"fmt"
    19  	"strings"
    20  
    21  	"go.chromium.org/luci/common/errors"
    22  )
    23  
    24  // MaxClusterIDBytes is the maximum number of bytes the algorithm-determined
    25  // cluster ID may occupy. This is the raw number of bytes; if the ID is hex-
    26  // encoded (e.g. for use in a BigQuery table), its length in characters may
    27  // be double this number.
    28  const MaxClusterIDBytes = 16
    29  
    30  // RulesAlgorithmPrefix is the algorithm name prefix used by all versions
    31  // of the rules-based clustering algorithm.
    32  const RulesAlgorithmPrefix = "rules-"
    33  
    34  // TestNameAlgorithmPrefix is the algorithm name prefix used by all versions
    35  // of the test name clustering algorithm.
    36  const TestNameAlgorithmPrefix = "testname-"
    37  
    38  // FailureReasonAlgorithmPrefix is the algorithm name prefix used by all versions
    39  // of the failure reason clustering algorithm.
    40  const FailureReasonAlgorithmPrefix = "reason-"
    41  
    42  // ClusterID represents the identity of a cluster. The LUCI Project is
    43  // omitted as it is assumed to be implicit from the context.
    44  type ClusterID struct {
    45  	// Algorithm is the name of the clustering algorithm that identified
    46  	// the cluster.
    47  	Algorithm string `json:"algorithm"`
    48  	// ID is the cluster identifier returned by the algorithm. The underlying
    49  	// identifier is at most 16 bytes, but is represented here as a hexadecimal
    50  	// string of up to 32 lowercase hexadecimal characters.
    51  	ID string `json:"id"`
    52  }
    53  
    54  // Key returns a value that can be used to uniquely identify the Cluster.
    55  // This is designed for cases where it is desirable for cluster IDs
    56  // to be used as keys in a map.
    57  func (c ClusterID) Key() string {
    58  	return fmt.Sprintf("%s:%s", c.Algorithm, c.ID)
    59  }
    60  
    61  // String returns a string-representation of the cluster, for debugging.
    62  func (c ClusterID) String() string {
    63  	return c.Key()
    64  }
    65  
    66  // Validate validates the algorithm and ID parts
    67  // of the cluster ID are valid.
    68  func (c ClusterID) Validate() error {
    69  	if !AlgorithmRe.MatchString(c.Algorithm) {
    70  		return errors.New("algorithm not valid")
    71  	}
    72  	if err := c.ValidateIDPart(); err != nil {
    73  		return err
    74  	}
    75  	return nil
    76  }
    77  
    78  // ValidateIDPart validates that the ID part of the cluster ID is valid.
    79  func (c ClusterID) ValidateIDPart() error {
    80  	valid := true
    81  	for _, r := range c.ID {
    82  		// ID must be always be stored in lowercase, so that string equality can
    83  		// be used to determine if IDs are the same.
    84  		if !(('0' <= r && r <= '9') || ('a' <= r && r <= 'f')) {
    85  			valid = false
    86  		}
    87  	}
    88  	if !valid || (len(c.ID)%2 != 0) {
    89  		return errors.New("ID is not valid lowercase hexadecimal bytes")
    90  	}
    91  	bytes := len(c.ID) / 2
    92  	if bytes > MaxClusterIDBytes {
    93  		return fmt.Errorf("ID is too long (got %v bytes, want at most %v bytes)", bytes, MaxClusterIDBytes)
    94  	}
    95  	if bytes == 0 {
    96  		return errors.New("ID is empty")
    97  	}
    98  	return nil
    99  }
   100  
   101  // IsEmpty returns whether the cluster ID is equal to its
   102  // zero value.
   103  func (c ClusterID) IsEmpty() bool {
   104  	return c.Algorithm == "" && c.ID == ""
   105  }
   106  
   107  // IsBugCluster returns whether this cluster is backed by a failure
   108  // association rule, and produced by a version of the failure association
   109  // rule based clustering algorithm.
   110  func (c ClusterID) IsBugCluster() bool {
   111  	return strings.HasPrefix(c.Algorithm, RulesAlgorithmPrefix)
   112  }
   113  
   114  // IsTestNameCluster returns whether this cluster was made by a version
   115  // of the test name clustering algorithm.
   116  func (c ClusterID) IsTestNameCluster() bool {
   117  	return strings.HasPrefix(c.Algorithm, TestNameAlgorithmPrefix)
   118  }
   119  
   120  // IsFailureReasonCluster returns whether this cluster was made by a version
   121  // of the failure reason clustering algorithm.
   122  func (c ClusterID) IsFailureReasonCluster() bool {
   123  	return strings.HasPrefix(c.Algorithm, FailureReasonAlgorithmPrefix)
   124  
   125  }
   126  
   127  // SortClusters sorts the given clusters in ascending algorithm and then ID
   128  // order.
   129  func SortClusters(cs []ClusterID) {
   130  	// There are almost always a tiny number of clusters per test result,
   131  	// so a bubble-sort is surpringly faster than the built-in quicksort
   132  	// which has to make memory allocations.
   133  	for {
   134  		done := true
   135  		for i := 0; i < len(cs)-1; i++ {
   136  			if isClusterLess(cs[i+1], cs[i]) {
   137  				cs[i+1], cs[i] = cs[i], cs[i+1]
   138  				done = false
   139  			}
   140  		}
   141  		if done {
   142  			break
   143  		}
   144  	}
   145  }
   146  
   147  // ClustersAreSortedNoDuplicates verifies that clusters are in sorted order
   148  // and there are no duplicate clusters.
   149  func ClustersAreSortedNoDuplicates(cs []ClusterID) bool {
   150  	for i := 0; i < len(cs)-1; i++ {
   151  		if !isClusterLess(cs[i], cs[i+1]) {
   152  			return false
   153  		}
   154  	}
   155  	return true
   156  }
   157  
   158  func isClusterLess(a ClusterID, b ClusterID) bool {
   159  	if a.Algorithm == b.Algorithm {
   160  		return a.ID < b.ID
   161  	}
   162  	return a.Algorithm < b.Algorithm
   163  }