go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/analysis/internal/clustering/clusterid.go (about) 1 // Copyright 2022 The LUCI Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package clustering 16 17 import ( 18 "fmt" 19 "strings" 20 21 "go.chromium.org/luci/common/errors" 22 ) 23 24 // MaxClusterIDBytes is the maximum number of bytes the algorithm-determined 25 // cluster ID may occupy. This is the raw number of bytes; if the ID is hex- 26 // encoded (e.g. for use in a BigQuery table), its length in characters may 27 // be double this number. 28 const MaxClusterIDBytes = 16 29 30 // RulesAlgorithmPrefix is the algorithm name prefix used by all versions 31 // of the rules-based clustering algorithm. 32 const RulesAlgorithmPrefix = "rules-" 33 34 // TestNameAlgorithmPrefix is the algorithm name prefix used by all versions 35 // of the test name clustering algorithm. 36 const TestNameAlgorithmPrefix = "testname-" 37 38 // FailureReasonAlgorithmPrefix is the algorithm name prefix used by all versions 39 // of the failure reason clustering algorithm. 40 const FailureReasonAlgorithmPrefix = "reason-" 41 42 // ClusterID represents the identity of a cluster. The LUCI Project is 43 // omitted as it is assumed to be implicit from the context. 44 type ClusterID struct { 45 // Algorithm is the name of the clustering algorithm that identified 46 // the cluster. 47 Algorithm string `json:"algorithm"` 48 // ID is the cluster identifier returned by the algorithm. The underlying 49 // identifier is at most 16 bytes, but is represented here as a hexadecimal 50 // string of up to 32 lowercase hexadecimal characters. 51 ID string `json:"id"` 52 } 53 54 // Key returns a value that can be used to uniquely identify the Cluster. 55 // This is designed for cases where it is desirable for cluster IDs 56 // to be used as keys in a map. 57 func (c ClusterID) Key() string { 58 return fmt.Sprintf("%s:%s", c.Algorithm, c.ID) 59 } 60 61 // String returns a string-representation of the cluster, for debugging. 62 func (c ClusterID) String() string { 63 return c.Key() 64 } 65 66 // Validate validates the algorithm and ID parts 67 // of the cluster ID are valid. 68 func (c ClusterID) Validate() error { 69 if !AlgorithmRe.MatchString(c.Algorithm) { 70 return errors.New("algorithm not valid") 71 } 72 if err := c.ValidateIDPart(); err != nil { 73 return err 74 } 75 return nil 76 } 77 78 // ValidateIDPart validates that the ID part of the cluster ID is valid. 79 func (c ClusterID) ValidateIDPart() error { 80 valid := true 81 for _, r := range c.ID { 82 // ID must be always be stored in lowercase, so that string equality can 83 // be used to determine if IDs are the same. 84 if !(('0' <= r && r <= '9') || ('a' <= r && r <= 'f')) { 85 valid = false 86 } 87 } 88 if !valid || (len(c.ID)%2 != 0) { 89 return errors.New("ID is not valid lowercase hexadecimal bytes") 90 } 91 bytes := len(c.ID) / 2 92 if bytes > MaxClusterIDBytes { 93 return fmt.Errorf("ID is too long (got %v bytes, want at most %v bytes)", bytes, MaxClusterIDBytes) 94 } 95 if bytes == 0 { 96 return errors.New("ID is empty") 97 } 98 return nil 99 } 100 101 // IsEmpty returns whether the cluster ID is equal to its 102 // zero value. 103 func (c ClusterID) IsEmpty() bool { 104 return c.Algorithm == "" && c.ID == "" 105 } 106 107 // IsBugCluster returns whether this cluster is backed by a failure 108 // association rule, and produced by a version of the failure association 109 // rule based clustering algorithm. 110 func (c ClusterID) IsBugCluster() bool { 111 return strings.HasPrefix(c.Algorithm, RulesAlgorithmPrefix) 112 } 113 114 // IsTestNameCluster returns whether this cluster was made by a version 115 // of the test name clustering algorithm. 116 func (c ClusterID) IsTestNameCluster() bool { 117 return strings.HasPrefix(c.Algorithm, TestNameAlgorithmPrefix) 118 } 119 120 // IsFailureReasonCluster returns whether this cluster was made by a version 121 // of the failure reason clustering algorithm. 122 func (c ClusterID) IsFailureReasonCluster() bool { 123 return strings.HasPrefix(c.Algorithm, FailureReasonAlgorithmPrefix) 124 125 } 126 127 // SortClusters sorts the given clusters in ascending algorithm and then ID 128 // order. 129 func SortClusters(cs []ClusterID) { 130 // There are almost always a tiny number of clusters per test result, 131 // so a bubble-sort is surpringly faster than the built-in quicksort 132 // which has to make memory allocations. 133 for { 134 done := true 135 for i := 0; i < len(cs)-1; i++ { 136 if isClusterLess(cs[i+1], cs[i]) { 137 cs[i+1], cs[i] = cs[i], cs[i+1] 138 done = false 139 } 140 } 141 if done { 142 break 143 } 144 } 145 } 146 147 // ClustersAreSortedNoDuplicates verifies that clusters are in sorted order 148 // and there are no duplicate clusters. 149 func ClustersAreSortedNoDuplicates(cs []ClusterID) bool { 150 for i := 0; i < len(cs)-1; i++ { 151 if !isClusterLess(cs[i], cs[i+1]) { 152 return false 153 } 154 } 155 return true 156 } 157 158 func isClusterLess(a ClusterID, b ClusterID) bool { 159 if a.Algorithm == b.Algorithm { 160 return a.ID < b.ID 161 } 162 return a.Algorithm < b.Algorithm 163 }