go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/analysis/internal/changepoints/grouping.go (about) 1 // Copyright 2024 The LUCI Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package changepoints 16 17 import ( 18 "math" 19 "sort" 20 ) 21 22 // TestIDGroupingThreshold is the threshold to partition changepoints by test ID. 23 // TODO: Set this threshold dynamically base on the total number of unique tests in the requested LUCI project and the number of regressions in this period. 24 // Because the significance of seeing a gap of 64 in test ID number depends on the above two factors. 25 // A possible formula to derive this threshold is (total tests / # of regressions in period) * coefficient. 26 const TestIDGroupingThreshold = 64 27 28 // RegressionRangeOverlapPrecThreshold decides whether two changepoints can be grouped together. 29 // The regression range overlap percentage is calculated by (# of overlapped commits/# of commits in the narrower regression range). 30 const RegressionRangeOverlapPrecThreshold = 0.4 31 32 // GroupChangepoints returns a 2D array where each row represents a group of changepoints. 33 // The grouping result is deterministic, which means same input always results in same groups. 34 // The groups are generated with the following steps. 35 // 1. partition changepoints base on test ID 36 // 2. For changepoints in each partition, group base on percentage regression range overlap. 37 func GroupChangepoints(rows []*ChangepointRow) [][]*ChangepointRow { 38 testIDGroups := groupByTestID(rows) 39 groups := [][]*ChangepointRow{} 40 for _, testIDGroup := range testIDGroups { 41 groupsForTestID := groupByRegressionRange(testIDGroup) 42 groups = append(groups, groupsForTestID...) 43 } 44 return groups 45 } 46 47 // groupByTestID groups changepoints base on their test ID. 48 func groupByTestID(rows []*ChangepointRow) [][]*ChangepointRow { 49 // Copy the input slice, so that we don't change the input. 50 cps := make([]*ChangepointRow, len(rows)) 51 copy(cps, rows) 52 sort.Slice(cps, func(i, j int) bool { 53 return CompareTestVariantBranchChangepoint(cps[i], cps[j]) 54 }) 55 testIDGroups := [][]*ChangepointRow{} 56 groupStart := 0 57 // Iterate the sorted list of changepoints, and create a split between two adjacent changepoints 58 // when the difference of their testIDNum is greater than TestIDGroupingThreshold. 59 for groupEnd := 1; groupEnd < len(cps); groupEnd++ { 60 if cps[groupEnd].TestIDNum-cps[groupEnd-1].TestIDNum > TestIDGroupingThreshold { 61 testIDGroups = append(testIDGroups, cps[groupStart:groupEnd]) 62 groupStart = groupEnd 63 } 64 } 65 testIDGroups = append(testIDGroups, cps[groupStart:]) 66 return testIDGroups 67 } 68 69 type testVariantKey struct { 70 TestID string 71 VariantHash string 72 } 73 74 func toTestVariantKey(changepoint *ChangepointRow) testVariantKey { 75 return testVariantKey{ 76 TestID: changepoint.TestID, 77 VariantHash: changepoint.VariantHash, 78 } 79 } 80 81 // groupByRegressionRange groups changepoints base on overlap of 99% confidence interval of start position (aka. Regression range). 82 // The same test variant branch can only appears once in a group. 83 func groupByRegressionRange(rows []*ChangepointRow) [][]*ChangepointRow { 84 // Copy the input slice, so that we don't change the input. 85 cps := make([]*ChangepointRow, len(rows)) 86 copy(cps, rows) 87 // Sort changepoints by regression range width ASC. 88 // Regression range width is defined as start_position_upper_bound_99th - start_position_lower_bound_99th. 89 // This is to avoid grouping small non-overlapping regressions together because of a base changepoint with large regression width. 90 // For example, 91 // Below is the regression range of 4 changepoints (cp = changepoint). 92 // cp1 |-----------------------------------| 93 // cp2|------| cp3|-----| cp4|-----| 94 // If cp1 is picked as the base changepoint, all 4 changepoints will be grouped together. 95 // To avoid this, we should always pick changepoint with smaller regression width. 96 sort.Slice(cps, func(i, j int) bool { 97 wi := regressionRangeWidth(cps[i]) 98 wj := regressionRangeWidth(cps[j]) 99 if wi != wj { 100 return wi < wj 101 } 102 // This is required to make sure the sort is deterministic when regression range width equal. 103 return CompareTestVariantBranchChangepoint(cps[i], cps[j]) 104 }) 105 groups := [][]*ChangepointRow{} 106 grouped := make([]bool, len(cps)) 107 for i := range cps { 108 if grouped[i] { 109 continue 110 } 111 // The first encountered ungrouped changepoint is picked as the base changepoint. 112 // We find other ungrouped changepoints which has overlap greater than the threshold with the base changepoint, 113 // and group them together with the base changepoint. 114 // This implies that in each result group, all changepoints satisfy the overlap threshold with the base changepoint, 115 // but a random pair of changepoints in a group might not satisfy the overlap threshold with each other. 116 base := cps[i] 117 // We record whether a test variant branch has been added to this group. 118 // This is to avoid multiple changepoints from the same test variant branch being grouped into the same group. 119 seenTestVariant := map[testVariantKey]bool{} 120 group := []*ChangepointRow{base} 121 seenTestVariant[toTestVariantKey(base)] = true 122 for j := i + 1; j < len(cps); j++ { 123 // Skip this changepoint when 124 // * it's already been grouped, OR 125 // * the test variant already exists in the group, OR 126 // * the changepoint happens on a different branch. 127 if grouped[j] || seenTestVariant[toTestVariantKey(cps[j])] || base.RefHash != cps[j].RefHash { 128 continue 129 } 130 overlap := numberOfOverlapCommit(base, cps[j]) 131 overlapPercentage := overlap / math.Min(float64(regressionRangeWidth(base)), float64(regressionRangeWidth(cps[j]))) 132 if overlapPercentage > RegressionRangeOverlapPrecThreshold { 133 grouped[j] = true 134 group = append(group, cps[j]) 135 seenTestVariant[toTestVariantKey(cps[j])] = true 136 } 137 } 138 groups = append(groups, group) 139 } 140 return groups 141 } 142 143 func numberOfOverlapCommit(cp1, cp2 *ChangepointRow) float64 { 144 return math.Min(float64(cp1.UpperBound99th), float64(cp2.UpperBound99th)) - math.Max(float64(cp1.LowerBound99th), float64(cp2.LowerBound99th)) + 1 145 } 146 147 func regressionRangeWidth(cp *ChangepointRow) int64 { 148 return cp.UpperBound99th - cp.LowerBound99th + 1 149 } 150 151 // CompareTestVariantBranchChangepoint returns whether element at i is smaller than element at j 152 // by comparing TestIDNum, VariantHash, RefHash, NominalStartPosition. 153 func CompareTestVariantBranchChangepoint(cpi, cpj *ChangepointRow) bool { 154 switch { 155 case cpi.TestIDNum != cpj.TestIDNum: 156 return cpi.TestIDNum < cpj.TestIDNum 157 case cpi.VariantHash != cpj.VariantHash: 158 return cpi.VariantHash < cpj.VariantHash 159 case cpi.RefHash != cpj.RefHash: 160 return cpi.RefHash < cpj.RefHash 161 default: 162 return cpi.NominalStartPosition < cpj.NominalStartPosition 163 } 164 }