go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/analysis/internal/changepoints/grouping.go (about)

     1  // Copyright 2024 The LUCI Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package changepoints
    16  
    17  import (
    18  	"math"
    19  	"sort"
    20  )
    21  
    22  // TestIDGroupingThreshold is the threshold to partition changepoints by test ID.
    23  // TODO: Set this threshold dynamically base on the total number of unique tests in the requested LUCI project and the number of regressions in this period.
    24  // Because the significance of seeing a gap of 64 in test ID number depends on the above two factors.
    25  // A possible formula to derive this threshold is (total tests / # of regressions in period) * coefficient.
    26  const TestIDGroupingThreshold = 64
    27  
    28  // RegressionRangeOverlapPrecThreshold decides whether two changepoints can be grouped together.
    29  // The regression range overlap percentage is calculated by (# of overlapped commits/# of commits in the narrower regression range).
    30  const RegressionRangeOverlapPrecThreshold = 0.4
    31  
    32  // GroupChangepoints returns a 2D array where each row represents a group of changepoints.
    33  // The grouping result is deterministic, which means same input always results in same groups.
    34  // The groups are generated with the following steps.
    35  //  1. partition changepoints base on test ID
    36  //  2. For changepoints in each partition, group base on percentage regression range overlap.
    37  func GroupChangepoints(rows []*ChangepointRow) [][]*ChangepointRow {
    38  	testIDGroups := groupByTestID(rows)
    39  	groups := [][]*ChangepointRow{}
    40  	for _, testIDGroup := range testIDGroups {
    41  		groupsForTestID := groupByRegressionRange(testIDGroup)
    42  		groups = append(groups, groupsForTestID...)
    43  	}
    44  	return groups
    45  }
    46  
    47  // groupByTestID groups changepoints base on their test ID.
    48  func groupByTestID(rows []*ChangepointRow) [][]*ChangepointRow {
    49  	// Copy the input slice, so that we don't change the input.
    50  	cps := make([]*ChangepointRow, len(rows))
    51  	copy(cps, rows)
    52  	sort.Slice(cps, func(i, j int) bool {
    53  		return CompareTestVariantBranchChangepoint(cps[i], cps[j])
    54  	})
    55  	testIDGroups := [][]*ChangepointRow{}
    56  	groupStart := 0
    57  	// Iterate the sorted list of changepoints, and create a split between two adjacent changepoints
    58  	// when the difference of their testIDNum is greater than TestIDGroupingThreshold.
    59  	for groupEnd := 1; groupEnd < len(cps); groupEnd++ {
    60  		if cps[groupEnd].TestIDNum-cps[groupEnd-1].TestIDNum > TestIDGroupingThreshold {
    61  			testIDGroups = append(testIDGroups, cps[groupStart:groupEnd])
    62  			groupStart = groupEnd
    63  		}
    64  	}
    65  	testIDGroups = append(testIDGroups, cps[groupStart:])
    66  	return testIDGroups
    67  }
    68  
    69  type testVariantKey struct {
    70  	TestID      string
    71  	VariantHash string
    72  }
    73  
    74  func toTestVariantKey(changepoint *ChangepointRow) testVariantKey {
    75  	return testVariantKey{
    76  		TestID:      changepoint.TestID,
    77  		VariantHash: changepoint.VariantHash,
    78  	}
    79  }
    80  
    81  // groupByRegressionRange groups changepoints base on overlap of 99% confidence interval of start position (aka. Regression range).
    82  // The same test variant branch can only appears once in a group.
    83  func groupByRegressionRange(rows []*ChangepointRow) [][]*ChangepointRow {
    84  	// Copy the input slice, so that we don't change the input.
    85  	cps := make([]*ChangepointRow, len(rows))
    86  	copy(cps, rows)
    87  	// Sort changepoints by regression range width ASC.
    88  	// Regression range width is defined as start_position_upper_bound_99th - start_position_lower_bound_99th.
    89  	// This is to avoid grouping small non-overlapping regressions together because of a base changepoint with large regression width.
    90  	// For example,
    91  	// Below is the regression range of 4 changepoints (cp = changepoint).
    92  	// cp1 |-----------------------------------|
    93  	//      cp2|------|  cp3|-----|  cp4|-----|
    94  	// If cp1 is picked as the base changepoint, all 4 changepoints will be grouped together.
    95  	// To avoid this, we should always pick changepoint with smaller regression width.
    96  	sort.Slice(cps, func(i, j int) bool {
    97  		wi := regressionRangeWidth(cps[i])
    98  		wj := regressionRangeWidth(cps[j])
    99  		if wi != wj {
   100  			return wi < wj
   101  		}
   102  		// This is required to make sure the sort is deterministic when regression range width equal.
   103  		return CompareTestVariantBranchChangepoint(cps[i], cps[j])
   104  	})
   105  	groups := [][]*ChangepointRow{}
   106  	grouped := make([]bool, len(cps))
   107  	for i := range cps {
   108  		if grouped[i] {
   109  			continue
   110  		}
   111  		// The first encountered ungrouped changepoint is picked as the base changepoint.
   112  		// We find other ungrouped changepoints which has overlap greater than the threshold with the base changepoint,
   113  		// and group them together with the base changepoint.
   114  		// This implies that in each result group, all changepoints satisfy the overlap threshold with the base changepoint,
   115  		// but a random pair of changepoints in a group might not satisfy the overlap threshold with each other.
   116  		base := cps[i]
   117  		// We record whether a test variant branch has been added to this group.
   118  		// This is to avoid multiple changepoints from the same test variant branch being grouped into the same group.
   119  		seenTestVariant := map[testVariantKey]bool{}
   120  		group := []*ChangepointRow{base}
   121  		seenTestVariant[toTestVariantKey(base)] = true
   122  		for j := i + 1; j < len(cps); j++ {
   123  			// Skip this changepoint when
   124  			//   * it's already been grouped, OR
   125  			//   * the test variant already exists in the group, OR
   126  			//   * the changepoint happens on a different branch.
   127  			if grouped[j] || seenTestVariant[toTestVariantKey(cps[j])] || base.RefHash != cps[j].RefHash {
   128  				continue
   129  			}
   130  			overlap := numberOfOverlapCommit(base, cps[j])
   131  			overlapPercentage := overlap / math.Min(float64(regressionRangeWidth(base)), float64(regressionRangeWidth(cps[j])))
   132  			if overlapPercentage > RegressionRangeOverlapPrecThreshold {
   133  				grouped[j] = true
   134  				group = append(group, cps[j])
   135  				seenTestVariant[toTestVariantKey(cps[j])] = true
   136  			}
   137  		}
   138  		groups = append(groups, group)
   139  	}
   140  	return groups
   141  }
   142  
   143  func numberOfOverlapCommit(cp1, cp2 *ChangepointRow) float64 {
   144  	return math.Min(float64(cp1.UpperBound99th), float64(cp2.UpperBound99th)) - math.Max(float64(cp1.LowerBound99th), float64(cp2.LowerBound99th)) + 1
   145  }
   146  
   147  func regressionRangeWidth(cp *ChangepointRow) int64 {
   148  	return cp.UpperBound99th - cp.LowerBound99th + 1
   149  }
   150  
   151  // CompareTestVariantBranchChangepoint returns whether element at i is smaller than element at j
   152  // by comparing TestIDNum, VariantHash, RefHash, NominalStartPosition.
   153  func CompareTestVariantBranchChangepoint(cpi, cpj *ChangepointRow) bool {
   154  	switch {
   155  	case cpi.TestIDNum != cpj.TestIDNum:
   156  		return cpi.TestIDNum < cpj.TestIDNum
   157  	case cpi.VariantHash != cpj.VariantHash:
   158  		return cpi.VariantHash < cpj.VariantHash
   159  	case cpi.RefHash != cpj.RefHash:
   160  		return cpi.RefHash < cpj.RefHash
   161  	default:
   162  		return cpi.NominalStartPosition < cpj.NominalStartPosition
   163  	}
   164  }