go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/bisection/nthsectionsnapshot/snapshot.go

go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/bisection/nthsectionsnapshot/snapshot.go (about)

     1  // Copyright 2023 The LUCI Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package nthsectionsnapshot contains the logic for getting the current state
    16  // for nthsection analysis and get the next commits to run.
    17  package nthsectionsnapshot
    18  
    19  import (
    20  	"fmt"
    21  	"math"
    22  	"sort"
    23  
    24  	"go.chromium.org/luci/bisection/model"
    25  	pb "go.chromium.org/luci/bisection/proto/v1"
    26  	"go.chromium.org/luci/common/errors"
    27  )
    28  
    29  // Snapshot contains the current snapshot of the nth-section run
    30  // Including the blamelist, status of the reruns...
    31  type Snapshot struct {
    32  	BlameList *pb.BlameList
    33  	// Runs are sorted by index
    34  	Runs []*Run
    35  	// We want a way to detect infinite loop where there is some "consistent" infra failure
    36  	// for a builder, and nth section keep retrying for that builder, and
    37  	// draining the resources.
    38  	// In such cases, keep track of the number of infra failed rerun, and if
    39  	// there are too many, don't run any more
    40  	NumInfraFailed int
    41  	// NumInProgress is the number of reruns that are currently running.
    42  	NumInProgress int
    43  	// NumTestSkipped is the number of reruns with the TEST_SKIPPED status.
    44  	// It indicates that the primary test failure was not executed, so we
    45  	// may not know the next commit for bisection.
    46  	NumTestSkipped int
    47  }
    48  
    49  type Run struct {
    50  	// Index of the run (on the blamelist).
    51  	Index  int
    52  	Commit string
    53  	Status pb.RerunStatus       // status of the run
    54  	Type   model.RerunBuildType // Whether this is nth-section or culprit verification run
    55  }
    56  
    57  func (snapshot *Snapshot) HasTooManyInfraFailure() bool {
    58  	return snapshot.NumInfraFailed > 1
    59  }
    60  
    61  // BadRangeError suggests the regression range is invalid.
    62  // For example, if a passed rerun is found that is more recent
    63  // than a failed rerun, the regression range is invalid.
    64  type BadRangeError struct {
    65  	FirstFailedIdx int
    66  	LastPassedIdx  int
    67  }
    68  
    69  func (b *BadRangeError) Error() string {
    70  	return fmt.Sprintf("invalid regression range - firstFailedIdx >= lastPassedIdx: (%d, %d)", b.FirstFailedIdx, b.LastPassedIdx)
    71  }
    72  
    73  // GetCurrentRegressionRange will return a pair of indices from the Snapshot
    74  // that contains the culprit, based on the results of the rerun.
    75  // Note: In the snapshot blamelist, index 0 refer to first failed,
    76  // and index (n-1) refer to the commit after last pass.
    77  // This function will return an BadRangeError if the regression range is invalid.
    78  func (snapshot *Snapshot) GetCurrentRegressionRange() (int, int, error) {
    79  	firstFailedIdx := 0
    80  	lastPassedIdx := len(snapshot.BlameList.Commits)
    81  	for _, run := range snapshot.Runs {
    82  		// The snapshot runs are sorted by index, so we don't need the (firstFailedIdx < run.Index) check here
    83  		if run.Status == pb.RerunStatus_RERUN_STATUS_FAILED {
    84  			firstFailedIdx = run.Index
    85  		}
    86  		if run.Status == pb.RerunStatus_RERUN_STATUS_PASSED {
    87  			if run.Index < lastPassedIdx {
    88  				lastPassedIdx = run.Index
    89  			}
    90  		}
    91  	}
    92  	if firstFailedIdx >= lastPassedIdx {
    93  		return 0, 0, &BadRangeError{
    94  			FirstFailedIdx: firstFailedIdx,
    95  			LastPassedIdx:  lastPassedIdx,
    96  		}
    97  	}
    98  	return firstFailedIdx, lastPassedIdx - 1, nil
    99  }
   100  
   101  // GetCulprit returns the result of NthSection
   102  // The first return value will be true iff there is a result
   103  // Second value will be the index of the culprit in the blamelist
   104  func (snapshot *Snapshot) GetCulprit() (bool, int) {
   105  	// GetCurrentRegressionRange returns the range that contain the culprit
   106  	start, end, err := snapshot.GetCurrentRegressionRange()
   107  	// If err != nil, it means last pass is later than first failed
   108  	// In such case, no culprit is found.
   109  	if err != nil {
   110  		return false, 0
   111  	}
   112  	// We haven't found the culprit yet
   113  	if start != end {
   114  		return false, 0
   115  	}
   116  	// The regression range only has 1 element: it is the culprit
   117  	return true, start
   118  }
   119  
   120  type NthSectionSnapshotChunk struct {
   121  	Begin int
   122  	End   int
   123  }
   124  
   125  func (chunk *NthSectionSnapshotChunk) length() int {
   126  	return chunk.End - chunk.Begin + 1
   127  }
   128  
   129  // FindNextIndicesToRun finds at most n next commits to run for nthsection.
   130  // At most n indices will be returned
   131  // The target is to minimize the biggest chunks
   132  // For example, if the regression is [0..9], and n=3,
   133  // We can run at indices 2, 5, 8 to break the range into 4 "chunks"
   134  // [0-1], [3-4], [6-7], [9]. The biggest chunk is of size 2.
   135  func (snapshot *Snapshot) FindNextIndicesToRun(n int) ([]int, error) {
   136  	hasCulprit, _ := snapshot.GetCulprit()
   137  	// There is a culprit, no need to run anymore
   138  	if hasCulprit {
   139  		return []int{}, nil
   140  	}
   141  
   142  	// Too many infra failure, we don't want to continue
   143  	if snapshot.HasTooManyInfraFailure() {
   144  		return []int{}, nil
   145  	}
   146  
   147  	if snapshot.NumTestSkipped > 0 {
   148  		return []int{}, nil
   149  	}
   150  
   151  	chunks, err := snapshot.findRegressionChunks()
   152  	if err != nil {
   153  		return nil, err
   154  	}
   155  
   156  	// Use n "dividers" to divide those chunks into even smaller chunks
   157  	// such that the max of those smaller chunks is minimized.
   158  	// We are not optimizing for speed here, because in reality, the number of chunks
   159  	// and n will be very small.
   160  	// We are using a brute force (recursive) method here.
   161  	allocations, _ := chunking(chunks, 0, n, n)
   162  	result := []int{}
   163  	for i, chunk := range chunks {
   164  		result = append(result, breakToSmallerChunks(chunk, allocations[i])...)
   165  	}
   166  	return result, nil
   167  }
   168  
   169  // FindNextCommitsToRun is similar to FindNextIndicesToRun,
   170  // but it returns the commit hashes instead of indices.
   171  func (snapshot *Snapshot) FindNextCommitsToRun(n int) ([]string, error) {
   172  	indices, err := snapshot.FindNextIndicesToRun(n)
   173  	if err != nil {
   174  		return nil, err
   175  	}
   176  	commits := make([]string, len(indices))
   177  	for i, index := range indices {
   178  		commits[i] = snapshot.BlameList.Commits[index].Commit
   179  	}
   180  	return commits, nil
   181  }
   182  
   183  // findRegressionChunks finds the regression range and breaks it into chunks
   184  // the result will be sorted (biggest chunk will come first)
   185  func (snapshot *Snapshot) findRegressionChunks() ([]*NthSectionSnapshotChunk, error) {
   186  	start, end, err := snapshot.GetCurrentRegressionRange()
   187  	if err != nil {
   188  		return nil, err
   189  	}
   190  
   191  	// Find the indices of running builds in the regression range
   192  	// We should not run again for those builds, but instead, we should
   193  	// use those builds to break the range into smaller chunks
   194  	chunks := []*NthSectionSnapshotChunk{}
   195  	for _, run := range snapshot.Runs {
   196  		// There is a special case where there is a failed run at the start
   197  		// In such case we don't want to include the failed run in any chunks
   198  		if run.Index == start && run.Status == pb.RerunStatus_RERUN_STATUS_FAILED {
   199  			start = run.Index + 1
   200  			continue
   201  		}
   202  		if run.Index >= start && run.Index <= end && run.Status == pb.RerunStatus_RERUN_STATUS_IN_PROGRESS {
   203  			if start <= run.Index-1 {
   204  				chunks = append(chunks, &NthSectionSnapshotChunk{Begin: start, End: run.Index - 1})
   205  			}
   206  			start = run.Index + 1
   207  		}
   208  	}
   209  	if start <= end {
   210  		chunks = append(chunks, &NthSectionSnapshotChunk{Begin: start, End: end})
   211  	}
   212  
   213  	// Sort the chunks descendingly based on length
   214  	// In general, the "bigger" chunks should be allocated more "dividers"
   215  	sort.Slice(chunks, func(i, j int) bool {
   216  		return chunks[i].length() > chunks[j].length()
   217  	})
   218  	return chunks, nil
   219  }
   220  
   221  // Use n dividers to divide chunks
   222  // The chunks are sorted by length descendingly
   223  // We only consider chunks from the start index
   224  // Return the array of allocation and the biggest chunk size
   225  // maxAllocationForEachChunk is to control the allocation: there is not cases
   226  // where we want to allocate more dividers to a smaller chunks
   227  func chunking(chunks []*NthSectionSnapshotChunk, start int, nDivider int, maxAllocationForEachChunk int) ([]int, int) {
   228  	// Base case: There is no chunk
   229  	// It may mean that all applicable commits for rerun are in progress
   230  	if len(chunks) == 0 {
   231  		return []int{0}, 0
   232  	}
   233  	// Base case: Only one chunk left
   234  	if start == len(chunks)-1 {
   235  		return []int{nDivider}, calculateChunkSize(chunks[start].length(), nDivider)
   236  	}
   237  	// Base case: No Divider left -> return the biggest chunk
   238  	if nDivider == 0 {
   239  		return zerosSlice(len(chunks) - start + 1), chunks[start].length()
   240  	}
   241  	// Recursive, k is the number of dividers allocated the "start" chunk
   242  	dividerLeft := minInt(nDivider, maxAllocationForEachChunk)
   243  	min := math.MaxInt64
   244  	allocation := []int{}
   245  	for k := dividerLeft; k > 0; k-- {
   246  		startSize := calculateChunkSize(chunks[start].length(), k)
   247  		// We passed k here because we don't want to allocate more dividers to a smaller chunk
   248  		// The recursion depth here is limited by the chunks length and nDivider, which in reality
   249  		// should be < 10
   250  		subAllocation, otherSize := chunking(chunks, start+1, nDivider-k, k)
   251  		if min > maxInt(startSize, otherSize) {
   252  			min = maxInt(startSize, otherSize)
   253  			allocation = append([]int{k}, subAllocation...)
   254  		}
   255  	}
   256  	return allocation, min
   257  }
   258  
   259  // Create a zeroes slice with length l
   260  func zerosSlice(l int) []int {
   261  	s := make([]int, l)
   262  	for i := range s {
   263  		s[i] = 0
   264  	}
   265  	return s
   266  }
   267  
   268  func minInt(a int, b int) int {
   269  	return int(math.Min(float64(a), float64(b)))
   270  }
   271  
   272  func maxInt(a int, b int) int {
   273  	return int(math.Max(float64(a), float64(b)))
   274  }
   275  
   276  // With the initial length, if we use n dividers to divide as equally as possible
   277  // then how long each chunk will be?
   278  // Example: initialLength = 10, nDivider = 3 -> chunk size = 2
   279  // Example: initialLength = 3, nDivider = 1 -> chunk size = 1
   280  func calculateChunkSize(initialLength int, nDivider int) int {
   281  	if nDivider >= initialLength {
   282  		return 0
   283  	}
   284  	return int(math.Ceil(float64(initialLength-nDivider) / (float64(nDivider + 1))))
   285  }
   286  
   287  // return the indices for the break points
   288  func breakToSmallerChunks(chunk *NthSectionSnapshotChunk, nDivider int) []int {
   289  	if nDivider > chunk.length() {
   290  		nDivider = chunk.length()
   291  	}
   292  	step := float64(chunk.length()-nDivider) / float64(nDivider+1)
   293  	result := []int{}
   294  	for i := 1; i <= nDivider; i++ {
   295  		next := int(math.Round(step*float64(i) + float64(i-1)))
   296  		// next >= chunk.length() should not happen, but just in case
   297  		if next < chunk.length() {
   298  			result = append(result, chunk.Begin+next)
   299  		}
   300  	}
   301  
   302  	return result
   303  }
   304  
   305  // FindNextSingleCommitToRun returns the next commit to run.
   306  // Used to get the new rerun when we get the update from recipe.
   307  // If we cannot find the next commit, we will return empty string.
   308  func (snapshot *Snapshot) FindNextSingleCommitToRun() (string, error) {
   309  	// We pass 1 as argument here because we only need to find one commit
   310  	// to replace the finishing one.
   311  	commits, err := snapshot.FindNextCommitsToRun(1)
   312  	if err != nil {
   313  		return "", errors.Annotate(err, "find next commits to run").Err()
   314  	}
   315  	// There is no commit to run, perhaps we already found a culprit, or we
   316  	// have already scheduled the necessary build to be run.
   317  	if len(commits) == 0 {
   318  		return "", nil
   319  	}
   320  	if len(commits) != 1 {
   321  		return "", errors.Annotate(err, "expect only 1 commits to rerun. Got %d", len(commits)).Err()
   322  	}
   323  	return commits[0], nil
   324  
   325  }