go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/bisection/nthsectionsnapshot/snapshot.go (about) 1 // Copyright 2023 The LUCI Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package nthsectionsnapshot contains the logic for getting the current state 16 // for nthsection analysis and get the next commits to run. 17 package nthsectionsnapshot 18 19 import ( 20 "fmt" 21 "math" 22 "sort" 23 24 "go.chromium.org/luci/bisection/model" 25 pb "go.chromium.org/luci/bisection/proto/v1" 26 "go.chromium.org/luci/common/errors" 27 ) 28 29 // Snapshot contains the current snapshot of the nth-section run 30 // Including the blamelist, status of the reruns... 31 type Snapshot struct { 32 BlameList *pb.BlameList 33 // Runs are sorted by index 34 Runs []*Run 35 // We want a way to detect infinite loop where there is some "consistent" infra failure 36 // for a builder, and nth section keep retrying for that builder, and 37 // draining the resources. 38 // In such cases, keep track of the number of infra failed rerun, and if 39 // there are too many, don't run any more 40 NumInfraFailed int 41 // NumInProgress is the number of reruns that are currently running. 42 NumInProgress int 43 // NumTestSkipped is the number of reruns with the TEST_SKIPPED status. 44 // It indicates that the primary test failure was not executed, so we 45 // may not know the next commit for bisection. 46 NumTestSkipped int 47 } 48 49 type Run struct { 50 // Index of the run (on the blamelist). 51 Index int 52 Commit string 53 Status pb.RerunStatus // status of the run 54 Type model.RerunBuildType // Whether this is nth-section or culprit verification run 55 } 56 57 func (snapshot *Snapshot) HasTooManyInfraFailure() bool { 58 return snapshot.NumInfraFailed > 1 59 } 60 61 // BadRangeError suggests the regression range is invalid. 62 // For example, if a passed rerun is found that is more recent 63 // than a failed rerun, the regression range is invalid. 64 type BadRangeError struct { 65 FirstFailedIdx int 66 LastPassedIdx int 67 } 68 69 func (b *BadRangeError) Error() string { 70 return fmt.Sprintf("invalid regression range - firstFailedIdx >= lastPassedIdx: (%d, %d)", b.FirstFailedIdx, b.LastPassedIdx) 71 } 72 73 // GetCurrentRegressionRange will return a pair of indices from the Snapshot 74 // that contains the culprit, based on the results of the rerun. 75 // Note: In the snapshot blamelist, index 0 refer to first failed, 76 // and index (n-1) refer to the commit after last pass. 77 // This function will return an BadRangeError if the regression range is invalid. 78 func (snapshot *Snapshot) GetCurrentRegressionRange() (int, int, error) { 79 firstFailedIdx := 0 80 lastPassedIdx := len(snapshot.BlameList.Commits) 81 for _, run := range snapshot.Runs { 82 // The snapshot runs are sorted by index, so we don't need the (firstFailedIdx < run.Index) check here 83 if run.Status == pb.RerunStatus_RERUN_STATUS_FAILED { 84 firstFailedIdx = run.Index 85 } 86 if run.Status == pb.RerunStatus_RERUN_STATUS_PASSED { 87 if run.Index < lastPassedIdx { 88 lastPassedIdx = run.Index 89 } 90 } 91 } 92 if firstFailedIdx >= lastPassedIdx { 93 return 0, 0, &BadRangeError{ 94 FirstFailedIdx: firstFailedIdx, 95 LastPassedIdx: lastPassedIdx, 96 } 97 } 98 return firstFailedIdx, lastPassedIdx - 1, nil 99 } 100 101 // GetCulprit returns the result of NthSection 102 // The first return value will be true iff there is a result 103 // Second value will be the index of the culprit in the blamelist 104 func (snapshot *Snapshot) GetCulprit() (bool, int) { 105 // GetCurrentRegressionRange returns the range that contain the culprit 106 start, end, err := snapshot.GetCurrentRegressionRange() 107 // If err != nil, it means last pass is later than first failed 108 // In such case, no culprit is found. 109 if err != nil { 110 return false, 0 111 } 112 // We haven't found the culprit yet 113 if start != end { 114 return false, 0 115 } 116 // The regression range only has 1 element: it is the culprit 117 return true, start 118 } 119 120 type NthSectionSnapshotChunk struct { 121 Begin int 122 End int 123 } 124 125 func (chunk *NthSectionSnapshotChunk) length() int { 126 return chunk.End - chunk.Begin + 1 127 } 128 129 // FindNextIndicesToRun finds at most n next commits to run for nthsection. 130 // At most n indices will be returned 131 // The target is to minimize the biggest chunks 132 // For example, if the regression is [0..9], and n=3, 133 // We can run at indices 2, 5, 8 to break the range into 4 "chunks" 134 // [0-1], [3-4], [6-7], [9]. The biggest chunk is of size 2. 135 func (snapshot *Snapshot) FindNextIndicesToRun(n int) ([]int, error) { 136 hasCulprit, _ := snapshot.GetCulprit() 137 // There is a culprit, no need to run anymore 138 if hasCulprit { 139 return []int{}, nil 140 } 141 142 // Too many infra failure, we don't want to continue 143 if snapshot.HasTooManyInfraFailure() { 144 return []int{}, nil 145 } 146 147 if snapshot.NumTestSkipped > 0 { 148 return []int{}, nil 149 } 150 151 chunks, err := snapshot.findRegressionChunks() 152 if err != nil { 153 return nil, err 154 } 155 156 // Use n "dividers" to divide those chunks into even smaller chunks 157 // such that the max of those smaller chunks is minimized. 158 // We are not optimizing for speed here, because in reality, the number of chunks 159 // and n will be very small. 160 // We are using a brute force (recursive) method here. 161 allocations, _ := chunking(chunks, 0, n, n) 162 result := []int{} 163 for i, chunk := range chunks { 164 result = append(result, breakToSmallerChunks(chunk, allocations[i])...) 165 } 166 return result, nil 167 } 168 169 // FindNextCommitsToRun is similar to FindNextIndicesToRun, 170 // but it returns the commit hashes instead of indices. 171 func (snapshot *Snapshot) FindNextCommitsToRun(n int) ([]string, error) { 172 indices, err := snapshot.FindNextIndicesToRun(n) 173 if err != nil { 174 return nil, err 175 } 176 commits := make([]string, len(indices)) 177 for i, index := range indices { 178 commits[i] = snapshot.BlameList.Commits[index].Commit 179 } 180 return commits, nil 181 } 182 183 // findRegressionChunks finds the regression range and breaks it into chunks 184 // the result will be sorted (biggest chunk will come first) 185 func (snapshot *Snapshot) findRegressionChunks() ([]*NthSectionSnapshotChunk, error) { 186 start, end, err := snapshot.GetCurrentRegressionRange() 187 if err != nil { 188 return nil, err 189 } 190 191 // Find the indices of running builds in the regression range 192 // We should not run again for those builds, but instead, we should 193 // use those builds to break the range into smaller chunks 194 chunks := []*NthSectionSnapshotChunk{} 195 for _, run := range snapshot.Runs { 196 // There is a special case where there is a failed run at the start 197 // In such case we don't want to include the failed run in any chunks 198 if run.Index == start && run.Status == pb.RerunStatus_RERUN_STATUS_FAILED { 199 start = run.Index + 1 200 continue 201 } 202 if run.Index >= start && run.Index <= end && run.Status == pb.RerunStatus_RERUN_STATUS_IN_PROGRESS { 203 if start <= run.Index-1 { 204 chunks = append(chunks, &NthSectionSnapshotChunk{Begin: start, End: run.Index - 1}) 205 } 206 start = run.Index + 1 207 } 208 } 209 if start <= end { 210 chunks = append(chunks, &NthSectionSnapshotChunk{Begin: start, End: end}) 211 } 212 213 // Sort the chunks descendingly based on length 214 // In general, the "bigger" chunks should be allocated more "dividers" 215 sort.Slice(chunks, func(i, j int) bool { 216 return chunks[i].length() > chunks[j].length() 217 }) 218 return chunks, nil 219 } 220 221 // Use n dividers to divide chunks 222 // The chunks are sorted by length descendingly 223 // We only consider chunks from the start index 224 // Return the array of allocation and the biggest chunk size 225 // maxAllocationForEachChunk is to control the allocation: there is not cases 226 // where we want to allocate more dividers to a smaller chunks 227 func chunking(chunks []*NthSectionSnapshotChunk, start int, nDivider int, maxAllocationForEachChunk int) ([]int, int) { 228 // Base case: There is no chunk 229 // It may mean that all applicable commits for rerun are in progress 230 if len(chunks) == 0 { 231 return []int{0}, 0 232 } 233 // Base case: Only one chunk left 234 if start == len(chunks)-1 { 235 return []int{nDivider}, calculateChunkSize(chunks[start].length(), nDivider) 236 } 237 // Base case: No Divider left -> return the biggest chunk 238 if nDivider == 0 { 239 return zerosSlice(len(chunks) - start + 1), chunks[start].length() 240 } 241 // Recursive, k is the number of dividers allocated the "start" chunk 242 dividerLeft := minInt(nDivider, maxAllocationForEachChunk) 243 min := math.MaxInt64 244 allocation := []int{} 245 for k := dividerLeft; k > 0; k-- { 246 startSize := calculateChunkSize(chunks[start].length(), k) 247 // We passed k here because we don't want to allocate more dividers to a smaller chunk 248 // The recursion depth here is limited by the chunks length and nDivider, which in reality 249 // should be < 10 250 subAllocation, otherSize := chunking(chunks, start+1, nDivider-k, k) 251 if min > maxInt(startSize, otherSize) { 252 min = maxInt(startSize, otherSize) 253 allocation = append([]int{k}, subAllocation...) 254 } 255 } 256 return allocation, min 257 } 258 259 // Create a zeroes slice with length l 260 func zerosSlice(l int) []int { 261 s := make([]int, l) 262 for i := range s { 263 s[i] = 0 264 } 265 return s 266 } 267 268 func minInt(a int, b int) int { 269 return int(math.Min(float64(a), float64(b))) 270 } 271 272 func maxInt(a int, b int) int { 273 return int(math.Max(float64(a), float64(b))) 274 } 275 276 // With the initial length, if we use n dividers to divide as equally as possible 277 // then how long each chunk will be? 278 // Example: initialLength = 10, nDivider = 3 -> chunk size = 2 279 // Example: initialLength = 3, nDivider = 1 -> chunk size = 1 280 func calculateChunkSize(initialLength int, nDivider int) int { 281 if nDivider >= initialLength { 282 return 0 283 } 284 return int(math.Ceil(float64(initialLength-nDivider) / (float64(nDivider + 1)))) 285 } 286 287 // return the indices for the break points 288 func breakToSmallerChunks(chunk *NthSectionSnapshotChunk, nDivider int) []int { 289 if nDivider > chunk.length() { 290 nDivider = chunk.length() 291 } 292 step := float64(chunk.length()-nDivider) / float64(nDivider+1) 293 result := []int{} 294 for i := 1; i <= nDivider; i++ { 295 next := int(math.Round(step*float64(i) + float64(i-1))) 296 // next >= chunk.length() should not happen, but just in case 297 if next < chunk.length() { 298 result = append(result, chunk.Begin+next) 299 } 300 } 301 302 return result 303 } 304 305 // FindNextSingleCommitToRun returns the next commit to run. 306 // Used to get the new rerun when we get the update from recipe. 307 // If we cannot find the next commit, we will return empty string. 308 func (snapshot *Snapshot) FindNextSingleCommitToRun() (string, error) { 309 // We pass 1 as argument here because we only need to find one commit 310 // to replace the finishing one. 311 commits, err := snapshot.FindNextCommitsToRun(1) 312 if err != nil { 313 return "", errors.Annotate(err, "find next commits to run").Err() 314 } 315 // There is no commit to run, perhaps we already found a culprit, or we 316 // have already scheduled the necessary build to be run. 317 if len(commits) == 0 { 318 return "", nil 319 } 320 if len(commits) != 1 { 321 return "", errors.Annotate(err, "expect only 1 commits to rerun. Got %d", len(commits)).Err() 322 } 323 return commits[0], nil 324 325 }