github.com/aclements/go-misc@v0.0.0-20240129233631-2f6ede80790c/findflakes/main.go (about) 1 // Copyright 2015 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package main 6 7 import ( 8 "flag" 9 "log" 10 "os" 11 "path/filepath" 12 "regexp" 13 "runtime" 14 "runtime/pprof" 15 "sort" 16 "strings" 17 "sync" 18 19 "github.com/aclements/go-misc/internal/loganal" 20 ) 21 22 var ( 23 flagRevDir = flag.String("dir", defaultRevDir(), "search logs under `directory`") 24 flagBranch = flag.String("branch", "master", "analyze commits to `branch`") 25 flagHTML = flag.Bool("html", false, "print an HTML report") 26 flagLimit = flag.Int("limit", 0, "process only most recent `N` revisions") 27 28 // TODO: Is this really just a separate mode? Should we have 29 // subcommands? 30 flagGrep = flag.String("grep", "", "show analysis for logs matching `regexp`") 31 flagPaths = flag.Bool("paths", false, "read dir-relative paths of logs with failures from stdin (useful with greplogs -l)") 32 ) 33 34 func defaultRevDir() string { 35 return filepath.Join(xdgCacheDir(), "fetchlogs", "rev") 36 } 37 38 // TODO: Tool you can point at a failure log to annotate each failure 39 // in the log with links to past instances of that failure. This just 40 // uses log analysis. 41 42 // TODO: If we were careful about merges, we could potentially use 43 // information from other branches to add additional samples between 44 // merge points. 45 46 // TODO: Consider each build a separate event, rather than each 47 // revision. It doesn't matter what "order" they're in, though we 48 // should randomize it for each revision. History subdivision should 49 // only happen on revision boundaries. 50 // 51 // OTOH, this makes deterministic failures on specific 52 // OSs/architectures looks like non-deterministic failures. 53 // 54 // This would also mean it's more important to identify builds in 55 // which a test wasn't even executed (e.g., because an earlier test 56 // failed) so we don't count those as "successes". OTOH, it may be 57 // sufficient to consider a test executed unless we see a failure in 58 // that test or that build didn't happen (e.g., a gap in the history). 59 // 60 // This would also help with fixing the problem where hard build 61 // failures are considered successes of all tests. 62 // 63 // This would also be more sound when builders are added at some point 64 // in the history. If the probability of a failure is really constant 65 // per build, adding a builder will increase the probability of seeing 66 // the failure at the commit level. Of course, if it's conditional on 67 // OS or architecture or builder, this will make it look *less* likely 68 // at the build level. 69 // 70 // Along these lines, the culprit analysis should have the property 71 // that do more runs around possible culprit commits should improve 72 // the fidelity of the culprit results. 73 74 // TODO: Support pointing this at a set of stress test failures (along 75 // with the count of total runs, I guess) and having it classify and 76 // report failures. In this case there's no order or commit sequence 77 // involved, so there's no time series analysis or 78 // first/last/culprits, but the classification and failure probability 79 // are still useful. 80 // 81 // It also makes sense to point this at a stress test of a sequence of 82 // commits, in which case the culprit analysis is still useful. This 83 // probably integrates well with the previous TODO of considering each 84 // build a separate event, and it's closely related to the problem of 85 // new builders being added. 86 87 func main() { 88 var cpuprofile = flag.String("cpuprofile", "", "write cpu profile to file") 89 90 flag.Parse() 91 if flag.NArg() > 0 { 92 flag.Usage() 93 os.Exit(2) 94 } 95 96 if *cpuprofile != "" { 97 f, err := os.Create(*cpuprofile) 98 if err != nil { 99 log.Fatal(err) 100 } 101 pprof.StartCPUProfile(f) 102 defer pprof.StopCPUProfile() 103 } 104 105 allRevs, err := LoadRevisions(*flagRevDir) 106 if err != nil { 107 log.Fatal(err) 108 } 109 110 // Filter to revisions on this branch. 111 revs := []*Revision{} 112 for _, rev := range allRevs { 113 if rev.Branch == *flagBranch { 114 revs = append(revs, rev) 115 } 116 } 117 if len(revs) == 0 { 118 log.Fatal("no revisions found") 119 } 120 121 // Limit to most recent N revisions. 122 if *flagLimit > 0 && len(revs) > *flagLimit { 123 revs = revs[len(revs)-*flagLimit:] 124 } 125 126 if *flagGrep != "" { 127 // Grep mode. 128 re, err := regexp.Compile(*flagGrep) 129 if err != nil { 130 log.Fatal(err) 131 } 132 failures := grepFailures(revs, re) 133 if len(failures) == 0 { 134 return 135 } 136 fc := newFailureClass(revs, failures) 137 printTextFlakeReport(os.Stdout, fc) 138 return 139 } 140 141 if *flagPaths { 142 // Paths mode. 143 paths, err := readPaths(os.Stdin) 144 if err != nil { 145 log.Fatal(err) 146 } 147 failures := pathFailures(revs, paths) 148 if len(failures) == 0 { 149 return 150 } 151 fc := newFailureClass(revs, failures) 152 printTextFlakeReport(os.Stdout, fc) 153 return 154 } 155 156 // Extract failures from logs. 157 failures := extractFailures(revs) 158 159 // Classify failures. 160 lfailures := make([]*loganal.Failure, len(failures)) 161 for i, f := range failures { 162 lfailures[i] = f.Failure 163 } 164 failureClasses := loganal.Classify(lfailures) 165 166 // Gather failures from each class and perform flakiness 167 // tests. 168 classes := []*failureClass{} 169 for class, indexes := range failureClasses { 170 classFailures := []*failure{} 171 for _, fi := range indexes { 172 classFailures = append(classFailures, failures[fi]) 173 } 174 fc := newFailureClass(revs, classFailures) 175 fc.Class = class 176 177 // Trim failure classes below thresholds. We leave out 178 // classes with extremely low failure probabilities 179 // because the chance that these are still happening 180 // takes a long time to decay and there's almost 181 // nothing we can do for culprit analysis. 182 if fc.Current < 0.05 || fc.Latest.FailureProbability < 0.01 { 183 continue 184 } 185 186 classes = append(classes, fc) 187 } 188 189 // Sort failure classes by likelihood that failure is still 190 // happening. 191 sort.Sort(sort.Reverse(currentSorter(classes))) 192 193 if *flagHTML { 194 printHTMLReport(os.Stdout, classes) 195 } else { 196 printTextReport(os.Stdout, classes) 197 } 198 } 199 200 func processFailureLogs(revs []*Revision, process func(build *Build, data []byte) []*failure) []*failure { 201 // Create log processing tasks. 202 type Task struct { 203 t int 204 build *Build 205 res []*failure 206 } 207 tasks := []Task{} 208 for t, rev := range revs { 209 for _, build := range rev.Builds { 210 if build.Status != BuildFailed { 211 continue 212 } 213 tasks = append(tasks, Task{t, build, nil}) 214 } 215 } 216 217 // Run failure processing. 218 todo := make(chan int) 219 go func() { 220 for i := range tasks { 221 todo <- i 222 } 223 close(todo) 224 }() 225 var wg sync.WaitGroup 226 for i := 0; i < 4*runtime.GOMAXPROCS(-1); i++ { 227 wg.Add(1) 228 go func() { 229 for i := range todo { 230 task := tasks[i] 231 232 data, err := task.build.ReadLog() 233 if err != nil { 234 log.Fatal(err) 235 } 236 237 failures := process(task.build, data) 238 239 // Fill build-related fields. 240 for _, failure := range failures { 241 failure.T = task.t 242 failure.CommitsAgo = len(revs) - task.t - 1 243 failure.Rev = revs[task.t] 244 failure.Build = task.build 245 } 246 tasks[i].res = failures 247 } 248 wg.Done() 249 }() 250 } 251 wg.Wait() 252 253 // Gather results. 254 failures := []*failure{} 255 for _, task := range tasks { 256 failures = append(failures, task.res...) 257 } 258 return failures 259 } 260 261 func extractFailures(revs []*Revision) []*failure { 262 return processFailureLogs(revs, func(build *Build, data []byte) []*failure { 263 // TODO: OS/Arch 264 lfailures, err := loganal.Extract(string(data), "", "") 265 if err != nil { 266 log.Printf("%s: %v\n", build.LogPath(), err) 267 return nil 268 } 269 if len(lfailures) == 0 { 270 return nil 271 } 272 273 failures := make([]*failure, 0, len(lfailures)) 274 for _, lf := range lfailures { 275 // Ignore build failures. 276 // 277 // TODO: This has the effect of counting these 278 // as successes for all tests. In the best 279 // case, this cuts down on the number of 280 // samples per revision. If we have an 281 // across-the-board build failure, it will 282 // drive down the failure rates of all 283 // failures and may even effect timeline 284 // subdivision. 285 if strings.Contains(lf.Message, "build failed") { 286 continue 287 } 288 289 failures = append(failures, &failure{ 290 Failure: lf, 291 }) 292 } 293 return failures 294 }) 295 } 296 297 func grepFailures(revs []*Revision, re *regexp.Regexp) []*failure { 298 return processFailureLogs(revs, func(build *Build, data []byte) []*failure { 299 if !re.Match(data) { 300 return nil 301 } 302 return []*failure{new(failure)} 303 }) 304 } 305 306 type failure struct { 307 *loganal.Failure 308 309 T int 310 CommitsAgo int 311 Rev *Revision 312 Build *Build 313 } 314 315 type failureClass struct { 316 // Class gives the common features of this failure class. 317 Class loganal.Failure 318 319 // Revs is the sequence of all revisions indexed by time (both 320 // success and failure). 321 Revs []*Revision 322 323 // Failures is a slice of all failures, by order of increasing 324 // time T. Note that there may be more than one failure at the 325 // same time T. 326 Failures []*failure 327 328 // Test is the results of the flake test for this failure 329 // class. 330 Test *FlakeTestResult 331 332 // Latest is the latest flake region (Test.All[0]). 333 Latest *FlakeRegion 334 335 // Current is the probability that this failure is still 336 // happening. 337 Current float64 338 } 339 340 func newFailureClass(revs []*Revision, failures []*failure) *failureClass { 341 fc := failureClass{ 342 Revs: revs, 343 Failures: failures, 344 } 345 times := []int{} 346 for i, f := range failures { 347 t := f.T 348 if i == 0 || times[len(times)-1] != t { 349 times = append(times, t) 350 } 351 } 352 fc.Test = FlakeTest(times) 353 fc.Latest = &fc.Test.All[0] 354 fc.Current = fc.Latest.StillHappening(len(revs) - 1) 355 return &fc 356 } 357 358 type currentSorter []*failureClass 359 360 func (s currentSorter) Len() int { 361 return len(s) 362 } 363 364 func (s currentSorter) Less(i, j int) bool { 365 if s[i].Current != s[j].Current { 366 return s[i].Current < s[j].Current 367 } 368 if s[i].Latest.FailureProbability != s[j].Latest.FailureProbability { 369 return s[i].Latest.FailureProbability < s[j].Latest.FailureProbability 370 } 371 return s[i].Class.String() < s[j].Class.String() 372 } 373 374 func (s currentSorter) Swap(i, j int) { 375 s[i], s[j] = s[j], s[i] 376 }