golang.org/x/build@v0.0.0-20240506185731-218518f32b70/cmd/watchflakes/main.go (about) 1 // Copyright 2024 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Watchflakes is a program that triages apparent test flakes 6 // on the build.golang.org dashboards. See https://go.dev/wiki/Watchflakes. 7 package main 8 9 import ( 10 "context" 11 "flag" 12 "fmt" 13 "log" 14 "os" 15 "runtime" 16 "strings" 17 "time" 18 19 bbpb "go.chromium.org/luci/buildbucket/proto" 20 rdbpb "go.chromium.org/luci/resultdb/proto/v1" 21 "golang.org/x/build/buildenv" 22 "golang.org/x/build/cmd/watchflakes/internal/script" 23 "golang.org/x/build/internal/secret" 24 "rsc.io/github" 25 ) 26 27 // TODO: 28 // - subrepos by go commit 29 // - handle INFRA_FAILURE and CANCELED 30 31 var _ = fmt.Print 32 33 // Query failures within most recent timeLimit. 34 const timeLimit = 45 * 24 * time.Hour 35 36 const maxFailPerBuild = 3 37 38 var ( 39 md = flag.Bool("md", false, "print Markdown output suitable for GitHub issues") 40 post = flag.Bool("post", false, "post updates to GitHub issues") 41 repeat = flag.Duration("repeat", 0, "keep running with specified `period`; zero means to run once and exit") 42 verbose = flag.Bool("v", false, "print verbose posting decisions") 43 44 useSecretManager = flag.Bool("use-secret-manager", false, "fetch GitHub token from Secret Manager instead of $HOME/.netrc") 45 ) 46 47 func usage() { 48 fmt.Fprintf(os.Stderr, "Usage: watchflakes [options] [script]\n") 49 flag.PrintDefaults() 50 os.Exit(2) 51 } 52 53 func main() { 54 log.SetPrefix("watchflakes: ") 55 flag.Usage = usage 56 buildenv.RegisterStagingFlag() 57 flag.Parse() 58 if flag.NArg() > 1 { 59 usage() 60 } 61 62 var query *Issue 63 if flag.NArg() == 1 { 64 s, err := script.Parse("script", flag.Arg(0), fields) 65 if err != nil { 66 log.Fatalf("parsing query:\n%s", err) 67 } 68 query = &Issue{Issue: new(github.Issue), Script: s, ScriptText: flag.Arg(0)} 69 } 70 71 // Create an authenticated GitHub client. 72 if *useSecretManager { 73 // Fetch credentials from Secret Manager. 74 secretCl, err := secret.NewClientInProject(buildenv.FromFlags().ProjectName) 75 if err != nil { 76 log.Fatalln("failed to create a Secret Manager client:", err) 77 } 78 ghToken, err := secretCl.Retrieve(context.Background(), secret.NameWatchflakesGitHubToken) 79 if err != nil { 80 log.Fatalln("failed to retrieve GitHub token from Secret Manager:", err) 81 } 82 gh = github.NewClient(ghToken) 83 } else { 84 // Use credentials in $HOME/.netrc. 85 var err error 86 gh, err = github.Dial("") 87 if err != nil { 88 log.Fatalln("github.Dial:", err) 89 } 90 } 91 92 // Load LUCI dashboards 93 ctx := context.Background() 94 c := NewLUCIClient(runtime.GOMAXPROCS(0) * 4) 95 c.TraceSteps = true 96 97 var ticker *time.Ticker 98 if *repeat != 0 { 99 ticker = time.NewTicker(*repeat) 100 } 101 Repeat: 102 startTime := time.Now() 103 boards, err := c.ListBoards(ctx) 104 if err != nil { 105 log.Fatalln("ListBoards:", err) 106 } 107 c.ReadBoards(ctx, boards, startTime.Add(-timeLimit)) 108 skipBrokenCommits(boards) 109 skipBrokenBuilders(boards) 110 111 failRes := c.FindFailures(ctx, boards) 112 c.FetchLogs(failRes) 113 114 if *verbose { 115 for _, r := range failRes { 116 fmt.Printf("failure %s %s %s\n", r.Builder, shortHash(r.Commit), buildURL(r.ID)) 117 } 118 } 119 120 // Load GitHub issues 121 var issues []*Issue 122 issues, err = readIssues(issues) 123 if err != nil { 124 log.Fatal(err) 125 } 126 findScripts(issues) 127 if query == nil { 128 postIssueErrors(issues) 129 } 130 if query != nil { 131 issues = []*Issue{query} 132 } 133 134 for _, r := range failRes { 135 newIssue := 0 136 fs := r.Failures 137 fs = coalesceFailures(fs) 138 if len(fs) == 0 { 139 // No test failure, Probably a build failure. 140 // E.g. https://ci.chromium.org/ui/b/8759448820419452721 141 // Make a dummy failure. 142 f := &Failure{ 143 Status: rdbpb.TestStatus_FAIL, 144 LogText: r.StepLogText, 145 } 146 fs = []*Failure{f} 147 } 148 for _, f := range fs { 149 fp := NewFailurePost(r, f) 150 record := fp.Record() 151 action, targets := run(issues, record) 152 if *verbose { 153 printRecord(record, false) 154 fmt.Printf("\t%s %v\n", action, targets) 155 } 156 switch action { 157 case "skip": 158 // do nothing 159 if *verbose { 160 fmt.Printf("%s: skipped by #%d\n", fp.URL, targets[0].Number) 161 } 162 163 case "": 164 if newIssue > 0 { 165 // If we already opened a new issue for a build, don't open another one. 166 // It could be that the build is just broken. 167 // One can look at the issue and split if necessary. 168 break 169 } 170 171 // create a new issue 172 if query == nil { 173 if *verbose { 174 fmt.Printf("%s: new issue\n", fp.URL) 175 } 176 issue, err := reportNew(fp) 177 if err != nil { 178 log.Fatal(err) 179 } 180 issues = append(issues, issue) 181 newIssue++ 182 } 183 184 case "default", "post", "take": 185 for _, issue := range targets { 186 if !issue.Mentions[fp.URL] && issue.Stale { 187 readComments(issue) 188 } 189 if *verbose { 190 mentioned := "un" 191 if issue.Mentions[fp.URL] { 192 mentioned = "already " 193 } 194 fmt.Printf("%s: %s #%d, %smentioned\n", fp.URL, action, issue.Number, mentioned) 195 } 196 if !issue.Mentions[fp.URL] { 197 issue.Post = append(issue.Post, fp) 198 } 199 } 200 } 201 } 202 } 203 204 if query != nil { 205 format := (*FailurePost).Text 206 if *md { 207 format = (*FailurePost).Markdown 208 } 209 for i, fp := range query.Post { 210 if i > 0 { 211 fmt.Printf("\n") 212 } 213 os.Stdout.WriteString(format(fp)) 214 } 215 if *md { 216 os.Stdout.WriteString(signature) 217 } 218 return 219 } 220 221 posts := 0 222 for _, issue := range issues { 223 if len(issue.Post) > 0 { 224 fmt.Printf(" - new for #%d %s\n", issue.Number, issue.Title) 225 for _, fp := range issue.Post { 226 fmt.Printf(" - %s\n %s\n", fp, fp.URL) 227 } 228 msg := updateText(issue) 229 if *verbose { 230 fmt.Printf("\n%s\n", indent(spaces[:3], msg)) 231 } 232 if *post { 233 if err := postComment(issue, msg); err != nil { 234 log.Print(err) 235 continue 236 } 237 if issue.Mentions == nil { 238 issue.Mentions = make(map[string]bool) 239 } 240 for _, fp := range issue.Post { 241 issue.Mentions[fp.URL] = true 242 } 243 } 244 posts++ 245 } 246 } 247 248 log.Printf("Done. %d boards, %d failures, %d issues, %d posts, in %v\n", len(boards), len(failRes), len(issues), posts, time.Since(startTime)) 249 250 if *repeat != 0 { 251 <-ticker.C 252 goto Repeat 253 } 254 } 255 256 const SKIP = bbpb.Status_STATUS_UNSPECIFIED // for smashing the status to skip a non-flake failure 257 258 // skipBrokenCommits identifies broken commits, 259 // which are the ones that failed on at least 1/4 of builders, 260 // and then changes all results for those commits to SKIP. 261 func skipBrokenCommits(boards []*Dashboard) { 262 for _, dash := range boards { 263 builderThreshold := len(dash.Builders) / 4 264 for i := 0; i < len(dash.Commits); i++ { 265 bad := 0 266 good := 0 267 for _, rs := range dash.Results { 268 if rs[i] == nil { 269 continue 270 } 271 switch rs[i].Status { 272 case bbpb.Status_SUCCESS: 273 good++ 274 case bbpb.Status_FAILURE: 275 bad++ 276 // ignore other status 277 } 278 } 279 if bad > builderThreshold || good < builderThreshold { 280 fmt.Printf("skip: commit %s (%s %s) is broken (good=%d bad=%d)\n", shortHash(dash.Commits[i].Hash), dash.Repo, dash.GoBranch, good, bad) 281 for _, rs := range dash.Results { 282 if rs[i] != nil { 283 rs[i].Status = SKIP 284 } 285 } 286 } 287 } 288 } 289 } 290 291 // skipBrokenBuilders identifies builders that were consistently broken 292 // (at least tooManyToBeFlakes failures in a row) and then turned ok. 293 // It changes those consistent failures to SKIP. 294 func skipBrokenBuilders(boards []*Dashboard) { 295 const tooManyToBeFlakes = 4 296 297 for _, dash := range boards { 298 for _, rs := range dash.Results { 299 bad := 100 // squash failures at the top of the dashboard, which may turn out to be consistent 300 badStart := 0 301 skip := func(i int) { // skip the i-th result 302 if rs[i] != nil { 303 fmt.Printf("skip: builder %s was broken at %s (%s %s)\n", rs[i].Builder, shortHash(rs[i].Commit), dash.Repo, dash.GoBranch) 304 rs[i].Status = SKIP 305 } 306 } 307 for i, r := range rs { 308 if rs[i] == nil { 309 continue 310 } 311 switch r.Status { 312 case bbpb.Status_SUCCESS: 313 bad = 0 314 continue 315 case bbpb.Status_FAILURE: 316 bad++ 317 default: // ignore other status 318 continue 319 } 320 switch bad { 321 case 1: 322 badStart = i 323 case tooManyToBeFlakes: 324 // Skip the run so far. 325 for j := badStart; j < i; j++ { 326 skip(j) 327 } 328 } 329 if bad >= tooManyToBeFlakes { 330 skip(i) 331 } 332 } 333 334 // Bad entries ending just before the cutoff are not flakes 335 // even if there are just a few of them. Otherwise we get 336 // spurious flakes when there's one bad entry before the 337 // cutoff and lots after the cutoff. 338 if bad > 0 && badStart > 0 { 339 for j := badStart; j < len(rs); j++ { 340 skip(j) 341 } 342 } 343 } 344 } 345 } 346 347 // run runs the scripts in issues on record. 348 // It returns the desired action (skip, post, default) 349 // as well as the list of target issues (for post or default). 350 func run(issues []*Issue, record script.Record) (action string, targets []*Issue) { 351 var def, post []*Issue 352 353 for _, issue := range issues { 354 if issue.Script != nil { 355 switch issue.Script.Action(record) { 356 case "skip": 357 return "skip", []*Issue{issue} 358 case "take": 359 println("TAKE", issue.Number) 360 case "default": 361 def = append(def, issue) 362 case "post": 363 post = append(post, issue) 364 } 365 } 366 } 367 368 if len(post) > 0 { 369 return "post", post 370 } 371 if len(def) > 0 { 372 return "default", def[:1] 373 } 374 return "", nil 375 } 376 377 // FailurePost is a failure to be posted on an issue. 378 type FailurePost struct { 379 *BuildResult 380 *Failure 381 URL string // LUCI build page 382 Pkg string 383 Test string 384 Snippet string 385 } 386 387 func NewFailurePost(r *BuildResult, f *Failure) *FailurePost { 388 pkg, test := splitTestID(f.TestID) 389 snip := snippet(f.LogText) 390 if snip == "" { 391 snip = snippet(r.LogText) 392 } 393 fp := &FailurePost{ 394 BuildResult: r, 395 Failure: f, 396 URL: buildURL(r.ID), 397 Pkg: pkg, 398 Test: test, 399 Snippet: snip, 400 } 401 return fp 402 } 403 404 // fields is the list of known fields for use by script patterns. 405 // It must be in sync with the Record method below. 406 var fields = []string{ 407 "", 408 "section", // not used, keep for compatibility with old watchflakes 409 "pkg", 410 "test", 411 "mode", 412 "output", 413 "snippet", 414 "date", 415 "builder", 416 "repo", 417 "goos", 418 "goarch", 419 "log", 420 "status", 421 } 422 423 func (fp *FailurePost) Record() script.Record { 424 // Note: update fields above if any new fields are added to this record. 425 m := script.Record{ 426 "pkg": fp.Pkg, 427 "test": fp.Test, 428 "output": fp.Failure.LogText, 429 "snippet": fp.Snippet, 430 "date": fp.Time.Format(time.RFC3339), 431 "builder": fp.Builder, 432 "repo": fp.Repo, 433 "goos": fp.Target.GOOS, 434 "goarch": fp.Target.GOARCH, 435 "log": fp.BuildResult.LogText, 436 "status": fp.Failure.Status.String(), 437 } 438 m[""] = m["output"] // default field for `regexp` search (as opposed to field ~ `regexp`) 439 if fp.IsBuildFailure() { 440 m["mode"] = "build" 441 } 442 return m 443 } 444 445 func printRecord(r script.Record, verbose bool) { 446 fmt.Printf("%s %s %s %s %s %s\n", r["date"], r["builder"], r["goos"], r["goarch"], 447 r["pkg"], r["test"]) 448 if verbose { 449 fmt.Printf("%s\n", indent(spaces[:4], r["snippet"])) 450 } 451 } 452 453 func (fp *FailurePost) IsBuildFailure() bool { 454 // no test ID. dummy for build failure. 455 return fp.Failure.TestID == "" 456 } 457 458 // String returns a header to identify the log and failure. 459 func (fp *FailurePost) String() string { 460 repo := fp.Repo 461 sep := "" 462 if fp.GoCommit != "" { 463 sep = " go@" 464 } 465 if fp.GoBranch != "" && fp.GoBranch != "master" { 466 b := strings.TrimPrefix(fp.GoBranch, " release-branch.") 467 if repo == "go" { 468 repo = b 469 } 470 if sep == " go@" { 471 sep = " " + b + "@" 472 } 473 } 474 s := fmt.Sprintf("%s %s %s@%s%s%s", 475 fp.Time.Format("2006-01-02 15:04"), 476 fp.Builder, repo, shortHash(fp.Commit), 477 sep, shortHash(fp.GoCommit)) 478 479 if fp.Pkg != "" || fp.Test != "" { 480 s += " " + shortPkg(fp.Pkg) 481 if fp.Pkg != "" && fp.Test != "" { 482 s += "." 483 } 484 s += fp.Test 485 } 486 if fp.IsBuildFailure() { 487 s += " [build]" 488 } 489 if fp.Failure.Status != rdbpb.TestStatus_FAIL { 490 s += fmt.Sprintf(" [%s]", fp.Failure.Status) 491 } 492 return s 493 } 494 495 // Markdown returns Markdown suitable for posting to GitHub. 496 func (fp *FailurePost) Markdown() string { 497 return fmt.Sprintf("<details><summary>%s (<a href=\"%s\">log</a>)</summary>\n\n%s</details>\n", 498 fp.String(), fp.URL, indent(spaces[:4], fp.Snippet)) 499 } 500 501 // Text returns text suitable for reading in interactive use or debug logging. 502 func (fp *FailurePost) Text() string { 503 return fmt.Sprintf("%s\n%s\n%s\n", fp, fp.URL, strings.TrimRight(fp.Snippet, "\n")) 504 } 505 506 var spaces = strings.Repeat(" ", 100) 507 508 // indent returns a copy of text in which every line has been indented by prefix. 509 // It also ensures that, except when text is empty, text ends in a \n character. 510 func indent(prefix, text string) string { 511 if text == "" { 512 return "" 513 } 514 text = strings.TrimRight(text, "\n") 515 return prefix + strings.ReplaceAll(text, "\n", "\n"+prefix) + "\n" 516 } 517 518 // shortPkg shortens pkg by removing any leading 519 // golang.org/ (for packages like golang.org/x/sys/windows). 520 func shortPkg(pkg string) string { 521 pkg = strings.TrimPrefix(pkg, "golang.org/") 522 return pkg 523 } 524 525 // shorten the output lines to form a snippet 526 func snippet(log string) string { 527 lines := strings.SplitAfter(log, "\n") 528 529 // Remove beginning and trailing blank lines. 530 for len(lines) > 0 && strings.TrimSpace(lines[0]) == "" { 531 lines = lines[1:] 532 } 533 for len(lines) > 0 && strings.TrimSpace(lines[len(lines)-1]) == "" { 534 lines = lines[:len(lines)-1] 535 } 536 537 // If we have more than 30 lines, make the snippet by taking the first 10, 538 // the last 10, and possibly a middle 10. The middle 10 is included when 539 // the interior lines (between the first and last 10) contain an important-looking 540 // message like "panic:" or "--- FAIL:". The middle 10 start at the important-looking line. 541 // such as 542 if len(lines) > 30 { 543 var keep []string 544 keep = append(keep, lines[:10]...) 545 dots := true 546 for i := 10; i < len(lines)-10; i++ { 547 s := strings.TrimSpace(lines[i]) 548 if strings.HasPrefix(s, "panic:") || strings.HasPrefix(s, "fatal error:") || strings.HasPrefix(s, "--- FAIL:") || strings.Contains(s, ": internal compiler error:") { 549 if i > 10 { 550 keep = append(keep, "...\n") 551 } 552 end := i + 10 553 if end >= len(lines)-10 { 554 dots = false 555 end = len(lines) - 10 556 } 557 keep = append(keep, lines[i:end]...) 558 break 559 } 560 } 561 if dots { 562 keep = append(keep, "...\n") 563 } 564 keep = append(keep, lines[len(lines)-10:]...) 565 lines = keep 566 } 567 568 return strings.Join(lines, "") 569 } 570 571 // If a build that has too many failures, the build is probably broken 572 // (e.g. timeout, crash). Coalesce the failures and report maxFailPerBuild 573 // of them. 574 func coalesceFailures(fs []*Failure) []*Failure { 575 var res []*Failure 576 // A subtest fail may cause the parent test to fail, combine them. 577 var cur *Failure 578 for _, f := range fs { 579 if cur != nil && strings.HasPrefix(f.TestID, cur.TestID+"/") { 580 // f is a subtest of cur. Consume cur, replace with f. 581 res[len(res)-1] = f 582 cur = f 583 continue 584 } 585 cur = f 586 res = append(res, f) 587 } 588 if len(res) <= maxFailPerBuild { 589 return res 590 } 591 592 // If multiple subtests fail under the same parent, pick one that is 593 // more likely to be helpful. Prefer the one containing "FAIL", then 594 // the longer log message. 595 moreLikelyUseful := func(f, last *Failure) bool { 596 return strings.Contains(f.LogText, "--- FAIL") && 597 (!strings.Contains(last.LogText, "--- FAIL") || len(f.LogText) > len(last.LogText)) 598 } 599 cur = nil 600 res = fs[:0] 601 for _, f := range fs { 602 if cur != nil && strings.HasPrefix(f.TestID, cur.TestID+"/") { 603 if moreLikelyUseful(f, res[len(res)-1]) { 604 res[len(res)-1] = f 605 } 606 continue 607 } 608 cur = f 609 res = append(res, f) 610 } 611 if len(res) <= maxFailPerBuild { 612 return res 613 } 614 615 // If there are still too many failures, coalesce by package (pick one with longest log). 616 fs = res 617 res = fs[:0] 618 curpkg := "" 619 for _, f := range fs { 620 pkg, _ := splitTestID(f.TestID) 621 if curpkg != "" && curpkg == pkg { 622 if moreLikelyUseful(f, res[len(res)-1]) { 623 res[len(res)-1] = f 624 } 625 continue 626 } 627 curpkg = pkg 628 res = append(res, f) 629 } 630 if len(res) > maxFailPerBuild { 631 res = res[:maxFailPerBuild] 632 } 633 return res 634 }