golang.org/x/build@v0.0.0-20240506185731-218518f32b70/cmd/watchflakes/main.go (about)

     1  // Copyright 2024 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // Watchflakes is a program that triages apparent test flakes
     6  // on the build.golang.org dashboards. See https://go.dev/wiki/Watchflakes.
     7  package main
     8  
     9  import (
    10  	"context"
    11  	"flag"
    12  	"fmt"
    13  	"log"
    14  	"os"
    15  	"runtime"
    16  	"strings"
    17  	"time"
    18  
    19  	bbpb "go.chromium.org/luci/buildbucket/proto"
    20  	rdbpb "go.chromium.org/luci/resultdb/proto/v1"
    21  	"golang.org/x/build/buildenv"
    22  	"golang.org/x/build/cmd/watchflakes/internal/script"
    23  	"golang.org/x/build/internal/secret"
    24  	"rsc.io/github"
    25  )
    26  
    27  // TODO:
    28  // - subrepos by go commit
    29  // - handle INFRA_FAILURE and CANCELED
    30  
    31  var _ = fmt.Print
    32  
    33  // Query failures within most recent timeLimit.
    34  const timeLimit = 45 * 24 * time.Hour
    35  
    36  const maxFailPerBuild = 3
    37  
    38  var (
    39  	md      = flag.Bool("md", false, "print Markdown output suitable for GitHub issues")
    40  	post    = flag.Bool("post", false, "post updates to GitHub issues")
    41  	repeat  = flag.Duration("repeat", 0, "keep running with specified `period`; zero means to run once and exit")
    42  	verbose = flag.Bool("v", false, "print verbose posting decisions")
    43  
    44  	useSecretManager = flag.Bool("use-secret-manager", false, "fetch GitHub token from Secret Manager instead of $HOME/.netrc")
    45  )
    46  
    47  func usage() {
    48  	fmt.Fprintf(os.Stderr, "Usage: watchflakes [options] [script]\n")
    49  	flag.PrintDefaults()
    50  	os.Exit(2)
    51  }
    52  
    53  func main() {
    54  	log.SetPrefix("watchflakes: ")
    55  	flag.Usage = usage
    56  	buildenv.RegisterStagingFlag()
    57  	flag.Parse()
    58  	if flag.NArg() > 1 {
    59  		usage()
    60  	}
    61  
    62  	var query *Issue
    63  	if flag.NArg() == 1 {
    64  		s, err := script.Parse("script", flag.Arg(0), fields)
    65  		if err != nil {
    66  			log.Fatalf("parsing query:\n%s", err)
    67  		}
    68  		query = &Issue{Issue: new(github.Issue), Script: s, ScriptText: flag.Arg(0)}
    69  	}
    70  
    71  	// Create an authenticated GitHub client.
    72  	if *useSecretManager {
    73  		// Fetch credentials from Secret Manager.
    74  		secretCl, err := secret.NewClientInProject(buildenv.FromFlags().ProjectName)
    75  		if err != nil {
    76  			log.Fatalln("failed to create a Secret Manager client:", err)
    77  		}
    78  		ghToken, err := secretCl.Retrieve(context.Background(), secret.NameWatchflakesGitHubToken)
    79  		if err != nil {
    80  			log.Fatalln("failed to retrieve GitHub token from Secret Manager:", err)
    81  		}
    82  		gh = github.NewClient(ghToken)
    83  	} else {
    84  		// Use credentials in $HOME/.netrc.
    85  		var err error
    86  		gh, err = github.Dial("")
    87  		if err != nil {
    88  			log.Fatalln("github.Dial:", err)
    89  		}
    90  	}
    91  
    92  	// Load LUCI dashboards
    93  	ctx := context.Background()
    94  	c := NewLUCIClient(runtime.GOMAXPROCS(0) * 4)
    95  	c.TraceSteps = true
    96  
    97  	var ticker *time.Ticker
    98  	if *repeat != 0 {
    99  		ticker = time.NewTicker(*repeat)
   100  	}
   101  Repeat:
   102  	startTime := time.Now()
   103  	boards, err := c.ListBoards(ctx)
   104  	if err != nil {
   105  		log.Fatalln("ListBoards:", err)
   106  	}
   107  	c.ReadBoards(ctx, boards, startTime.Add(-timeLimit))
   108  	skipBrokenCommits(boards)
   109  	skipBrokenBuilders(boards)
   110  
   111  	failRes := c.FindFailures(ctx, boards)
   112  	c.FetchLogs(failRes)
   113  
   114  	if *verbose {
   115  		for _, r := range failRes {
   116  			fmt.Printf("failure %s %s %s\n", r.Builder, shortHash(r.Commit), buildURL(r.ID))
   117  		}
   118  	}
   119  
   120  	// Load GitHub issues
   121  	var issues []*Issue
   122  	issues, err = readIssues(issues)
   123  	if err != nil {
   124  		log.Fatal(err)
   125  	}
   126  	findScripts(issues)
   127  	if query == nil {
   128  		postIssueErrors(issues)
   129  	}
   130  	if query != nil {
   131  		issues = []*Issue{query}
   132  	}
   133  
   134  	for _, r := range failRes {
   135  		newIssue := 0
   136  		fs := r.Failures
   137  		fs = coalesceFailures(fs)
   138  		if len(fs) == 0 {
   139  			// No test failure, Probably a build failure.
   140  			// E.g. https://ci.chromium.org/ui/b/8759448820419452721
   141  			// Make a dummy failure.
   142  			f := &Failure{
   143  				Status:  rdbpb.TestStatus_FAIL,
   144  				LogText: r.StepLogText,
   145  			}
   146  			fs = []*Failure{f}
   147  		}
   148  		for _, f := range fs {
   149  			fp := NewFailurePost(r, f)
   150  			record := fp.Record()
   151  			action, targets := run(issues, record)
   152  			if *verbose {
   153  				printRecord(record, false)
   154  				fmt.Printf("\t%s %v\n", action, targets)
   155  			}
   156  			switch action {
   157  			case "skip":
   158  				// do nothing
   159  				if *verbose {
   160  					fmt.Printf("%s: skipped by #%d\n", fp.URL, targets[0].Number)
   161  				}
   162  
   163  			case "":
   164  				if newIssue > 0 {
   165  					// If we already opened a new issue for a build, don't open another one.
   166  					// It could be that the build is just broken.
   167  					// One can look at the issue and split if necessary.
   168  					break
   169  				}
   170  
   171  				// create a new issue
   172  				if query == nil {
   173  					if *verbose {
   174  						fmt.Printf("%s: new issue\n", fp.URL)
   175  					}
   176  					issue, err := reportNew(fp)
   177  					if err != nil {
   178  						log.Fatal(err)
   179  					}
   180  					issues = append(issues, issue)
   181  					newIssue++
   182  				}
   183  
   184  			case "default", "post", "take":
   185  				for _, issue := range targets {
   186  					if !issue.Mentions[fp.URL] && issue.Stale {
   187  						readComments(issue)
   188  					}
   189  					if *verbose {
   190  						mentioned := "un"
   191  						if issue.Mentions[fp.URL] {
   192  							mentioned = "already "
   193  						}
   194  						fmt.Printf("%s: %s #%d, %smentioned\n", fp.URL, action, issue.Number, mentioned)
   195  					}
   196  					if !issue.Mentions[fp.URL] {
   197  						issue.Post = append(issue.Post, fp)
   198  					}
   199  				}
   200  			}
   201  		}
   202  	}
   203  
   204  	if query != nil {
   205  		format := (*FailurePost).Text
   206  		if *md {
   207  			format = (*FailurePost).Markdown
   208  		}
   209  		for i, fp := range query.Post {
   210  			if i > 0 {
   211  				fmt.Printf("\n")
   212  			}
   213  			os.Stdout.WriteString(format(fp))
   214  		}
   215  		if *md {
   216  			os.Stdout.WriteString(signature)
   217  		}
   218  		return
   219  	}
   220  
   221  	posts := 0
   222  	for _, issue := range issues {
   223  		if len(issue.Post) > 0 {
   224  			fmt.Printf(" - new for #%d %s\n", issue.Number, issue.Title)
   225  			for _, fp := range issue.Post {
   226  				fmt.Printf("    - %s\n      %s\n", fp, fp.URL)
   227  			}
   228  			msg := updateText(issue)
   229  			if *verbose {
   230  				fmt.Printf("\n%s\n", indent(spaces[:3], msg))
   231  			}
   232  			if *post {
   233  				if err := postComment(issue, msg); err != nil {
   234  					log.Print(err)
   235  					continue
   236  				}
   237  				if issue.Mentions == nil {
   238  					issue.Mentions = make(map[string]bool)
   239  				}
   240  				for _, fp := range issue.Post {
   241  					issue.Mentions[fp.URL] = true
   242  				}
   243  			}
   244  			posts++
   245  		}
   246  	}
   247  
   248  	log.Printf("Done. %d boards, %d failures, %d issues, %d posts, in %v\n", len(boards), len(failRes), len(issues), posts, time.Since(startTime))
   249  
   250  	if *repeat != 0 {
   251  		<-ticker.C
   252  		goto Repeat
   253  	}
   254  }
   255  
   256  const SKIP = bbpb.Status_STATUS_UNSPECIFIED // for smashing the status to skip a non-flake failure
   257  
   258  // skipBrokenCommits identifies broken commits,
   259  // which are the ones that failed on at least 1/4 of builders,
   260  // and then changes all results for those commits to SKIP.
   261  func skipBrokenCommits(boards []*Dashboard) {
   262  	for _, dash := range boards {
   263  		builderThreshold := len(dash.Builders) / 4
   264  		for i := 0; i < len(dash.Commits); i++ {
   265  			bad := 0
   266  			good := 0
   267  			for _, rs := range dash.Results {
   268  				if rs[i] == nil {
   269  					continue
   270  				}
   271  				switch rs[i].Status {
   272  				case bbpb.Status_SUCCESS:
   273  					good++
   274  				case bbpb.Status_FAILURE:
   275  					bad++
   276  					// ignore other status
   277  				}
   278  			}
   279  			if bad > builderThreshold || good < builderThreshold {
   280  				fmt.Printf("skip: commit %s (%s %s) is broken (good=%d bad=%d)\n", shortHash(dash.Commits[i].Hash), dash.Repo, dash.GoBranch, good, bad)
   281  				for _, rs := range dash.Results {
   282  					if rs[i] != nil {
   283  						rs[i].Status = SKIP
   284  					}
   285  				}
   286  			}
   287  		}
   288  	}
   289  }
   290  
   291  // skipBrokenBuilders identifies builders that were consistently broken
   292  // (at least tooManyToBeFlakes failures in a row) and then turned ok.
   293  // It changes those consistent failures to SKIP.
   294  func skipBrokenBuilders(boards []*Dashboard) {
   295  	const tooManyToBeFlakes = 4
   296  
   297  	for _, dash := range boards {
   298  		for _, rs := range dash.Results {
   299  			bad := 100 // squash failures at the top of the dashboard, which may turn out to be consistent
   300  			badStart := 0
   301  			skip := func(i int) { // skip the i-th result
   302  				if rs[i] != nil {
   303  					fmt.Printf("skip: builder %s was broken at %s (%s %s)\n", rs[i].Builder, shortHash(rs[i].Commit), dash.Repo, dash.GoBranch)
   304  					rs[i].Status = SKIP
   305  				}
   306  			}
   307  			for i, r := range rs {
   308  				if rs[i] == nil {
   309  					continue
   310  				}
   311  				switch r.Status {
   312  				case bbpb.Status_SUCCESS:
   313  					bad = 0
   314  					continue
   315  				case bbpb.Status_FAILURE:
   316  					bad++
   317  				default: // ignore other status
   318  					continue
   319  				}
   320  				switch bad {
   321  				case 1:
   322  					badStart = i
   323  				case tooManyToBeFlakes:
   324  					// Skip the run so far.
   325  					for j := badStart; j < i; j++ {
   326  						skip(j)
   327  					}
   328  				}
   329  				if bad >= tooManyToBeFlakes {
   330  					skip(i)
   331  				}
   332  			}
   333  
   334  			// Bad entries ending just before the cutoff are not flakes
   335  			// even if there are just a few of them. Otherwise we get
   336  			// spurious flakes when there's one bad entry before the
   337  			// cutoff and lots after the cutoff.
   338  			if bad > 0 && badStart > 0 {
   339  				for j := badStart; j < len(rs); j++ {
   340  					skip(j)
   341  				}
   342  			}
   343  		}
   344  	}
   345  }
   346  
   347  // run runs the scripts in issues on record.
   348  // It returns the desired action (skip, post, default)
   349  // as well as the list of target issues (for post or default).
   350  func run(issues []*Issue, record script.Record) (action string, targets []*Issue) {
   351  	var def, post []*Issue
   352  
   353  	for _, issue := range issues {
   354  		if issue.Script != nil {
   355  			switch issue.Script.Action(record) {
   356  			case "skip":
   357  				return "skip", []*Issue{issue}
   358  			case "take":
   359  				println("TAKE", issue.Number)
   360  			case "default":
   361  				def = append(def, issue)
   362  			case "post":
   363  				post = append(post, issue)
   364  			}
   365  		}
   366  	}
   367  
   368  	if len(post) > 0 {
   369  		return "post", post
   370  	}
   371  	if len(def) > 0 {
   372  		return "default", def[:1]
   373  	}
   374  	return "", nil
   375  }
   376  
   377  // FailurePost is a failure to be posted on an issue.
   378  type FailurePost struct {
   379  	*BuildResult
   380  	*Failure
   381  	URL     string // LUCI build page
   382  	Pkg     string
   383  	Test    string
   384  	Snippet string
   385  }
   386  
   387  func NewFailurePost(r *BuildResult, f *Failure) *FailurePost {
   388  	pkg, test := splitTestID(f.TestID)
   389  	snip := snippet(f.LogText)
   390  	if snip == "" {
   391  		snip = snippet(r.LogText)
   392  	}
   393  	fp := &FailurePost{
   394  		BuildResult: r,
   395  		Failure:     f,
   396  		URL:         buildURL(r.ID),
   397  		Pkg:         pkg,
   398  		Test:        test,
   399  		Snippet:     snip,
   400  	}
   401  	return fp
   402  }
   403  
   404  // fields is the list of known fields for use by script patterns.
   405  // It must be in sync with the Record method below.
   406  var fields = []string{
   407  	"",
   408  	"section", // not used, keep for compatibility with old watchflakes
   409  	"pkg",
   410  	"test",
   411  	"mode",
   412  	"output",
   413  	"snippet",
   414  	"date",
   415  	"builder",
   416  	"repo",
   417  	"goos",
   418  	"goarch",
   419  	"log",
   420  	"status",
   421  }
   422  
   423  func (fp *FailurePost) Record() script.Record {
   424  	// Note: update fields above if any new fields are added to this record.
   425  	m := script.Record{
   426  		"pkg":     fp.Pkg,
   427  		"test":    fp.Test,
   428  		"output":  fp.Failure.LogText,
   429  		"snippet": fp.Snippet,
   430  		"date":    fp.Time.Format(time.RFC3339),
   431  		"builder": fp.Builder,
   432  		"repo":    fp.Repo,
   433  		"goos":    fp.Target.GOOS,
   434  		"goarch":  fp.Target.GOARCH,
   435  		"log":     fp.BuildResult.LogText,
   436  		"status":  fp.Failure.Status.String(),
   437  	}
   438  	m[""] = m["output"] // default field for `regexp` search (as opposed to field ~ `regexp`)
   439  	if fp.IsBuildFailure() {
   440  		m["mode"] = "build"
   441  	}
   442  	return m
   443  }
   444  
   445  func printRecord(r script.Record, verbose bool) {
   446  	fmt.Printf("%s %s %s %s %s %s\n", r["date"], r["builder"], r["goos"], r["goarch"],
   447  		r["pkg"], r["test"])
   448  	if verbose {
   449  		fmt.Printf("%s\n", indent(spaces[:4], r["snippet"]))
   450  	}
   451  }
   452  
   453  func (fp *FailurePost) IsBuildFailure() bool {
   454  	// no test ID. dummy for build failure.
   455  	return fp.Failure.TestID == ""
   456  }
   457  
   458  // String returns a header to identify the log and failure.
   459  func (fp *FailurePost) String() string {
   460  	repo := fp.Repo
   461  	sep := ""
   462  	if fp.GoCommit != "" {
   463  		sep = " go@"
   464  	}
   465  	if fp.GoBranch != "" && fp.GoBranch != "master" {
   466  		b := strings.TrimPrefix(fp.GoBranch, " release-branch.")
   467  		if repo == "go" {
   468  			repo = b
   469  		}
   470  		if sep == " go@" {
   471  			sep = " " + b + "@"
   472  		}
   473  	}
   474  	s := fmt.Sprintf("%s %s %s@%s%s%s",
   475  		fp.Time.Format("2006-01-02 15:04"),
   476  		fp.Builder, repo, shortHash(fp.Commit),
   477  		sep, shortHash(fp.GoCommit))
   478  
   479  	if fp.Pkg != "" || fp.Test != "" {
   480  		s += " " + shortPkg(fp.Pkg)
   481  		if fp.Pkg != "" && fp.Test != "" {
   482  			s += "."
   483  		}
   484  		s += fp.Test
   485  	}
   486  	if fp.IsBuildFailure() {
   487  		s += " [build]"
   488  	}
   489  	if fp.Failure.Status != rdbpb.TestStatus_FAIL {
   490  		s += fmt.Sprintf(" [%s]", fp.Failure.Status)
   491  	}
   492  	return s
   493  }
   494  
   495  // Markdown returns Markdown suitable for posting to GitHub.
   496  func (fp *FailurePost) Markdown() string {
   497  	return fmt.Sprintf("<details><summary>%s (<a href=\"%s\">log</a>)</summary>\n\n%s</details>\n",
   498  		fp.String(), fp.URL, indent(spaces[:4], fp.Snippet))
   499  }
   500  
   501  // Text returns text suitable for reading in interactive use or debug logging.
   502  func (fp *FailurePost) Text() string {
   503  	return fmt.Sprintf("%s\n%s\n%s\n", fp, fp.URL, strings.TrimRight(fp.Snippet, "\n"))
   504  }
   505  
   506  var spaces = strings.Repeat(" ", 100)
   507  
   508  // indent returns a copy of text in which every line has been indented by prefix.
   509  // It also ensures that, except when text is empty, text ends in a \n character.
   510  func indent(prefix, text string) string {
   511  	if text == "" {
   512  		return ""
   513  	}
   514  	text = strings.TrimRight(text, "\n")
   515  	return prefix + strings.ReplaceAll(text, "\n", "\n"+prefix) + "\n"
   516  }
   517  
   518  // shortPkg shortens pkg by removing any leading
   519  // golang.org/ (for packages like golang.org/x/sys/windows).
   520  func shortPkg(pkg string) string {
   521  	pkg = strings.TrimPrefix(pkg, "golang.org/")
   522  	return pkg
   523  }
   524  
   525  // shorten the output lines to form a snippet
   526  func snippet(log string) string {
   527  	lines := strings.SplitAfter(log, "\n")
   528  
   529  	// Remove beginning and trailing blank lines.
   530  	for len(lines) > 0 && strings.TrimSpace(lines[0]) == "" {
   531  		lines = lines[1:]
   532  	}
   533  	for len(lines) > 0 && strings.TrimSpace(lines[len(lines)-1]) == "" {
   534  		lines = lines[:len(lines)-1]
   535  	}
   536  
   537  	// If we have more than 30 lines, make the snippet by taking the first 10,
   538  	// the last 10, and possibly a middle 10. The middle 10 is included when
   539  	// the interior lines (between the first and last 10) contain an important-looking
   540  	// message like "panic:" or "--- FAIL:". The middle 10 start at the important-looking line.
   541  	// such as
   542  	if len(lines) > 30 {
   543  		var keep []string
   544  		keep = append(keep, lines[:10]...)
   545  		dots := true
   546  		for i := 10; i < len(lines)-10; i++ {
   547  			s := strings.TrimSpace(lines[i])
   548  			if strings.HasPrefix(s, "panic:") || strings.HasPrefix(s, "fatal error:") || strings.HasPrefix(s, "--- FAIL:") || strings.Contains(s, ": internal compiler error:") {
   549  				if i > 10 {
   550  					keep = append(keep, "...\n")
   551  				}
   552  				end := i + 10
   553  				if end >= len(lines)-10 {
   554  					dots = false
   555  					end = len(lines) - 10
   556  				}
   557  				keep = append(keep, lines[i:end]...)
   558  				break
   559  			}
   560  		}
   561  		if dots {
   562  			keep = append(keep, "...\n")
   563  		}
   564  		keep = append(keep, lines[len(lines)-10:]...)
   565  		lines = keep
   566  	}
   567  
   568  	return strings.Join(lines, "")
   569  }
   570  
   571  // If a build that has too many failures, the build is probably broken
   572  // (e.g. timeout, crash). Coalesce the failures and report maxFailPerBuild
   573  // of them.
   574  func coalesceFailures(fs []*Failure) []*Failure {
   575  	var res []*Failure
   576  	// A subtest fail may cause the parent test to fail, combine them.
   577  	var cur *Failure
   578  	for _, f := range fs {
   579  		if cur != nil && strings.HasPrefix(f.TestID, cur.TestID+"/") {
   580  			// f is a subtest of cur. Consume cur, replace with f.
   581  			res[len(res)-1] = f
   582  			cur = f
   583  			continue
   584  		}
   585  		cur = f
   586  		res = append(res, f)
   587  	}
   588  	if len(res) <= maxFailPerBuild {
   589  		return res
   590  	}
   591  
   592  	// If multiple subtests fail under the same parent, pick one that is
   593  	// more likely to be helpful. Prefer the one containing "FAIL", then
   594  	// the longer log message.
   595  	moreLikelyUseful := func(f, last *Failure) bool {
   596  		return strings.Contains(f.LogText, "--- FAIL") &&
   597  			(!strings.Contains(last.LogText, "--- FAIL") || len(f.LogText) > len(last.LogText))
   598  	}
   599  	cur = nil
   600  	res = fs[:0]
   601  	for _, f := range fs {
   602  		if cur != nil && strings.HasPrefix(f.TestID, cur.TestID+"/") {
   603  			if moreLikelyUseful(f, res[len(res)-1]) {
   604  				res[len(res)-1] = f
   605  			}
   606  			continue
   607  		}
   608  		cur = f
   609  		res = append(res, f)
   610  	}
   611  	if len(res) <= maxFailPerBuild {
   612  		return res
   613  	}
   614  
   615  	// If there are still too many failures, coalesce by package (pick one with longest log).
   616  	fs = res
   617  	res = fs[:0]
   618  	curpkg := ""
   619  	for _, f := range fs {
   620  		pkg, _ := splitTestID(f.TestID)
   621  		if curpkg != "" && curpkg == pkg {
   622  			if moreLikelyUseful(f, res[len(res)-1]) {
   623  				res[len(res)-1] = f
   624  			}
   625  			continue
   626  		}
   627  		curpkg = pkg
   628  		res = append(res, f)
   629  	}
   630  	if len(res) > maxFailPerBuild {
   631  		res = res[:maxFailPerBuild]
   632  	}
   633  	return res
   634  }