golang.org/x/build@v0.0.0-20240506185731-218518f32b70/cmd/watchflakes/luci.go (about)

     1  // Copyright 2024 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package main
     6  
     7  import (
     8  	"context"
     9  	"encoding/json"
    10  	"fmt"
    11  	"io"
    12  	"log"
    13  	"net/http"
    14  	"regexp"
    15  	"slices"
    16  	"strings"
    17  	"sync"
    18  	"time"
    19  
    20  	bbpb "go.chromium.org/luci/buildbucket/proto"
    21  	"go.chromium.org/luci/common/api/gitiles"
    22  	gpb "go.chromium.org/luci/common/proto/gitiles"
    23  	"go.chromium.org/luci/grpc/prpc"
    24  	rdbpb "go.chromium.org/luci/resultdb/proto/v1"
    25  	"golang.org/x/sync/errgroup"
    26  	"google.golang.org/protobuf/types/known/fieldmaskpb"
    27  	"google.golang.org/protobuf/types/known/timestamppb"
    28  )
    29  
    30  const resultDBHost = "results.api.cr.dev"
    31  const crBuildBucketHost = "cr-buildbucket.appspot.com"
    32  const gitilesHost = "go.googlesource.com"
    33  
    34  // LUCIClient is a LUCI client.
    35  type LUCIClient struct {
    36  	HTTPClient     *http.Client
    37  	GitilesClient  gpb.GitilesClient
    38  	BuildsClient   bbpb.BuildsClient
    39  	BuildersClient bbpb.BuildersClient
    40  	ResultDBClient rdbpb.ResultDBClient
    41  
    42  	// TraceSteps controls whether to log each step name as it's executed.
    43  	TraceSteps bool
    44  
    45  	nProc int
    46  }
    47  
    48  // NewLUCIClient creates a LUCI client.
    49  // nProc controls concurrency. NewLUCIClient panics if nProc is non-positive.
    50  func NewLUCIClient(nProc int) *LUCIClient {
    51  	if nProc < 1 {
    52  		panic(fmt.Errorf("nProc is %d, want 1 or higher", nProc))
    53  	}
    54  	c := new(http.Client)
    55  	gitilesClient, err := gitiles.NewRESTClient(c, gitilesHost, false)
    56  	if err != nil {
    57  		log.Fatal(err)
    58  	}
    59  	buildsClient := bbpb.NewBuildsClient(&prpc.Client{
    60  		C:    c,
    61  		Host: crBuildBucketHost,
    62  	})
    63  	buildersClient := bbpb.NewBuildersClient(&prpc.Client{
    64  		C:    c,
    65  		Host: crBuildBucketHost,
    66  	})
    67  	resultDBClient := rdbpb.NewResultDBClient(&prpc.Client{
    68  		C:    c,
    69  		Host: resultDBHost,
    70  	})
    71  	return &LUCIClient{
    72  		HTTPClient:     c,
    73  		GitilesClient:  gitilesClient,
    74  		BuildsClient:   buildsClient,
    75  		BuildersClient: buildersClient,
    76  		ResultDBClient: resultDBClient,
    77  		nProc:          nProc,
    78  	}
    79  }
    80  
    81  type BuilderConfigProperties struct {
    82  	Repo     string `json:"project,omitempty"`
    83  	GoBranch string `json:"go_branch,omitempty"`
    84  	Target   struct {
    85  		GOARCH string `json:"goarch,omitempty"`
    86  		GOOS   string `json:"goos,omitempty"`
    87  	} `json:"target"`
    88  	KnownIssue int `json:"known_issue,omitempty"`
    89  }
    90  
    91  type Builder struct {
    92  	Name string
    93  	*BuilderConfigProperties
    94  }
    95  
    96  type BuildResult struct {
    97  	ID        int64
    98  	Status    bbpb.Status
    99  	Commit    string    // commit hash
   100  	Time      time.Time // commit time
   101  	GoCommit  string    // for subrepo build, go commit hash
   102  	BuildTime time.Time // build end time
   103  	Builder   string
   104  	*BuilderConfigProperties
   105  	InvocationID string // ResultDB invocation ID
   106  	LogURL       string // textual log of the whole run
   107  	LogText      string
   108  	StepLogURL   string // textual log of the (last) failed step, if any
   109  	StepLogText  string
   110  	Failures     []*Failure
   111  }
   112  
   113  type Commit struct {
   114  	Hash string
   115  	Time time.Time
   116  }
   117  
   118  type Project struct {
   119  	Repo     string
   120  	GoBranch string
   121  }
   122  
   123  type Dashboard struct {
   124  	Project
   125  	Builders []Builder
   126  	Commits  []Commit
   127  	Results  [][]*BuildResult // indexed by builder, then by commit
   128  }
   129  
   130  type Failure struct {
   131  	TestID  string
   132  	Status  rdbpb.TestStatus
   133  	LogURL  string
   134  	LogText string
   135  }
   136  
   137  // ListCommits fetches the list of commits from Gerrit.
   138  func (c *LUCIClient) ListCommits(ctx context.Context, repo, goBranch string, since time.Time) []Commit {
   139  	if c.TraceSteps {
   140  		log.Println("ListCommits", repo, goBranch)
   141  	}
   142  	branch := "master"
   143  	if repo == "go" {
   144  		branch = goBranch
   145  	}
   146  	var commits []Commit
   147  	var pageToken string
   148  nextPage:
   149  	resp, err := c.GitilesClient.Log(ctx, &gpb.LogRequest{
   150  		Project:    repo,
   151  		Committish: "refs/heads/" + branch,
   152  		PageSize:   1000,
   153  		PageToken:  pageToken,
   154  	})
   155  	if err != nil {
   156  		log.Fatal(err)
   157  	}
   158  	for _, c := range resp.GetLog() {
   159  		commitTime := c.GetCommitter().GetTime().AsTime()
   160  		if commitTime.Before(since) {
   161  			goto done
   162  		}
   163  		commits = append(commits, Commit{
   164  			Hash: c.GetId(),
   165  			Time: commitTime,
   166  		})
   167  	}
   168  	if resp.GetNextPageToken() != "" {
   169  		pageToken = resp.GetNextPageToken()
   170  		goto nextPage
   171  	}
   172  done:
   173  	return commits
   174  }
   175  
   176  // ListBuilders fetches the list of builders, on the given repo and goBranch.
   177  // If repo and goBranch are empty, it fetches all builders.
   178  func (c *LUCIClient) ListBuilders(ctx context.Context, repo, goBranch string) ([]Builder, error) {
   179  	if c.TraceSteps {
   180  		log.Println("ListBuilders", repo, goBranch)
   181  	}
   182  	all := repo == "" && goBranch == ""
   183  	var builders []Builder
   184  	var pageToken string
   185  nextPage:
   186  	resp, err := c.BuildersClient.ListBuilders(ctx, &bbpb.ListBuildersRequest{
   187  		Project:   "golang",
   188  		Bucket:    "ci",
   189  		PageSize:  1000,
   190  		PageToken: pageToken,
   191  	})
   192  	if err != nil {
   193  		return nil, err
   194  	}
   195  	for _, b := range resp.GetBuilders() {
   196  		var p BuilderConfigProperties
   197  		json.Unmarshal([]byte(b.GetConfig().GetProperties()), &p)
   198  		if all || (p.Repo == repo && p.GoBranch == goBranch) {
   199  			builders = append(builders, Builder{b.GetId().GetBuilder(), &p})
   200  		}
   201  	}
   202  	if resp.GetNextPageToken() != "" {
   203  		pageToken = resp.GetNextPageToken()
   204  		goto nextPage
   205  	}
   206  	slices.SortFunc(builders, func(a, b Builder) int {
   207  		return strings.Compare(a.Name, b.Name)
   208  	})
   209  	return builders, nil
   210  }
   211  
   212  func (c *LUCIClient) ListBoards(ctx context.Context) ([]*Dashboard, error) {
   213  	builders, err := c.ListBuilders(ctx, "", "")
   214  	if err != nil {
   215  		return nil, err
   216  	}
   217  	repoMap := make(map[Project]bool)
   218  	for _, b := range builders {
   219  		repoMap[Project{b.Repo, b.GoBranch}] = true
   220  	}
   221  	boards := make([]*Dashboard, 0, len(repoMap))
   222  	for p := range repoMap {
   223  		d := &Dashboard{Project: p}
   224  		boards = append(boards, d)
   225  	}
   226  	slices.SortFunc(boards, func(d1, d2 *Dashboard) int {
   227  		if d1.Repo != d2.Repo {
   228  			// put main repo first
   229  			if d1.Repo == "go" {
   230  				return -1
   231  			}
   232  			if d2.Repo == "go" {
   233  				return 1
   234  			}
   235  			return strings.Compare(d1.Repo, d2.Repo)
   236  		}
   237  		return strings.Compare(d1.GoBranch, d2.GoBranch)
   238  	})
   239  	return boards, nil
   240  }
   241  
   242  // GetBuilds fetches builds from one builder.
   243  func (c *LUCIClient) GetBuilds(ctx context.Context, builder string, since time.Time) ([]*bbpb.Build, error) {
   244  	if c.TraceSteps {
   245  		log.Println("GetBuilds", builder)
   246  	}
   247  	pred := &bbpb.BuildPredicate{
   248  		Builder:    &bbpb.BuilderID{Project: "golang", Bucket: "ci", Builder: builder},
   249  		CreateTime: &bbpb.TimeRange{StartTime: timestamppb.New(since)},
   250  	}
   251  	mask, err := fieldmaskpb.New((*bbpb.Build)(nil), "id", "builder", "output", "status", "steps", "infra", "end_time")
   252  	if err != nil {
   253  		return nil, err
   254  	}
   255  	var builds []*bbpb.Build
   256  	var pageToken string
   257  nextPage:
   258  	resp, err := c.BuildsClient.SearchBuilds(ctx, &bbpb.SearchBuildsRequest{
   259  		Predicate: pred,
   260  		Mask:      &bbpb.BuildMask{Fields: mask},
   261  		PageSize:  1000,
   262  		PageToken: pageToken,
   263  	})
   264  	if err != nil {
   265  		return nil, err
   266  	}
   267  	builds = append(builds, resp.GetBuilds()...)
   268  	if resp.GetNextPageToken() != "" {
   269  		pageToken = resp.GetNextPageToken()
   270  		goto nextPage
   271  	}
   272  	return builds, nil
   273  }
   274  
   275  // ReadBoard reads the build dashboard dash, then fills in the content.
   276  func (c *LUCIClient) ReadBoard(ctx context.Context, dash *Dashboard, since time.Time) error {
   277  	if c.TraceSteps {
   278  		log.Println("ReadBoard", dash.Repo, dash.GoBranch)
   279  	}
   280  	dash.Commits = c.ListCommits(ctx, dash.Repo, dash.GoBranch, since)
   281  	var err error
   282  	dash.Builders, err = c.ListBuilders(ctx, dash.Repo, dash.GoBranch)
   283  	if err != nil {
   284  		return err
   285  	}
   286  
   287  	dashMap := make([]map[string]*BuildResult, len(dash.Builders)) // indexed by builder, then keyed by commit hash
   288  
   289  	// Get builds from builders.
   290  	g, groupContext := errgroup.WithContext(ctx)
   291  	g.SetLimit(c.nProc)
   292  	for i, builder := range dash.Builders {
   293  		builder := builder
   294  		buildMap := make(map[string]*BuildResult)
   295  		dashMap[i] = buildMap
   296  		g.Go(func() error {
   297  			bName := builder.Name
   298  			builds, err := c.GetBuilds(groupContext, bName, since)
   299  			if err != nil {
   300  				return err
   301  			}
   302  			for _, b := range builds {
   303  				id := b.GetId()
   304  				var commit, goCommit string
   305  				prop := b.GetOutput().GetProperties().GetFields()
   306  				for _, s := range prop["sources"].GetListValue().GetValues() {
   307  					x := s.GetStructValue().GetFields()["gitilesCommit"].GetStructValue().GetFields()
   308  					c := x["id"].GetStringValue()
   309  					switch repo := x["project"].GetStringValue(); repo {
   310  					case dash.Repo:
   311  						commit = c
   312  					case "go":
   313  						goCommit = c
   314  					default:
   315  						log.Fatalf("repo mismatch: %s %s %s", repo, dash.Repo, buildURL(id))
   316  					}
   317  				}
   318  				if commit == "" {
   319  					switch b.GetStatus() {
   320  					case bbpb.Status_SUCCESS:
   321  						log.Fatalf("empty commit: %s", buildURL(id))
   322  					default:
   323  						// unfinished build, or infra failure, ignore
   324  						continue
   325  					}
   326  				}
   327  				buildTime := b.GetEndTime().AsTime()
   328  				if r0 := buildMap[commit]; r0 != nil {
   329  					// A build already exists for the same builder and commit.
   330  					// Maybe manually retried, or different go commits on same subrepo commit.
   331  					// Pick the one ended at later time.
   332  					const printDup = false
   333  					if printDup {
   334  						fmt.Printf("skip duplicate build: %s %s %d %d\n", bName, shortHash(commit), id, r0.ID)
   335  					}
   336  					if buildTime.Before(r0.BuildTime) {
   337  						continue
   338  					}
   339  				}
   340  				rdb := b.GetInfra().GetResultdb()
   341  				if rdb.GetHostname() != resultDBHost {
   342  					log.Fatalf("ResultDB host mismatch: %s %s %s", rdb.GetHostname(), resultDBHost, buildURL(id))
   343  				}
   344  				if b.GetBuilder().GetBuilder() != bName { // sanity check
   345  					log.Fatalf("builder mismatch: %s %s %s", b.GetBuilder().GetBuilder(), bName, buildURL(id))
   346  				}
   347  				r := &BuildResult{
   348  					ID:                      id,
   349  					Status:                  b.GetStatus(),
   350  					Commit:                  commit,
   351  					GoCommit:                goCommit,
   352  					BuildTime:               buildTime,
   353  					Builder:                 bName,
   354  					BuilderConfigProperties: builder.BuilderConfigProperties,
   355  					InvocationID:            rdb.GetInvocation(),
   356  				}
   357  				if r.Status == bbpb.Status_FAILURE {
   358  					links := prop["failure"].GetStructValue().GetFields()["links"].GetListValue().GetValues()
   359  					for _, l := range links {
   360  						m := l.GetStructValue().GetFields()
   361  						if strings.Contains(m["name"].GetStringValue(), "(combined output)") {
   362  							r.LogURL = m["url"].GetStringValue()
   363  							break
   364  						}
   365  					}
   366  					if r.LogURL == "" {
   367  						// No log URL, Probably a build failure.
   368  						// E.g. https://ci.chromium.org/ui/b/8759448820419452721
   369  						// Use the build's stderr instead.
   370  						for _, l := range b.GetOutput().GetLogs() {
   371  							if l.GetName() == "stderr" {
   372  								r.LogURL = l.GetViewUrl()
   373  								break
   374  							}
   375  						}
   376  					}
   377  
   378  					// Fetch the stderr of the failed step.
   379  					steps := b.GetSteps()
   380  				stepLoop:
   381  					for i := len(steps) - 1; i >= 0; i-- {
   382  						s := steps[i]
   383  						if s.GetStatus() == bbpb.Status_FAILURE {
   384  							for _, l := range s.GetLogs() {
   385  								if l.GetName() == "stderr" || l.GetName() == "output" {
   386  									r.StepLogURL = l.GetViewUrl()
   387  									break stepLoop
   388  								}
   389  							}
   390  						}
   391  					}
   392  				}
   393  				buildMap[commit] = r
   394  			}
   395  			return nil
   396  		})
   397  	}
   398  	if err := g.Wait(); err != nil {
   399  		return err
   400  	}
   401  
   402  	// Gather into dashboard.
   403  	dash.Results = make([][]*BuildResult, len(dash.Builders))
   404  	for i, m := range dashMap {
   405  		dash.Results[i] = make([]*BuildResult, len(dash.Commits))
   406  		for j, c := range dash.Commits {
   407  			r := m[c.Hash]
   408  			if r == nil {
   409  				continue
   410  			}
   411  			r.Time = c.Time // fill in commit time
   412  			dash.Results[i][j] = r
   413  		}
   414  	}
   415  
   416  	return nil
   417  }
   418  
   419  func (c *LUCIClient) ReadBoards(ctx context.Context, boards []*Dashboard, since time.Time) error {
   420  	for _, dash := range boards {
   421  		err := c.ReadBoard(ctx, dash, since)
   422  		if err != nil {
   423  			return err
   424  		}
   425  	}
   426  	return nil
   427  }
   428  
   429  // GetResultAndArtifacts fetches the failed tests and artifacts for the failed run r.
   430  func (c *LUCIClient) GetResultAndArtifacts(ctx context.Context, r *BuildResult) []*Failure {
   431  	if c.TraceSteps {
   432  		log.Println("GetResultAndArtifacts", r.Builder, shortHash(r.Commit), r.ID)
   433  	}
   434  	req := &rdbpb.QueryTestResultsRequest{
   435  		Invocations: []string{r.InvocationID},
   436  		Predicate:   &rdbpb.TestResultPredicate{Expectancy: rdbpb.TestResultPredicate_VARIANTS_WITH_UNEXPECTED_RESULTS},
   437  		PageSize:    1000,
   438  		// TODO: paging? Not sure we want to handle more than 1000 failures in a run...
   439  	}
   440  	resp, err := c.ResultDBClient.QueryTestResults(ctx, req)
   441  	if err != nil {
   442  		log.Fatal(err)
   443  	}
   444  
   445  	var failures []*Failure
   446  	for _, rr := range resp.GetTestResults() {
   447  		testID := rr.GetTestId()
   448  		resp, err := c.ResultDBClient.QueryArtifacts(ctx, &rdbpb.QueryArtifactsRequest{
   449  			Invocations: []string{r.InvocationID},
   450  			Predicate: &rdbpb.ArtifactPredicate{
   451  				TestResultPredicate: &rdbpb.TestResultPredicate{
   452  					TestIdRegexp: regexp.QuoteMeta(testID),
   453  					Expectancy:   rdbpb.TestResultPredicate_VARIANTS_WITH_UNEXPECTED_RESULTS,
   454  				},
   455  			},
   456  			PageSize: 1000,
   457  		})
   458  		if err != nil {
   459  			log.Fatal(err)
   460  		}
   461  		for _, a := range resp.GetArtifacts() {
   462  			if a.GetArtifactId() != "output" {
   463  				continue
   464  			}
   465  			url := a.GetFetchUrl()
   466  			f := &Failure{
   467  				TestID: testID,
   468  				Status: rr.GetStatus(),
   469  				LogURL: url,
   470  			}
   471  			failures = append(failures, f)
   472  		}
   473  	}
   474  	slices.SortFunc(failures, func(f1, f2 *Failure) int {
   475  		return strings.Compare(f1.TestID, f2.TestID)
   476  	})
   477  	return failures
   478  }
   479  
   480  // split TestID to package and test name.
   481  func splitTestID(testid string) (string, string) {
   482  	// TestId is <package path>.<test name>.
   483  	// Both package path and test name could contain "." and "/" (due to subtests).
   484  	// So looking for "." or "/" are not reliable.
   485  	// Tests are always start with ".Test" (or ".Example", ".Benchmark" (do we
   486  	// run benchmarks?)). Looking for them instead.
   487  	// TODO: handle test flavors (e.g. -cpu=1,2,4, -linkmode=internal, etc.)
   488  	for _, sep := range []string{".Test", ".Example", ".Benchmark"} {
   489  		pkg, test, ok := strings.Cut(testid, sep)
   490  		if ok {
   491  			return pkg, sep[1:] + test // add back "Test" prefix (without ".")
   492  		}
   493  	}
   494  	return "", testid
   495  }
   496  
   497  func buildURL(buildID int64) string { // keep in sync with buildUrlRE in github.go
   498  	return fmt.Sprintf("https://ci.chromium.org/b/%d", buildID)
   499  }
   500  
   501  func shortHash(s string) string {
   502  	if len(s) > 8 {
   503  		return s[:8]
   504  	}
   505  	return s
   506  }
   507  
   508  // FindFailures returns the failures listed in the dashboards.
   509  // The result is sorted by commit date, then repo, then builder.
   510  // Pupulate the failure contents (the .Failures fields) for the
   511  // failures.
   512  func (c *LUCIClient) FindFailures(ctx context.Context, boards []*Dashboard) []*BuildResult {
   513  	var res []*BuildResult
   514  	var wg sync.WaitGroup
   515  	sem := make(chan int, c.nProc)
   516  	for _, dash := range boards {
   517  		for i, b := range dash.Builders {
   518  			for _, r := range dash.Results[i] {
   519  				if r == nil {
   520  					continue
   521  				}
   522  				if r.Builder != b.Name { // sanity check
   523  					log.Fatalf("builder mismatch: %s %s", b.Name, r.Builder)
   524  				}
   525  
   526  				if r.Status == bbpb.Status_FAILURE {
   527  					wg.Add(1)
   528  					sem <- 1
   529  					go func(r *BuildResult) {
   530  						defer func() { wg.Done(); <-sem }()
   531  						r.Failures = c.GetResultAndArtifacts(ctx, r)
   532  					}(r)
   533  					res = append(res, r)
   534  				}
   535  			}
   536  		}
   537  	}
   538  	wg.Wait()
   539  
   540  	slices.SortFunc(res, func(a, b *BuildResult) int {
   541  		if !a.Time.Equal(b.Time) {
   542  			return a.Time.Compare(b.Time)
   543  		}
   544  		if a.Repo != b.Repo {
   545  			return strings.Compare(a.Repo, b.Repo)
   546  		}
   547  		if a.Builder != b.Builder {
   548  			return strings.Compare(a.Builder, b.Builder)
   549  		}
   550  		return strings.Compare(a.Commit, b.Commit)
   551  	})
   552  
   553  	return res
   554  }
   555  
   556  // PrintDashboard prints the dashboard.
   557  // For each builder, it prints a list of commits and status.
   558  func PrintDashboard(dash *Dashboard) {
   559  	for i, b := range dash.Builders {
   560  		fmt.Println(b.Name)
   561  		for _, r := range dash.Results[i] {
   562  			if r == nil {
   563  				continue
   564  			}
   565  			fmt.Printf("\t%s %v %v\n", shortHash(r.Commit), r.Time, r.Status)
   566  		}
   567  	}
   568  }
   569  
   570  // FetchLogs fetches logs for build results.
   571  func (c *LUCIClient) FetchLogs(res []*BuildResult) {
   572  	// TODO: caching?
   573  	g := new(errgroup.Group)
   574  	g.SetLimit(c.nProc)
   575  	for _, r := range res {
   576  		r := r
   577  		g.Go(func() error {
   578  			c.fetchLogsForBuild(r)
   579  			return nil
   580  		})
   581  	}
   582  	g.Wait()
   583  }
   584  
   585  func (c *LUCIClient) fetchLogsForBuild(r *BuildResult) {
   586  	if c.TraceSteps {
   587  		log.Println("fetchLogsForBuild", r.Builder, shortHash(r.Commit), r.ID)
   588  	}
   589  	if r.LogURL == "" {
   590  		fmt.Printf("no log url: %s\n", buildURL(r.ID))
   591  	} else {
   592  		r.LogText = fetchURL(r.LogURL + "?format=raw")
   593  	}
   594  	if r.StepLogURL != "" {
   595  		r.StepLogText = fetchURL(r.StepLogURL + "?format=raw")
   596  	}
   597  	for _, f := range r.Failures {
   598  		if f.LogURL == "" {
   599  			fmt.Printf("no log url: %s %s\n", buildURL(r.ID), f.TestID)
   600  		} else {
   601  			f.LogText = fetchURL(f.LogURL)
   602  		}
   603  	}
   604  }
   605  
   606  func fetchURL(url string) string {
   607  	resp, err := http.Get(url)
   608  	if err != nil {
   609  		log.Fatal(err)
   610  	}
   611  	defer resp.Body.Close()
   612  	if resp.StatusCode == http.StatusNotFound {
   613  		return ""
   614  	} else if resp.StatusCode != http.StatusOK {
   615  		body, _ := io.ReadAll(io.LimitReader(resp.Body, 4<<10))
   616  		log.Fatal(fmt.Errorf("GET %s: non-200 OK status code: %v body: %q", url, resp.Status, body))
   617  	}
   618  	body, err := io.ReadAll(resp.Body)
   619  	if err != nil {
   620  		log.Fatal(fmt.Errorf("GET %s: failed to read body: %v body: %q", url, err, body))
   621  	}
   622  	return string(body)
   623  }