golang.org/x/build@v0.0.0-20240506185731-218518f32b70/cmd/coordinator/buildstatus.go

golang.org/x/build@v0.0.0-20240506185731-218518f32b70/cmd/coordinator/buildstatus.go (about)

     1  // Copyright 2021 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:build linux || darwin
     6  
     7  package main
     8  
     9  import (
    10  	"bytes"
    11  	"context"
    12  	"errors"
    13  	"fmt"
    14  	"html"
    15  	"html/template"
    16  	"io"
    17  	"log"
    18  	"os"
    19  	"path"
    20  	"strings"
    21  	"sync"
    22  	"sync/atomic"
    23  	"time"
    24  
    25  	"cloud.google.com/go/errorreporting"
    26  	"go4.org/syncutil"
    27  	"golang.org/x/build/buildenv"
    28  	"golang.org/x/build/buildlet"
    29  	"golang.org/x/build/dashboard"
    30  	"golang.org/x/build/internal/buildgo"
    31  	"golang.org/x/build/internal/buildstats"
    32  	"golang.org/x/build/internal/coordinator/pool"
    33  	"golang.org/x/build/internal/coordinator/pool/queue"
    34  	"golang.org/x/build/internal/coordinator/schedule"
    35  	"golang.org/x/build/internal/singleflight"
    36  	"golang.org/x/build/internal/sourcecache"
    37  	"golang.org/x/build/internal/spanlog"
    38  	"golang.org/x/build/livelog"
    39  	"golang.org/x/build/maintner/maintnerd/apipb"
    40  	"golang.org/x/build/types"
    41  	"golang.org/x/mod/semver"
    42  	perfstorage "golang.org/x/perf/storage"
    43  )
    44  
    45  // newBuild constructs a new *buildStatus from rev and commit details.
    46  // detail may be only partially populated, but it must have at least RevBranch set.
    47  // If rev.SubRev is set, then detail.SubRevBranch must also be set.
    48  func newBuild(rev buildgo.BuilderRev, detail commitDetail) (*buildStatus, error) {
    49  	// Note: can't acquire statusMu in newBuild, as this is called
    50  	// from findTryWork -> newTrySet, which holds statusMu.
    51  
    52  	conf, ok := dashboard.Builders[rev.Name]
    53  	if !ok {
    54  		return nil, fmt.Errorf("unknown builder type %q", rev.Name)
    55  	}
    56  	if rev.Rev == "" {
    57  		return nil, fmt.Errorf("required field Rev is empty; got %+v", rev)
    58  	}
    59  	if detail.RevBranch == "" {
    60  		return nil, fmt.Errorf("required field RevBranch is empty; got %+v", detail)
    61  	}
    62  	if rev.SubRev != "" && detail.SubRevBranch == "" {
    63  		return nil, fmt.Errorf("field SubRevBranch is empty, required because SubRev is present; got %+v", detail)
    64  	}
    65  
    66  	ctx, cancel := context.WithCancel(context.Background())
    67  	return &buildStatus{
    68  		buildID:      "B" + randHex(9),
    69  		BuilderRev:   rev,
    70  		commitDetail: detail,
    71  		conf:         conf,
    72  		startTime:    time.Now(),
    73  		ctx:          ctx,
    74  		cancel:       cancel,
    75  	}, nil
    76  }
    77  
    78  // buildStatus is the status of a build.
    79  type buildStatus struct {
    80  	// Immutable:
    81  	buildgo.BuilderRev
    82  	commitDetail
    83  	buildID   string // "B" + 9 random hex
    84  	conf      *dashboard.BuildConfig
    85  	startTime time.Time // actually time of newBuild (~same thing)
    86  	trySet    *trySet   // or nil
    87  
    88  	onceInitHelpers sync.Once // guards call of onceInitHelpersFunc
    89  	helpers         <-chan buildlet.Client
    90  	ctx             context.Context    // used to start the build
    91  	cancel          context.CancelFunc // used to cancel context; for use by setDone only
    92  
    93  	hasBuildlet int32 // atomic: non-zero if this build has a buildlet; for status.go.
    94  
    95  	mu              sync.Mutex       // guards following
    96  	canceled        bool             // whether this build was forcefully canceled, so errors should be ignored
    97  	schedItem       *queue.SchedItem // for the initial buildlet (ignoring helpers for now)
    98  	logURL          string           // if non-empty, permanent URL of log
    99  	bc              buildlet.Client  // nil initially, until pool returns one
   100  	done            time.Time        // finished running
   101  	succeeded       bool             // set when done
   102  	output          livelog.Buffer   // stdout and stderr
   103  	events          []eventAndTime
   104  	useSnapshotMemo map[string]bool // memoized result of useSnapshotFor(rev), where the key is rev
   105  }
   106  
   107  func (st *buildStatus) NameAndBranch() string {
   108  	result := st.Name
   109  	if st.RevBranch != "master" {
   110  		// For the common and currently-only case of
   111  		// "release-branch.go1.15" say "linux-amd64 (Go 1.15.x)"
   112  		const releasePrefix = "release-branch.go"
   113  		if strings.HasPrefix(st.RevBranch, releasePrefix) {
   114  			result = fmt.Sprintf("%s (Go %s.x)", st.Name, strings.TrimPrefix(st.RevBranch, releasePrefix))
   115  		} else {
   116  			// But if we ever support building other branches,
   117  			// fall back to something verbose until we add a
   118  			// special case:
   119  			result = fmt.Sprintf("%s (go branch %s)", st.Name, st.RevBranch)
   120  		}
   121  	}
   122  	// For an x repo running on a CL in a different repo,
   123  	// add a prefix specifying the name of the x repo.
   124  	if st.SubName != "" && st.trySet != nil && st.SubName != st.trySet.Project {
   125  		result = "(x/" + st.SubName + ") " + result
   126  	}
   127  	return result
   128  }
   129  
   130  // cancelBuild marks a build as no longer wanted, cancels its context,
   131  // and tears down its buildlet.
   132  func (st *buildStatus) cancelBuild() {
   133  	st.mu.Lock()
   134  	if st.canceled {
   135  		// Already done. Shouldn't happen currently, but make
   136  		// it safe for duplicate calls in the future.
   137  		st.mu.Unlock()
   138  		return
   139  	}
   140  
   141  	st.canceled = true
   142  	st.output.Close()
   143  	// cancel the context, which stops the creation of helper
   144  	// buildlets, etc. The context isn't plumbed everywhere yet,
   145  	// so we also forcefully close its buildlet out from under it
   146  	// to trigger a failure. When we get the failure later, we
   147  	// just ignore it (knowing that the canceled bit was set
   148  	// true).
   149  	st.cancel()
   150  	bc := st.bc
   151  	st.mu.Unlock()
   152  
   153  	if bc != nil {
   154  		// closing the buildlet may be slow (up to ~10 seconds
   155  		// on a wedged buildlet) so run it in its own
   156  		// goroutine, so we're not holding st.mu for too long.
   157  		bc.Close()
   158  	}
   159  }
   160  
   161  func (st *buildStatus) setDone(succeeded bool) {
   162  	st.mu.Lock()
   163  	defer st.mu.Unlock()
   164  	if st.canceled {
   165  		return
   166  	}
   167  	st.succeeded = succeeded
   168  	st.done = time.Now()
   169  	st.output.Close()
   170  	st.cancel()
   171  }
   172  
   173  func (st *buildStatus) isRunning() bool {
   174  	st.mu.Lock()
   175  	defer st.mu.Unlock()
   176  	return st.isRunningLocked()
   177  }
   178  
   179  func (st *buildStatus) isRunningLocked() bool { return st.done.IsZero() }
   180  
   181  func (st *buildStatus) logf(format string, args ...interface{}) {
   182  	log.Printf("[build %s %s]: %s", st.Name, st.Rev, fmt.Sprintf(format, args...))
   183  }
   184  
   185  // start starts the build in a new goroutine.
   186  // The buildStatus's context is closed when the build is complete,
   187  // successfully or not.
   188  func (st *buildStatus) start() {
   189  	setStatus(st.BuilderRev, st)
   190  	go func() {
   191  		err := st.build()
   192  		if err == errSkipBuildDueToDeps {
   193  			st.setDone(true)
   194  		} else {
   195  			if err != nil {
   196  				fmt.Fprintf(st, "\n\nError: %v\n", err)
   197  				log.Println(st.BuilderRev, "failed:", err)
   198  			}
   199  			st.setDone(err == nil)
   200  			pool.CoordinatorProcess().PutBuildRecord(st.buildRecord())
   201  		}
   202  		markDone(st.BuilderRev)
   203  	}()
   204  }
   205  
   206  func (st *buildStatus) buildletPool() pool.Buildlet {
   207  	return pool.ForHost(st.conf.HostConfig())
   208  }
   209  
   210  func (st *buildStatus) expectedMakeBashDuration() time.Duration {
   211  	// TODO: base this on historical measurements, instead of statically configured.
   212  	// TODO: move this to dashboard/builders.go? But once we based on on historical
   213  	// measurements, it'll need GCE services (bigtable/bigquery?), so it's probably
   214  	// better in this file.
   215  	goos, goarch := st.conf.GOOS(), st.conf.GOARCH()
   216  
   217  	if goos == "linux" {
   218  		if goarch == "arm" {
   219  			return 4 * time.Minute
   220  		}
   221  		return 45 * time.Second
   222  	}
   223  	return 60 * time.Second
   224  }
   225  
   226  func (st *buildStatus) expectedBuildletStartDuration() time.Duration {
   227  	// TODO: move this to dashboard/builders.go? But once we based on on historical
   228  	// measurements, it'll need GCE services (bigtable/bigquery?), so it's probably
   229  	// better in this file.
   230  	p := st.buildletPool()
   231  	switch p.(type) {
   232  	case *pool.GCEBuildlet:
   233  		if strings.HasPrefix(st.Name, "android-") {
   234  			// about a minute for buildlet + minute for Android emulator to be usable
   235  			return 2 * time.Minute
   236  		}
   237  		return time.Minute
   238  	case *pool.EC2Buildlet:
   239  		// lack of historical data. 2 * time.Minute is a safe overestimate
   240  		return 2 * time.Minute
   241  	case *pool.ReverseBuildletPool:
   242  		goos, arch := st.conf.GOOS(), st.conf.GOARCH()
   243  		if goos == "darwin" {
   244  			if arch == "arm" || arch == "arm64" {
   245  				// iOS; idle or it's not.
   246  				return 0
   247  			}
   248  			if arch == "amd64" || arch == "386" {
   249  				return 0 // TODO: remove this once we're using VMware
   250  				// return 1 * time.Minute // VMware boot of hermetic OS X
   251  			}
   252  		}
   253  	}
   254  	return 0
   255  }
   256  
   257  // getHelpersReadySoon waits a bit (as a function of the build
   258  // configuration) and starts getting the buildlets for test sharding
   259  // ready, such that they're ready when make.bash is done. But we don't
   260  // want to start too early, lest we waste idle resources during make.bash.
   261  func (st *buildStatus) getHelpersReadySoon() {
   262  	if st.IsSubrepo() || st.conf.NumTestHelpers(st.isTry()) == 0 || st.conf.IsReverse() {
   263  		return
   264  	}
   265  	time.AfterFunc(st.expectedMakeBashDuration()-st.expectedBuildletStartDuration(),
   266  		func() {
   267  			st.LogEventTime("starting_helpers")
   268  			st.getHelpers() // and ignore the result.
   269  		})
   270  }
   271  
   272  // getHelpers returns a channel of buildlet test helpers, with an item
   273  // sent as they become available. The channel is closed at the end.
   274  func (st *buildStatus) getHelpers() <-chan buildlet.Client {
   275  	st.onceInitHelpers.Do(st.onceInitHelpersFunc)
   276  	return st.helpers
   277  }
   278  
   279  func (st *buildStatus) onceInitHelpersFunc() {
   280  	schedTmpl := &queue.SchedItem{
   281  		BuilderRev: st.BuilderRev,
   282  		HostType:   st.conf.HostType,
   283  		IsTry:      st.isTry(),
   284  		CommitTime: st.commitTime(),
   285  		Branch:     st.RevBranch,
   286  		Repo:       st.RepoOrGo(),
   287  		User:       st.AuthorEmail,
   288  	}
   289  	st.helpers = getBuildlets(st.ctx, st.conf.NumTestHelpers(st.isTry()), schedTmpl, st)
   290  }
   291  
   292  // useSnapshot reports whether this type of build uses a snapshot of
   293  // make.bash if it exists and that the snapshot exists.
   294  func (st *buildStatus) useSnapshot() bool {
   295  	return st.useSnapshotFor(st.Rev)
   296  }
   297  
   298  func (st *buildStatus) useSnapshotFor(rev string) bool {
   299  	if st.conf.SkipSnapshot {
   300  		return false
   301  	}
   302  	st.mu.Lock()
   303  	defer st.mu.Unlock()
   304  	if b, ok := st.useSnapshotMemo[rev]; ok {
   305  		return b
   306  	}
   307  	br := st.BuilderRev
   308  	br.Rev = rev
   309  	b := br.SnapshotExists(context.TODO(), pool.NewGCEConfiguration().BuildEnv())
   310  	if st.useSnapshotMemo == nil {
   311  		st.useSnapshotMemo = make(map[string]bool)
   312  	}
   313  	st.useSnapshotMemo[rev] = b
   314  	return b
   315  }
   316  
   317  func (st *buildStatus) forceSnapshotUsage() {
   318  	st.mu.Lock()
   319  	defer st.mu.Unlock()
   320  	if st.useSnapshotMemo == nil {
   321  		st.useSnapshotMemo = make(map[string]bool)
   322  	}
   323  	st.useSnapshotMemo[st.Rev] = true
   324  }
   325  
   326  func (st *buildStatus) checkDep(ctx context.Context, dep string) (have bool, err error) {
   327  	span := st.CreateSpan("ask_maintner_has_ancestor")
   328  	defer func() { span.Done(err) }()
   329  	fails := 0
   330  	for {
   331  		res, err := maintnerClient.HasAncestor(ctx, &apipb.HasAncestorRequest{
   332  			Commit:   st.Rev,
   333  			Ancestor: dep,
   334  		})
   335  		if err != nil {
   336  			fails++
   337  			if fails == 3 {
   338  				span.Done(err)
   339  				return false, err
   340  			}
   341  			select {
   342  			case <-ctx.Done():
   343  				return false, ctx.Err()
   344  			case <-time.After(1 * time.Second):
   345  			}
   346  			continue
   347  		}
   348  		if res.UnknownCommit {
   349  			select {
   350  			case <-ctx.Done():
   351  				return false, ctx.Err()
   352  			case <-time.After(1 * time.Second):
   353  			}
   354  			continue
   355  		}
   356  		return res.HasAncestor, nil
   357  	}
   358  }
   359  
   360  var errSkipBuildDueToDeps = errors.New("build was skipped due to missing deps")
   361  
   362  func (st *buildStatus) getBuildlet() (buildlet.Client, error) {
   363  	schedItem := &queue.SchedItem{
   364  		HostType:   st.conf.HostType,
   365  		IsTry:      st.trySet != nil,
   366  		BuilderRev: st.BuilderRev,
   367  		CommitTime: st.commitTime(),
   368  		Repo:       st.RepoOrGo(),
   369  		Branch:     st.RevBranch,
   370  		User:       st.AuthorEmail,
   371  	}
   372  	st.mu.Lock()
   373  	st.schedItem = schedItem
   374  	st.mu.Unlock()
   375  
   376  	sp := st.CreateSpan("get_buildlet")
   377  	bc, err := sched.GetBuildlet(st.ctx, schedItem)
   378  	sp.Done(err)
   379  	if err != nil {
   380  		err = fmt.Errorf("failed to get a buildlet: %v", err)
   381  		go st.reportErr(err)
   382  		return nil, err
   383  	}
   384  	atomic.StoreInt32(&st.hasBuildlet, 1)
   385  
   386  	st.mu.Lock()
   387  	st.bc = bc
   388  	st.mu.Unlock()
   389  	st.LogEventTime("using_buildlet", bc.IPPort())
   390  
   391  	return bc, nil
   392  }
   393  
   394  func (st *buildStatus) build() error {
   395  	if deps := st.conf.GoDeps; len(deps) > 0 {
   396  		ctx, cancel := context.WithTimeout(st.ctx, 30*time.Second)
   397  		defer cancel()
   398  		for _, dep := range deps {
   399  			has, err := st.checkDep(ctx, dep)
   400  			if err != nil {
   401  				fmt.Fprintf(st, "Error checking whether commit %s includes ancestor %s: %v\n", st.Rev, dep, err)
   402  				return err
   403  			}
   404  			if !has {
   405  				st.LogEventTime(eventSkipBuildMissingDep)
   406  				fmt.Fprintf(st, "skipping build; commit %s lacks ancestor %s\n", st.Rev, dep)
   407  				return errSkipBuildDueToDeps
   408  			}
   409  		}
   410  		cancel()
   411  	}
   412  
   413  	pool.CoordinatorProcess().PutBuildRecord(st.buildRecord())
   414  
   415  	bc, err := st.getBuildlet()
   416  	if err != nil {
   417  		return err
   418  	}
   419  	defer bc.Close()
   420  
   421  	if st.useSnapshot() {
   422  		if err := st.writeGoSnapshot(); err != nil {
   423  			return err
   424  		}
   425  	} else {
   426  		// Write the Go source and bootstrap tool chain in parallel.
   427  		var grp syncutil.Group
   428  		grp.Go(st.writeGoSource)
   429  		grp.Go(st.writeBootstrapToolchain)
   430  		if err := grp.Err(); err != nil {
   431  			return err
   432  		}
   433  	}
   434  
   435  	execStartTime := time.Now()
   436  	fmt.Fprintf(st, "%s at %v", st.Name, st.Rev)
   437  	if st.IsSubrepo() {
   438  		fmt.Fprintf(st, " building %v at %v", st.SubName, st.SubRev)
   439  	}
   440  	fmt.Fprint(st, "\n\n")
   441  
   442  	makeTest := st.CreateSpan("make_and_test") // warning: magic event named used by handleLogs
   443  
   444  	remoteErr, err := st.runAllSharded()
   445  	makeTest.Done(err)
   446  
   447  	// bc (aka st.bc) may be invalid past this point, so let's
   448  	// close it to make sure we don't accidentally use it.
   449  	bc.Close()
   450  
   451  	doneMsg := "all tests passed"
   452  	if remoteErr != nil {
   453  		doneMsg = "with test failures"
   454  	} else if err != nil {
   455  		doneMsg = "comm error: " + err.Error()
   456  	}
   457  	// If a build fails multiple times due to communication
   458  	// problems with the buildlet, assume something's wrong with
   459  	// the buildlet or machine and fail the build, rather than
   460  	// looping forever. This promotes the err (communication
   461  	// error) to a remoteErr (an error that occurred remotely and
   462  	// is terminal).
   463  	if rerr := st.repeatedCommunicationError(err); rerr != nil {
   464  		remoteErr = rerr
   465  		err = nil
   466  		doneMsg = "communication error to buildlet (promoted to terminal error): " + rerr.Error()
   467  		fmt.Fprintf(st, "\n%s\n", doneMsg)
   468  	}
   469  	if err != nil {
   470  		// Return the error *before* we create the magic
   471  		// "done" event. (which the try coordinator looks for)
   472  		return err
   473  	}
   474  	st.LogEventTime(eventDone, doneMsg)
   475  
   476  	if devPause {
   477  		st.LogEventTime("DEV_MAIN_SLEEP")
   478  		time.Sleep(5 * time.Minute)
   479  	}
   480  
   481  	if st.trySet == nil {
   482  		buildLog := st.logs()
   483  		if remoteErr != nil {
   484  			// If we just have the line-or-so little
   485  			// banner at top, that means we didn't get any
   486  			// interesting output from the remote side, so
   487  			// include the remoteErr text.  Otherwise,
   488  			// assume that remoteErr is redundant with the
   489  			// buildlog text itself.
   490  			if strings.Count(buildLog, "\n") < 10 {
   491  				buildLog += "\n" + remoteErr.Error()
   492  			}
   493  		}
   494  		if err := recordResult(st.BuilderRev, remoteErr == nil, buildLog, time.Since(execStartTime)); err != nil {
   495  			if remoteErr != nil {
   496  				return fmt.Errorf("Remote error was %q but failed to report it to the dashboard: %v", remoteErr, err)
   497  			}
   498  			return fmt.Errorf("Build succeeded but failed to report it to the dashboard: %v", err)
   499  		}
   500  	}
   501  	if remoteErr != nil {
   502  		return remoteErr
   503  	}
   504  	return nil
   505  }
   506  
   507  func (st *buildStatus) HasBuildlet() bool { return atomic.LoadInt32(&st.hasBuildlet) != 0 }
   508  
   509  // useKeepGoingFlag reports whether this build should use -k flag of 'go tool
   510  // dist test', which makes it keep going even when some tests have failed.
   511  func (st *buildStatus) useKeepGoingFlag() bool {
   512  	// For now, keep going for post-submit builders on release branches,
   513  	// because we prioritize seeing more complete test results over failing fast.
   514  	// Later on, we may start doing this all post-submit builders on all branches.
   515  	// See golang.org/issue/14305.
   516  	//
   517  	// TODO(golang.org/issue/36181): A more ideal long term solution is one that reports
   518  	// a failure fast, but still keeps going to make all other test results available.
   519  	return !st.isTry() && strings.HasPrefix(st.branch(), "release-branch.go")
   520  }
   521  
   522  // isTry reports whether the build is a part of a TryBot (pre-submit) run.
   523  // It may be a normal TryBot (part of the default try set) or a SlowBot.
   524  func (st *buildStatus) isTry() bool { return st.trySet != nil }
   525  
   526  // isSlowBot reports whether the build is an explicitly requested SlowBot.
   527  func (st *buildStatus) isSlowBot() bool {
   528  	if st.trySet == nil {
   529  		return false
   530  	}
   531  	for _, conf := range st.trySet.slowBots {
   532  		if st.conf == conf {
   533  			return true
   534  		}
   535  	}
   536  	return false
   537  }
   538  
   539  func (st *buildStatus) buildRecord() *types.BuildRecord {
   540  	rec := &types.BuildRecord{
   541  		ID:        st.buildID,
   542  		ProcessID: processID,
   543  		StartTime: st.startTime,
   544  		IsTry:     st.isTry(),
   545  		IsSlowBot: st.isSlowBot(),
   546  		GoRev:     st.Rev,
   547  		Rev:       st.SubRevOrGoRev(),
   548  		Repo:      st.RepoOrGo(),
   549  		Builder:   st.Name,
   550  		OS:        st.conf.GOOS(),
   551  		Arch:      st.conf.GOARCH(),
   552  	}
   553  
   554  	// Log whether we used COS, so we can do queries to analyze
   555  	// Kubernetes vs COS performance for containers.
   556  	if st.conf.IsContainer() && pool.ForHost(st.conf.HostConfig()) == pool.NewGCEConfiguration().BuildletPool() {
   557  		rec.ContainerHost = "cos"
   558  	}
   559  
   560  	st.mu.Lock()
   561  	defer st.mu.Unlock()
   562  	// TODO: buildlet instance name
   563  	if !st.done.IsZero() {
   564  		rec.EndTime = st.done
   565  		rec.LogURL = st.logURL
   566  		rec.Seconds = rec.EndTime.Sub(rec.StartTime).Seconds()
   567  		if st.succeeded {
   568  			rec.Result = "ok"
   569  		} else {
   570  			rec.Result = "fail"
   571  		}
   572  	}
   573  	return rec
   574  }
   575  
   576  func (st *buildStatus) SpanRecord(sp *schedule.Span, err error) *types.SpanRecord {
   577  	rec := &types.SpanRecord{
   578  		BuildID: st.buildID,
   579  		IsTry:   st.isTry(),
   580  		GoRev:   st.Rev,
   581  		Rev:     st.SubRevOrGoRev(),
   582  		Repo:    st.RepoOrGo(),
   583  		Builder: st.Name,
   584  		OS:      st.conf.GOOS(),
   585  		Arch:    st.conf.GOARCH(),
   586  
   587  		Event:     sp.Event(),
   588  		Detail:    sp.OptText(),
   589  		StartTime: sp.Start(),
   590  		EndTime:   sp.End(),
   591  		Seconds:   sp.End().Sub(sp.Start()).Seconds(),
   592  	}
   593  	if err != nil {
   594  		rec.Error = err.Error()
   595  	}
   596  	return rec
   597  }
   598  
   599  // goBuilder returns a GoBuilder for this buildStatus.
   600  func (st *buildStatus) goBuilder() buildgo.GoBuilder {
   601  	forceMake := true
   602  	if st.RevBranch == "release-branch.go1.20" {
   603  		// The concept of "broken ports" and -force flag didn't
   604  		// exist prior to Go 1.21. See go.dev/issue/56679.
   605  		// TODO: Remove this condition when Go 1.20 is no longer supported.
   606  		forceMake = false
   607  	}
   608  	return buildgo.GoBuilder{
   609  		Logger:     st,
   610  		BuilderRev: st.BuilderRev,
   611  		Conf:       st.conf,
   612  		Goroot:     "go",
   613  		Force:      forceMake,
   614  	}
   615  }
   616  
   617  // runAllSharded runs make.bash and then shards the test execution.
   618  // remoteErr and err are as described at the top of this file.
   619  //
   620  // After runAllSharded returns, the caller must assume that st.bc
   621  // might be invalid (It's possible that only one of the helper
   622  // buildlets survived).
   623  func (st *buildStatus) runAllSharded() (remoteErr, err error) {
   624  	st.getHelpersReadySoon()
   625  
   626  	if !st.useSnapshot() {
   627  		remoteErr, err = st.goBuilder().RunMake(st.ctx, st.bc, st)
   628  		if err != nil {
   629  			return nil, err
   630  		}
   631  		if remoteErr != nil {
   632  			return fmt.Errorf("build failed: %v", remoteErr), nil
   633  		}
   634  	}
   635  	if st.conf.StopAfterMake {
   636  		return nil, nil
   637  	}
   638  
   639  	if err := st.doSnapshot(st.bc); err != nil {
   640  		return nil, err
   641  	}
   642  
   643  	switch {
   644  	case st.conf.RunBench:
   645  		remoteErr, err = st.runBenchmarkTests()
   646  	case st.IsSubrepo():
   647  		remoteErr, err = st.runSubrepoTests()
   648  	case st.conf.IsCrossCompileOnly():
   649  		remoteErr, err = st.buildTestPackages()
   650  	default:
   651  		// Only run platform tests if we're not cross-compiling.
   652  		// dist can't actually build test packages without running them yet.
   653  		// See #58297.
   654  		remoteErr, err = st.runTests(st.getHelpers())
   655  	}
   656  
   657  	if err == errBuildletsGone {
   658  		// Don't wrap this error. TODO: use xerrors.
   659  		return nil, errBuildletsGone
   660  	}
   661  	if err != nil {
   662  		return nil, fmt.Errorf("runTests: %v", err)
   663  	}
   664  	if remoteErr != nil {
   665  		return fmt.Errorf("tests failed: %v", remoteErr), nil
   666  	}
   667  	return nil, nil
   668  }
   669  
   670  // buildTestPackages runs `go tool dist test -compile-only`, which builds all standard
   671  // library test packages but does not run any tests. Used in cross-compilation modes.
   672  func (st *buildStatus) buildTestPackages() (remoteErr, err error) {
   673  	if st.RevBranch == "release-branch.go1.20" {
   674  		// Go 1.20 doesn't support `go tool dist test -compile-only` very well.
   675  		// TODO(mknyszek): Remove this condition when Go 1.20 is no longer supported.
   676  		return nil, nil
   677  	}
   678  	sp := st.CreateSpan("build_test_pkgs")
   679  	remoteErr, err = st.bc.Exec(st.ctx, path.Join("go", "bin", "go"), buildlet.ExecOpts{
   680  		Output: st,
   681  		Debug:  true,
   682  		Args:   []string{"tool", "dist", "test", "-compile-only"},
   683  	})
   684  	if err != nil {
   685  		sp.Done(err)
   686  		return nil, err
   687  	}
   688  	if remoteErr != nil {
   689  		sp.Done(remoteErr)
   690  		return fmt.Errorf("go tool dist test -compile-only failed: %v", remoteErr), nil
   691  	}
   692  	sp.Done(nil)
   693  	return nil, nil
   694  }
   695  
   696  func (st *buildStatus) doSnapshot(bc buildlet.Client) error {
   697  	// If we're using a pre-built snapshot, don't make another.
   698  	if st.useSnapshot() {
   699  		return nil
   700  	}
   701  	if st.conf.SkipSnapshot {
   702  		return nil
   703  	}
   704  	if pool.NewGCEConfiguration().BuildEnv().SnapBucket == "" {
   705  		// Build environment isn't configured to do snapshots.
   706  		return nil
   707  	}
   708  	if err := st.cleanForSnapshot(bc); err != nil {
   709  		return fmt.Errorf("cleanForSnapshot: %v", err)
   710  	}
   711  	if err := st.writeSnapshot(bc); err != nil {
   712  		return fmt.Errorf("writeSnapshot: %v", err)
   713  	}
   714  	return nil
   715  }
   716  
   717  func (st *buildStatus) writeGoSnapshot() (err error) {
   718  	return st.writeGoSnapshotTo(st.Rev, "go")
   719  }
   720  
   721  func (st *buildStatus) writeGoSnapshotTo(rev, dir string) (err error) {
   722  	sp := st.CreateSpan("write_snapshot_tar")
   723  	defer func() { sp.Done(err) }()
   724  
   725  	snapshotURL := pool.NewGCEConfiguration().BuildEnv().SnapshotURL(st.Name, rev)
   726  
   727  	if err := st.bc.PutTarFromURL(st.ctx, snapshotURL, dir); err != nil {
   728  		return fmt.Errorf("failed to put baseline snapshot to buildlet: %v", err)
   729  	}
   730  	return nil
   731  }
   732  
   733  func (st *buildStatus) writeGoSource() error {
   734  	return st.writeGoSourceTo(st.bc, st.Rev, "go")
   735  }
   736  
   737  func (st *buildStatus) writeGoSourceTo(bc buildlet.Client, rev, dir string) error {
   738  	// Write the VERSION file.
   739  	sp := st.CreateSpan("write_version_tar")
   740  	if err := bc.PutTar(st.ctx, buildgo.VersionTgz(rev), dir); err != nil {
   741  		return sp.Done(fmt.Errorf("writing VERSION tgz: %v", err))
   742  	}
   743  
   744  	srcTar, err := sourcecache.GetSourceTgz(st, "go", rev)
   745  	if err != nil {
   746  		return err
   747  	}
   748  	sp = st.CreateSpan("write_go_src_tar")
   749  	if err := bc.PutTar(st.ctx, srcTar, dir); err != nil {
   750  		return sp.Done(fmt.Errorf("writing tarball from Gerrit: %v", err))
   751  	}
   752  	return sp.Done(nil)
   753  }
   754  
   755  func (st *buildStatus) writeBootstrapToolchain() error {
   756  	u := st.conf.GoBootstrapURL(pool.NewGCEConfiguration().BuildEnv())
   757  	if u == "" {
   758  		return nil
   759  	}
   760  	const bootstrapDir = "go1.4" // might be newer; name is the default
   761  	sp := st.CreateSpan("write_go_bootstrap_tar")
   762  	return sp.Done(st.bc.PutTarFromURL(st.ctx, u, bootstrapDir))
   763  }
   764  
   765  func (st *buildStatus) cleanForSnapshot(bc buildlet.Client) error {
   766  	sp := st.CreateSpan("clean_for_snapshot")
   767  	return sp.Done(bc.RemoveAll(st.ctx,
   768  		"go/doc/gopher",
   769  		"go/pkg/bootstrap",
   770  	))
   771  }
   772  
   773  func (st *buildStatus) writeSnapshot(bc buildlet.Client) (err error) {
   774  	sp := st.CreateSpan("write_snapshot_to_gcs")
   775  	defer func() { sp.Done(err) }()
   776  	// A typical Go snapshot tarball in April 2022 is around 150 MB in size.
   777  	// Builders with a fast uplink speed can upload the tar within seconds or minutes.
   778  	// Reverse builders might be far away on the network, so be more lenient for them.
   779  	// (Fast builds require a sufficiently fast uplink speed or turning off snapshots,
   780  	// so the timeout here is mostly an upper bound to prevent infinite hangs.)
   781  	timeout := 5 * time.Minute
   782  	if st.conf.IsReverse() {
   783  		timeout *= 3
   784  	}
   785  	ctx, cancel := context.WithTimeout(st.ctx, timeout)
   786  	defer cancel()
   787  
   788  	tsp := st.CreateSpan("fetch_snapshot_reader_from_buildlet")
   789  	tgz, err := bc.GetTar(ctx, "go")
   790  	tsp.Done(err)
   791  	if err != nil {
   792  		return err
   793  	}
   794  	defer tgz.Close()
   795  
   796  	sc := pool.NewGCEConfiguration().StorageClient()
   797  	if sc == nil {
   798  		return errors.New("GCE configuration missing storage client")
   799  	}
   800  	bucket := pool.NewGCEConfiguration().BuildEnv().SnapBucket
   801  	if bucket == "" {
   802  		return errors.New("build environment missing snapshot bucket")
   803  	}
   804  	wr := sc.Bucket(bucket).Object(st.SnapshotObjectName()).NewWriter(ctx)
   805  	wr.ContentType = "application/octet-stream"
   806  	if n, err := io.Copy(wr, tgz); err != nil {
   807  		st.logf("failed to write snapshot to GCS after copying %d bytes: %v", n, err)
   808  		return err
   809  	}
   810  
   811  	return wr.Close()
   812  }
   813  
   814  // toolchainBaselineCommit determines the toolchain baseline commit for this
   815  // benchmark run.
   816  func (st *buildStatus) toolchainBaselineCommit() (baseline string, err error) {
   817  	sp := st.CreateSpan("list_go_releases")
   818  	defer func() { sp.Done(err) }()
   819  
   820  	// TODO(prattmic): Cache responses for a while. These won't change often.
   821  	res, err := maintnerClient.ListGoReleases(st.ctx, &apipb.ListGoReleasesRequest{})
   822  	if err != nil {
   823  		return "", err
   824  	}
   825  
   826  	releases := res.GetReleases()
   827  	if len(releases) == 0 {
   828  		return "", fmt.Errorf("no Go releases: %v", res)
   829  	}
   830  
   831  	if st.RevBranch == "master" {
   832  		// Testing master, baseline is latest release.
   833  		return releases[0].GetTagCommit(), nil
   834  	}
   835  
   836  	// Testing release branch. Baseline is latest patch version of this
   837  	// release.
   838  	for _, r := range releases {
   839  		if st.RevBranch == r.GetBranchName() {
   840  			return r.GetTagCommit(), nil
   841  		}
   842  	}
   843  
   844  	return "", fmt.Errorf("cannot find latest release for %s", st.RevBranch)
   845  }
   846  
   847  // Temporarily hard-code the subrepo baseline commits to use.
   848  //
   849  // TODO(rfindley): in the future, we should use the latestRelease method to
   850  // automatically choose the latest patch release of the previous minor version
   851  // (e.g. v0.11.x while we're working on v0.12.y).
   852  var subrepoBaselines = map[string]string{
   853  	"tools": "6ce74ceaddcc4ff081d22ae134f4264a667d394f", // gopls@v0.11.0, with additional instrumentation for memory and CPU usage
   854  }
   855  
   856  // subrepoBaselineCommit determines the baseline commit for this subrepo benchmark run.
   857  func (st *buildStatus) subrepoBaselineCommit() (baseline string, err error) {
   858  	commit, ok := subrepoBaselines[st.SubName]
   859  	if !ok {
   860  		return "", fmt.Errorf("unknown subrepo for benchmarking %q", st.SubName)
   861  	}
   862  	return commit, nil
   863  }
   864  
   865  // latestRelease returns the latest release version for a module in subrepo. If
   866  // submodule is non-empty, it is the path to a subdirectory containing the
   867  // submodule of interest (for example submodule is "gopls" if we are
   868  // considering the module golang.org/x/tools/gopls). Otherwise the module is
   869  // assumed to be at repo root.
   870  //
   871  // It is currently unused, but preserved for future use by the
   872  // subrepoBaselineCommit method.
   873  func (st *buildStatus) latestRelease(submodule string) (string, error) {
   874  	// Baseline is the latest gopls release tag (but not prerelease).
   875  	gerritClient := pool.NewGCEConfiguration().GerritClient()
   876  	tags, err := gerritClient.GetProjectTags(st.ctx, st.SubName)
   877  	if err != nil {
   878  		return "", fmt.Errorf("error fetching tags for %q: %w", st.SubName, err)
   879  	}
   880  
   881  	var versions []string
   882  	revisions := make(map[string]string)
   883  	prefix := "refs/tags"
   884  	if submodule != "" {
   885  		prefix += "/" + submodule // e.g. gopls tags are "gopls/vX.Y.Z"
   886  	}
   887  	for ref, ti := range tags {
   888  		if !strings.HasPrefix(ref, prefix) {
   889  			continue
   890  		}
   891  		version := ref[len(prefix):]
   892  		versions = append(versions, version)
   893  		revisions[version] = ti.Revision
   894  	}
   895  
   896  	semver.Sort(versions)
   897  
   898  	// Return latest non-prerelease version.
   899  	for i := len(versions) - 1; i >= 0; i-- {
   900  		ver := versions[i]
   901  		if !semver.IsValid(ver) {
   902  			continue
   903  		}
   904  		if semver.Prerelease(ver) != "" {
   905  			continue
   906  		}
   907  		return revisions[ver], nil
   908  	}
   909  
   910  	return "", fmt.Errorf("no valid versions found in %+v", versions)
   911  }
   912  
   913  // reportErr reports an error to Stackdriver.
   914  func (st *buildStatus) reportErr(err error) {
   915  	gceErrsClient := pool.NewGCEConfiguration().ErrorsClient()
   916  	if gceErrsClient == nil {
   917  		// errorsClient is nil in dev environments.
   918  		return
   919  	}
   920  
   921  	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
   922  	defer cancel()
   923  
   924  	err = fmt.Errorf("buildID: %v, name: %s, hostType: %s, error: %v", st.buildID, st.conf.Name, st.conf.HostType, err)
   925  	gceErrsClient.ReportSync(ctx, errorreporting.Entry{Error: err})
   926  }
   927  
   928  // distTestList uses 'go tool dist test -list' to get a list of dist test names.
   929  //
   930  // As of Go 1.21, the dist test naming pattern has changed to always be in the
   931  // form of "<pkg>[:<variant>]", where "<pkg>" means what used to be previously
   932  // named "go_test:<pkg>". distTestList maps those new dist test names back to
   933  // that previous format, a combination of "go_test[_bench]:<pkg>" and others.
   934  func (st *buildStatus) distTestList() (names []distTestName, remoteErr, err error) {
   935  	workDir, err := st.bc.WorkDir(st.ctx)
   936  	if err != nil {
   937  		err = fmt.Errorf("distTestList, WorkDir: %v", err)
   938  		return
   939  	}
   940  	goroot := st.conf.FilePathJoin(workDir, "go")
   941  
   942  	args := []string{"tool", "dist", "test", "--no-rebuild", "--list"}
   943  	if st.conf.IsRace() {
   944  		args = append(args, "--race")
   945  	}
   946  	if st.conf.CompileOnly {
   947  		args = append(args, "--compile-only")
   948  	}
   949  	var buf bytes.Buffer
   950  	remoteErr, err = st.bc.Exec(st.ctx, "./go/bin/go", buildlet.ExecOpts{
   951  		Output:      &buf,
   952  		ExtraEnv:    append(st.conf.Env(), "GOROOT="+goroot),
   953  		OnStartExec: func() { st.LogEventTime("discovering_tests") },
   954  		Path:        []string{st.conf.FilePathJoin("$WORKDIR", "go", "bin"), "$PATH"},
   955  		Args:        args,
   956  	})
   957  	if remoteErr != nil {
   958  		remoteErr = fmt.Errorf("Remote error: %v, %s", remoteErr, buf.Bytes())
   959  		err = nil
   960  		return
   961  	}
   962  	if err != nil {
   963  		err = fmt.Errorf("Exec error: %v, %s", err, buf.Bytes())
   964  		return
   965  	}
   966  	// To avoid needing to update all the existing dist test adjust policies,
   967  	// it's easier to remap new dist test names in "<pkg>[:<variant>]" format
   968  	// to ones used in Go 1.20 and prior. Do that for now.
   969  	for _, test := range go120DistTestNames(strings.Fields(buf.String())) {
   970  		isNormalTry := st.isTry() && !st.isSlowBot()
   971  		if !st.conf.ShouldRunDistTest(test.Old, isNormalTry) {
   972  			continue
   973  		}
   974  		names = append(names, test)
   975  	}
   976  	return names, nil, nil
   977  }
   978  
   979  // go120DistTestNames converts a list of dist test names from
   980  // an arbitrary Go distribution to the format used in Go 1.20
   981  // and prior versions. (Go 1.21 introduces a simpler format.)
   982  //
   983  // This exists only to avoid rewriting current dist adjust policies.
   984  // We wish to avoid new dist adjust policies, but if they're truly needed,
   985  // they can choose to start using new dist test names instead.
   986  func go120DistTestNames(names []string) []distTestName {
   987  	if len(names) == 0 {
   988  		// Only happens if there's a problem, but no need to panic.
   989  		return nil
   990  	} else if strings.HasPrefix(names[0], "go_test:") {
   991  		// In Go 1.21 and newer no dist tests have a "go_test:" prefix.
   992  		// In Go 1.20 and older, go tool dist test -list always returns
   993  		// at least one "go_test:*" test first.
   994  		// So if we see it, the list is already in Go 1.20 format.
   995  		var s []distTestName
   996  		for _, old := range names {
   997  			s = append(s, distTestName{old, old})
   998  		}
   999  		return s
  1000  	}
  1001  	// Remap the new Go 1.21+ dist test names to old ones.
  1002  	var s []distTestName
  1003  	for _, new := range names {
  1004  		var old string
  1005  		switch pkg, variant, _ := strings.Cut(new, ":"); {
  1006  		// Special cases. Enough to cover what's used by old dist
  1007  		// adjust policies. Not much use in going far beyond that.
  1008  		case variant == "nolibgcc":
  1009  			old = "nolibgcc:" + pkg
  1010  		case variant == "race":
  1011  			old = "race"
  1012  		case variant == "moved_goroot":
  1013  			old = "moved_goroot"
  1014  		case pkg == "cmd/internal/testdir":
  1015  			if variant == "" {
  1016  				// Handle this too for when we stop doing special-case sharding only for testdir inside dist.
  1017  				variant = "0_1"
  1018  			}
  1019  			old = "test:" + variant
  1020  		case pkg == "cmd/api" && variant == "check":
  1021  			old = "api"
  1022  		case pkg == "cmd/internal/bootstrap_test":
  1023  			old = "reboot"
  1024  
  1025  		// Easy regular cases.
  1026  		case variant == "":
  1027  			old = "go_test:" + pkg
  1028  		case variant == "racebench":
  1029  			old = "go_test_bench:" + pkg
  1030  
  1031  		// Neither a known special case nor a regular case.
  1032  		default:
  1033  			old = new // Less bad than leaving it empty.
  1034  		}
  1035  		s = append(s, distTestName{Old: old, Raw: new})
  1036  	}
  1037  	return s
  1038  }
  1039  
  1040  type token struct{}
  1041  
  1042  // newTestSet returns a new testSet given the dist test names (from "go tool dist test -list")
  1043  // and benchmark items.
  1044  func (st *buildStatus) newTestSet(testStats *buildstats.TestStats, names []distTestName) (*testSet, error) {
  1045  	set := &testSet{
  1046  		st:        st,
  1047  		testStats: testStats,
  1048  	}
  1049  	for _, name := range names {
  1050  		set.items = append(set.items, &testItem{
  1051  			set:      set,
  1052  			name:     name,
  1053  			duration: testStats.Duration(st.BuilderRev.Name, name.Old),
  1054  			take:     make(chan token, 1),
  1055  			done:     make(chan token),
  1056  		})
  1057  	}
  1058  	return set, nil
  1059  }
  1060  
  1061  var (
  1062  	testStats       atomic.Value // of *buildstats.TestStats
  1063  	testStatsLoader singleflight.Group
  1064  )
  1065  
  1066  func getTestStats(sl spanlog.Logger) *buildstats.TestStats {
  1067  	sp := sl.CreateSpan("get_test_stats")
  1068  	ts, ok := testStats.Load().(*buildstats.TestStats)
  1069  	if ok && ts.AsOf.After(time.Now().Add(-1*time.Hour)) {
  1070  		sp.Done(nil)
  1071  		return ts
  1072  	}
  1073  	v, err, _ := testStatsLoader.Do("", func() (interface{}, error) {
  1074  		log.Printf("getTestStats: reloading from BigQuery...")
  1075  		sp := sl.CreateSpan("query_test_stats")
  1076  		ctx, cancel := context.WithTimeout(context.Background(), time.Minute)
  1077  		defer cancel()
  1078  		ts, err := buildstats.QueryTestStats(ctx, pool.NewGCEConfiguration().BuildEnv())
  1079  		sp.Done(err)
  1080  		if err != nil {
  1081  			log.Printf("getTestStats: error: %v", err)
  1082  			return nil, err
  1083  		}
  1084  		testStats.Store(ts)
  1085  		return ts, nil
  1086  	})
  1087  	if err != nil {
  1088  		sp.Done(err)
  1089  		return nil
  1090  	}
  1091  	sp.Done(nil)
  1092  	return v.(*buildstats.TestStats)
  1093  }
  1094  
  1095  func (st *buildStatus) runSubrepoTests() (remoteErr, err error) {
  1096  	st.LogEventTime("fetching_subrepo", st.SubName)
  1097  
  1098  	workDir, err := st.bc.WorkDir(st.ctx)
  1099  	if err != nil {
  1100  		err = fmt.Errorf("error discovering workdir for helper %s: %v", st.bc.IPPort(), err)
  1101  		return nil, err
  1102  	}
  1103  	goroot := st.conf.FilePathJoin(workDir, "go")
  1104  	gopath := st.conf.FilePathJoin(workDir, "gopath")
  1105  
  1106  	// A goTestRun represents a single invocation of the 'go test' command.
  1107  	type goTestRun struct {
  1108  		Dir      string   // Directory where 'go test' should be executed.
  1109  		Patterns []string // Import path patterns to provide to 'go test'.
  1110  	}
  1111  	// Test all packages selected by the "./..." pattern at the repository root.
  1112  	// (If there are modules in subdirectories, they'll be found and handled below.)
  1113  	repoPath := importPathOfRepo(st.SubName)
  1114  	testRuns := []goTestRun{{
  1115  		Dir:      "gopath/src/" + repoPath,
  1116  		Patterns: []string{"./..."},
  1117  	}}
  1118  
  1119  	// Check out the provided sub-repo to the buildlet's workspace so we
  1120  	// can find go.mod files and run tests in it.
  1121  	{
  1122  		tgz, err := sourcecache.GetSourceTgz(st, st.SubName, st.SubRev)
  1123  		if errors.As(err, new(sourcecache.TooBigError)) {
  1124  			// Source being too big is a non-retryable error.
  1125  			return err, nil
  1126  		} else if err != nil {
  1127  			return nil, err
  1128  		}
  1129  		err = st.bc.PutTar(st.ctx, tgz, "gopath/src/"+repoPath)
  1130  		if err != nil {
  1131  			return nil, err
  1132  		}
  1133  	}
  1134  
  1135  	// Look for inner modules, in order to test them too. See golang.org/issue/32528.
  1136  	sp := st.CreateSpan("listing_subrepo_modules", st.SubName)
  1137  	err = st.bc.ListDir(st.ctx, "gopath/src/"+repoPath, buildlet.ListDirOpts{Recursive: true}, func(e buildlet.DirEntry) {
  1138  		goModFile := path.Base(e.Name()) == "go.mod" && !e.IsDir()
  1139  		if !goModFile {
  1140  			return
  1141  		}
  1142  		// Found a go.mod file in a subdirectory, which indicates the root of a module.
  1143  		modulePath := path.Join(repoPath, path.Dir(e.Name()))
  1144  		if modulePath == repoPath {
  1145  			// This is the go.mod file at the repository root.
  1146  			// It's already a part of testRuns, so skip it.
  1147  			return
  1148  		} else if ignoredByGoTool(modulePath) || isVendored(modulePath) {
  1149  			// go.mod file is in a directory we're not looking to support, so skip it.
  1150  			return
  1151  		}
  1152  		// Add an additional test run entry that will test all packages in this module.
  1153  		testRuns = append(testRuns, goTestRun{
  1154  			Dir:      "gopath/src/" + modulePath,
  1155  			Patterns: []string{"./..."},
  1156  		})
  1157  	})
  1158  	sp.Done(err)
  1159  	if err != nil {
  1160  		return nil, err
  1161  	}
  1162  
  1163  	// Finally, execute all of the test runs.
  1164  	// If any fail, keep going so that all test results are included in the output.
  1165  
  1166  	sp = st.CreateSpan("running_subrepo_tests", st.SubName)
  1167  	defer func() { sp.Done(err) }()
  1168  
  1169  	env := append(st.conf.Env(),
  1170  		"GOROOT="+goroot,
  1171  		"GOPATH="+gopath,
  1172  	)
  1173  	env = append(env, st.modulesEnv()...)
  1174  
  1175  	args := []string{"test"}
  1176  	if st.conf.CompileOnly {
  1177  		// Build all packages, but avoid running the binary by executing /bin/true for the tests.
  1178  		// We assume for a compile-only build we're just running on a Linux system.
  1179  		args = append(args, "-exec", "/bin/true")
  1180  	} else {
  1181  		if !st.conf.IsLongTest() {
  1182  			args = append(args, "-short")
  1183  		}
  1184  		if st.conf.IsRace() {
  1185  			args = append(args, "-race")
  1186  		}
  1187  		if scale := st.conf.GoTestTimeoutScale(); scale != 1 {
  1188  			const goTestDefaultTimeout = 10 * time.Minute // Default value taken from Go 1.20.
  1189  			args = append(args, "-timeout="+(goTestDefaultTimeout*time.Duration(scale)).String())
  1190  		}
  1191  	}
  1192  
  1193  	var remoteErrors []error
  1194  	for _, tr := range testRuns {
  1195  		rErr, err := st.bc.Exec(st.ctx, "./go/bin/go", buildlet.ExecOpts{
  1196  			Debug:    true, // make buildlet print extra debug in output for failures
  1197  			Output:   st,
  1198  			Dir:      tr.Dir,
  1199  			ExtraEnv: env,
  1200  			Path:     []string{st.conf.FilePathJoin("$WORKDIR", "go", "bin"), "$PATH"},
  1201  			Args:     append(args, tr.Patterns...),
  1202  		})
  1203  		if err != nil {
  1204  			// A network/communication error. Give up here;
  1205  			// the caller can retry as it sees fit.
  1206  			return nil, err
  1207  		} else if rErr != nil {
  1208  			// An error occurred remotely and is terminal, but we want to
  1209  			// keep testing other packages and report their failures too,
  1210  			// rather than stopping short.
  1211  			remoteErrors = append(remoteErrors, rErr)
  1212  		}
  1213  	}
  1214  	if len(remoteErrors) > 0 {
  1215  		return multiError(remoteErrors), nil
  1216  	}
  1217  	return nil, nil
  1218  }
  1219  
  1220  // ignoredByGoTool reports whether the given import path corresponds
  1221  // to a directory that would be ignored by the go tool.
  1222  //
  1223  // The logic of the go tool for ignoring directories is documented at
  1224  // https://golang.org/cmd/go/#hdr-Package_lists_and_patterns:
  1225  //
  1226  //	Directory and file names that begin with "." or "_" are ignored
  1227  //	by the go tool, as are directories named "testdata".
  1228  func ignoredByGoTool(importPath string) bool {
  1229  	for _, el := range strings.Split(importPath, "/") {
  1230  		if strings.HasPrefix(el, ".") || strings.HasPrefix(el, "_") || el == "testdata" {
  1231  			return true
  1232  		}
  1233  	}
  1234  	return false
  1235  }
  1236  
  1237  // isVendored reports whether the given import path corresponds
  1238  // to a Go package that is inside a vendor directory.
  1239  //
  1240  // The logic for what is considered a vendor directory is documented at
  1241  // https://golang.org/cmd/go/#hdr-Vendor_Directories.
  1242  func isVendored(importPath string) bool {
  1243  	return strings.HasPrefix(importPath, "vendor/") ||
  1244  		strings.Contains(importPath, "/vendor/")
  1245  }
  1246  
  1247  // multiError is a concatenation of multiple errors.
  1248  // There must be one or more errors, and all must be non-nil.
  1249  type multiError []error
  1250  
  1251  // Error concatenates all error strings into a single string,
  1252  // using a semicolon and space as a separator.
  1253  func (m multiError) Error() string {
  1254  	if len(m) == 1 {
  1255  		return m[0].Error()
  1256  	}
  1257  
  1258  	var b strings.Builder
  1259  	for i, e := range m {
  1260  		if i != 0 {
  1261  			b.WriteString("; ")
  1262  		}
  1263  		b.WriteString(e.Error())
  1264  	}
  1265  	return b.String()
  1266  }
  1267  
  1268  // internalModuleProxy returns the GOPROXY environment value to use for
  1269  // most module-enabled tests.
  1270  //
  1271  // We go through an internal (10.0.0.0/8) proxy that then hits
  1272  // https://proxy.golang.org/ so we're still able to firewall
  1273  // non-internal outbound connections on builder nodes.
  1274  //
  1275  // This internalModuleProxy func in prod mode (when running on GKE) returns an
  1276  // http URL to the current GKE pod's IP with a Kubernetes NodePort service port
  1277  // that forwards back to the coordinator's 8123. See comment below.
  1278  func internalModuleProxy() string {
  1279  	// We run a NodePort service on each GKE node
  1280  	// (cmd/coordinator/module-proxy-service.yaml) on port 30157
  1281  	// that maps back the coordinator's port 8123. (We could round
  1282  	// robin over all the GKE nodes' IPs if we wanted, but the
  1283  	// coordinator is running on GKE so our node by definition is
  1284  	// up, so just use it. It won't be much traffic.)
  1285  	// TODO: migrate to a GKE internal load balancer with an internal static IP
  1286  	// once we migrate symbolic-datum-552 off a Legacy VPC network to the modern
  1287  	// scheme that supports internal static IPs.
  1288  	return "http://" + pool.NewGCEConfiguration().GKENodeHostname() + ":30157"
  1289  }
  1290  
  1291  // modulesEnv returns the extra module-specific environment variables
  1292  // to append to tests.
  1293  func (st *buildStatus) modulesEnv() (env []string) {
  1294  	// GOPROXY
  1295  	switch {
  1296  	case st.SubName == "" && !st.conf.OutboundNetworkAllowed():
  1297  		env = append(env, "GOPROXY=off")
  1298  	case st.conf.PrivateGoProxy():
  1299  		// Don't add GOPROXY, the builder is pre-configured.
  1300  	case pool.NewGCEConfiguration().BuildEnv() == nil || !pool.NewGCEConfiguration().BuildEnv().IsProd:
  1301  		// Dev mode; use the system default.
  1302  		env = append(env, "GOPROXY="+os.Getenv("GOPROXY"))
  1303  	case st.conf.IsGCE():
  1304  		// On GCE; the internal proxy is accessible, prefer that.
  1305  		env = append(env, "GOPROXY="+internalModuleProxy())
  1306  	default:
  1307  		// Everything else uses the public proxy.
  1308  		env = append(env, "GOPROXY=https://proxy.golang.org")
  1309  	}
  1310  
  1311  	return env
  1312  }
  1313  
  1314  // runBenchmarkTests runs benchmarks from x/benchmarks when RunBench is set.
  1315  func (st *buildStatus) runBenchmarkTests() (remoteErr, err error) {
  1316  	if st.SubName == "" {
  1317  		return nil, fmt.Errorf("benchmark tests must run on a subrepo")
  1318  	}
  1319  
  1320  	// Repository under test.
  1321  	//
  1322  	// When running benchmarks, there are numerous variables:
  1323  	//
  1324  	// * Go experiment version
  1325  	// * Go baseline version
  1326  	// * Subrepo experiment version (if benchmarking subrepo)
  1327  	// * Subrepo baseline version (if benchmarking subrepo)
  1328  	// * x/benchmarks version (which defines which benchmarks run and how
  1329  	//   regardless of which repo is under test)
  1330  	//
  1331  	// For benchmarking of the main Go repo, the first three are used.
  1332  	// Ideally, the coordinator scheduler would handle the combinatorics on
  1333  	// testing these. Unfortunately, the coordinator doesn't handle
  1334  	// three-way combinations. By running Go benchmarks as a "subrepo test"
  1335  	// for x/benchmark, we can at least get the scheduler to handle the
  1336  	// x/benchmarks version (st.SubRev) and Go experiment version (st.Rev).
  1337  	// The Go baseline version is simply selected as the most recent
  1338  	// previous release tag (e.g., 1.18.x on release-branch.go1.18) at the
  1339  	// time this test runs (st.installBaselineToolchain below).
  1340  	//
  1341  	// When benchmarking a subrepo, we want to compare a subrepo experiment
  1342  	// version vs subrepo baseline version (_not_ compare a single subrepo
  1343  	// version vs baseline/experiment Go versions). We do need to build the
  1344  	// subrepo with some version of Go, so we choose to use the latest
  1345  	// released version at the time of testing (same as Go baseline above).
  1346  	// We'd like the coordinator to handle the combination of x/benchmarks
  1347  	// and x/<subrepo>, however the coordinator can't do multiple subrepo
  1348  	// combinations.
  1349  	//
  1350  	// Thus, we run these as typical subrepo builders, which gives us the
  1351  	// subrepo experiment version and a Go experiment version (which we
  1352  	// will ignore). The Go baseline version is selected as above, and the
  1353  	// subrepo baseline version is selected as the latest (non-pre-release)
  1354  	// tag in the subrepo.
  1355  	//
  1356  	// This setup is suboptimal because the caller is installing an
  1357  	// experiment Go version that we won't use when building the subrepo
  1358  	// (we'll use the Go baseline version). We'll also end up with
  1359  	// duplicate runs with identical subrepo experiment/baseline and
  1360  	// x/benchmarks versions, as builds will trigger on every commit to the
  1361  	// Go repo. Limiting subrepo builders to release branches can
  1362  	// significantly reduce the number of Go commit triggers.
  1363  	//
  1364  	// TODO(prattmic): Cleaning this up is good future work, but these
  1365  	// deficiencies are not particularly problematic and avoid the need for
  1366  	// major changes in other parts of the coordinator.
  1367  	repo := st.SubName
  1368  	if repo == "benchmarks" {
  1369  		repo = "go"
  1370  	}
  1371  
  1372  	const (
  1373  		baselineDir        = "go-baseline"
  1374  		benchmarksDir      = "benchmarks"
  1375  		subrepoDir         = "subrepo"
  1376  		subrepoBaselineDir = "subrepo-baseline"
  1377  	)
  1378  
  1379  	workDir, err := st.bc.WorkDir(st.ctx)
  1380  	if err != nil {
  1381  		err = fmt.Errorf("error discovering workdir for helper %s: %v", st.bc.IPPort(), err)
  1382  		return nil, err
  1383  	}
  1384  	goroot := st.conf.FilePathJoin(workDir, "go")
  1385  	baselineGoroot := st.conf.FilePathJoin(workDir, baselineDir)
  1386  	gopath := st.conf.FilePathJoin(workDir, "gopath")
  1387  
  1388  	// Install baseline toolchain in addition to the experiment toolchain.
  1389  	toolchainBaselineCommit, remoteErr, err := st.installBaselineToolchain(goroot, baselineDir)
  1390  	if remoteErr != nil || err != nil {
  1391  		return remoteErr, err
  1392  	}
  1393  
  1394  	// Install x/benchmarks.
  1395  	benchmarksCommit, remoteErr, err := st.fetchBenchmarksSource(benchmarksDir)
  1396  	if remoteErr != nil || err != nil {
  1397  		return remoteErr, err
  1398  	}
  1399  
  1400  	// If testing a repo other than Go, install the subrepo and its baseline.
  1401  	var subrepoBaselineCommit string
  1402  	if repo != "go" {
  1403  		subrepoBaselineCommit, remoteErr, err = st.fetchSubrepoAndBaseline(subrepoDir, subrepoBaselineDir)
  1404  		if remoteErr != nil || err != nil {
  1405  			return remoteErr, err
  1406  		}
  1407  	}
  1408  
  1409  	// Run golang.org/x/benchmarks/cmd/bench to perform benchmarks.
  1410  	sp := st.CreateSpan("running_benchmark_tests", st.SubName)
  1411  	defer func() { sp.Done(err) }()
  1412  
  1413  	env := append(st.conf.Env(),
  1414  		"BENCH_BASELINE_GOROOT="+baselineGoroot,
  1415  		"BENCH_BRANCH="+st.RevBranch,
  1416  		"BENCH_REPOSITORY="+repo,
  1417  		"GOROOT="+goroot,
  1418  		"GOPATH="+gopath, // For module cache storage
  1419  	)
  1420  	env = append(env, st.modulesEnv()...)
  1421  	if repo != "go" {
  1422  		env = append(env, "BENCH_SUBREPO_PATH="+st.conf.FilePathJoin(workDir, subrepoDir))
  1423  		env = append(env, "BENCH_SUBREPO_BASELINE_PATH="+st.conf.FilePathJoin(workDir, subrepoBaselineDir))
  1424  	}
  1425  	rErr, err := st.bc.Exec(st.ctx, "./go/bin/go", buildlet.ExecOpts{
  1426  		Debug:    true, // make buildlet print extra debug in output for failures
  1427  		Output:   st,
  1428  		Dir:      benchmarksDir,
  1429  		ExtraEnv: env,
  1430  		Path:     []string{st.conf.FilePathJoin("$WORKDIR", "go", "bin"), "$PATH"},
  1431  		Args:     []string{"run", "golang.org/x/benchmarks/cmd/bench"},
  1432  	})
  1433  	if err != nil || rErr != nil {
  1434  		return rErr, err
  1435  	}
  1436  
  1437  	// Upload benchmark results on success.
  1438  	if err := st.uploadBenchResults(toolchainBaselineCommit, subrepoBaselineCommit, benchmarksCommit); err != nil {
  1439  		return nil, err
  1440  	}
  1441  	return nil, nil
  1442  }
  1443  
  1444  func (st *buildStatus) uploadBenchResults(toolchainBaselineCommit, subrepoBaselineCommit, benchmarksCommit string) (err error) {
  1445  	sp := st.CreateSpan("upload_bench_results")
  1446  	defer func() { sp.Done(err) }()
  1447  
  1448  	s := pool.NewGCEConfiguration().BuildEnv().PerfDataURL
  1449  	if s == "" {
  1450  		log.Printf("No perfdata URL, skipping benchmark upload")
  1451  		return nil
  1452  	}
  1453  	client := &perfstorage.Client{BaseURL: s, HTTPClient: pool.NewGCEConfiguration().OAuthHTTPClient()}
  1454  	u := client.NewUpload(st.ctx)
  1455  	w, err := u.CreateFile("results")
  1456  	if err != nil {
  1457  		u.Abort()
  1458  		return fmt.Errorf("error creating perfdata file: %w", err)
  1459  	}
  1460  
  1461  	// Prepend some useful metadata.
  1462  	var b strings.Builder
  1463  	if subrepoBaselineCommit != "" {
  1464  		// Subrepos compare two subrepo commits.
  1465  		fmt.Fprintf(&b, "experiment-commit: %s\n", st.SubRev)
  1466  		fmt.Fprintf(&b, "experiment-commit-time: %s\n", st.SubRevCommitTime.In(time.UTC).Format(time.RFC3339Nano))
  1467  		fmt.Fprintf(&b, "baseline-commit: %s\n", subrepoBaselineCommit)
  1468  		// Subrepo benchmarks typically don't care about the toolchain
  1469  		// version, but we should still provide the data as toolchain
  1470  		// version changes may cause a performance discontinuity.
  1471  		fmt.Fprintf(&b, "toolchain-commit: %s\n", toolchainBaselineCommit)
  1472  	} else {
  1473  		// Go repo compares two main repo commits.
  1474  		fmt.Fprintf(&b, "experiment-commit: %s\n", st.Rev)
  1475  		fmt.Fprintf(&b, "experiment-commit-time: %s\n", st.RevCommitTime.In(time.UTC).Format(time.RFC3339Nano))
  1476  		fmt.Fprintf(&b, "baseline-commit: %s\n", toolchainBaselineCommit)
  1477  	}
  1478  	fmt.Fprintf(&b, "benchmarks-commit: %s\n", benchmarksCommit)
  1479  	fmt.Fprintf(&b, "post-submit: %t\n", st.trySet == nil)
  1480  	if _, err := w.Write([]byte(b.String())); err != nil {
  1481  		u.Abort()
  1482  		return fmt.Errorf("error writing perfdata metadata with contents %q: %w", b.String(), err)
  1483  	}
  1484  
  1485  	// TODO(prattmic): Full log output may contain non-benchmark output
  1486  	// that can be erroneously parsed as benchfmt.
  1487  	if _, err := w.Write([]byte(st.logs())); err != nil {
  1488  		u.Abort()
  1489  		return fmt.Errorf("error writing perfdata file with contents %q: %w", st.logs(), err)
  1490  	}
  1491  	status, err := u.Commit()
  1492  	if err != nil {
  1493  		return fmt.Errorf("error committing perfdata file: %w", err)
  1494  	}
  1495  	st.LogEventTime("bench_upload", status.UploadID)
  1496  	return nil
  1497  }
  1498  
  1499  func (st *buildStatus) installBaselineToolchain(goroot, baselineDir string) (baselineCommit string, remoteErr, err error) {
  1500  	sp := st.CreateSpan("install_baseline")
  1501  	defer func() { sp.Done(err) }()
  1502  
  1503  	commit, err := st.toolchainBaselineCommit()
  1504  	if err != nil {
  1505  		return "", nil, fmt.Errorf("error finding baseline commit: %w", err)
  1506  	}
  1507  	fmt.Fprintf(st, "Baseline toolchain %s\n", commit)
  1508  
  1509  	if st.useSnapshotFor(commit) {
  1510  		if err := st.writeGoSnapshotTo(commit, baselineDir); err != nil {
  1511  			return "", nil, fmt.Errorf("error writing baseline snapshot: %w", err)
  1512  		}
  1513  		return commit, nil, nil
  1514  	}
  1515  
  1516  	if err := st.writeGoSourceTo(st.bc, commit, baselineDir); err != nil {
  1517  		return "", nil, fmt.Errorf("error writing baseline source: %w", err)
  1518  	}
  1519  
  1520  	br := st.BuilderRev
  1521  	br.Rev = commit
  1522  
  1523  	builder := buildgo.GoBuilder{
  1524  		Logger:     st,
  1525  		BuilderRev: br,
  1526  		Conf:       st.conf,
  1527  		Goroot:     baselineDir,
  1528  		// Use the primary GOROOT as GOROOT_BOOTSTRAP. The
  1529  		// typical bootstrap toolchain may not be available if
  1530  		// the primary toolchain was installed from a snapshot.
  1531  		GorootBootstrap: goroot,
  1532  	}
  1533  	remoteErr, err = builder.RunMake(st.ctx, st.bc, st)
  1534  	if err != nil {
  1535  		return "", nil, err
  1536  	}
  1537  	if remoteErr != nil {
  1538  		return "", remoteErr, nil
  1539  	}
  1540  	return commit, nil, nil
  1541  }
  1542  
  1543  func (st *buildStatus) fetchBenchmarksSource(benchmarksDir string) (rev string, remoteErr, err error) {
  1544  	if st.SubName == "benchmarks" {
  1545  		rev = st.SubRev
  1546  	} else {
  1547  		rev, err = getRepoHead("benchmarks")
  1548  		if err != nil {
  1549  			return "", nil, fmt.Errorf("error finding x/benchmarks HEAD: %w", err)
  1550  		}
  1551  	}
  1552  
  1553  	sp := st.CreateSpan("fetching_benchmarks")
  1554  	defer func() { sp.Done(err) }()
  1555  
  1556  	tgz, err := sourcecache.GetSourceTgz(st, "benchmarks", rev)
  1557  	if errors.As(err, new(sourcecache.TooBigError)) {
  1558  		// Source being too big is a non-retryable error.
  1559  		return "", err, nil
  1560  	} else if err != nil {
  1561  		return "", nil, err
  1562  	}
  1563  
  1564  	err = st.bc.PutTar(st.ctx, tgz, benchmarksDir)
  1565  	if err != nil {
  1566  		return "", nil, err
  1567  	}
  1568  
  1569  	return rev, nil, nil
  1570  }
  1571  
  1572  func (st *buildStatus) fetchSubrepoAndBaseline(repoDir, baselineDir string) (baselineRev string, remoteErr, err error) {
  1573  	st.LogEventTime("fetching_subrepo", st.SubName)
  1574  
  1575  	tgz, err := sourcecache.GetSourceTgz(st, st.SubName, st.SubRev)
  1576  	if errors.As(err, new(sourcecache.TooBigError)) {
  1577  		// Source being too big is a non-retryable error.
  1578  		return "", err, nil
  1579  	} else if err != nil {
  1580  		return "", nil, err
  1581  	}
  1582  
  1583  	err = st.bc.PutTar(st.ctx, tgz, repoDir)
  1584  	if err != nil {
  1585  		return "", nil, err
  1586  	}
  1587  
  1588  	baselineRev, err = st.subrepoBaselineCommit()
  1589  	if err != nil {
  1590  		return "", nil, err
  1591  	}
  1592  
  1593  	fmt.Fprintf(st, "Baseline subrepo %s\n", baselineRev)
  1594  
  1595  	tgz, err = sourcecache.GetSourceTgz(st, st.SubName, baselineRev)
  1596  	if errors.As(err, new(sourcecache.TooBigError)) {
  1597  		// Source being too big is a non-retryable error.
  1598  		return "", err, nil
  1599  	} else if err != nil {
  1600  		return "", nil, err
  1601  	}
  1602  
  1603  	err = st.bc.PutTar(st.ctx, tgz, baselineDir)
  1604  	if err != nil {
  1605  		return "", nil, err
  1606  	}
  1607  
  1608  	return baselineRev, nil, nil
  1609  }
  1610  
  1611  var errBuildletsGone = errors.New("runTests: dist test failed: all buildlets had network errors or timeouts, yet tests remain")
  1612  
  1613  // runTests runs tests for the main Go repo.
  1614  //
  1615  // After runTests completes, the caller must assume that st.bc might be invalid
  1616  // (It's possible that only one of the helper buildlets survived).
  1617  func (st *buildStatus) runTests(helpers <-chan buildlet.Client) (remoteErr, err error) {
  1618  	testNames, remoteErr, err := st.distTestList()
  1619  	if remoteErr != nil {
  1620  		return fmt.Errorf("distTestList remote: %v", remoteErr), nil
  1621  	}
  1622  	if err != nil {
  1623  		return nil, fmt.Errorf("distTestList exec: %v", err)
  1624  	}
  1625  	testStats := getTestStats(st)
  1626  
  1627  	set, err := st.newTestSet(testStats, testNames)
  1628  	if err != nil {
  1629  		return nil, err
  1630  	}
  1631  	st.LogEventTime("starting_tests", fmt.Sprintf("%d tests", len(set.items)))
  1632  	startTime := time.Now()
  1633  
  1634  	workDir, err := st.bc.WorkDir(st.ctx)
  1635  	if err != nil {
  1636  		return nil, fmt.Errorf("error discovering workdir for main buildlet, %s: %v", st.bc.Name(), err)
  1637  	}
  1638  
  1639  	mainBuildletGoroot := st.conf.FilePathJoin(workDir, "go")
  1640  	mainBuildletGopath := st.conf.FilePathJoin(workDir, "gopath")
  1641  
  1642  	// We use our original buildlet to run the tests in order, to
  1643  	// make the streaming somewhat smooth and not incredibly
  1644  	// lumpy.  The rest of the buildlets run the largest tests
  1645  	// first (critical path scheduling).
  1646  	// The buildletActivity WaitGroup is used to track when all
  1647  	// the buildlets are dead or done.
  1648  	var buildletActivity sync.WaitGroup
  1649  	buildletActivity.Add(2) // one per goroutine below (main + helper launcher goroutine)
  1650  	go func() {
  1651  		defer buildletActivity.Done() // for the per-goroutine Add(2) above
  1652  		for !st.bc.IsBroken() {
  1653  			tis, ok := set.testsToRunInOrder()
  1654  			if !ok {
  1655  				select {
  1656  				case <-st.ctx.Done():
  1657  					return
  1658  				case <-time.After(5 * time.Second):
  1659  				}
  1660  				continue
  1661  			}
  1662  			st.runTestsOnBuildlet(st.bc, tis, mainBuildletGoroot, mainBuildletGopath)
  1663  		}
  1664  		st.LogEventTime("main_buildlet_broken", st.bc.Name())
  1665  	}()
  1666  	go func() {
  1667  		defer buildletActivity.Done() // for the per-goroutine Add(2) above
  1668  		for helper := range helpers {
  1669  			buildletActivity.Add(1)
  1670  			go func(bc buildlet.Client) {
  1671  				defer buildletActivity.Done() // for the per-helper Add(1) above
  1672  				defer st.LogEventTime("closed_helper", bc.Name())
  1673  				defer bc.Close()
  1674  				if devPause {
  1675  					defer time.Sleep(5 * time.Minute)
  1676  					defer st.LogEventTime("DEV_HELPER_SLEEP", bc.Name())
  1677  				}
  1678  				st.LogEventTime("got_empty_test_helper", bc.String())
  1679  				if err := bc.PutTarFromURL(st.ctx, st.SnapshotURL(pool.NewGCEConfiguration().BuildEnv()), "go"); err != nil {
  1680  					log.Printf("failed to extract snapshot for helper %s: %v", bc.Name(), err)
  1681  					return
  1682  				}
  1683  				workDir, err := bc.WorkDir(st.ctx)
  1684  				if err != nil {
  1685  					log.Printf("error discovering workdir for helper %s: %v", bc.Name(), err)
  1686  					return
  1687  				}
  1688  				st.LogEventTime("test_helper_set_up", bc.Name())
  1689  				goroot := st.conf.FilePathJoin(workDir, "go")
  1690  				gopath := st.conf.FilePathJoin(workDir, "gopath")
  1691  				for !bc.IsBroken() {
  1692  					tis, ok := set.testsToRunBiggestFirst()
  1693  					if !ok {
  1694  						st.LogEventTime("no_new_tests_remain", bc.Name())
  1695  						return
  1696  					}
  1697  					st.runTestsOnBuildlet(bc, tis, goroot, gopath)
  1698  				}
  1699  				st.LogEventTime("test_helper_is_broken", bc.Name())
  1700  			}(helper)
  1701  		}
  1702  	}()
  1703  
  1704  	// Convert a sync.WaitGroup into a channel.
  1705  	// Aside: https://groups.google.com/forum/#!topic/golang-dev/7fjGWuImu5k
  1706  	buildletsGone := make(chan struct{})
  1707  	go func() {
  1708  		buildletActivity.Wait()
  1709  		close(buildletsGone)
  1710  	}()
  1711  
  1712  	var lastMetadata string
  1713  	var lastHeader string
  1714  	var serialDuration time.Duration
  1715  	for _, ti := range set.items {
  1716  	AwaitDone:
  1717  		for {
  1718  			timer := time.NewTimer(30 * time.Second)
  1719  			select {
  1720  			case <-ti.done: // wait for success
  1721  				timer.Stop()
  1722  				break AwaitDone
  1723  			case <-timer.C:
  1724  				st.LogEventTime("still_waiting_on_test", ti.name.Old)
  1725  			case <-buildletsGone:
  1726  				set.cancelAll()
  1727  				return nil, errBuildletsGone
  1728  			}
  1729  		}
  1730  
  1731  		serialDuration += ti.execDuration
  1732  		if len(ti.output) > 0 {
  1733  			metadata, header, out := parseOutputAndHeader(ti.output)
  1734  			printHeader := false
  1735  			if metadata != lastMetadata {
  1736  				lastMetadata = metadata
  1737  				fmt.Fprintf(st, "\n%s\n", metadata)
  1738  				// Always include the test header after
  1739  				// metadata changes. This is a readability
  1740  				// optimization that ensures that tests are
  1741  				// always immediately preceded by their test
  1742  				// banner, even if it is duplicate banner
  1743  				// because the test metadata changed.
  1744  				printHeader = true
  1745  			}
  1746  			if header != lastHeader {
  1747  				lastHeader = header
  1748  				printHeader = true
  1749  			}
  1750  			if printHeader {
  1751  				fmt.Fprintf(st, "\n%s\n", header)
  1752  			}
  1753  			if pool.NewGCEConfiguration().InStaging() {
  1754  				out = bytes.TrimSuffix(out, nl)
  1755  				st.Write(out)
  1756  				fmt.Fprintf(st, " (shard %s; par=%d)\n", ti.shardIPPort, ti.groupSize)
  1757  			} else {
  1758  				st.Write(out)
  1759  			}
  1760  		}
  1761  
  1762  		if ti.remoteErr != nil {
  1763  			set.cancelAll()
  1764  			return fmt.Errorf("dist test failed: %s: %v", ti.name, ti.remoteErr), nil
  1765  		}
  1766  	}
  1767  	elapsed := time.Since(startTime)
  1768  	var msg string
  1769  	if st.conf.NumTestHelpers(st.isTry()) > 0 {
  1770  		msg = fmt.Sprintf("took %v; aggregate %v; saved %v", elapsed, serialDuration, serialDuration-elapsed)
  1771  	} else {
  1772  		msg = fmt.Sprintf("took %v", elapsed)
  1773  	}
  1774  	st.LogEventTime("tests_complete", msg)
  1775  	fmt.Fprintf(st, "\nAll tests passed.\n")
  1776  	return nil, nil
  1777  }
  1778  
  1779  const (
  1780  	banner       = "XXXBANNERXXX:" // flag passed to dist
  1781  	bannerPrefix = "\n" + banner   // with the newline added by dist
  1782  
  1783  	metadataBannerPrefix = bannerPrefix + "Test execution environment."
  1784  
  1785  	outputBanner = "##### " // banner to display in output.
  1786  )
  1787  
  1788  var (
  1789  	bannerPrefixBytes         = []byte(bannerPrefix)
  1790  	metadataBannerPrefixBytes = []byte(metadataBannerPrefix)
  1791  )
  1792  
  1793  // parseOutputAndHeader parses b and returns the test (optional) environment
  1794  // metaadata, display header (e.g., "##### Testing packages.") and the
  1795  // following output.
  1796  //
  1797  // metadata is the optional execution environment metadata block. e.g.,
  1798  //
  1799  // ##### Test execution environment.
  1800  // # GOARCH: amd64
  1801  // # CPU: Intel(R) Xeon(R) W-2135 CPU @ 3.70GHz
  1802  func parseOutputAndHeader(b []byte) (metadata, header string, out []byte) {
  1803  	if !bytes.HasPrefix(b, bannerPrefixBytes) {
  1804  		return "", "", b
  1805  	}
  1806  
  1807  	if bytes.HasPrefix(b, metadataBannerPrefixBytes) {
  1808  		// Header includes everything up to and including the next
  1809  		// banner.
  1810  		rem := b[len(metadataBannerPrefixBytes):]
  1811  		i := bytes.Index(rem, bannerPrefixBytes)
  1812  		if i == -1 {
  1813  			// Metadata block without a following block doesn't
  1814  			// make sense. Bail.
  1815  			return "", "", b
  1816  		}
  1817  		bi := i + len(metadataBannerPrefixBytes)
  1818  		// Metadata portion of header, skipping initial and trailing newlines.
  1819  		metadata = strings.Trim(string(b[:bi]), "\n")
  1820  		metadata = strings.Replace(metadata, banner, outputBanner, 1)
  1821  		b = b[bi+1:] // skip newline at start of next banner.
  1822  	} else {
  1823  		b = b[1:] // skip newline
  1824  	}
  1825  
  1826  	// Find end of primary test banner.
  1827  	nl := bytes.IndexByte(b, '\n')
  1828  	if nl == -1 {
  1829  		// No newline, everything is header.
  1830  		header = string(b)
  1831  		b = nil
  1832  	} else {
  1833  		header = string(b[:nl])
  1834  		b = b[nl+1:]
  1835  	}
  1836  
  1837  	// Replace internal marker banner with the human-friendly version.
  1838  	header = strings.Replace(header, banner, outputBanner, 1)
  1839  	return metadata, header, b
  1840  }
  1841  
  1842  // maxTestExecError is the number of test execution failures at which
  1843  // we give up and stop trying and instead permanently fail the test.
  1844  // Note that this is not related to whether the test failed remotely,
  1845  // but whether we were unable to start or complete watching it run.
  1846  // (A communication error)
  1847  const maxTestExecErrors = 3
  1848  
  1849  // runTestsOnBuildlet runs tis on bc, using the optional goroot & gopath environment variables.
  1850  func (st *buildStatus) runTestsOnBuildlet(bc buildlet.Client, tis []*testItem, goroot, gopath string) {
  1851  	names, rawNames := make([]string, len(tis)), make([]string, len(tis))
  1852  	for i, ti := range tis {
  1853  		names[i], rawNames[i] = ti.name.Old, ti.name.Raw
  1854  		if i > 0 && (!strings.HasPrefix(ti.name.Old, "go_test:") || !strings.HasPrefix(names[0], "go_test:")) {
  1855  			panic("only go_test:* tests may be merged")
  1856  		}
  1857  	}
  1858  	var spanName string
  1859  	var detail string
  1860  	if len(names) == 1 {
  1861  		spanName = "run_test:" + names[0]
  1862  		detail = bc.Name()
  1863  	} else {
  1864  		spanName = "run_tests_multi"
  1865  		detail = fmt.Sprintf("%s: %v", bc.Name(), names)
  1866  	}
  1867  	sp := st.CreateSpan(spanName, detail)
  1868  
  1869  	args := []string{"tool", "dist", "test", "--no-rebuild", "--banner=" + banner}
  1870  	if st.conf.IsRace() {
  1871  		args = append(args, "--race")
  1872  	}
  1873  	if st.conf.CompileOnly {
  1874  		args = append(args, "--compile-only")
  1875  	}
  1876  	if st.useKeepGoingFlag() {
  1877  		args = append(args, "-k")
  1878  	}
  1879  	args = append(args, rawNames...)
  1880  	var buf bytes.Buffer
  1881  	t0 := time.Now()
  1882  	timeout := st.conf.DistTestsExecTimeout(names)
  1883  
  1884  	ctx, cancel := context.WithTimeout(st.ctx, timeout)
  1885  	defer cancel()
  1886  
  1887  	env := append(st.conf.Env(),
  1888  		"GOROOT="+goroot,
  1889  		"GOPATH="+gopath,
  1890  	)
  1891  	env = append(env, st.modulesEnv()...)
  1892  
  1893  	remoteErr, err := bc.Exec(ctx, "./go/bin/go", buildlet.ExecOpts{
  1894  		// We set Dir to "." instead of the default ("go/bin") so when the dist tests
  1895  		// try to run os/exec.Command("go", "test", ...), the LookPath of "go" doesn't
  1896  		// return "./go.exe" (which exists in the current directory: "go/bin") and then
  1897  		// fail when dist tries to run the binary in dir "$GOROOT/src", since
  1898  		// "$GOROOT/src" + "./go.exe" doesn't exist. Perhaps LookPath should return
  1899  		// an absolute path.
  1900  		Dir:      ".",
  1901  		Output:   &buf, // see "maybe stream lines" TODO below
  1902  		ExtraEnv: env,
  1903  		Path:     []string{st.conf.FilePathJoin("$WORKDIR", "go", "bin"), "$PATH"},
  1904  		Args:     args,
  1905  	})
  1906  	execDuration := time.Since(t0)
  1907  	sp.Done(err)
  1908  	if err != nil {
  1909  		bc.MarkBroken() // prevents reuse
  1910  		for _, ti := range tis {
  1911  			ti.numFail++
  1912  			st.logf("Execution error running %s on %s: %v (numFails = %d)", ti.name, bc, err, ti.numFail)
  1913  			if err == buildlet.ErrTimeout {
  1914  				ti.failf("Test %q ran over %v limit (%v); saw output:\n%s", ti.name, timeout, execDuration, buf.Bytes())
  1915  			} else if ti.numFail >= maxTestExecErrors {
  1916  				ti.failf("Failed to schedule %q test after %d tries.\n", ti.name, maxTestExecErrors)
  1917  			} else {
  1918  				ti.retry()
  1919  			}
  1920  		}
  1921  		return
  1922  	}
  1923  
  1924  	out := buf.Bytes()
  1925  	out = bytes.Replace(out, []byte("\nALL TESTS PASSED (some were excluded)\n"), nil, 1)
  1926  	out = bytes.Replace(out, []byte("\nALL TESTS PASSED\n"), nil, 1)
  1927  
  1928  	for _, ti := range tis {
  1929  		ti.output = out
  1930  		ti.remoteErr = remoteErr
  1931  		ti.execDuration = execDuration
  1932  		ti.groupSize = len(tis)
  1933  		ti.shardIPPort = bc.IPPort()
  1934  		close(ti.done)
  1935  
  1936  		// After the first one, make the rest succeed with no output.
  1937  		// TODO: maybe stream lines (set Output to a line-reading
  1938  		// Writer instead of &buf). for now we just wait for them in
  1939  		// ~10 second batches.  Doesn't look as smooth on the output,
  1940  		// though.
  1941  		out = nil
  1942  		remoteErr = nil
  1943  		execDuration = 0
  1944  	}
  1945  }
  1946  
  1947  func (st *buildStatus) CreateSpan(event string, optText ...string) spanlog.Span {
  1948  	return schedule.CreateSpan(st, event, optText...)
  1949  }
  1950  
  1951  func (st *buildStatus) LogEventTime(event string, optText ...string) {
  1952  	if len(optText) > 1 {
  1953  		panic("usage")
  1954  	}
  1955  	if pool.NewGCEConfiguration().InStaging() {
  1956  		st.logf("%s %v", event, optText)
  1957  	}
  1958  	st.mu.Lock()
  1959  	defer st.mu.Unlock()
  1960  	var text string
  1961  	if len(optText) > 0 {
  1962  		text = optText[0]
  1963  	}
  1964  	st.events = append(st.events, eventAndTime{
  1965  		t:    time.Now(),
  1966  		evt:  event,
  1967  		text: text,
  1968  	})
  1969  }
  1970  
  1971  func (st *buildStatus) hasEvent(event string) bool {
  1972  	st.mu.Lock()
  1973  	defer st.mu.Unlock()
  1974  	for _, e := range st.events {
  1975  		if e.evt == event {
  1976  			return true
  1977  		}
  1978  	}
  1979  	return false
  1980  }
  1981  
  1982  // HTMLStatusLine returns the HTML to show within the <pre> block on
  1983  // the main page's list of active builds.
  1984  func (st *buildStatus) HTMLStatusLine() template.HTML      { return st.htmlStatus(singleLine) }
  1985  func (st *buildStatus) HTMLStatusTruncated() template.HTML { return st.htmlStatus(truncated) }
  1986  func (st *buildStatus) HTMLStatus() template.HTML          { return st.htmlStatus(full) }
  1987  
  1988  func strSliceTo(s string, n int) string {
  1989  	if len(s) <= n {
  1990  		return s
  1991  	}
  1992  	return s[:n]
  1993  }
  1994  
  1995  type buildStatusDetail int
  1996  
  1997  const (
  1998  	singleLine buildStatusDetail = iota
  1999  	truncated
  2000  	full
  2001  )
  2002  
  2003  func (st *buildStatus) htmlStatus(detail buildStatusDetail) template.HTML {
  2004  	if st == nil {
  2005  		return "[nil]"
  2006  	}
  2007  	st.mu.Lock()
  2008  	defer st.mu.Unlock()
  2009  
  2010  	urlPrefix := "https://go-review.googlesource.com/#/q/"
  2011  
  2012  	if st.Rev == "" {
  2013  		log.Printf("warning: st.Rev is empty")
  2014  	}
  2015  
  2016  	var buf bytes.Buffer
  2017  	fmt.Fprintf(&buf, "<a href='https://github.com/golang/go/wiki/DashboardBuilders'>%s</a> rev <a href='%s%s'>%s</a>",
  2018  		st.Name, urlPrefix, st.Rev, strSliceTo(st.Rev, 8))
  2019  	if st.IsSubrepo() {
  2020  		if st.SubRev == "" {
  2021  			log.Printf("warning: st.SubRev is empty on subrepo")
  2022  		}
  2023  		fmt.Fprintf(&buf, " (sub-repo %s rev <a href='%s%s'>%s</a>)",
  2024  			st.SubName, urlPrefix, st.SubRev, strSliceTo(st.SubRev, 8))
  2025  	}
  2026  	if ts := st.trySet; ts != nil {
  2027  		if ts.ChangeID == "" {
  2028  			log.Printf("warning: ts.ChangeID is empty")
  2029  		}
  2030  		fmt.Fprintf(&buf, " (<a href='/try?commit=%v'>trybot set</a> for <a href='https://go-review.googlesource.com/#/q/%s'>%s</a>)",
  2031  			strSliceTo(ts.Commit, 8),
  2032  			ts.ChangeTriple(), strSliceTo(ts.ChangeID, 8))
  2033  	}
  2034  
  2035  	var state string
  2036  	if st.canceled {
  2037  		state = "canceled"
  2038  	} else if st.done.IsZero() {
  2039  		if st.HasBuildlet() {
  2040  			state = "running"
  2041  		} else {
  2042  			state = "waiting_for_machine"
  2043  		}
  2044  	} else if st.succeeded {
  2045  		state = "succeeded"
  2046  	} else {
  2047  		state = "<font color='#700000'>failed</font>"
  2048  	}
  2049  	if detail > singleLine && st.bc != nil {
  2050  		fmt.Fprintf(&buf, "; <a href='%s'>%s</a>; %s", html.EscapeString(st.logsURLLocked()), state, html.EscapeString(st.bc.String()))
  2051  	} else {
  2052  		fmt.Fprintf(&buf, "; <a href='%s'>%s</a>", html.EscapeString(st.logsURLLocked()), state)
  2053  	}
  2054  
  2055  	t := st.done
  2056  	if t.IsZero() {
  2057  		t = st.startTime
  2058  	}
  2059  	fmt.Fprintf(&buf, ", %v ago", time.Since(t).Round(time.Second))
  2060  	if detail > singleLine {
  2061  		buf.WriteByte('\n')
  2062  		lastLines := 0
  2063  		if detail == truncated {
  2064  			lastLines = 3
  2065  		}
  2066  		st.writeEventsLocked(&buf, true, lastLines)
  2067  	}
  2068  	return template.HTML(buf.String())
  2069  }
  2070  
  2071  func (st *buildStatus) logsURLLocked() string {
  2072  	if st.logURL != "" {
  2073  		return st.logURL
  2074  	}
  2075  	var urlPrefix string
  2076  	if pool.NewGCEConfiguration().BuildEnv() == buildenv.Production {
  2077  		urlPrefix = "https://farmer.golang.org"
  2078  	} else {
  2079  		urlPrefix = "http://" + pool.NewGCEConfiguration().BuildEnv().StaticIP
  2080  	}
  2081  	if *mode == "dev" {
  2082  		urlPrefix = "https://localhost:8119"
  2083  	}
  2084  	u := fmt.Sprintf("%v/temporarylogs?name=%s&rev=%s&st=%p", urlPrefix, st.Name, st.Rev, st)
  2085  	if st.IsSubrepo() {
  2086  		u += fmt.Sprintf("&subName=%v&subRev=%v", st.SubName, st.SubRev)
  2087  	}
  2088  	return u
  2089  }
  2090  
  2091  // st.mu must be held.
  2092  // If numLines is greater than zero, it's the number of final lines to truncate to.
  2093  func (st *buildStatus) writeEventsLocked(w io.Writer, htmlMode bool, numLines int) {
  2094  	startAt := 0
  2095  	if numLines > 0 {
  2096  		startAt = len(st.events) - numLines
  2097  		if startAt > 0 {
  2098  			io.WriteString(w, "...\n")
  2099  		} else {
  2100  			startAt = 0
  2101  		}
  2102  	}
  2103  
  2104  	for i := startAt; i < len(st.events); i++ {
  2105  		evt := st.events[i]
  2106  		e := evt.evt
  2107  		text := evt.text
  2108  		if htmlMode {
  2109  			if e == "running_exec" {
  2110  				e = fmt.Sprintf("<a href='%s'>%s</a>", html.EscapeString(st.logsURLLocked()), e)
  2111  			}
  2112  			e = "<b>" + e + "</b>"
  2113  			text = "<i>" + html.EscapeString(text) + "</i>"
  2114  		}
  2115  		fmt.Fprintf(w, "  %v %s %s\n", evt.t.Format(time.RFC3339), e, text)
  2116  	}
  2117  	if st.isRunningLocked() && len(st.events) > 0 {
  2118  		lastEvt := st.events[len(st.events)-1]
  2119  		fmt.Fprintf(w, " %7s (now)\n", fmt.Sprintf("+%0.1fs", time.Since(lastEvt.t).Seconds()))
  2120  	}
  2121  }
  2122  
  2123  func (st *buildStatus) logs() string {
  2124  	return st.output.String()
  2125  }
  2126  
  2127  func (st *buildStatus) Write(p []byte) (n int, err error) {
  2128  	return st.output.Write(p)
  2129  }
  2130  
  2131  // repeatedCommunicationError takes a buildlet execution error (a
  2132  // network/communication error, as opposed to a remote execution that
  2133  // ran and had a non-zero exit status and we heard about) and
  2134  // conditionally promotes it to a terminal error. If this returns a
  2135  // non-nil value, the execErr should be considered terminal with the
  2136  // returned error.
  2137  func (st *buildStatus) repeatedCommunicationError(execErr error) error {
  2138  	if execErr == nil {
  2139  		return nil
  2140  	}
  2141  	// For now, only do this for plan9, which is flaky (Issue 31261),
  2142  	// but not for plan9-arm (Issue 52677)
  2143  	if strings.HasPrefix(st.Name, "plan9-") && st.Name != "plan9-arm" && execErr == errBuildletsGone {
  2144  		// TODO: give it two tries at least later (store state
  2145  		// somewhere; global map?). But for now we're going to
  2146  		// only give it one try.
  2147  		return fmt.Errorf("network error promoted to terminal error: %v", execErr)
  2148  	}
  2149  	return nil
  2150  }
  2151  
  2152  // commitTime returns the greater of Rev and SubRev's commit times.
  2153  func (st *buildStatus) commitTime() time.Time {
  2154  	if st.RevCommitTime.Before(st.SubRevCommitTime) {
  2155  		return st.SubRevCommitTime
  2156  	}
  2157  	return st.RevCommitTime
  2158  }
  2159  
  2160  // branch returns branch for either Rev, or SubRev if it exists.
  2161  func (st *buildStatus) branch() string {
  2162  	if st.SubRev != "" {
  2163  		return st.SubRevBranch
  2164  	}
  2165  	return st.RevBranch
  2166  }