github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/cmd/roachtest/test_runner.go (about)

     1  // Copyright 2018 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package main
    12  
    13  import (
    14  	"context"
    15  	"fmt"
    16  	"io"
    17  	"math/rand"
    18  	"net"
    19  	"net/http"
    20  	"os"
    21  	"os/exec"
    22  	"path/filepath"
    23  	"runtime"
    24  	"sort"
    25  	"strconv"
    26  	"strings"
    27  	"sync"
    28  	"time"
    29  
    30  	"github.com/cockroachdb/cockroach/pkg/cmd/internal/issues"
    31  	"github.com/cockroachdb/cockroach/pkg/util/ctxgroup"
    32  	"github.com/cockroachdb/cockroach/pkg/util/log"
    33  	"github.com/cockroachdb/cockroach/pkg/util/quotapool"
    34  	"github.com/cockroachdb/cockroach/pkg/util/stop"
    35  	"github.com/cockroachdb/cockroach/pkg/util/syncutil"
    36  	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
    37  	"github.com/cockroachdb/cockroach/pkg/util/version"
    38  	"github.com/cockroachdb/errors"
    39  	"github.com/cockroachdb/logtags"
    40  	"github.com/petermattis/goid"
    41  )
    42  
    43  // testRunner runs tests.
    44  type testRunner struct {
    45  	// buildVersion is the version of the Cockroach binary that tests will run against.
    46  	buildVersion version.Version
    47  
    48  	config struct {
    49  		// skipClusterValidationOnAttach skips validation on existing clusters that
    50  		// the registry uses for running tests.
    51  		skipClusterValidationOnAttach bool
    52  		// skipClusterStopOnAttach skips stopping existing clusters that
    53  		// the registry uses for running tests. It implies skipClusterWipeOnAttach.
    54  		skipClusterStopOnAttach bool
    55  		skipClusterWipeOnAttach bool
    56  	}
    57  
    58  	status struct {
    59  		syncutil.Mutex
    60  		running map[*test]struct{}
    61  		pass    map[*test]struct{}
    62  		fail    map[*test]struct{}
    63  		skip    map[*test]struct{}
    64  	}
    65  
    66  	// cr keeps track of all live clusters.
    67  	cr *clusterRegistry
    68  
    69  	workersMu struct {
    70  		syncutil.Mutex
    71  		// workers is a map from worker name to information about each worker currently running tests.
    72  		workers map[string]*workerStatus
    73  	}
    74  
    75  	// work maintains the remaining tests to run.
    76  	work *workPool
    77  
    78  	completedTestsMu struct {
    79  		syncutil.Mutex
    80  		// completed maintains information on all completed test runs.
    81  		completed []completedTestInfo
    82  	}
    83  }
    84  
    85  // newTestRunner constructs a testRunner.
    86  //
    87  // cr: The cluster registry with which all clusters will be registered. The
    88  //   caller provides this as the caller needs to be able to shut clusters down
    89  //   on Ctrl+C.
    90  // buildVersion: The version of the Cockroach binary against which tests will run.
    91  func newTestRunner(cr *clusterRegistry, buildVersion version.Version) *testRunner {
    92  	r := &testRunner{
    93  		cr:           cr,
    94  		buildVersion: buildVersion,
    95  	}
    96  	r.config.skipClusterWipeOnAttach = !clusterWipe
    97  	r.workersMu.workers = make(map[string]*workerStatus)
    98  	return r
    99  }
   100  
   101  // clustersOpt groups options for the clusters to be used by the tests.
   102  type clustersOpt struct {
   103  	// The type of cluster to use. If localCluster, then no other fields can be
   104  	// set.
   105  	typ clusterType
   106  
   107  	// If set, all the tests will run against this roachprod cluster.
   108  	clusterName string
   109  
   110  	// If set, all the clusters will use this ID as part of their name. When
   111  	// roachtests is invoked by TeamCity, this will be the build id.
   112  	clusterID string
   113  	// The name of the user running the tests. This will be part of cluster names.
   114  	user string
   115  
   116  	// cpuQuota specifies how many CPUs can be used concurrently by the roachprod
   117  	// clusters. While there's no quota available for creating a new cluster, the
   118  	// test runner will wait for other tests to finish and their cluster to be
   119  	// destroyed (or reused). Note that this limit is global, not per zone.
   120  	cpuQuota int
   121  	// If set, clusters will not be wiped or destroyed when a test using the
   122  	// respective cluster fails. These cluster will linger around and they'll
   123  	// continue counting towards the cpuQuota.
   124  	keepClustersOnTestFailure bool
   125  }
   126  
   127  func (c clustersOpt) validate() error {
   128  	if c.typ == localCluster {
   129  		if c.clusterName != "" {
   130  			return errors.New("clusterName cannot be set when typ=localCluster")
   131  		}
   132  		if c.clusterID != "" {
   133  			return errors.New("clusterID cannot be set when typ=localCluster")
   134  		}
   135  	}
   136  	return nil
   137  }
   138  
   139  // Run runs tests.
   140  //
   141  // Args:
   142  // tests: The tests to run.
   143  // count: How many times to run each test selected by filter.
   144  // parallelism: How many workers to use for running tests. Tests are run
   145  //   locally (although generally they run against remote roachprod clusters).
   146  //   parallelism bounds the maximum number of tests that run concurrently. Note
   147  //   that the concurrency is also affected by cpuQuota.
   148  // clusterOpt: Options for the clusters to use by tests.
   149  // lopt: Options for logging.
   150  func (r *testRunner) Run(
   151  	ctx context.Context,
   152  	tests []testSpec,
   153  	count int,
   154  	parallelism int,
   155  	clustersOpt clustersOpt,
   156  	artifactsDir string,
   157  	lopt loggingOpt,
   158  ) error {
   159  	// Validate options.
   160  	if len(tests) == 0 {
   161  		return fmt.Errorf("no test matched filters")
   162  	}
   163  	if err := clustersOpt.validate(); err != nil {
   164  		return err
   165  	}
   166  	if parallelism != 1 {
   167  		if clustersOpt.clusterName != "" {
   168  			return fmt.Errorf("--cluster incompatible with --parallelism. Use --parallelism=1")
   169  		}
   170  		if clustersOpt.typ == localCluster {
   171  			return fmt.Errorf("--local incompatible with --parallelism. Use --parallelism=1")
   172  		}
   173  	}
   174  
   175  	if name := clustersOpt.clusterName; name != "" {
   176  		// Since we were given a cluster, check that all tests we have to run have compatible specs.
   177  		// We should also check against the spec of the cluster, but we don't
   178  		// currently have a way of doing that; we're relying on the fact that attaching to the cluster
   179  		// will fail if the cluster is incompatible.
   180  		spec := tests[0].Cluster
   181  		spec.Lifetime = 0
   182  		for i := 1; i < len(tests); i++ {
   183  			spec2 := tests[i].Cluster
   184  			spec2.Lifetime = 0
   185  			if spec != spec2 {
   186  				return errors.Errorf("cluster specified but found tests "+
   187  					"with incompatible specs: %s (%s) - %s (%s)",
   188  					tests[0].Name, spec, tests[i].Name, spec2,
   189  				)
   190  			}
   191  		}
   192  	}
   193  
   194  	var numConcurrentClusterCreations int
   195  	if cloud == "aws" {
   196  		// AWS has ridiculous API calls limits, so we're going to create one cluster
   197  		// at a time. Internally, roachprod has throttling for the calls required to
   198  		// create a single cluster.
   199  		numConcurrentClusterCreations = 1
   200  	} else {
   201  		numConcurrentClusterCreations = 1000
   202  	}
   203  	clusterFactory := newClusterFactory(
   204  		clustersOpt.user, clustersOpt.clusterID, lopt.artifactsDir, r.cr, numConcurrentClusterCreations)
   205  
   206  	// allocateCluster will be used by workers to create new clusters (or to attach
   207  	// to an existing one).
   208  	allocateCluster := func(
   209  		ctx context.Context,
   210  		t testSpec,
   211  		alloc *quotapool.IntAlloc,
   212  		artifactsDir string,
   213  		wStatus *workerStatus,
   214  	) (*cluster, error) {
   215  		wStatus.SetStatus("creating cluster")
   216  		defer wStatus.SetStatus("")
   217  
   218  		lopt.l.PrintfCtx(ctx, "Creating new cluster for test %s: %s", t.Name, t.Cluster)
   219  
   220  		existingClusterName := clustersOpt.clusterName
   221  		if existingClusterName != "" {
   222  			// Logs for attaching to a cluster go to a dedicated log file.
   223  			logPath := filepath.Join(artifactsDir, runnerLogsDir, "cluster-create", existingClusterName+".log")
   224  			clusterL, err := rootLogger(logPath, lopt.tee)
   225  			defer clusterL.close()
   226  			if err != nil {
   227  				return nil, err
   228  			}
   229  			opt := attachOpt{
   230  				skipValidation: r.config.skipClusterValidationOnAttach,
   231  				skipStop:       r.config.skipClusterStopOnAttach,
   232  				skipWipe:       r.config.skipClusterWipeOnAttach,
   233  			}
   234  			return attachToExistingCluster(ctx, existingClusterName, clusterL, t.Cluster, opt, r.cr)
   235  		}
   236  
   237  		cfg := clusterConfig{
   238  			spec:         t.Cluster,
   239  			artifactsDir: artifactsDir,
   240  			localCluster: clustersOpt.typ == localCluster,
   241  			alloc:        alloc,
   242  		}
   243  		return clusterFactory.newCluster(ctx, cfg, wStatus.SetStatus, lopt.tee)
   244  	}
   245  
   246  	// Seed the default rand source so that different runs get different cluster
   247  	// IDs.
   248  	rand.Seed(timeutil.Now().UnixNano())
   249  
   250  	n := len(tests)
   251  	if n*count < parallelism {
   252  		// Don't spin up more workers than necessary.
   253  		parallelism = n * count
   254  	}
   255  
   256  	r.status.running = make(map[*test]struct{})
   257  	r.status.pass = make(map[*test]struct{})
   258  	r.status.fail = make(map[*test]struct{})
   259  	r.status.skip = make(map[*test]struct{})
   260  
   261  	r.work = newWorkPool(tests, count)
   262  	stopper := stop.NewStopper()
   263  	errs := &workerErrors{}
   264  
   265  	qp := quotapool.NewIntPool("cloud cpu", uint64(clustersOpt.cpuQuota))
   266  	l := lopt.l
   267  
   268  	var wg sync.WaitGroup
   269  	for i := 0; i < parallelism; i++ {
   270  		i := i // Copy for closure.
   271  		wg.Add(1)
   272  		stopper.RunWorker(ctx, func(ctx context.Context) {
   273  			defer wg.Done()
   274  
   275  			if err := r.runWorker(
   276  				ctx, fmt.Sprintf("w%d", i) /* name */, r.work, qp,
   277  				stopper.ShouldQuiesce(),
   278  				clustersOpt.keepClustersOnTestFailure,
   279  				lopt.artifactsDir, lopt.runnerLogPath, lopt.tee, lopt.stdout,
   280  				allocateCluster,
   281  				l,
   282  			); err != nil {
   283  				// A worker returned an error. Let's shut down.
   284  				msg := fmt.Sprintf("Worker %d returned with error. Quiescing. Error: %+v", i, err)
   285  				shout(ctx, l, lopt.stdout, msg)
   286  				errs.AddErr(err)
   287  				// Quiesce the stopper. This will cause all workers to not pick up more
   288  				// tests after finishing the currently running one.
   289  				stopper.Quiesce(ctx)
   290  				// Interrupt everybody waiting for resources.
   291  				if qp != nil {
   292  					qp.Close(msg)
   293  				}
   294  			}
   295  		})
   296  	}
   297  
   298  	// Wait for all the workers to finish.
   299  	wg.Wait()
   300  	r.cr.destroyAllClusters(ctx, l)
   301  
   302  	if errs.Err() != nil {
   303  		shout(ctx, l, lopt.stdout, "FAIL (err: %s)", errs.Err())
   304  		return errs.Err()
   305  	}
   306  	passFailLine := r.generateReport()
   307  	shout(ctx, l, lopt.stdout, passFailLine)
   308  	if len(r.status.fail) > 0 {
   309  		return fmt.Errorf("some tests failed")
   310  	}
   311  	return nil
   312  }
   313  
   314  type clusterAllocatorFn func(
   315  	ctx context.Context,
   316  	t testSpec,
   317  	alloc *quotapool.IntAlloc,
   318  	artifactsDir string,
   319  	wStatus *workerStatus,
   320  ) (*cluster, error)
   321  
   322  // runWorker runs tests in a loop until work is exhausted.
   323  //
   324  // Errors are returned in exceptional circumstances, like when a cluster failed
   325  // to be created or when a test timed out and failed to react to its
   326  // cancellation. Upon return, an attempt is performed to destroy the cluster used
   327  // by this worker. If an error is returned, we might have "leaked" cpu quota
   328  // because the cluster destruction might have failed but we've still released
   329  // the quota. Also, we might have "leaked" a test goroutine (in the test
   330  // nonresponsive to timeout case) which might still be running and doing
   331  // arbitrary things to the cluster it was using.
   332  //
   333  // Args:
   334  // name: The worker's name, to be used as a prefix for log messages.
   335  // artifactsRootDir: The artifacts dir. Each test's logs are going to be under a
   336  //   run_<n> dir. If empty, test log files will not be created.
   337  // testRunnerLogPath: The path to the test runner's log. It will be copied to
   338  //  	failing tests' artifacts dir if running under TeamCity.
   339  // stdout: The Writer to use for messages that need to go to stdout (e.g. the
   340  // 	 "=== RUN" and "--- FAIL" lines).
   341  // teeOpt: The teeing option for future test loggers.
   342  // l: The logger to use for more verbose messages.
   343  func (r *testRunner) runWorker(
   344  	ctx context.Context,
   345  	name string,
   346  	work *workPool,
   347  	qp *quotapool.IntPool,
   348  	interrupt <-chan struct{},
   349  	debug bool,
   350  	artifactsRootDir string,
   351  	testRunnerLogPath string,
   352  	teeOpt teeOptType,
   353  	stdout io.Writer,
   354  	allocateCluster clusterAllocatorFn,
   355  	l *logger,
   356  ) error {
   357  	ctx = logtags.AddTag(ctx, name, nil /* value */)
   358  	wStatus := r.addWorker(ctx, name)
   359  	defer func() {
   360  		r.removeWorker(ctx, name)
   361  	}()
   362  
   363  	var c *cluster // The cluster currently being used.
   364  	var err error
   365  	// When this method returns we'll destroy the cluster we had at the time.
   366  	// Note that, if debug was set, c has been set to nil.
   367  	defer func() {
   368  		wStatus.SetTest(nil /* test */, testToRunRes{noWork: true})
   369  		wStatus.SetStatus("worker done")
   370  		wStatus.SetCluster(nil)
   371  
   372  		if c == nil {
   373  			return
   374  		}
   375  		doDestroy := ctx.Err() == nil
   376  		if doDestroy {
   377  			l.PrintfCtx(ctx, "Worker exiting; destroying cluster.")
   378  			c.Destroy(context.Background(), closeLogger, l)
   379  		} else {
   380  			l.PrintfCtx(ctx, "Worker exiting with canceled ctx. Not destroying cluster.")
   381  		}
   382  	}()
   383  
   384  	// Loop until there's no more work in the pool, we get interrupted, or an
   385  	// error occurs.
   386  	for {
   387  		select {
   388  		case <-interrupt:
   389  			l.ErrorfCtx(ctx, "worker detected interruption")
   390  			return errors.Errorf("interrupted")
   391  		default:
   392  			if ctx.Err() != nil {
   393  				// The context has been canceled. No need to continue.
   394  				return errors.Wrap(ctx.Err(), "worker ctx done")
   395  			}
   396  		}
   397  
   398  		if c != nil {
   399  			if _, ok := c.spec.ReusePolicy.(reusePolicyNone); ok {
   400  				wStatus.SetStatus("destroying cluster")
   401  				// We use a context that can't be canceled for the Destroy().
   402  				c.Destroy(ctx, closeLogger, l)
   403  				c = nil
   404  			}
   405  		}
   406  		var testToRun testToRunRes
   407  		wStatus.SetTest(nil /* test */, testToRunRes{})
   408  		wStatus.SetStatus("getting work")
   409  		testToRun, c, err = r.getWork(
   410  			ctx, work, qp, c, interrupt, l,
   411  			getWorkCallbacks{
   412  				createCluster: func(ctx context.Context, ttr testToRunRes) (*cluster, error) {
   413  					wStatus.SetTest(nil /* test */, ttr)
   414  					return allocateCluster(ctx, ttr.spec, ttr.alloc, artifactsRootDir, wStatus)
   415  				},
   416  				onDestroy: func() {
   417  					wStatus.SetCluster(nil)
   418  				},
   419  			})
   420  		if err != nil || testToRun.noWork {
   421  			return err
   422  		}
   423  		c.status("running test")
   424  
   425  		// Prepare the test's logger.
   426  		logPath := ""
   427  		var artifactsDir string
   428  		var artifactsSpec string
   429  		if artifactsRootDir != "" {
   430  			escapedTestName := teamCityNameEscape(testToRun.spec.Name)
   431  			runSuffix := "run_" + strconv.Itoa(testToRun.runNum)
   432  
   433  			base := filepath.Join(artifactsRootDir, escapedTestName)
   434  
   435  			artifactsDir = filepath.Join(base, runSuffix)
   436  			logPath = filepath.Join(artifactsDir, "test.log")
   437  
   438  			// Map artifacts/TestFoo/** => TestFoo/**, i.e. collect the artifacts
   439  			// for this test exactly as they are laid out on disk (when the time
   440  			// comes).
   441  			artifactsSpec = fmt.Sprintf("%s/** => %s", base, escapedTestName)
   442  		}
   443  		testL, err := rootLogger(logPath, teeOpt)
   444  		if err != nil {
   445  			return err
   446  		}
   447  		t := &test{
   448  			spec:          &testToRun.spec,
   449  			buildVersion:  r.buildVersion,
   450  			artifactsDir:  artifactsDir,
   451  			artifactsSpec: artifactsSpec,
   452  			l:             testL,
   453  		}
   454  		// Tell the cluster that, from now on, it will be run "on behalf of this
   455  		// test".
   456  		c.setTest(t)
   457  		wStatus.SetCluster(c)
   458  		wStatus.SetTest(t, testToRun)
   459  		wStatus.SetStatus("running test")
   460  
   461  		// Now run the test.
   462  		l.PrintfCtx(ctx, "starting test: %s:%d", testToRun.spec.Name, testToRun.runNum)
   463  		var success bool
   464  		success, err = r.runTest(ctx, t, testToRun.runNum, c, testRunnerLogPath, stdout, testL)
   465  		if err != nil {
   466  			shout(ctx, l, stdout, "test returned error: %s: %s", t.Name(), err)
   467  			// Mark the test as failed if it isn't already.
   468  			if !t.Failed() {
   469  				t.printAndFail(0 /* skip */, err)
   470  			}
   471  		} else {
   472  			msg := "test passed"
   473  			if !success {
   474  				msg = fmt.Sprintf("test failed: %s (run %d)", t.Name(), testToRun.runNum)
   475  			}
   476  			l.PrintfCtx(ctx, msg)
   477  		}
   478  		testL.close()
   479  		if err != nil || t.Failed() {
   480  			failureMsg := fmt.Sprintf("%s (%d) - ", testToRun.spec.Name, testToRun.runNum)
   481  			if err != nil {
   482  				failureMsg += fmt.Sprintf("%+v", err)
   483  			} else {
   484  				failureMsg += t.FailureMsg()
   485  			}
   486  
   487  			if debug {
   488  				// Save the cluster for future debugging.
   489  				c.Save(ctx, failureMsg, l)
   490  
   491  				// Continue with a fresh cluster.
   492  				c = nil
   493  			} else {
   494  				// On any test failure or error, we destroy the cluster. We could be
   495  				// more selective, but this sounds safer.
   496  				l.PrintfCtx(ctx, "destroying cluster %s because: %s", c, failureMsg)
   497  				c.Destroy(ctx, closeLogger, l)
   498  				c = nil
   499  			}
   500  
   501  			if err != nil {
   502  				return err
   503  			}
   504  		} else {
   505  			// Upon success fetch the perf artifacts from the remote hosts.
   506  			getPerfArtifacts(ctx, l, c, t)
   507  		}
   508  	}
   509  }
   510  
   511  // getPerfArtifacts retrieves the perf artifacts for the test.
   512  // If there's an error, oh well, don't do anything rash like fail a test
   513  // which already passed.
   514  func getPerfArtifacts(ctx context.Context, l *logger, c *cluster, t *test) {
   515  	g := ctxgroup.WithContext(ctx)
   516  	fetchNode := func(node int) func(context.Context) error {
   517  		return func(ctx context.Context) error {
   518  			testCmd := `'PERF_ARTIFACTS="` + perfArtifactsDir + `"
   519  if [[ -d "${PERF_ARTIFACTS}" ]]; then
   520      echo true
   521  elif [[ -e "${PERF_ARTIFACTS}" ]]; then
   522      ls -la "${PERF_ARTIFACTS}"
   523      exit 1
   524  else
   525      echo false
   526  fi'`
   527  			out, err := c.RunWithBuffer(ctx, l, c.Node(node), "bash", "-c", testCmd)
   528  			if err != nil {
   529  				return errors.Wrapf(err, "failed to check for perf artifacts: %v", string(out))
   530  			}
   531  			switch out := strings.TrimSpace(string(out)); out {
   532  			case "true":
   533  				dst := fmt.Sprintf("%s/%d.%s", t.artifactsDir, node, perfArtifactsDir)
   534  				return c.Get(ctx, l, perfArtifactsDir, dst, c.Node(node))
   535  			case "false":
   536  				l.PrintfCtx(ctx, "no perf artifacts exist on node %v", c.Node(node))
   537  				return nil
   538  			default:
   539  				return errors.Errorf("unexpected output when checking for perf artifacts: %s", out)
   540  			}
   541  		}
   542  	}
   543  	for _, i := range c.All() {
   544  		g.GoCtx(fetchNode(i))
   545  	}
   546  	if err := g.Wait(); err != nil {
   547  		l.PrintfCtx(ctx, "failed to get perf artifacts: %v", err)
   548  	}
   549  }
   550  
   551  func allStacks() []byte {
   552  	// Collect up to 5mb worth of stacks.
   553  	b := make([]byte, 5*(1<<20))
   554  	return b[:runtime.Stack(b, true /* all */)]
   555  }
   556  
   557  // An error is returned if the test is still running (on another goroutine) when
   558  // this returns. This happens when the test doesn't respond to cancellation.
   559  // Returns true if the test is considered to have passed, false otherwise.
   560  //
   561  // Args:
   562  // c: The cluster on which the test will run. runTest() does not wipe or destroy
   563  //    the cluster.
   564  // testRunnerLogPath: The path to the test runner's log. It will be copied to
   565  //  	the test's artifacts dir if the test fails and we're running under
   566  //  	TeamCity.
   567  func (r *testRunner) runTest(
   568  	ctx context.Context,
   569  	t *test,
   570  	runNum int,
   571  	c *cluster,
   572  	testRunnerLogPath string,
   573  	stdout io.Writer,
   574  	l *logger,
   575  ) (bool, error) {
   576  	if t.spec.Skip != "" {
   577  		return false, fmt.Errorf("can't run skipped test: %s: %s", t.Name(), t.spec.Skip)
   578  	}
   579  
   580  	if teamCity {
   581  		shout(ctx, l, stdout, "##teamcity[testStarted name='%s' flowId='%s']", t.Name(), t.Name())
   582  	} else {
   583  		shout(ctx, l, stdout, "=== RUN   %s", t.Name())
   584  	}
   585  
   586  	r.status.Lock()
   587  	r.status.running[t] = struct{}{}
   588  	r.status.Unlock()
   589  
   590  	t.runner = callerName()
   591  	t.runnerID = goid.Get()
   592  
   593  	defer func() {
   594  		t.end = timeutil.Now()
   595  
   596  		if err := recover(); err != nil {
   597  			t.mu.Lock()
   598  			t.mu.failed = true
   599  			t.mu.output = append(t.mu.output, t.decorate(0 /* skip */, fmt.Sprint(err))...)
   600  			t.mu.Unlock()
   601  		}
   602  
   603  		t.mu.Lock()
   604  		t.mu.done = true
   605  		t.mu.Unlock()
   606  
   607  		durationStr := fmt.Sprintf("%.2fs", t.duration().Seconds())
   608  		if t.Failed() {
   609  			t.mu.Lock()
   610  			output := fmt.Sprintf("test artifacts and logs in: %s\n", t.ArtifactsDir()) + string(t.mu.output)
   611  			failLoc := t.mu.failLoc
   612  			t.mu.Unlock()
   613  
   614  			if teamCity {
   615  				shout(ctx, l, stdout, "##teamcity[testFailed name='%s' details='%s' flowId='%s']",
   616  					t.Name(), teamCityEscape(output), t.Name(),
   617  				)
   618  
   619  				// Copy a snapshot of the testrunner's log to the test's artifacts dir
   620  				// so that we collect it below.
   621  				cp := exec.Command("cp", testRunnerLogPath, t.artifactsDir)
   622  				if err := cp.Run(); err != nil {
   623  					l.ErrorfCtx(ctx, "failed to copy test runner's logs to test artifacts: %s", err)
   624  				}
   625  			}
   626  
   627  			shout(ctx, l, stdout, "--- FAIL: %s (%s)\n%s", t.Name(), durationStr, output)
   628  			// NB: check NodeCount > 0 to avoid posting issues from this pkg's unit tests.
   629  			if issues.CanPost() && t.spec.Run != nil && t.spec.Cluster.NodeCount > 0 {
   630  				authorEmail := getAuthorEmail(t.spec.Tags, failLoc.file, failLoc.line)
   631  				if authorEmail == "" {
   632  					ownerInfo, ok := roachtestOwners[t.spec.Owner]
   633  					if ok {
   634  						authorEmail = ownerInfo.ContactEmail
   635  					}
   636  				}
   637  				branch := "<unknown branch>"
   638  				if b := os.Getenv("TC_BUILD_BRANCH"); b != "" {
   639  					branch = b
   640  				}
   641  				msg := fmt.Sprintf("The test failed on branch=%s, cloud=%s:\n%s",
   642  					branch, cloud, output)
   643  				artifacts := fmt.Sprintf("/%s", t.Name())
   644  
   645  				req := issues.PostRequest{
   646  					// TODO(tbg): actually use this as a template.
   647  					TitleTemplate: fmt.Sprintf("roachtest: %s failed", t.Name()),
   648  					// TODO(tbg): make a template better adapted to roachtest.
   649  					BodyTemplate: issues.UnitTestFailureBody,
   650  					PackageName:  "roachtest",
   651  					TestName:     t.Name(),
   652  					Message:      msg,
   653  					Artifacts:    artifacts,
   654  					AuthorEmail:  authorEmail,
   655  					// Issues posted from roachtest are identifiable as such and
   656  					// they are also release blockers (this label may be removed
   657  					// by a human upon closer investigation).
   658  					ExtraLabels: []string{"O-roachtest", "release-blocker"},
   659  				}
   660  				if err := issues.Post(
   661  					context.Background(),
   662  					req,
   663  				); err != nil {
   664  					shout(ctx, l, stdout, "failed to post issue: %s", err)
   665  				}
   666  			}
   667  		} else {
   668  			shout(ctx, l, stdout, "--- PASS: %s (%s)", t.Name(), durationStr)
   669  			// If `##teamcity[testFailed ...]` is not present before `##teamCity[testFinished ...]`,
   670  			// TeamCity regards the test as successful.
   671  		}
   672  
   673  		if teamCity {
   674  			shout(ctx, l, stdout, "##teamcity[testFinished name='%s' flowId='%s']", t.Name(), t.Name())
   675  
   676  			if t.artifactsSpec != "" {
   677  				// Tell TeamCity to collect this test's artifacts now. The TC job
   678  				// also collects the artifacts directory wholesale at the end, but
   679  				// here we make sure that the artifacts for any test that has already
   680  				// finished are available in the UI even before the job as a whole
   681  				// has completed. We're using the exact same destination to avoid
   682  				// duplication of any of the artifacts.
   683  				shout(ctx, l, stdout, "##teamcity[publishArtifacts '%s']", t.artifactsSpec)
   684  			}
   685  		}
   686  
   687  		r.recordTestFinish(completedTestInfo{
   688  			test:    t.Name(),
   689  			run:     runNum,
   690  			start:   t.start,
   691  			end:     t.end,
   692  			pass:    !t.Failed(),
   693  			failure: t.FailureMsg(),
   694  		})
   695  		r.status.Lock()
   696  		delete(r.status.running, t)
   697  		// Only include tests with a Run function in the summary output.
   698  		if t.spec.Run != nil {
   699  			if t.Failed() {
   700  				r.status.fail[t] = struct{}{}
   701  			} else if t.spec.Skip == "" {
   702  				r.status.pass[t] = struct{}{}
   703  			} else {
   704  				r.status.skip[t] = struct{}{}
   705  			}
   706  		}
   707  		r.status.Unlock()
   708  	}()
   709  
   710  	t.start = timeutil.Now()
   711  
   712  	timeout := 10 * time.Hour
   713  	if t.spec.Timeout != 0 {
   714  		timeout = t.spec.Timeout
   715  	}
   716  	// Make sure the cluster has enough life left for the test plus enough headroom
   717  	// after the test finishes so that the next test can be selected. If it
   718  	// doesn't, extend it.
   719  	minExp := timeutil.Now().Add(timeout + time.Hour)
   720  	if c.expiration.Before(minExp) {
   721  		extend := minExp.Sub(c.expiration)
   722  		l.PrintfCtx(ctx, "cluster needs to survive until %s, but has expiration: %s. Extending.",
   723  			minExp, c.expiration)
   724  		if err := c.Extend(ctx, extend, l); err != nil {
   725  			t.printfAndFail(0 /* skip */, "failed to extend cluster: %s", err)
   726  			return false, nil
   727  		}
   728  	}
   729  
   730  	runCtx, cancel := context.WithCancel(ctx)
   731  	defer cancel()
   732  	t.mu.Lock()
   733  	// t.Fatal() will cancel this context.
   734  	t.mu.cancel = cancel
   735  	t.mu.Unlock()
   736  
   737  	// We run the actual test in a different goroutine because it might call
   738  	// t.Fatal() which kills the goroutine, and also because we want to enforce a
   739  	// timeout.
   740  	success := false
   741  	done := make(chan struct{})
   742  	go func() {
   743  		defer close(done) // closed only after we've grabbed the debug info below
   744  
   745  		// This is the call to actually run the test.
   746  		defer func() {
   747  			if r := recover(); r != nil {
   748  				// TODO(andreimatei): prevent the cluster from being reused.
   749  				t.Fatalf("test panicked: %v", r)
   750  			}
   751  		}()
   752  
   753  		t.spec.Run(runCtx, t, c)
   754  	}()
   755  
   756  	select {
   757  	case <-done:
   758  	case <-time.After(timeout):
   759  		// Timeouts are often opaque. Improve our changes by dumping the stack
   760  		// so that at least we can piece together what the test is trying to
   761  		// do at this very moment.
   762  		// In large roachtest runs, this will dump everyone else's stack as well,
   763  		// but this is just hard to avoid. Moving to a worker model in which
   764  		// each roachtest is spawned off as a separate process would address
   765  		// this at the expense of ease of communication between the runner and
   766  		// the test.
   767  		//
   768  		// Do this before we cancel the context, which might (hopefully) unblock
   769  		// the test. We want to see where it got stuck.
   770  		const stacksFile = "__stacks"
   771  		if cl, err := t.l.ChildLogger(stacksFile, quietStderr, quietStdout); err == nil {
   772  			cl.PrintfCtx(ctx, "all stacks:\n\n%s\n", allStacks())
   773  			t.l.PrintfCtx(ctx, "dumped stacks to %s", stacksFile)
   774  		}
   775  		// Now kill anything going on in this cluster while collecting stacks
   776  		// to the logs, to get the server side of the hang.
   777  		//
   778  		// TODO(tbg): send --sig=3 followed by a hard kill after we've fixed
   779  		// https://github.com/cockroachdb/cockroach/issues/45875.
   780  		// Signal 11 will dump stacks, but it might be confusing to folks
   781  		// who debug from the artifacts only.
   782  		//
   783  		// Don't use surrounding context, which are likely already canceled.
   784  		if nodes := c.All(); len(nodes) > 0 { // avoid tests
   785  			innerCtx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
   786  			_ = c.StopE(innerCtx, c.All(), stopArgs("--sig=11"))
   787  			cancel()
   788  		}
   789  
   790  		// Mark the test as failed (which will also cancel its context). Then
   791  		// we'll wait up to 5 minutes in the hope that the test reacts either to
   792  		// the ctx cancelation or to the fact that it was marked as failed (and
   793  		// all processes nuked out from under it).
   794  		// If that happens, great - we return normally and so the cluster can be
   795  		// reused. It the test does not react to anything, then we return an
   796  		// error, which will cause the caller to stop everything and destroy
   797  		// this cluster (as well as all the others). The cluster cannot be
   798  		// reused since we have a runaway test goroutine that's presumably going
   799  		// to continue using the cluster.
   800  		t.printfAndFail(0 /* skip */, "test timed out (%s)", timeout)
   801  		select {
   802  		case <-done:
   803  			if success {
   804  				panic("expected success=false after a timeout")
   805  			}
   806  		case <-time.After(5 * time.Minute):
   807  			// We really shouldn't get here unless the test code somehow managed
   808  			// to deadlock without blocking on anything remote - since we killed
   809  			// everything.
   810  			const msg = "test timed out and afterwards failed to respond to cancelation"
   811  			t.l.PrintfCtx(ctx, msg)
   812  			r.collectClusterLogs(ctx, c, t.l)
   813  			// We return an error here because the test goroutine is still running, so
   814  			// we want to alert the caller of this unusual situation.
   815  			return false, errors.New(msg)
   816  		}
   817  	}
   818  
   819  	// Detect replica divergence (i.e. ranges in which replicas have arrived
   820  	// at the same log position with different states).
   821  	c.FailOnReplicaDivergence(ctx, t)
   822  	// Detect dead nodes in an inner defer. Note that this will call
   823  	// t.printfAndFail() when appropriate, which will cause the code below to
   824  	// enter the t.Failed() branch.
   825  	c.FailOnDeadNodes(ctx, t)
   826  
   827  	if t.Failed() {
   828  		r.collectClusterLogs(ctx, c, t.l)
   829  		return false, nil
   830  	}
   831  	return true, nil
   832  }
   833  
   834  func (r *testRunner) collectClusterLogs(ctx context.Context, c *cluster, l *logger) {
   835  	// NB: fetch the logs even when we have a debug zip because
   836  	// debug zip can't ever get the logs for down nodes.
   837  	// We only save artifacts for failed tests in CI, so this
   838  	// duplication is acceptable.
   839  	// NB: fetch the logs *first* in case one of the other steps
   840  	// below has problems. For example, `debug zip` is known to
   841  	// hang sometimes at the time of writing, see:
   842  	// https://github.com/cockroachdb/cockroach/issues/39620
   843  	l.PrintfCtx(ctx, "collecting cluster logs")
   844  	if err := c.FetchLogs(ctx); err != nil {
   845  		l.Printf("failed to download logs: %s", err)
   846  	}
   847  	if err := c.FetchDmesg(ctx); err != nil {
   848  		l.Printf("failed to fetch dmesg: %s", err)
   849  	}
   850  	if err := c.FetchJournalctl(ctx); err != nil {
   851  		l.Printf("failed to fetch journalctl: %s", err)
   852  	}
   853  	if err := c.FetchCores(ctx); err != nil {
   854  		l.Printf("failed to fetch cores: %s", err)
   855  	}
   856  	if err := c.CopyRoachprodState(ctx); err != nil {
   857  		l.Printf("failed to copy roachprod state: %s", err)
   858  	}
   859  	if err := c.FetchDebugZip(ctx); err != nil {
   860  		l.Printf("failed to collect zip: %s", err)
   861  	}
   862  }
   863  
   864  func callerName() string {
   865  	// Make room for the skip PC.
   866  	var pc [2]uintptr
   867  	n := runtime.Callers(2, pc[:]) // runtime.Callers + callerName
   868  	if n == 0 {
   869  		panic("zero callers found")
   870  	}
   871  	frames := runtime.CallersFrames(pc[:n])
   872  	frame, _ := frames.Next()
   873  	return frame.Function
   874  }
   875  
   876  // generateReport produces the final pass/fail line and produces a slack report
   877  // if configured.
   878  func (r *testRunner) generateReport() string {
   879  	r.status.Lock()
   880  	defer r.status.Unlock()
   881  	postSlackReport(r.status.pass, r.status.fail, r.status.skip)
   882  
   883  	fails := len(r.status.fail)
   884  	var msg string
   885  	if fails > 0 {
   886  		msg = fmt.Sprintf("FAIL (%d fails)\n", fails)
   887  	} else {
   888  		msg = "PASS"
   889  	}
   890  	return msg
   891  }
   892  
   893  type getWorkCallbacks struct {
   894  	createCluster func(context.Context, testToRunRes) (*cluster, error)
   895  	onDestroy     func()
   896  }
   897  
   898  // getWork selects the next test to run and creates a suitable cluster for it if
   899  // need be. If a new cluster needs to be created, the method blocks until there
   900  // are enough resources available to run it.
   901  // getWork takes in a cluster; if not nil, tests that can reuse it are
   902  // preferred. If a test that can reuse it is not found (or if there's no more
   903  // work), the cluster is destroyed (and so its resources are released).
   904  //
   905  // If the cluster is to be reused, getWork() wipes it.
   906  func (r *testRunner) getWork(
   907  	ctx context.Context,
   908  	work *workPool,
   909  	qp *quotapool.IntPool,
   910  	c *cluster,
   911  	interrupt <-chan struct{},
   912  	l *logger,
   913  	callbacks getWorkCallbacks,
   914  ) (testToRunRes, *cluster, error) {
   915  
   916  	select {
   917  	case <-interrupt:
   918  		return testToRunRes{}, nil, fmt.Errorf("interrupted")
   919  	default:
   920  	}
   921  
   922  	testToRun, err := work.getTestToRun(ctx, c, qp, r.cr, callbacks.onDestroy, l)
   923  	if err != nil {
   924  		return testToRunRes{}, nil, err
   925  	}
   926  	if !testToRun.noWork {
   927  		l.PrintfCtx(ctx, "Selected test: %s run: %d.", testToRun.spec.Name, testToRun.runNum)
   928  	}
   929  	// Are we done?
   930  	if testToRun.noWork {
   931  		return testToRun, nil, nil
   932  	}
   933  
   934  	// Create a cluster, if we no longer have one.
   935  	if testToRun.canReuseCluster {
   936  		l.PrintfCtx(ctx, "Using existing cluster: %s. Wiping", c.name)
   937  		if err := c.WipeE(ctx, l); err != nil {
   938  			return testToRunRes{}, nil, err
   939  		}
   940  		if err := c.RunL(ctx, l, c.All(), "rm -rf "+perfArtifactsDir); err != nil {
   941  			return testToRunRes{}, nil, errors.Wrapf(err, "failed to remove perf artifacts dir")
   942  		}
   943  		// Overwrite the spec of the cluster with the one coming from the test. In
   944  		// particular, this overwrites the reuse policy to reflect what the test
   945  		// intends to do with it.
   946  		c.spec = testToRun.spec.Cluster
   947  	} else {
   948  		var err error
   949  		c, err = callbacks.createCluster(ctx, testToRun)
   950  		if err != nil {
   951  			return testToRunRes{}, nil, err
   952  		}
   953  	}
   954  	return testToRun, c, nil
   955  }
   956  
   957  // addWorker updates the bookkeeping for one more worker.
   958  func (r *testRunner) addWorker(ctx context.Context, name string) *workerStatus {
   959  	r.workersMu.Lock()
   960  	defer r.workersMu.Unlock()
   961  	w := &workerStatus{name: name}
   962  	if _, ok := r.workersMu.workers[name]; ok {
   963  		log.Fatalf(ctx, "worker %q already exists", name)
   964  	}
   965  	r.workersMu.workers[name] = w
   966  	return w
   967  }
   968  
   969  // removeWorker deletes the bookkepping for a worker that has finished running.
   970  func (r *testRunner) removeWorker(ctx context.Context, name string) {
   971  	r.workersMu.Lock()
   972  	delete(r.workersMu.workers, name)
   973  	r.workersMu.Unlock()
   974  }
   975  
   976  // runHTTPServer starts a server running in the background.
   977  //
   978  // httpPort: The port on which to serve the web interface. Pass 0 for allocating
   979  // 	 a port automatically (which will be printed to stdout).
   980  func (r *testRunner) runHTTPServer(httpPort int, stdout io.Writer) error {
   981  	http.HandleFunc("/", r.serveHTTP)
   982  	// Run an http server in the background.
   983  	// We handle the case where httpPort is 0, which means we automatically
   984  	// allocate a port.
   985  	listener, err := net.Listen("tcp", fmt.Sprintf(":%d", httpPort))
   986  	if err != nil {
   987  		return err
   988  	}
   989  	httpPort = listener.Addr().(*net.TCPAddr).Port
   990  	go func() {
   991  		if err := http.Serve(listener, nil /* handler */); err != nil {
   992  			panic(err)
   993  		}
   994  	}()
   995  	fmt.Fprintf(stdout, "HTTP server listening on all network interfaces, port %d.\n", httpPort)
   996  	return nil
   997  }
   998  
   999  // serveHTTP is the handler for the test runner's web server.
  1000  func (r *testRunner) serveHTTP(wr http.ResponseWriter, req *http.Request) {
  1001  	fmt.Fprintf(wr, "<html><body>")
  1002  	fmt.Fprintf(wr, "<a href='debug/pprof'>pprof</a>")
  1003  	fmt.Fprintf(wr, "<p>")
  1004  	// Print the workers report.
  1005  	fmt.Fprintf(wr, "<h2>Workers:</h2>")
  1006  	fmt.Fprintf(wr, `<table border='1'>
  1007  	<tr><th>Worker</th>
  1008  	<th>Worker Status</th>
  1009  	<th>Test</th>
  1010  	<th>Cluster</th>
  1011  	<th>Cluster reused</th>
  1012  	<th>Test Status</th>
  1013  	</tr>`)
  1014  	r.workersMu.Lock()
  1015  	workers := make([]*workerStatus, len(r.workersMu.workers))
  1016  	i := 0
  1017  	for _, w := range r.workersMu.workers {
  1018  		workers[i] = w
  1019  		i++
  1020  	}
  1021  	r.workersMu.Unlock()
  1022  	sort.Slice(workers, func(i int, j int) bool {
  1023  		l := workers[i]
  1024  		r := workers[j]
  1025  		return strings.Compare(l.name, r.name) < 0
  1026  	})
  1027  	for _, w := range workers {
  1028  		var testName string
  1029  		ttr := w.TestToRun()
  1030  		clusterReused := ""
  1031  		if ttr.noWork {
  1032  			testName = "done"
  1033  		} else if ttr.spec.Name == "" {
  1034  			testName = "N/A"
  1035  		} else {
  1036  			testName = fmt.Sprintf("%s (run %d)", ttr.spec.Name, ttr.runNum)
  1037  			if ttr.canReuseCluster {
  1038  				clusterReused = "yes"
  1039  			} else {
  1040  				clusterReused = "no"
  1041  			}
  1042  		}
  1043  		var clusterName string
  1044  		if w.Cluster() != nil {
  1045  			clusterName = w.Cluster().name
  1046  		}
  1047  		t := w.Test()
  1048  		testStatus := "N/A"
  1049  		if t != nil {
  1050  			testStatus = t.GetStatus()
  1051  		}
  1052  		fmt.Fprintf(wr, "<tr><td>%s</td><td>%s</td><td>%s</td><td>%s</td><td>%s</td><td>%s</td></tr>\n",
  1053  			w.name, w.Status(), testName, clusterName, clusterReused, testStatus)
  1054  	}
  1055  	fmt.Fprintf(wr, "</table>")
  1056  
  1057  	// Print the finished tests report.
  1058  	fmt.Fprintf(wr, "<p>")
  1059  	fmt.Fprintf(wr, "<h2>Finished tests:</h2>")
  1060  	fmt.Fprintf(wr, `<table border='1'>
  1061  	<tr><th>Test</th>
  1062  	<th>Status</th>
  1063  	<th>Duration</th>
  1064  	</tr>`)
  1065  	for _, t := range r.getCompletedTests() {
  1066  		name := fmt.Sprintf("%s (run %d)", t.test, t.run)
  1067  		status := "PASS"
  1068  		if !t.pass {
  1069  			status = "FAIL " + t.failure
  1070  		}
  1071  		duration := fmt.Sprintf("%s (%s - %s)", t.end.Sub(t.start), t.start, t.end)
  1072  		fmt.Fprintf(wr, "<tr><td>%s</td><td>%s</td><td>%s</td><tr/>", name, status, duration)
  1073  	}
  1074  	fmt.Fprintf(wr, "</table>")
  1075  
  1076  	// Print the saved clusters report.
  1077  	fmt.Fprintf(wr, "<p>")
  1078  	fmt.Fprintf(wr, "<h2>Clusters left alive for further debugging "+
  1079  		"(if --debug was specified):</h2>")
  1080  	fmt.Fprintf(wr, `<table border='1'>
  1081  	<tr><th>Cluster</th>
  1082  	<th>Test</th>
  1083  	</tr>`)
  1084  	for _, c := range r.cr.savedClusters() {
  1085  		fmt.Fprintf(wr, "<tr><td>%s</td><td>%s</td><tr/>", c.name, c.savedMsg)
  1086  	}
  1087  	fmt.Fprintf(wr, "</table>")
  1088  
  1089  	fmt.Fprintf(wr, "<p>")
  1090  	fmt.Fprintf(wr, "<h2>Tests left:</h2>")
  1091  	fmt.Fprintf(wr, `<table border='1'>
  1092  	<tr><th>Test</th>
  1093  	<th>Runs</th>
  1094  	</tr>`)
  1095  	for _, t := range r.work.workRemaining() {
  1096  		fmt.Fprintf(wr, "<tr><td>%s</td><td>%d</td><tr/>", t.spec.Name, t.count)
  1097  	}
  1098  
  1099  	fmt.Fprintf(wr, "</body></html>")
  1100  }
  1101  
  1102  // recordTestFinish updated bookkeeping when a test finishes.
  1103  func (r *testRunner) recordTestFinish(info completedTestInfo) {
  1104  	r.completedTestsMu.Lock()
  1105  	r.completedTestsMu.completed = append(r.completedTestsMu.completed, info)
  1106  	r.completedTestsMu.Unlock()
  1107  }
  1108  
  1109  // getCompletedTests returns info on all tests that finished running.
  1110  func (r *testRunner) getCompletedTests() []completedTestInfo {
  1111  	r.completedTestsMu.Lock()
  1112  	defer r.completedTestsMu.Unlock()
  1113  	res := make([]completedTestInfo, len(r.completedTestsMu.completed))
  1114  	copy(res, r.completedTestsMu.completed)
  1115  	return res
  1116  }
  1117  
  1118  // completedTestInfo represents information on a completed test run.
  1119  type completedTestInfo struct {
  1120  	test    string
  1121  	run     int
  1122  	start   time.Time
  1123  	end     time.Time
  1124  	pass    bool
  1125  	failure string
  1126  }
  1127  
  1128  // PredecessorVersion returns a recent predecessor of the build version (i.e.
  1129  // the build tag of the main binary). For example, if the running binary is from
  1130  // the master branch prior to releasing 19.2.0, this will return a recent
  1131  // (ideally though not necessarily the latest) 19.1 patch release.
  1132  func PredecessorVersion(buildVersion version.Version) (string, error) {
  1133  	if buildVersion == (version.Version{}) {
  1134  		return "", errors.Errorf("buildVersion not set")
  1135  	}
  1136  
  1137  	buildVersionMajorMinor := fmt.Sprintf("%d.%d", buildVersion.Major(), buildVersion.Minor())
  1138  
  1139  	// NB: you can update the values in this map to point at newer patch
  1140  	// releases. You will need to run acceptance/version-upgrade with the
  1141  	// checkpoint option enabled to create the missing store directory fixture
  1142  	// (see runVersionUpgrade). The same is true for adding a new key to this
  1143  	// map.
  1144  	verMap := map[string]string{
  1145  		"20.2": "20.1.0",
  1146  		"20.1": "19.2.7",
  1147  		"19.2": "19.1.5",
  1148  		"19.1": "2.1.9",
  1149  		"2.2":  "2.1.9",
  1150  		"2.1":  "2.0.7",
  1151  	}
  1152  	v, ok := verMap[buildVersionMajorMinor]
  1153  	if !ok {
  1154  		return "", errors.Errorf("prev version not set for version: %s", buildVersionMajorMinor)
  1155  	}
  1156  	return v, nil
  1157  }
  1158  
  1159  type workerErrors struct {
  1160  	mu struct {
  1161  		syncutil.Mutex
  1162  		errs []error
  1163  	}
  1164  }
  1165  
  1166  func (we *workerErrors) AddErr(err error) {
  1167  	we.mu.Lock()
  1168  	defer we.mu.Unlock()
  1169  	we.mu.errs = append(we.mu.errs, err)
  1170  }
  1171  
  1172  func (we *workerErrors) Err() error {
  1173  	we.mu.Lock()
  1174  	defer we.mu.Unlock()
  1175  	if len(we.mu.errs) == 0 {
  1176  		return nil
  1177  	}
  1178  	// TODO(andrei): Maybe we should do something other than return the first
  1179  	// error...
  1180  	return we.mu.errs[0]
  1181  }