github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/cmd/roachtest/test.go

github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/cmd/roachtest/test.go (about)

     1  // Copyright 2018 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package main
    12  
    13  import (
    14  	"bytes"
    15  	"context"
    16  	"fmt"
    17  	"io"
    18  	// For the debug http handlers.
    19  	_ "net/http/pprof"
    20  	"os/exec"
    21  	"regexp"
    22  	"runtime"
    23  	"strings"
    24  	"time"
    25  
    26  	"github.com/cockroachdb/cockroach/pkg/util/syncutil"
    27  	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
    28  	"github.com/cockroachdb/cockroach/pkg/util/version"
    29  	"github.com/petermattis/goid"
    30  )
    31  
    32  type testSpec struct {
    33  	Skip string // if non-empty, test will be skipped
    34  	// When Skip is set, this can contain more text to be printed in the logs
    35  	// after the "--- SKIP" line.
    36  	SkipDetails string
    37  
    38  	Name string
    39  	// Owner is the name of the team responsible for signing off on failures of
    40  	// this test that happen in the release process. This must be one of a limited
    41  	// set of values (the keys in the roachtestTeams map).
    42  	Owner Owner
    43  	// The maximum duration the test is allowed to run before it is considered
    44  	// failed. If not specified, the default timeout is 10m before the test's
    45  	// associated cluster expires. The timeout is always truncated to 10m before
    46  	// the test's cluster expires.
    47  	Timeout time.Duration
    48  	// MinVersion indicates the minimum cockroach version that is required for
    49  	// the test to be run. If MinVersion is less than the version specified
    50  	// --cockroach-version, Skip will be populated causing the test to be
    51  	// skipped.
    52  	MinVersion string
    53  	minVersion *version.Version
    54  	// Tags is a set of tags associated with the test that allow grouping
    55  	// tests. If no tags are specified, the set ["default"] is automatically
    56  	// given.
    57  	Tags []string
    58  	// Cluster provides the specification for the cluster to use for the test.
    59  	Cluster clusterSpec
    60  
    61  	// UseIOBarrier controls the local-ssd-no-ext4-barrier flag passed to
    62  	// roachprod when creating a cluster. If set, the flag is not passed, and so
    63  	// you get durable writes. If not set (the default!), the filesystem is
    64  	// mounted without the barrier.
    65  	//
    66  	// The default (false) is chosen because it the no-barrier option is needed
    67  	// explicitly by some tests (particularly benchmarks, ironically, since they'd
    68  	// rather measure other things than I/O) and the vast majority of other tests
    69  	// don't care - there's no durability across machine crashes that roachtests
    70  	// care about.
    71  	UseIOBarrier bool
    72  
    73  	// Run is the test function.
    74  	Run func(ctx context.Context, t *test, c *cluster)
    75  }
    76  
    77  // perfArtifactsDir is the directory on cluster nodes in which perf artifacts
    78  // reside. Upon success this directory is copied into test artifactsDir from
    79  // each node in the cluster.
    80  const perfArtifactsDir = "perf"
    81  
    82  // matchOrSkip returns true if the filter matches the test. If the filter does
    83  // not match the test because the tag filter does not match, the test is
    84  // matched, but marked as skipped.
    85  func (t *testSpec) matchOrSkip(filter *testFilter) bool {
    86  	if !filter.name.MatchString(t.Name) {
    87  		return false
    88  	}
    89  	if len(t.Tags) == 0 {
    90  		if !filter.tag.MatchString("default") {
    91  			t.Skip = fmt.Sprintf("%s does not match [default]", filter.rawTag)
    92  		}
    93  		return true
    94  	}
    95  	for _, t := range t.Tags {
    96  		if filter.tag.MatchString(t) {
    97  			return true
    98  		}
    99  	}
   100  	t.Skip = fmt.Sprintf("%s does not match %s", filter.rawTag, t.Tags)
   101  	return true
   102  }
   103  
   104  type testStatus struct {
   105  	msg      string
   106  	time     time.Time
   107  	progress float64
   108  }
   109  
   110  type test struct {
   111  	spec *testSpec
   112  
   113  	// buildVersion is the version of the Cockroach binary that the test will run
   114  	// against.
   115  	buildVersion version.Version
   116  
   117  	// l is the logger that the test will use for its output.
   118  	l *logger
   119  
   120  	runner   string
   121  	runnerID int64
   122  	start    time.Time
   123  	end      time.Time
   124  
   125  	// artifactsDir is the path to the directory holding all the artifacts for
   126  	// this test. It will contain a test.log file and cluster logs.
   127  	artifactsDir string
   128  	// artifactsSpec is a TeamCity artifacts spec used to publish this test's
   129  	// artifacts. See:
   130  	// https://www.jetbrains.com/help/teamcity/2019.1/configuring-general-settings.html#Artifact-Paths
   131  	artifactsSpec string
   132  
   133  	mu struct {
   134  		syncutil.RWMutex
   135  		done   bool
   136  		failed bool
   137  		// cancel, if set, is called from the t.Fatal() family of functions when the
   138  		// test is being marked as failed (i.e. when the failed field above is also
   139  		// set). This is used to cancel the context passed to t.spec.Run(), so async
   140  		// test goroutines can be notified.
   141  		cancel  func()
   142  		failLoc struct {
   143  			file string
   144  			line int
   145  		}
   146  		failureMsg string
   147  		status     map[int64]testStatus
   148  		output     []byte
   149  	}
   150  }
   151  
   152  func (t *test) Name() string {
   153  	return t.spec.Name
   154  }
   155  
   156  func (t *test) logger() *logger {
   157  	return t.l
   158  }
   159  
   160  func (t *test) status(ctx context.Context, id int64, args ...interface{}) {
   161  	t.mu.Lock()
   162  	defer t.mu.Unlock()
   163  
   164  	if t.mu.status == nil {
   165  		t.mu.status = make(map[int64]testStatus)
   166  	}
   167  	if len(args) == 0 {
   168  		delete(t.mu.status, id)
   169  		return
   170  	}
   171  	msg := fmt.Sprint(args...)
   172  	t.mu.status[id] = testStatus{
   173  		msg:  msg,
   174  		time: timeutil.Now(),
   175  	}
   176  	if !t.l.closed() {
   177  		if id == t.runnerID {
   178  			t.l.PrintfCtxDepth(ctx, 2, "test status: %s", msg)
   179  		} else {
   180  			t.l.PrintfCtxDepth(ctx, 2, "test worker status: %s", msg)
   181  		}
   182  	}
   183  }
   184  
   185  // Status sets the main status message for the test. When called from the main
   186  // test goroutine (i.e. the goroutine on which testSpec.Run is invoked), this
   187  // is equivalent to calling WorkerStatus. If no arguments are specified, the
   188  // status message is erased.
   189  func (t *test) Status(args ...interface{}) {
   190  	t.status(context.TODO(), t.runnerID, args...)
   191  }
   192  
   193  // GetStatus returns the status of the tests's main goroutine.
   194  func (t *test) GetStatus() string {
   195  	t.mu.Lock()
   196  	defer t.mu.Unlock()
   197  	status, ok := t.mu.status[t.runnerID]
   198  	if ok {
   199  		return fmt.Sprintf("%s (set %s ago)", status.msg, timeutil.Now().Sub(status.time).Round(time.Second))
   200  	}
   201  	return "N/A"
   202  }
   203  
   204  // WorkerStatus sets the status message for a worker goroutine associated with
   205  // the test. The status message should be cleared before the goroutine exits by
   206  // calling WorkerStatus with no arguments.
   207  func (t *test) WorkerStatus(args ...interface{}) {
   208  	t.status(context.TODO(), goid.Get(), args...)
   209  }
   210  
   211  func (t *test) progress(id int64, frac float64) {
   212  	t.mu.Lock()
   213  	defer t.mu.Unlock()
   214  
   215  	if t.mu.status == nil {
   216  		t.mu.status = make(map[int64]testStatus)
   217  	}
   218  	status := t.mu.status[id]
   219  	status.progress = frac
   220  	t.mu.status[id] = status
   221  }
   222  
   223  // Progress sets the progress (a fraction in the range [0,1]) associated with
   224  // the main test status messasge. When called from the main test goroutine
   225  // (i.e. the goroutine on which testSpec.Run is invoked), this is equivalent to
   226  // calling WorkerProgress.
   227  func (t *test) Progress(frac float64) {
   228  	t.progress(t.runnerID, frac)
   229  }
   230  
   231  // WorkerProgress sets the progress (a fraction in the range [0,1]) associated
   232  // with the a worker status messasge.
   233  func (t *test) WorkerProgress(frac float64) {
   234  	t.progress(goid.Get(), frac)
   235  }
   236  
   237  // Skip records msg into t.spec.Skip and calls runtime.Goexit() - thus
   238  // interrupting the running of the test.
   239  func (t *test) Skip(msg string, details string) {
   240  	t.spec.Skip = msg
   241  	t.spec.SkipDetails = details
   242  	runtime.Goexit()
   243  }
   244  
   245  // Fatal marks the test as failed, prints the args to t.l, and calls
   246  // runtime.GoExit(). It can be called multiple times.
   247  //
   248  // If the only argument is an error, it is formatted by "%+v", so it will show
   249  // stack traces and such.
   250  //
   251  // ATTENTION: Since this calls runtime.GoExit(), it should only be called from a
   252  // test's closure. The test runner itself should never call this.
   253  func (t *test) Fatal(args ...interface{}) {
   254  	t.fatalfInner("" /* format */, args...)
   255  }
   256  
   257  // Fatalf is like Fatal, but takes a format string.
   258  func (t *test) Fatalf(format string, args ...interface{}) {
   259  	t.fatalfInner(format, args...)
   260  }
   261  
   262  // FailNow implements the TestingT interface.
   263  func (t *test) FailNow() {
   264  	t.Fatal()
   265  }
   266  
   267  // Errorf implements the TestingT interface.
   268  func (t *test) Errorf(format string, args ...interface{}) {
   269  	t.Fatalf(format, args...)
   270  }
   271  
   272  func (t *test) fatalfInner(format string, args ...interface{}) {
   273  	// Skip two frames: our own and the caller.
   274  	if format != "" {
   275  		t.printfAndFail(2 /* skip */, format, args...)
   276  	} else {
   277  		t.printAndFail(2 /* skip */, args...)
   278  	}
   279  	runtime.Goexit()
   280  }
   281  
   282  // FatalIfErr calls t.Fatal() if err != nil.
   283  func FatalIfErr(t *test, err error) {
   284  	if err != nil {
   285  		t.fatalfInner("" /* format */, err)
   286  	}
   287  }
   288  
   289  func (t *test) printAndFail(skip int, args ...interface{}) {
   290  	var msg string
   291  	if len(args) == 1 {
   292  		// If we were passed only an error, then format it with "%+v" in order to
   293  		// get any stack traces.
   294  		if err, ok := args[0].(error); ok {
   295  			msg = fmt.Sprintf("%+v", err)
   296  		}
   297  	}
   298  	if msg == "" {
   299  		msg = fmt.Sprint(args...)
   300  	}
   301  	t.failWithMsg(t.decorate(skip+1, msg))
   302  }
   303  
   304  func (t *test) printfAndFail(skip int, format string, args ...interface{}) {
   305  	if format == "" {
   306  		panic(fmt.Sprintf("invalid empty format. args: %s", args))
   307  	}
   308  	t.failWithMsg(t.decorate(skip+1, fmt.Sprintf(format, args...)))
   309  }
   310  
   311  func (t *test) failWithMsg(msg string) {
   312  	t.mu.Lock()
   313  	defer t.mu.Unlock()
   314  
   315  	prefix := ""
   316  	if t.mu.failed {
   317  		prefix = "[not the first failure] "
   318  		// NB: the first failure is not always the relevant one due to:
   319  		// https://github.com/cockroachdb/cockroach/issues/44436
   320  		//
   321  		// So we chain all failures together in the order in which we see
   322  		// them.
   323  		msg = "\n" + msg
   324  	}
   325  	t.l.Printf("%stest failure: %s", prefix, msg)
   326  
   327  	t.mu.failed = true
   328  	t.mu.failureMsg += msg
   329  	t.mu.output = append(t.mu.output, msg...)
   330  	if t.mu.cancel != nil {
   331  		t.mu.cancel()
   332  	}
   333  }
   334  
   335  // Args:
   336  // skip: The number of stack frames to exclude from the result. 0 means that
   337  //   the caller will be the first frame identified. 1 means the caller's caller
   338  //   will be the first, etc.
   339  func (t *test) decorate(skip int, s string) string {
   340  	// Skip two extra frames to account for this function and runtime.Callers
   341  	// itself.
   342  	var pc [50]uintptr
   343  	n := runtime.Callers(2+skip, pc[:])
   344  	if n == 0 {
   345  		panic("zero callers found")
   346  	}
   347  
   348  	buf := new(bytes.Buffer)
   349  	frames := runtime.CallersFrames(pc[:n])
   350  	sep := "\t"
   351  	runnerFound := false
   352  	for {
   353  		if runnerFound {
   354  			break
   355  		}
   356  
   357  		frame, more := frames.Next()
   358  		if !more {
   359  			break
   360  		}
   361  		if frame.Function == t.runner {
   362  			runnerFound = true
   363  
   364  			// Handle the special case of the runner function being the caller of
   365  			// t.Fatal(). In that case, that's the line to be used for issue creation.
   366  			if t.mu.failLoc.file == "" {
   367  				t.mu.failLoc.file = frame.File
   368  				t.mu.failLoc.line = frame.Line
   369  			}
   370  		}
   371  		if !t.mu.failed && !runnerFound {
   372  			// Keep track of the highest stack frame that is lower than the t.runner
   373  			// stack frame. This is used to determine the author of that line of code
   374  			// and issue assignment.
   375  			t.mu.failLoc.file = frame.File
   376  			t.mu.failLoc.line = frame.Line
   377  		}
   378  		file := frame.File
   379  		if index := strings.LastIndexByte(file, '/'); index >= 0 {
   380  			file = file[index+1:]
   381  		}
   382  		fmt.Fprintf(buf, "%s%s:%d", sep, file, frame.Line)
   383  		sep = ","
   384  	}
   385  	buf.WriteString(": ")
   386  
   387  	lines := strings.Split(s, "\n")
   388  	if l := len(lines); l > 1 && lines[l-1] == "" {
   389  		lines = lines[:l-1]
   390  	}
   391  	for i, line := range lines {
   392  		if i > 0 {
   393  			buf.WriteString("\n\t\t")
   394  		}
   395  		buf.WriteString(line)
   396  	}
   397  	buf.WriteByte('\n')
   398  	return buf.String()
   399  }
   400  
   401  func (t *test) duration() time.Duration {
   402  	return t.end.Sub(t.start)
   403  }
   404  
   405  func (t *test) Failed() bool {
   406  	t.mu.RLock()
   407  	defer t.mu.RUnlock()
   408  	return t.mu.failed
   409  }
   410  
   411  func (t *test) FailureMsg() string {
   412  	t.mu.RLock()
   413  	defer t.mu.RUnlock()
   414  	return t.mu.failureMsg
   415  }
   416  
   417  func (t *test) ArtifactsDir() string {
   418  	return t.artifactsDir
   419  }
   420  
   421  // IsBuildVersion returns true if the build version is greater than or equal to
   422  // minVersion. This allows a test to optionally perform additional checks
   423  // depending on the cockroach version it is running against. Note that the
   424  // versions are Cockroach build tag version numbers, not the internal cluster
   425  // version number.
   426  func (t *test) IsBuildVersion(minVersion string) bool {
   427  	vers, err := version.Parse(minVersion)
   428  	if err != nil {
   429  		t.Fatal(err)
   430  	}
   431  	if p := vers.PreRelease(); p != "" {
   432  		panic("cannot specify a prerelease: " + p)
   433  	}
   434  	// We append "-0" to the min-version spec so that we capture all
   435  	// prereleases of the specified version. Otherwise, "v2.1.0" would compare
   436  	// greater than "v2.1.0-alpha.x".
   437  	vers = version.MustParse(minVersion + "-0")
   438  	return t.buildVersion.AtLeast(vers)
   439  }
   440  
   441  // teamCityEscape escapes a string for use as <value> in a key='<value>' attribute
   442  // in TeamCity build output marker.
   443  // Documentation here: https://confluence.jetbrains.com/display/TCD10/Build+Script+Interaction+with+TeamCity#BuildScriptInteractionwithTeamCity-Escapedvalues
   444  func teamCityEscape(s string) string {
   445  	r := strings.NewReplacer(
   446  		"\n", "|n",
   447  		"'", "|'",
   448  		"|", "||",
   449  		"[", "|[",
   450  		"]", "|]",
   451  	)
   452  	return r.Replace(s)
   453  }
   454  
   455  func teamCityNameEscape(name string) string {
   456  	return strings.Replace(name, ",", "_", -1)
   457  }
   458  
   459  // getAuthorEmail retrieves the author of a line of code. Returns the empty
   460  // string if the author cannot be determined. Some test tags override this
   461  // behavior and have a hardcoded author email.
   462  func getAuthorEmail(tags []string, file string, line int) string {
   463  	for _, tag := range tags {
   464  		if tag == `orm` || tag == `driver` {
   465  			return `rafi@cockroachlabs.com`
   466  		}
   467  	}
   468  	const repo = "github.com/cockroachdb/cockroach/"
   469  	i := strings.Index(file, repo)
   470  	if i == -1 {
   471  		return ""
   472  	}
   473  	file = file[i+len(repo):]
   474  
   475  	cmd := exec.Command(`/bin/bash`, `-c`,
   476  		fmt.Sprintf(`git blame --porcelain -L%d,+1 $(git rev-parse --show-toplevel)/%s | grep author-mail`,
   477  			line, file))
   478  	// This command returns output such as:
   479  	// author-mail <jordan@cockroachlabs.com>
   480  	out, err := cmd.CombinedOutput()
   481  	if err != nil {
   482  		return ""
   483  	}
   484  	re := regexp.MustCompile("author-mail <(.*)>")
   485  	matches := re.FindSubmatch(out)
   486  	if matches == nil {
   487  		return ""
   488  	}
   489  	return string(matches[1])
   490  }
   491  
   492  type testWithCount struct {
   493  	spec testSpec
   494  	// count maintains the number of runs remaining for a test.
   495  	count int
   496  }
   497  
   498  type clusterType int
   499  
   500  const (
   501  	localCluster clusterType = iota
   502  	roachprodCluster
   503  )
   504  
   505  type loggingOpt struct {
   506  	// l is the test runner logger.
   507  	// Note that individual test runs will use a different logger.
   508  	l *logger
   509  	// tee controls whether test logs (not test runner logs) also go to stdout or
   510  	// not.
   511  	tee            teeOptType
   512  	stdout, stderr io.Writer
   513  	// artifactsDir is that path to the dir that will contain the artifacts for
   514  	// all the tests.
   515  	artifactsDir string
   516  	// runnerLogPath is that path to the runner's log file.
   517  	runnerLogPath string
   518  }
   519  
   520  type workerStatus struct {
   521  	// name is the worker's identifier.
   522  	name string
   523  	mu   struct {
   524  		syncutil.Mutex
   525  
   526  		// status is presented in the HTML progress page.
   527  		status string
   528  
   529  		ttr testToRunRes
   530  		t   *test
   531  		c   *cluster
   532  	}
   533  }
   534  
   535  func (w *workerStatus) Status() string {
   536  	w.mu.Lock()
   537  	defer w.mu.Unlock()
   538  	return w.mu.status
   539  }
   540  
   541  func (w *workerStatus) SetStatus(status string) {
   542  	w.mu.Lock()
   543  	w.mu.status = status
   544  	w.mu.Unlock()
   545  }
   546  
   547  func (w *workerStatus) Cluster() *cluster {
   548  	w.mu.Lock()
   549  	defer w.mu.Unlock()
   550  	return w.mu.c
   551  }
   552  
   553  func (w *workerStatus) SetCluster(c *cluster) {
   554  	w.mu.Lock()
   555  	w.mu.c = c
   556  	w.mu.Unlock()
   557  }
   558  
   559  func (w *workerStatus) TestToRun() testToRunRes {
   560  	w.mu.Lock()
   561  	defer w.mu.Unlock()
   562  	return w.mu.ttr
   563  }
   564  
   565  func (w *workerStatus) Test() *test {
   566  	w.mu.Lock()
   567  	defer w.mu.Unlock()
   568  	return w.mu.t
   569  }
   570  
   571  func (w *workerStatus) SetTest(t *test, ttr testToRunRes) {
   572  	w.mu.Lock()
   573  	w.mu.t = t
   574  	w.mu.ttr = ttr
   575  	w.mu.Unlock()
   576  }
   577  
   578  // shout logs a message both to a logger and to an io.Writer.
   579  // If format doesn't end with a new line, one will be automatically added.
   580  func shout(ctx context.Context, l *logger, stdout io.Writer, format string, args ...interface{}) {
   581  	if len(format) == 0 || format[len(format)-1] != '\n' {
   582  		format += "\n"
   583  	}
   584  	msg := fmt.Sprintf(format, args...)
   585  	l.PrintfCtxDepth(ctx, 2 /* depth */, msg)
   586  	fmt.Fprint(stdout, msg)
   587  }