github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/cmd/roachtest/cluster.go (about)

     1  // Copyright 2018 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package main
    12  
    13  import (
    14  	"bufio"
    15  	"bytes"
    16  	"context"
    17  	gosql "database/sql"
    18  	"encoding/json"
    19  	"fmt"
    20  	"io"
    21  	"io/ioutil"
    22  	"math/rand"
    23  	"net"
    24  	"net/url"
    25  	"os"
    26  	"os/exec"
    27  	"os/user"
    28  	"path/filepath"
    29  	"regexp"
    30  	"sort"
    31  	"strconv"
    32  	"strings"
    33  	"sync"
    34  	"sync/atomic"
    35  	"time"
    36  
    37  	"github.com/armon/circbuf"
    38  	"github.com/cockroachdb/cockroach/pkg/util/contextutil"
    39  	"github.com/cockroachdb/cockroach/pkg/util/log"
    40  	"github.com/cockroachdb/cockroach/pkg/util/quotapool"
    41  	"github.com/cockroachdb/cockroach/pkg/util/syncutil"
    42  	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
    43  	"github.com/cockroachdb/errors"
    44  	_ "github.com/lib/pq"
    45  	"golang.org/x/sync/errgroup"
    46  )
    47  
    48  const (
    49  	aws   = "aws"
    50  	gce   = "gce"
    51  	azure = "azure"
    52  )
    53  
    54  var (
    55  	local        bool
    56  	cockroach    string
    57  	cloud                     = gce
    58  	encrypt      encryptValue = "false"
    59  	instanceType string
    60  	workload     string
    61  	roachprod    string
    62  	buildTag     string
    63  	clusterName  string
    64  	clusterWipe  bool
    65  	zonesF       string
    66  	teamCity     bool
    67  )
    68  
    69  type encryptValue string
    70  
    71  func (v *encryptValue) String() string {
    72  	return string(*v)
    73  }
    74  
    75  func (v *encryptValue) Set(s string) error {
    76  	if s == "random" {
    77  		*v = encryptValue(s)
    78  		return nil
    79  	}
    80  	t, err := strconv.ParseBool(s)
    81  	if err != nil {
    82  		return err
    83  	}
    84  	*v = encryptValue(fmt.Sprint(t))
    85  	return nil
    86  }
    87  
    88  func (v *encryptValue) asBool() bool {
    89  	if *v == "random" {
    90  		return rand.Intn(2) == 0
    91  	}
    92  	t, err := strconv.ParseBool(string(*v))
    93  	if err != nil {
    94  		return false
    95  	}
    96  	return t
    97  }
    98  
    99  func (v *encryptValue) Type() string {
   100  	return "string"
   101  }
   102  
   103  func ifLocal(trueVal, falseVal string) string {
   104  	if local {
   105  		return trueVal
   106  	}
   107  	return falseVal
   108  }
   109  
   110  func filepathAbs(path string) (string, error) {
   111  	path, err := filepath.Abs(path)
   112  	if err != nil {
   113  		return "", errors.WithStack(err)
   114  	}
   115  	return path, nil
   116  }
   117  
   118  func findBinary(binary, defValue string) (string, error) {
   119  	if binary == "" {
   120  		binary = defValue
   121  	}
   122  
   123  	// Check to see if binary exists and is a regular file and executable.
   124  	if fi, err := os.Stat(binary); err == nil && fi.Mode().IsRegular() && (fi.Mode()&0111) != 0 {
   125  		return filepathAbs(binary)
   126  	}
   127  
   128  	// Find the binary to run and translate it to an absolute path. First, look
   129  	// for the binary in PATH.
   130  	path, err := exec.LookPath(binary)
   131  	if err != nil {
   132  		if strings.HasPrefix(binary, "/") {
   133  			return "", errors.WithStack(err)
   134  		}
   135  		// We're unable to find the binary in PATH and "binary" is a relative path:
   136  		// look in the cockroach repo.
   137  		gopath := os.Getenv("GOPATH")
   138  		if gopath == "" {
   139  			gopath = filepath.Join(os.Getenv("HOME"), "go")
   140  		}
   141  
   142  		var binSuffix string
   143  		if !local {
   144  			binSuffix = ".docker_amd64"
   145  		}
   146  		dirs := []string{
   147  			filepath.Join(gopath, "/src/github.com/cockroachdb/cockroach/"),
   148  			filepath.Join(gopath, "/src/github.com/cockroachdb/cockroach/bin"+binSuffix),
   149  			filepath.Join(os.ExpandEnv("$PWD"), "bin"+binSuffix),
   150  		}
   151  		for _, dir := range dirs {
   152  			path = filepath.Join(dir, binary)
   153  			var err2 error
   154  			path, err2 = exec.LookPath(path)
   155  			if err2 == nil {
   156  				return filepathAbs(path)
   157  			}
   158  		}
   159  		return "", fmt.Errorf("failed to find %q in $PATH or any of %s", binary, dirs)
   160  	}
   161  	return filepathAbs(path)
   162  }
   163  
   164  func initBinaries() {
   165  	// If we're running against an existing "local" cluster, force the local flag
   166  	// to true in order to get the "local" test configurations.
   167  	if clusterName == "local" {
   168  		local = true
   169  	}
   170  
   171  	cockroachDefault := "cockroach"
   172  	if !local {
   173  		cockroachDefault = "cockroach-linux-2.6.32-gnu-amd64"
   174  	}
   175  	var err error
   176  	cockroach, err = findBinary(cockroach, cockroachDefault)
   177  	if err != nil {
   178  		fmt.Fprintf(os.Stderr, "%+v\n", err)
   179  		os.Exit(1)
   180  	}
   181  
   182  	workload, err = findBinary(workload, "workload")
   183  	if err != nil {
   184  		fmt.Fprintf(os.Stderr, "%+v\n", err)
   185  		os.Exit(1)
   186  	}
   187  
   188  	roachprod, err = findBinary(roachprod, "roachprod")
   189  	if err != nil {
   190  		fmt.Fprintf(os.Stderr, "%+v\n", err)
   191  		os.Exit(1)
   192  	}
   193  }
   194  
   195  type clusterRegistry struct {
   196  	mu struct {
   197  		syncutil.Mutex
   198  		clusters map[string]*cluster
   199  		tagCount map[string]int
   200  		// savedClusters keeps track of clusters that have been saved for further
   201  		// debugging. Each cluster comes with a message about the test failure
   202  		// causing it to be saved for debugging.
   203  		savedClusters map[*cluster]string
   204  	}
   205  }
   206  
   207  func newClusterRegistry() *clusterRegistry {
   208  	cr := &clusterRegistry{}
   209  	cr.mu.clusters = make(map[string]*cluster)
   210  	cr.mu.savedClusters = make(map[*cluster]string)
   211  	return cr
   212  }
   213  
   214  func (r *clusterRegistry) registerCluster(c *cluster) error {
   215  	r.mu.Lock()
   216  	defer r.mu.Unlock()
   217  	if r.mu.clusters[c.name] != nil {
   218  		return fmt.Errorf("cluster named %q already exists in registry", c.name)
   219  	}
   220  	r.mu.clusters[c.name] = c
   221  	return nil
   222  }
   223  
   224  func (r *clusterRegistry) unregisterCluster(c *cluster) bool {
   225  	r.mu.Lock()
   226  	defer r.mu.Unlock()
   227  	if _, ok := r.mu.clusters[c.name]; !ok {
   228  		// If the cluster is not registered, no-op. This allows the
   229  		// method to be called defensively.
   230  		return false
   231  	}
   232  	delete(r.mu.clusters, c.name)
   233  	if c.tag != "" {
   234  		if _, ok := r.mu.tagCount[c.tag]; !ok {
   235  			panic(fmt.Sprintf("tagged cluster not accounted for: %s", c))
   236  		}
   237  		r.mu.tagCount[c.tag]--
   238  	}
   239  	return true
   240  }
   241  
   242  func (r *clusterRegistry) countForTag(tag string) int {
   243  	r.mu.Lock()
   244  	defer r.mu.Unlock()
   245  	return r.mu.tagCount[tag]
   246  }
   247  
   248  // markClusterAsSaved marks c such that it will not be destroyed by
   249  // destroyAllClusters.
   250  // msg is a message recording the reason why the cluster is being saved (i.e.
   251  // generally a test failure error).
   252  func (r *clusterRegistry) markClusterAsSaved(c *cluster, msg string) {
   253  	r.mu.Lock()
   254  	r.mu.savedClusters[c] = msg
   255  	r.mu.Unlock()
   256  }
   257  
   258  type clusterWithMsg struct {
   259  	*cluster
   260  	savedMsg string
   261  }
   262  
   263  // savedClusters returns the list of clusters that have been saved for
   264  // debugging.
   265  func (r *clusterRegistry) savedClusters() []clusterWithMsg {
   266  	r.mu.Lock()
   267  	defer r.mu.Unlock()
   268  	res := make([]clusterWithMsg, len(r.mu.savedClusters))
   269  	i := 0
   270  	for c, msg := range r.mu.savedClusters {
   271  		res[i] = clusterWithMsg{
   272  			cluster:  c,
   273  			savedMsg: msg,
   274  		}
   275  		i++
   276  	}
   277  	sort.Slice(res, func(i, j int) bool {
   278  		return strings.Compare(res[i].name, res[j].name) < 0
   279  	})
   280  	return res
   281  }
   282  
   283  // destroyAllClusters destroys all the clusters (except for "saved" ones) and
   284  // blocks until they're destroyed. It responds to context cancelation by
   285  // interrupting the waiting; the cluster destruction itself does not inherit the
   286  // cancelation.
   287  func (r *clusterRegistry) destroyAllClusters(ctx context.Context, l *logger) {
   288  	// Fire off a goroutine to destroy all of the clusters.
   289  	done := make(chan struct{})
   290  	go func() {
   291  		defer close(done)
   292  
   293  		var clusters []*cluster
   294  		savedClusters := make(map[*cluster]struct{})
   295  		r.mu.Lock()
   296  		for _, c := range r.mu.clusters {
   297  			clusters = append(clusters, c)
   298  		}
   299  		for c := range r.mu.savedClusters {
   300  			savedClusters[c] = struct{}{}
   301  		}
   302  		r.mu.Unlock()
   303  
   304  		var wg sync.WaitGroup
   305  		wg.Add(len(clusters))
   306  		for _, c := range clusters {
   307  			go func(c *cluster) {
   308  				defer wg.Done()
   309  				if _, ok := savedClusters[c]; !ok {
   310  					// We don't close the logger here since the cluster may be still in use
   311  					// by a test, and so the logger might still be needed.
   312  					c.Destroy(ctx, dontCloseLogger, l)
   313  				}
   314  			}(c)
   315  		}
   316  
   317  		wg.Wait()
   318  	}()
   319  
   320  	select {
   321  	case <-done:
   322  	case <-ctx.Done():
   323  	}
   324  }
   325  
   326  // execCmd is like execCmdEx, but doesn't return the command's output.
   327  func execCmd(ctx context.Context, l *logger, args ...string) error {
   328  	return execCmdEx(ctx, l, args...).err
   329  }
   330  
   331  type cmdRes struct {
   332  	err error
   333  	// stdout and stderr are the commands output. Note that this is truncated and
   334  	// only a tail is returned.
   335  	stdout, stderr string
   336  }
   337  
   338  // execCmdEx runs a command and returns its error and output.
   339  //
   340  // Note that the output is truncated; only a tail is returned.
   341  // Also note that if the command exits with an error code, its output is also
   342  // included in cmdRes.err.
   343  func execCmdEx(ctx context.Context, l *logger, args ...string) cmdRes {
   344  	var cancel func()
   345  	ctx, cancel = context.WithCancel(ctx)
   346  	defer cancel()
   347  
   348  	l.Printf("> %s\n", strings.Join(args, " "))
   349  	cmd := exec.CommandContext(ctx, args[0], args[1:]...)
   350  
   351  	debugStdoutBuffer, _ := circbuf.NewBuffer(4096)
   352  	debugStderrBuffer, _ := circbuf.NewBuffer(1024)
   353  
   354  	// Do a dance around https://github.com/golang/go/issues/23019.
   355  	// When the command we run launches a subprocess, that subprocess receives
   356  	// a copy of our Command's Stdout/Stderr file descriptor, which effectively
   357  	// means that the file descriptors close only when that subcommand returns.
   358  	// However, proactively killing the subcommand is not really possible - we
   359  	// will only manage to kill the parent process that we launched directly.
   360  	// In practice this means that if we try to react to context cancellation,
   361  	// the pipes we read the output from will wait for the *subprocess* to
   362  	// terminate, leaving us hanging, potentially indefinitely.
   363  	// To work around it, use pipes and set a read deadline on our (read) end of
   364  	// the pipes when we detect a context cancellation.
   365  	//
   366  	// See TestExecCmd for a test.
   367  	var closePipes func(ctx context.Context)
   368  	var wg sync.WaitGroup
   369  	{
   370  
   371  		var wOut, wErr, rOut, rErr *os.File
   372  		var cwOnce sync.Once
   373  		closePipes = func(ctx context.Context) {
   374  			// Idempotently closes the writing end of the pipes. This is called either
   375  			// when the process returns or when it was killed due to context
   376  			// cancellation. In the former case, close the writing ends of the pipe
   377  			// so that the copy goroutines started below return (without missing any
   378  			// output). In the context cancellation case, we set a deadline to force
   379  			// the goroutines to quit eagerly. This is important since the command
   380  			// may have duplicated wOut and wErr to its possible subprocesses, which
   381  			// may continue to run for long periods of time, and would otherwise
   382  			// block this command. In theory this is possible also when the command
   383  			// returns on its own accord, so we set a (more lenient) deadline in the
   384  			// first case as well.
   385  			//
   386  			// NB: there's also the option (at least on *nix) to use a process group,
   387  			// but it doesn't look portable:
   388  			// https://medium.com/@felixge/killing-a-child-process-and-all-of-its-children-in-go-54079af94773
   389  			cwOnce.Do(func() {
   390  				if wOut != nil {
   391  					_ = wOut.Close()
   392  				}
   393  				if wErr != nil {
   394  					_ = wErr.Close()
   395  				}
   396  				dur := 10 * time.Second // wait up to 10s for subprocesses
   397  				if ctx.Err() != nil {
   398  					dur = 10 * time.Millisecond
   399  				}
   400  				deadline := timeutil.Now().Add(dur)
   401  				if rOut != nil {
   402  					_ = rOut.SetReadDeadline(deadline)
   403  				}
   404  				if rErr != nil {
   405  					_ = rErr.SetReadDeadline(deadline)
   406  				}
   407  			})
   408  		}
   409  		defer closePipes(ctx)
   410  
   411  		var err error
   412  		rOut, wOut, err = os.Pipe()
   413  		if err != nil {
   414  			return cmdRes{err: err}
   415  		}
   416  
   417  		rErr, wErr, err = os.Pipe()
   418  		if err != nil {
   419  			return cmdRes{err: err}
   420  		}
   421  
   422  		cmd.Stdout = wOut
   423  		wg.Add(1)
   424  		go func() {
   425  			defer wg.Done()
   426  			_, _ = io.Copy(l.stdout, io.TeeReader(rOut, debugStdoutBuffer))
   427  		}()
   428  
   429  		if l.stderr == l.stdout {
   430  			// If l.stderr == l.stdout, we use only one pipe to avoid
   431  			// duplicating everything.
   432  			cmd.Stderr = wOut
   433  		} else {
   434  			cmd.Stderr = wErr
   435  			wg.Add(1)
   436  			go func() {
   437  				defer wg.Done()
   438  				_, _ = io.Copy(l.stderr, io.TeeReader(rErr, debugStderrBuffer))
   439  			}()
   440  		}
   441  	}
   442  
   443  	err := cmd.Run()
   444  	closePipes(ctx)
   445  	wg.Wait()
   446  
   447  	if err != nil {
   448  		// Context errors opaquely appear as "signal killed" when manifested.
   449  		// We surface this error explicitly.
   450  		if ctx.Err() != nil {
   451  			err = errors.CombineErrors(ctx.Err(), err)
   452  		}
   453  
   454  		if err != nil {
   455  			err = &withCommandDetails{
   456  				cause:  err,
   457  				cmd:    strings.Join(args, " "),
   458  				stderr: debugStderrBuffer.String(),
   459  				stdout: debugStdoutBuffer.String(),
   460  			}
   461  		}
   462  	}
   463  
   464  	return cmdRes{
   465  		err:    err,
   466  		stdout: debugStdoutBuffer.String(),
   467  		stderr: debugStderrBuffer.String(),
   468  	}
   469  }
   470  
   471  type withCommandDetails struct {
   472  	cause  error
   473  	cmd    string
   474  	stderr string
   475  	stdout string
   476  }
   477  
   478  var _ error = (*withCommandDetails)(nil)
   479  var _ errors.Formatter = (*withCommandDetails)(nil)
   480  
   481  // Error implements error.
   482  func (e *withCommandDetails) Error() string { return e.cause.Error() }
   483  
   484  // Cause implements causer.
   485  func (e *withCommandDetails) Cause() error { return e.cause }
   486  
   487  // Format implements fmt.Formatter.
   488  func (e *withCommandDetails) Format(s fmt.State, verb rune) { errors.FormatError(e, s, verb) }
   489  
   490  // FormatError implements errors.Formatter.
   491  func (e *withCommandDetails) FormatError(p errors.Printer) error {
   492  	p.Printf("%s returned", e.cmd)
   493  	if p.Detail() {
   494  		p.Printf("stderr:\n%s\nstdout:\n%s", e.stderr, e.stdout)
   495  	}
   496  	return e.cause
   497  }
   498  
   499  // GetStderr retrieves the stderr output of a command that
   500  // returned with an error, or the empty string if there was no stderr.
   501  func GetStderr(err error) string {
   502  	var c *withCommandDetails
   503  	if errors.As(err, &c) {
   504  		return c.stderr
   505  	}
   506  	return ""
   507  }
   508  
   509  // execCmdWithBuffer executes the given command and returns its stdout/stderr
   510  // output. If the return code is not 0, an error is also returned.
   511  // l is used to log the command before running it. No output is logged.
   512  func execCmdWithBuffer(ctx context.Context, l *logger, args ...string) ([]byte, error) {
   513  	l.Printf("> %s\n", strings.Join(args, " "))
   514  	cmd := exec.CommandContext(ctx, args[0], args[1:]...)
   515  
   516  	out, err := cmd.CombinedOutput()
   517  	if err != nil {
   518  		return out, errors.Wrapf(err, `%s`, strings.Join(args, ` `))
   519  	}
   520  	return out, nil
   521  }
   522  
   523  func makeGCEClusterName(name string) string {
   524  	name = strings.ToLower(name)
   525  	name = regexp.MustCompile(`[^-a-z0-9]+`).ReplaceAllString(name, "-")
   526  	name = regexp.MustCompile(`-+`).ReplaceAllString(name, "-")
   527  	return name
   528  }
   529  
   530  func makeClusterName(name string) string {
   531  	return makeGCEClusterName(name)
   532  }
   533  
   534  // MachineTypeToCPUs returns a CPU count for either a GCE or AWS
   535  // machine type.
   536  func MachineTypeToCPUs(s string) int {
   537  	{
   538  		// GCE machine types.
   539  		var v int
   540  		if _, err := fmt.Sscanf(s, "n1-standard-%d", &v); err == nil {
   541  			return v
   542  		}
   543  		if _, err := fmt.Sscanf(s, "n1-highcpu-%d", &v); err == nil {
   544  			return v
   545  		}
   546  		if _, err := fmt.Sscanf(s, "n1-highmem-%d", &v); err == nil {
   547  			return v
   548  		}
   549  	}
   550  
   551  	typeAndSize := strings.Split(s, ".")
   552  
   553  	if len(typeAndSize) == 2 {
   554  		size := typeAndSize[1]
   555  
   556  		switch size {
   557  		case "large":
   558  			return 2
   559  		case "xlarge":
   560  			return 4
   561  		case "2xlarge":
   562  			return 8
   563  		case "4xlarge":
   564  			return 16
   565  		case "9xlarge":
   566  			return 36
   567  		case "12xlarge":
   568  			return 48
   569  		case "18xlarge":
   570  			return 72
   571  		case "24xlarge":
   572  			return 96
   573  		}
   574  	}
   575  
   576  	// Azure doesn't have a standard way to size machines.
   577  	// This method is implemented for the default machine type.
   578  	// Not all of Azure machine types contain the number of vCPUs int he size and
   579  	// the sizing naming scheme is dependent on the machine type family.
   580  	switch s {
   581  	case "Standard_D2_v3":
   582  		return 2
   583  	case "Standard_D4_v3":
   584  		return 4
   585  	case "Standard_D8_v3":
   586  		return 8
   587  	case "Standard_D16_v3":
   588  		return 16
   589  	case "Standard_D32_v3":
   590  		return 32
   591  	case "Standard_D48_v3":
   592  		return 48
   593  	case "Standard_D64_v3":
   594  		return 64
   595  	}
   596  
   597  	// TODO(pbardea): Non-default Azure machine types are not supported
   598  	// and will return unknown machine type error.
   599  	fmt.Fprintf(os.Stderr, "unknown machine type: %s\n", s)
   600  	os.Exit(1)
   601  	return -1
   602  }
   603  
   604  func awsMachineType(cpus int) string {
   605  	switch {
   606  	case cpus <= 2:
   607  		return "c5d.large"
   608  	case cpus <= 4:
   609  		return "c5d.xlarge"
   610  	case cpus <= 8:
   611  		return "c5d.2xlarge"
   612  	case cpus <= 16:
   613  		return "c5d.4xlarge"
   614  	case cpus <= 36:
   615  		return "c5d.9xlarge"
   616  	case cpus <= 72:
   617  		return "c5d.18xlarge"
   618  	case cpus <= 96:
   619  		// There is no c5d.24xlarge.
   620  		return "m5d.24xlarge"
   621  	default:
   622  		panic(fmt.Sprintf("no aws machine type with %d cpus", cpus))
   623  	}
   624  }
   625  
   626  // Default GCE machine type when none is specified.
   627  func gceMachineType(cpus int) string {
   628  	// TODO(peter): This is awkward: below 16 cpus, use n1-standard so that the
   629  	// machines have a decent amount of RAM. We could use customer machine
   630  	// configurations, but the rules for the amount of RAM per CPU need to be
   631  	// determined (you can't request any arbitrary amount of RAM).
   632  	if cpus < 16 {
   633  		return fmt.Sprintf("n1-standard-%d", cpus)
   634  	}
   635  	return fmt.Sprintf("n1-highcpu-%d", cpus)
   636  }
   637  
   638  func azureMachineType(cpus int) string {
   639  	switch {
   640  	case cpus <= 2:
   641  		return "Standard_D2_v3"
   642  	case cpus <= 4:
   643  		return "Standard_D4_v3"
   644  	case cpus <= 8:
   645  		return "Standard_D8_v3"
   646  	case cpus <= 16:
   647  		return "Standard_D16_v3"
   648  	case cpus <= 36:
   649  		return "Standard_D32_v3"
   650  	case cpus <= 48:
   651  		return "Standard_D48_v3"
   652  	case cpus <= 64:
   653  		return "Standard_D64_v3"
   654  	default:
   655  		panic(fmt.Sprintf("no azure machine type with %d cpus", cpus))
   656  	}
   657  }
   658  
   659  func machineTypeFlag(machineType string) string {
   660  	switch cloud {
   661  	case aws:
   662  		if isSSD(machineType) {
   663  			return "--aws-machine-type-ssd"
   664  		}
   665  		return "--aws-machine-type"
   666  	case gce:
   667  		return "--gce-machine-type"
   668  	case azure:
   669  		return "--azure-machine-type"
   670  	default:
   671  		panic(fmt.Sprintf("unsupported cloud: %s\n", cloud))
   672  	}
   673  }
   674  
   675  func isSSD(machineType string) bool {
   676  	if cloud != aws {
   677  		panic("can only differentiate SSDs based on machine type on AWS")
   678  	}
   679  
   680  	typeAndSize := strings.Split(machineType, ".")
   681  	if len(typeAndSize) == 2 {
   682  		awsType := typeAndSize[0]
   683  		// All SSD machine types that we use end in 'd or begins with i3 (e.g. i3, i3en).
   684  		return strings.HasPrefix(awsType, "i3") || strings.HasSuffix(awsType, "d")
   685  	}
   686  
   687  	fmt.Fprint(os.Stderr, "aws machine type does not match expected format 'type.size' (e.g. c5d.4xlarge)", machineType)
   688  	os.Exit(1)
   689  	return false
   690  }
   691  
   692  type testI interface {
   693  	Name() string
   694  	Fatal(args ...interface{})
   695  	Fatalf(format string, args ...interface{})
   696  	Failed() bool
   697  	// Path to a directory where the test is supposed to store its log and other
   698  	// artifacts.
   699  	ArtifactsDir() string
   700  	logger() *logger
   701  }
   702  
   703  // TODO(tschottdorf): Consider using a more idiomatic approach in which options
   704  // act upon a config struct:
   705  // https://dave.cheney.net/2014/10/17/functional-options-for-friendly-apis
   706  type option interface {
   707  	option()
   708  }
   709  
   710  type nodeSelector interface {
   711  	option
   712  	merge(nodeListOption) nodeListOption
   713  }
   714  
   715  type nodeListOption []int
   716  
   717  func (n nodeListOption) option() {}
   718  
   719  func (n nodeListOption) merge(o nodeListOption) nodeListOption {
   720  	t := make(nodeListOption, 0, len(n)+len(o))
   721  	t = append(t, n...)
   722  	t = append(t, o...)
   723  	sort.Ints([]int(t))
   724  	r := t[:1]
   725  	for i := 1; i < len(t); i++ {
   726  		if r[len(r)-1] != t[i] {
   727  			r = append(r, t[i])
   728  		}
   729  	}
   730  	return r
   731  }
   732  
   733  func (n nodeListOption) randNode() nodeListOption {
   734  	return nodeListOption{n[rand.Intn(len(n))]}
   735  }
   736  
   737  func (n nodeListOption) String() string {
   738  	if len(n) == 0 {
   739  		return ""
   740  	}
   741  
   742  	var buf bytes.Buffer
   743  	buf.WriteByte(':')
   744  
   745  	appendRange := func(start, end int) {
   746  		if buf.Len() > 1 {
   747  			buf.WriteByte(',')
   748  		}
   749  		if start == end {
   750  			fmt.Fprintf(&buf, "%d", start)
   751  		} else {
   752  			fmt.Fprintf(&buf, "%d-%d", start, end)
   753  		}
   754  	}
   755  
   756  	start, end := -1, -1
   757  	for _, i := range n {
   758  		if start != -1 && end == i-1 {
   759  			end = i
   760  			continue
   761  		}
   762  		if start != -1 {
   763  			appendRange(start, end)
   764  		}
   765  		start, end = i, i
   766  	}
   767  	if start != -1 {
   768  		appendRange(start, end)
   769  	}
   770  	return buf.String()
   771  }
   772  
   773  // clusterSpec represents a test's description of what its cluster needs to
   774  // look like. It becomes part of a clusterConfig when the cluster is created.
   775  type clusterSpec struct {
   776  	NodeCount int
   777  	// CPUs is the number of CPUs per node.
   778  	CPUs        int
   779  	Zones       string
   780  	Geo         bool
   781  	Lifetime    time.Duration
   782  	ReusePolicy clusterReusePolicy
   783  }
   784  
   785  func makeClusterSpec(nodeCount int, opts ...createOption) clusterSpec {
   786  	spec := clusterSpec{NodeCount: nodeCount}
   787  	defaultOpts := []createOption{cpu(4), nodeLifetimeOption(12 * time.Hour), reuseAny()}
   788  	for _, o := range append(defaultOpts, opts...) {
   789  		o.apply(&spec)
   790  	}
   791  	return spec
   792  }
   793  
   794  func clustersCompatible(s1, s2 clusterSpec) bool {
   795  	s1.Lifetime = 0
   796  	s2.Lifetime = 0
   797  	return s1 == s2
   798  }
   799  
   800  func (s clusterSpec) String() string {
   801  	str := fmt.Sprintf("n%dcpu%d", s.NodeCount, s.CPUs)
   802  	if s.Geo {
   803  		str += "-geo"
   804  	}
   805  	return str
   806  }
   807  
   808  func firstZone(zones string) string {
   809  	return strings.SplitN(zones, ",", 2)[0]
   810  }
   811  
   812  func (s *clusterSpec) args() []string {
   813  	var args []string
   814  
   815  	switch cloud {
   816  	case aws:
   817  		if s.Zones != "" {
   818  			fmt.Fprintf(os.Stderr, "zones spec not yet supported on AWS: %s\n", s.Zones)
   819  			os.Exit(1)
   820  		}
   821  		if s.Geo {
   822  			fmt.Fprintf(os.Stderr, "geo-distributed clusters not yet supported on AWS\n")
   823  			os.Exit(1)
   824  		}
   825  
   826  		args = append(args, "--clouds=aws")
   827  	case azure:
   828  		args = append(args, "--clouds=azure")
   829  	}
   830  
   831  	if !local && s.CPUs != 0 {
   832  		// Use the machine type specified as a CLI flag.
   833  		machineType := instanceType
   834  		if len(machineType) == 0 {
   835  			// If no machine type was specified, choose one
   836  			// based on the cloud and CPU count.
   837  			switch cloud {
   838  			case aws:
   839  				machineType = awsMachineType(s.CPUs)
   840  			case gce:
   841  				machineType = gceMachineType(s.CPUs)
   842  			case azure:
   843  				machineType = azureMachineType(s.CPUs)
   844  			}
   845  		}
   846  		if cloud == aws {
   847  			if isSSD(machineType) {
   848  				args = append(args, "--local-ssd=true")
   849  			} else {
   850  				args = append(args, "--local-ssd=false")
   851  			}
   852  		}
   853  		machineTypeArg := machineTypeFlag(machineType) + "=" + machineType
   854  		args = append(args, machineTypeArg)
   855  	}
   856  	if s.Zones != "" {
   857  		switch cloud {
   858  		case gce:
   859  			if s.Geo {
   860  				args = append(args, "--gce-zones="+s.Zones)
   861  			} else {
   862  				args = append(args, "--gce-zones="+firstZone(s.Zones))
   863  			}
   864  		case azure:
   865  			args = append(args, "--azure-locations="+s.Zones)
   866  		default:
   867  			fmt.Fprintf(os.Stderr, "specifying zones is not yet supported on %s", cloud)
   868  			os.Exit(1)
   869  		}
   870  	}
   871  	if s.Geo {
   872  		args = append(args, "--geo")
   873  	}
   874  	if s.Lifetime != 0 {
   875  		args = append(args, "--lifetime="+s.Lifetime.String())
   876  	}
   877  	return args
   878  }
   879  
   880  func (s *clusterSpec) expiration() time.Time {
   881  	l := s.Lifetime
   882  	if l == 0 {
   883  		l = 12 * time.Hour
   884  	}
   885  	return timeutil.Now().Add(l)
   886  }
   887  
   888  type createOption interface {
   889  	apply(spec *clusterSpec)
   890  }
   891  
   892  type nodeCPUOption int
   893  
   894  func (o nodeCPUOption) apply(spec *clusterSpec) {
   895  	spec.CPUs = int(o)
   896  }
   897  
   898  // cpu is a node option which requests nodes with the specified number of CPUs.
   899  func cpu(n int) nodeCPUOption {
   900  	return nodeCPUOption(n)
   901  }
   902  
   903  type nodeGeoOption struct{}
   904  
   905  func (o nodeGeoOption) apply(spec *clusterSpec) {
   906  	spec.Geo = true
   907  }
   908  
   909  // geo is a node option which requests geo-distributed nodes.
   910  func geo() nodeGeoOption {
   911  	return nodeGeoOption{}
   912  }
   913  
   914  type nodeZonesOption string
   915  
   916  func (o nodeZonesOption) apply(spec *clusterSpec) {
   917  	spec.Zones = string(o)
   918  }
   919  
   920  // zones is a node option which requests geo-distributed nodes. Note that this
   921  // overrides the --zones flag and is useful for tests that require running on
   922  // specific zones.
   923  func zones(s string) nodeZonesOption {
   924  	return nodeZonesOption(s)
   925  }
   926  
   927  type nodeLifetimeOption time.Duration
   928  
   929  func (o nodeLifetimeOption) apply(spec *clusterSpec) {
   930  	spec.Lifetime = time.Duration(o)
   931  }
   932  
   933  // clusterReusePolicy indicates what clusters a particular test can run on and
   934  // who (if anybody) can reuse the cluster after the test has finished running
   935  // (either passing or failing). See the individual policies for details.
   936  //
   937  // Only tests whose cluster spec matches can ever run on the same
   938  // cluster, regardless of this policy.
   939  //
   940  // Clean clusters (freshly-created clusters or cluster on which a test with the
   941  // Any policy ran) are accepted by all policies.
   942  //
   943  // Note that not all combinations of "what cluster can I accept" and "how am I
   944  // soiling this cluster" can be expressed. For example, there's no way to
   945  // express that I'll accept a cluster that was tagged a certain way but after me
   946  // nobody else can reuse the cluster at all.
   947  type clusterReusePolicy interface {
   948  	clusterReusePolicy()
   949  }
   950  
   951  // reusePolicyAny means that only clean clusters are accepted and the cluster
   952  // can be used by any other test (i.e. the cluster remains "clean").
   953  type reusePolicyAny struct{}
   954  
   955  // reusePolicyNone means that only clean clusters are accepted and the cluster
   956  // cannot be reused afterwards.
   957  type reusePolicyNone struct{}
   958  
   959  // reusePolicyTagged means that clusters left over by similarly-tagged tests are
   960  // accepted in addition to clean cluster and, regardless of how the cluster
   961  // started up, it will be tagged with the given tag at the end (so only
   962  // similarly-tagged tests can use it afterwards).
   963  //
   964  // The idea is that a tag identifies a particular way in which a test is soiled,
   965  // since it's common for groups of tests to mess clusters up in similar ways and
   966  // to also be able to reset the cluster when the test starts. It's like a virus
   967  // - if you carry it, you infect a clean host and can otherwise intermingle with
   968  // other hosts that are already infected. Note that using this policy assumes
   969  // that the way in which every test soils the cluster is idempotent.
   970  type reusePolicyTagged struct{ tag string }
   971  
   972  func (reusePolicyAny) clusterReusePolicy()    {}
   973  func (reusePolicyNone) clusterReusePolicy()   {}
   974  func (reusePolicyTagged) clusterReusePolicy() {}
   975  
   976  type clusterReusePolicyOption struct {
   977  	p clusterReusePolicy
   978  }
   979  
   980  func reuseAny() clusterReusePolicyOption {
   981  	return clusterReusePolicyOption{p: reusePolicyAny{}}
   982  }
   983  func reuseNone() clusterReusePolicyOption {
   984  	return clusterReusePolicyOption{p: reusePolicyNone{}}
   985  }
   986  func reuseTagged(tag string) clusterReusePolicyOption {
   987  	return clusterReusePolicyOption{p: reusePolicyTagged{tag: tag}}
   988  }
   989  
   990  func (p clusterReusePolicyOption) apply(spec *clusterSpec) {
   991  	spec.ReusePolicy = p.p
   992  }
   993  
   994  // cluster provides an interface for interacting with a set of machines,
   995  // starting and stopping a cockroach cluster on a subset of those machines, and
   996  // running load generators and other operations on the machines.
   997  //
   998  // A cluster is safe for concurrent use by multiple goroutines.
   999  type cluster struct {
  1000  	name string
  1001  	tag  string
  1002  	spec clusterSpec
  1003  	// status is used to communicate the test's status. The callback is a noop
  1004  	// until the cluster is passed to a test, at which point it's hooked up to
  1005  	// test.Status().
  1006  	status func(...interface{})
  1007  	t      testI
  1008  	// r is the registry tracking this cluster. Destroying the cluster will
  1009  	// unregister it.
  1010  	r *clusterRegistry
  1011  	// l is the logger used to log various cluster operations.
  1012  	// DEPRECATED for use outside of cluster methods: Use a test's t.l instead.
  1013  	// This is generally set to the current test's logger.
  1014  	l          *logger
  1015  	expiration time.Time
  1016  	// encryptDefault is true if the cluster should default to having encryption
  1017  	// at rest enabled. The default only applies if encryption is not explicitly
  1018  	// enabled or disabled by options passed to Start.
  1019  	encryptDefault bool
  1020  
  1021  	// destroyState contains state related to the cluster's destruction.
  1022  	destroyState destroyState
  1023  }
  1024  
  1025  func (c *cluster) String() string {
  1026  	return fmt.Sprintf("%s [tag:%s] (%d nodes)", c.name, c.tag, c.spec.NodeCount)
  1027  }
  1028  
  1029  type destroyState struct {
  1030  	// owned is set if this instance is responsible for `roachprod destroy`ing the
  1031  	// cluster. It is set when a new cluster is created, but not when we attach to
  1032  	// an existing roachprod cluster.
  1033  	// If not set, Destroy() only wipes the cluster.
  1034  	owned bool
  1035  
  1036  	// alloc is set if owned is set. If set, it represents resources in a
  1037  	// QuotaPool that need to be released when the cluster is destroyed.
  1038  	alloc *quotapool.IntAlloc
  1039  
  1040  	mu struct {
  1041  		syncutil.Mutex
  1042  		loggerClosed bool
  1043  		// destroyed is used to coordinate between different goroutines that want to
  1044  		// destroy a cluster. It is set once the destroy process starts. It it
  1045  		// closed when the destruction is complete.
  1046  		destroyed chan struct{}
  1047  		// saved is set if this cluster should not be wiped or destroyed. It should
  1048  		// be left alone for further debugging. This is kept in sync with the
  1049  		// clusterRegistry which maintains a list of all saved clusters.
  1050  		saved bool
  1051  		// savedMsg records a message describing the reason why the cluster is being
  1052  		// saved.
  1053  		savedMsg string
  1054  	}
  1055  }
  1056  
  1057  // closeLogger closes c.l. It can be called multiple times.
  1058  func (c *cluster) closeLogger() {
  1059  	c.destroyState.mu.Lock()
  1060  	defer c.destroyState.mu.Unlock()
  1061  	if c.destroyState.mu.loggerClosed {
  1062  		return
  1063  	}
  1064  	c.destroyState.mu.loggerClosed = true
  1065  	c.l.close()
  1066  }
  1067  
  1068  type clusterConfig struct {
  1069  	spec clusterSpec
  1070  	// artifactsDir is the path where log file will be stored.
  1071  	artifactsDir string
  1072  	localCluster bool
  1073  	useIOBarrier bool
  1074  	alloc        *quotapool.IntAlloc
  1075  }
  1076  
  1077  // clusterFactory is a creator of clusters.
  1078  type clusterFactory struct {
  1079  	// namePrefix is prepended to all cluster names.
  1080  	namePrefix string
  1081  	// counter is incremented with every new cluster. It's used as part of the cluster's name.
  1082  	// Accessed atomically.
  1083  	counter uint64
  1084  	// The registry with whom all clustered will be registered.
  1085  	r *clusterRegistry
  1086  	// artifactsDir is the directory in which the cluster creation log file will be placed.
  1087  	artifactsDir string
  1088  	// sem is a semaphore throttling the creation of clusters (because AWS has
  1089  	// ridiculous API calls limits).
  1090  	sem chan struct{}
  1091  }
  1092  
  1093  func newClusterFactory(
  1094  	user string, clustersID string, artifactsDir string, r *clusterRegistry, concurrentCreations int,
  1095  ) *clusterFactory {
  1096  	secs := timeutil.Now().Unix()
  1097  	var prefix string
  1098  	if clustersID != "" {
  1099  		prefix = fmt.Sprintf("%s-%s-%d-", user, clustersID, secs)
  1100  	} else {
  1101  		prefix = fmt.Sprintf("%s-%d-", user, secs)
  1102  	}
  1103  	return &clusterFactory{
  1104  		sem:          make(chan struct{}, concurrentCreations),
  1105  		namePrefix:   prefix,
  1106  		artifactsDir: artifactsDir,
  1107  		r:            r,
  1108  	}
  1109  }
  1110  
  1111  // acquireSem blocks until the semaphore allows a new cluster creation. The
  1112  // returned function needs to be called when cluster creation finished.
  1113  func (f *clusterFactory) acquireSem() func() {
  1114  	f.sem <- struct{}{}
  1115  	return f.releaseSem
  1116  }
  1117  
  1118  func (f *clusterFactory) releaseSem() {
  1119  	<-f.sem
  1120  }
  1121  
  1122  // newCluster creates a new roachprod cluster.
  1123  //
  1124  // setStatus is called with status messages indicating the stage of cluster
  1125  // creation.
  1126  //
  1127  // NOTE: setTest() needs to be called before a test can use this cluster.
  1128  func (f *clusterFactory) newCluster(
  1129  	ctx context.Context, cfg clusterConfig, setStatus func(string), teeOpt teeOptType,
  1130  ) (*cluster, error) {
  1131  	if ctx.Err() != nil {
  1132  		return nil, errors.Wrap(ctx.Err(), "newCluster")
  1133  	}
  1134  
  1135  	var name string
  1136  	if cfg.localCluster {
  1137  		name = "local" // The roachprod tool understands this magic name.
  1138  	} else {
  1139  		count := atomic.AddUint64(&f.counter, 1)
  1140  		name = makeClusterName(
  1141  			fmt.Sprintf("%s-%02d-%s", f.namePrefix, count, cfg.spec.String()))
  1142  	}
  1143  
  1144  	if cfg.spec.NodeCount == 0 {
  1145  		// For tests. Return the minimum that makes them happy.
  1146  		c := &cluster{
  1147  			name:       name,
  1148  			expiration: timeutil.Now().Add(24 * time.Hour),
  1149  			status:     func(...interface{}) {},
  1150  			r:          f.r,
  1151  		}
  1152  		if err := f.r.registerCluster(c); err != nil {
  1153  			return nil, err
  1154  		}
  1155  		return c, nil
  1156  	}
  1157  
  1158  	exp := cfg.spec.expiration()
  1159  	if cfg.localCluster {
  1160  		// Local clusters never expire.
  1161  		exp = timeutil.Now().Add(100000 * time.Hour)
  1162  	}
  1163  	c := &cluster{
  1164  		name:           name,
  1165  		spec:           cfg.spec,
  1166  		status:         func(...interface{}) {},
  1167  		expiration:     exp,
  1168  		encryptDefault: encrypt.asBool(),
  1169  		r:              f.r,
  1170  		destroyState: destroyState{
  1171  			owned: true,
  1172  			alloc: cfg.alloc,
  1173  		},
  1174  	}
  1175  
  1176  	sargs := []string{roachprod, "create", c.name, "-n", fmt.Sprint(c.spec.NodeCount)}
  1177  	sargs = append(sargs, cfg.spec.args()...)
  1178  	if !local && zonesF != "" && cfg.spec.Zones == "" {
  1179  		if cfg.spec.Geo {
  1180  			sargs = append(sargs, "--gce-zones="+zonesF)
  1181  		} else {
  1182  			sargs = append(sargs, "--gce-zones="+firstZone(zonesF))
  1183  		}
  1184  	}
  1185  	if !cfg.useIOBarrier {
  1186  		sargs = append(sargs, "--local-ssd-no-ext4-barrier")
  1187  	}
  1188  
  1189  	setStatus("acquring cluster creation semaphore")
  1190  	release := f.acquireSem()
  1191  	defer release()
  1192  	setStatus("roachprod create")
  1193  	c.status("creating cluster")
  1194  
  1195  	// Logs for creating a new cluster go to a dedicated log file.
  1196  	logPath := filepath.Join(f.artifactsDir, runnerLogsDir, "cluster-create", name+".log")
  1197  	l, err := rootLogger(logPath, teeOpt)
  1198  	if err != nil {
  1199  		log.Fatalf(ctx, "%v", err)
  1200  	}
  1201  
  1202  	success := false
  1203  	// Attempt to create a cluster several times, cause them clouds be flaky that
  1204  	// my phone says it's snowing.
  1205  	for i := 0; i < 3; i++ {
  1206  		err = execCmd(ctx, l, sargs...)
  1207  		if err == nil {
  1208  			success = true
  1209  			break
  1210  		}
  1211  		l.PrintfCtx(ctx, "Failed to create cluster.")
  1212  		if !strings.Contains(GetStderr(err), "already exists") {
  1213  			l.PrintfCtx(ctx, "Cleaning up in case it was partially created.")
  1214  			c.Destroy(ctx, closeLogger, l)
  1215  		} else {
  1216  			break
  1217  		}
  1218  	}
  1219  	if !success {
  1220  		return nil, err
  1221  	}
  1222  
  1223  	if err := f.r.registerCluster(c); err != nil {
  1224  		return nil, err
  1225  	}
  1226  
  1227  	c.status("idle")
  1228  	return c, nil
  1229  }
  1230  
  1231  type attachOpt struct {
  1232  	skipValidation bool
  1233  	// Implies skipWipe.
  1234  	skipStop bool
  1235  	skipWipe bool
  1236  }
  1237  
  1238  // attachToExistingCluster creates a cluster object based on machines that have
  1239  // already been already allocated by roachprod.
  1240  //
  1241  // NOTE: setTest() needs to be called before a test can use this cluster.
  1242  func attachToExistingCluster(
  1243  	ctx context.Context, name string, l *logger, spec clusterSpec, opt attachOpt, r *clusterRegistry,
  1244  ) (*cluster, error) {
  1245  	exp := spec.expiration()
  1246  	if name == "local" {
  1247  		exp = timeutil.Now().Add(100000 * time.Hour)
  1248  	}
  1249  	c := &cluster{
  1250  		name:           name,
  1251  		spec:           spec,
  1252  		status:         func(...interface{}) {},
  1253  		l:              l,
  1254  		expiration:     exp,
  1255  		encryptDefault: encrypt.asBool(),
  1256  		destroyState: destroyState{
  1257  			// If we're attaching to an existing cluster, we're not going to destoy it.
  1258  			owned: false,
  1259  		},
  1260  		r: r,
  1261  	}
  1262  
  1263  	if err := r.registerCluster(c); err != nil {
  1264  		return nil, err
  1265  	}
  1266  
  1267  	if !opt.skipValidation {
  1268  		if err := c.validate(ctx, spec, l); err != nil {
  1269  			return nil, err
  1270  		}
  1271  	}
  1272  
  1273  	if !opt.skipStop {
  1274  		c.status("stopping cluster")
  1275  		if err := c.StopE(ctx, c.All()); err != nil {
  1276  			return nil, err
  1277  		}
  1278  		if !opt.skipWipe {
  1279  			if clusterWipe {
  1280  				if err := c.WipeE(ctx, l, c.All()); err != nil {
  1281  					return nil, err
  1282  				}
  1283  			} else {
  1284  				l.Printf("skipping cluster wipe\n")
  1285  			}
  1286  		}
  1287  	}
  1288  
  1289  	c.status("idle")
  1290  	return c, nil
  1291  }
  1292  
  1293  // setTest prepares c for being used on behalf of t.
  1294  //
  1295  // TODO(andrei): Get rid of c.t, c.l and of this method.
  1296  func (c *cluster) setTest(t testI) {
  1297  	c.t = t
  1298  	c.l = t.logger()
  1299  	if impl, ok := t.(*test); ok {
  1300  		c.status = impl.Status
  1301  	}
  1302  }
  1303  
  1304  // StopCockroachGracefullyOnNode stops a running cockroach instance on the requested
  1305  // node before a version upgrade.
  1306  func (c *cluster) StopCockroachGracefullyOnNode(ctx context.Context, node int) error {
  1307  	port := fmt.Sprintf("{pgport:%d}", node)
  1308  	// Note that the following command line needs to run against both v2.1
  1309  	// and the current branch. Do not change it in a manner that is
  1310  	// incompatible with 2.1.
  1311  	if err := c.RunE(ctx, c.Node(node), "./cockroach quit --insecure --port="+port); err != nil {
  1312  		return err
  1313  	}
  1314  	// TODO (rohany): This comment below might be out of date.
  1315  	// NB: we still call Stop to make sure the process is dead when we try
  1316  	// to restart it (or we'll catch an error from the RocksDB dir being
  1317  	// locked). This won't happen unless run with --local due to timing.
  1318  	// However, it serves as a reminder that `./cockroach quit` doesn't yet
  1319  	// work well enough -- ideally all listeners and engines are closed by
  1320  	// the time it returns to the client.
  1321  	c.Stop(ctx, c.Node(node))
  1322  	// TODO(tschottdorf): should return an error. I doubt that we want to
  1323  	//  call these *testing.T-style methods on goroutines.
  1324  	return nil
  1325  }
  1326  
  1327  // Save marks the cluster as "saved" so that it doesn't get destroyed.
  1328  func (c *cluster) Save(ctx context.Context, msg string, l *logger) {
  1329  	l.PrintfCtx(ctx, "saving cluster %s for debugging (--debug specified)", c)
  1330  	// TODO(andrei): should we extend the cluster here? For how long?
  1331  	if c.destroyState.owned { // we won't have an alloc for an unowned cluster
  1332  		c.destroyState.alloc.Freeze()
  1333  	}
  1334  	c.r.markClusterAsSaved(c, msg)
  1335  	c.destroyState.mu.Lock()
  1336  	c.destroyState.mu.saved = true
  1337  	c.destroyState.mu.savedMsg = msg
  1338  	c.destroyState.mu.Unlock()
  1339  }
  1340  
  1341  // validateCluster takes a cluster and checks that the reality corresponds to
  1342  // the cluster's spec. It's intended to be used with clusters created by
  1343  // attachToExistingCluster(); otherwise, clusters create with newCluster() are
  1344  // know to be up to spec.
  1345  func (c *cluster) validate(ctx context.Context, nodes clusterSpec, l *logger) error {
  1346  	// Perform validation on the existing cluster.
  1347  	c.status("checking that existing cluster matches spec")
  1348  	sargs := []string{roachprod, "list", c.name, "--json", "--quiet"}
  1349  	out, err := execCmdWithBuffer(ctx, l, sargs...)
  1350  	if err != nil {
  1351  		return err
  1352  	}
  1353  
  1354  	// jsonOutput matches the structure of the output from `roachprod list`
  1355  	// when in json mode.
  1356  	type jsonOutput struct {
  1357  		Clusters map[string]struct {
  1358  			VMs []struct {
  1359  				MachineType string `json:"machine_type"`
  1360  			} `json:"vms"`
  1361  		} `json:"clusters"`
  1362  	}
  1363  	var details jsonOutput
  1364  	if err := json.Unmarshal(out, &details); err != nil {
  1365  		return err
  1366  	}
  1367  
  1368  	cDetails, ok := details.Clusters[c.name]
  1369  	if !ok {
  1370  		return fmt.Errorf("cluster %q not found", c.name)
  1371  	}
  1372  	if len(cDetails.VMs) < c.spec.NodeCount {
  1373  		return fmt.Errorf("cluster has %d nodes, test requires at least %d", len(cDetails.VMs), c.spec.NodeCount)
  1374  	}
  1375  	if cpus := nodes.CPUs; cpus != 0 {
  1376  		for i, vm := range cDetails.VMs {
  1377  			vmCPUs := MachineTypeToCPUs(vm.MachineType)
  1378  			// vmCPUs will be negative if the machine type is unknown. Give unknown
  1379  			// machine types the benefit of the doubt.
  1380  			if vmCPUs > 0 && vmCPUs < cpus {
  1381  				return fmt.Errorf("node %d has %d CPUs, test requires %d", i, vmCPUs, cpus)
  1382  			}
  1383  		}
  1384  	}
  1385  	return nil
  1386  }
  1387  
  1388  // All returns a node list containing all of the nodes in the cluster.
  1389  func (c *cluster) All() nodeListOption {
  1390  	return c.Range(1, c.spec.NodeCount)
  1391  }
  1392  
  1393  // All returns a node list containing the nodes [begin,end].
  1394  func (c *cluster) Range(begin, end int) nodeListOption {
  1395  	if begin < 1 || end > c.spec.NodeCount {
  1396  		c.t.Fatalf("invalid node range: %d-%d (1-%d)", begin, end, c.spec.NodeCount)
  1397  	}
  1398  	r := make(nodeListOption, 0, 1+end-begin)
  1399  	for i := begin; i <= end; i++ {
  1400  		r = append(r, i)
  1401  	}
  1402  	return r
  1403  }
  1404  
  1405  // All returns a node list containing only the node i.
  1406  func (c *cluster) Node(i int) nodeListOption {
  1407  	return c.Range(i, i)
  1408  }
  1409  
  1410  // FetchLogs downloads the logs from the cluster using `roachprod get`.
  1411  // The logs will be placed in the test's artifacts dir.
  1412  func (c *cluster) FetchLogs(ctx context.Context) error {
  1413  	if c.spec.NodeCount == 0 {
  1414  		// No nodes can happen during unit tests and implies nothing to do.
  1415  		return nil
  1416  	}
  1417  
  1418  	c.l.Printf("fetching logs\n")
  1419  	c.status("fetching logs")
  1420  
  1421  	// Don't hang forever if we can't fetch the logs.
  1422  	return contextutil.RunWithTimeout(ctx, "fetch logs", 2*time.Minute, func(ctx context.Context) error {
  1423  		path := filepath.Join(c.t.ArtifactsDir(), "logs")
  1424  		if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
  1425  			return err
  1426  		}
  1427  
  1428  		return execCmd(ctx, c.l, roachprod, "get", c.name, "logs" /* src */, path /* dest */)
  1429  	})
  1430  }
  1431  
  1432  // CopyRoachprodState copies the roachprod state directory in to the test
  1433  // artifacts.
  1434  func (c *cluster) CopyRoachprodState(ctx context.Context) error {
  1435  	if c.spec.NodeCount == 0 {
  1436  		// No nodes can happen during unit tests and implies nothing to do.
  1437  		return nil
  1438  	}
  1439  
  1440  	const roachprodStateDirName = ".roachprod"
  1441  	const roachprodStateName = "roachprod_state"
  1442  	u, err := user.Current()
  1443  	if err != nil {
  1444  		return errors.Wrap(err, "failed to get current user")
  1445  	}
  1446  	src := filepath.Join(u.HomeDir, roachprodStateDirName)
  1447  	dest := filepath.Join(c.t.ArtifactsDir(), roachprodStateName)
  1448  	cmd := exec.CommandContext(ctx, "cp", "-r", src, dest)
  1449  	output, err := cmd.CombinedOutput()
  1450  	return errors.Wrapf(err, "command %q failed: output: %v", cmd.Args, string(output))
  1451  }
  1452  
  1453  // FetchDebugZip downloads the debug zip from the cluster using `roachprod ssh`.
  1454  // The logs will be placed in the test's artifacts dir.
  1455  func (c *cluster) FetchDebugZip(ctx context.Context) error {
  1456  	if c.spec.NodeCount == 0 {
  1457  		// No nodes can happen during unit tests and implies nothing to do.
  1458  		return nil
  1459  	}
  1460  
  1461  	c.l.Printf("fetching debug zip\n")
  1462  	c.status("fetching debug zip")
  1463  
  1464  	// Don't hang forever if we can't fetch the debug zip.
  1465  	return contextutil.RunWithTimeout(ctx, "debug zip", 5*time.Minute, func(ctx context.Context) error {
  1466  		const zipName = "debug.zip"
  1467  		path := filepath.Join(c.t.ArtifactsDir(), zipName)
  1468  		if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
  1469  			return err
  1470  		}
  1471  		// Some nodes might be down, so try to find one that works. We make the
  1472  		// assumption that a down node will refuse the connection, so it won't
  1473  		// waste our time.
  1474  		for i := 1; i <= c.spec.NodeCount; i++ {
  1475  			// `./cockroach debug zip` is noisy. Suppress the output unless it fails.
  1476  			si := strconv.Itoa(i)
  1477  			output, err := execCmdWithBuffer(ctx, c.l, roachprod, "ssh", c.name+":"+si, "--",
  1478  				"./cockroach", "debug", "zip", "--url", "{pgurl:"+si+"}", zipName)
  1479  			if err != nil {
  1480  				c.l.Printf("./cockroach debug zip failed: %s", output)
  1481  				if i < c.spec.NodeCount {
  1482  					continue
  1483  				}
  1484  				return err
  1485  			}
  1486  			return execCmd(ctx, c.l, roachprod, "get", c.name+":"+si, zipName /* src */, path /* dest */)
  1487  		}
  1488  		return nil
  1489  	})
  1490  }
  1491  
  1492  // FailOnDeadNodes fails the test if nodes that have a populated data dir are
  1493  // found to be not running. It prints both to t.l and the test output.
  1494  func (c *cluster) FailOnDeadNodes(ctx context.Context, t *test) {
  1495  	if c.spec.NodeCount == 0 {
  1496  		// No nodes can happen during unit tests and implies nothing to do.
  1497  		return
  1498  	}
  1499  
  1500  	// Don't hang forever.
  1501  	_ = contextutil.RunWithTimeout(ctx, "detect dead nodes", time.Minute, func(ctx context.Context) error {
  1502  		output, err := execCmdWithBuffer(
  1503  			ctx, t.l, roachprod, "monitor", c.name, "--oneshot", "--ignore-empty-nodes",
  1504  		)
  1505  		// If there's an error, it means either that the monitor command failed
  1506  		// completely, or that it found a dead node worth complaining about.
  1507  		if err != nil {
  1508  			if ctx.Err() != nil {
  1509  				// Don't fail if we timed out.
  1510  				return nil
  1511  			}
  1512  			t.printfAndFail(0 /* skip */, "dead node detection: %s %s", err, output)
  1513  		}
  1514  		return nil
  1515  	})
  1516  }
  1517  
  1518  // CheckReplicaDivergenceOnDB runs a fast consistency check of the whole keyspace
  1519  // against the provided db. If an inconsistency is found, it returns it in the
  1520  // error. Note that this will swallow errors returned directly from the consistency
  1521  // check since we know that such spurious errors are possibly without any relation
  1522  // to the check having failed.
  1523  func (c *cluster) CheckReplicaDivergenceOnDB(ctx context.Context, db *gosql.DB) error {
  1524  	// NB: we set a statement_timeout since context cancellation won't work here,
  1525  	// see:
  1526  	// https://github.com/cockroachdb/cockroach/pull/34520
  1527  	//
  1528  	// We've seen the consistency checks hang indefinitely in some cases.
  1529  	rows, err := db.QueryContext(ctx, `
  1530  SET statement_timeout = '3m';
  1531  SELECT t.range_id, t.start_key_pretty, t.status, t.detail
  1532  FROM
  1533  crdb_internal.check_consistency(true, '', '') as t
  1534  WHERE t.status NOT IN ('RANGE_CONSISTENT', 'RANGE_INDETERMINATE')`)
  1535  	if err != nil {
  1536  		// TODO(tbg): the checks can fail for silly reasons like missing gossiped
  1537  		// descriptors, etc. -- not worth failing the test for. Ideally this would
  1538  		// be rock solid.
  1539  		c.l.Printf("consistency check failed with %v; ignoring", err)
  1540  		return nil
  1541  	}
  1542  	var finalErr error
  1543  	for rows.Next() {
  1544  		var rangeID int32
  1545  		var prettyKey, status, detail string
  1546  		if scanErr := rows.Scan(&rangeID, &prettyKey, &status, &detail); err != nil {
  1547  			return scanErr
  1548  		}
  1549  		finalErr = errors.CombineErrors(finalErr,
  1550  			errors.Newf("r%d (%s) is inconsistent: %s %s\n", rangeID, prettyKey, status, detail))
  1551  	}
  1552  	if err := rows.Err(); err != nil {
  1553  		finalErr = errors.CombineErrors(finalErr, err)
  1554  	}
  1555  
  1556  	return finalErr
  1557  }
  1558  
  1559  // FailOnReplicaDivergence fails the test if
  1560  // crdb_internal.check_consistency(true, '', '') indicates that any ranges'
  1561  // replicas are inconsistent with each other. It uses the first node that
  1562  // is up to run the query.
  1563  func (c *cluster) FailOnReplicaDivergence(ctx context.Context, t *test) {
  1564  	if c.spec.NodeCount < 1 {
  1565  		return // unit tests
  1566  	}
  1567  
  1568  	// Find a live node to run against, if one exists.
  1569  	var db *gosql.DB
  1570  	for i := 1; i <= c.spec.NodeCount; i++ {
  1571  		// Don't hang forever.
  1572  		if err := contextutil.RunWithTimeout(
  1573  			ctx, "find live node", 5*time.Second,
  1574  			func(ctx context.Context) error {
  1575  				db = c.Conn(ctx, i)
  1576  				_, err := db.ExecContext(ctx, `;`)
  1577  				return err
  1578  			},
  1579  		); err != nil {
  1580  			_ = db.Close()
  1581  			db = nil
  1582  			continue
  1583  		}
  1584  		c.l.Printf("running (fast) consistency checks on node %d", i)
  1585  		break
  1586  	}
  1587  	if db == nil {
  1588  		c.l.Printf("no live node found, skipping consistency check")
  1589  		return
  1590  	}
  1591  	defer db.Close()
  1592  
  1593  	if err := contextutil.RunWithTimeout(
  1594  		ctx, "consistency check", time.Minute,
  1595  		func(ctx context.Context) error {
  1596  			return c.CheckReplicaDivergenceOnDB(ctx, db)
  1597  		},
  1598  	); err != nil {
  1599  		t.Fatal(err)
  1600  	}
  1601  }
  1602  
  1603  // FetchDmesg grabs the dmesg logs if possible. This requires being able to run
  1604  // `sudo dmesg` on the remote nodes.
  1605  func (c *cluster) FetchDmesg(ctx context.Context) error {
  1606  	if c.spec.NodeCount == 0 || c.isLocal() {
  1607  		// No nodes can happen during unit tests and implies nothing to do.
  1608  		// Also, don't grab dmesg on local runs.
  1609  		return nil
  1610  	}
  1611  
  1612  	c.l.Printf("fetching dmesg\n")
  1613  	c.status("fetching dmesg")
  1614  
  1615  	// Don't hang forever.
  1616  	return contextutil.RunWithTimeout(ctx, "dmesg", 20*time.Second, func(ctx context.Context) error {
  1617  		const name = "dmesg.txt"
  1618  		path := filepath.Join(c.t.ArtifactsDir(), name)
  1619  		if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
  1620  			return err
  1621  		}
  1622  		if err := execCmd(
  1623  			ctx, c.l, roachprod, "ssh", c.name, "--",
  1624  			"/bin/bash", "-c", "'sudo dmesg > "+name+"'", /* src */
  1625  		); err != nil {
  1626  			// Don't error out because it might've worked on some nodes. Fetching will
  1627  			// error out below but will get everything it can first.
  1628  			c.l.Printf("during dmesg fetching: %s", err)
  1629  		}
  1630  		return execCmd(ctx, c.l, roachprod, "get", c.name, name /* src */, path /* dest */)
  1631  	})
  1632  }
  1633  
  1634  // FetchJournalctl grabs the journalctl logs if possible. This requires being
  1635  // able to run `sudo journalctl` on the remote nodes.
  1636  func (c *cluster) FetchJournalctl(ctx context.Context) error {
  1637  	if c.spec.NodeCount == 0 || c.isLocal() {
  1638  		// No nodes can happen during unit tests and implies nothing to do.
  1639  		// Also, don't grab journalctl on local runs.
  1640  		return nil
  1641  	}
  1642  
  1643  	c.l.Printf("fetching journalctl\n")
  1644  	c.status("fetching journalctl")
  1645  
  1646  	// Don't hang forever.
  1647  	return contextutil.RunWithTimeout(ctx, "journalctl", 20*time.Second, func(ctx context.Context) error {
  1648  		const name = "journalctl.txt"
  1649  		path := filepath.Join(c.t.ArtifactsDir(), name)
  1650  		if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
  1651  			return err
  1652  		}
  1653  		if err := execCmd(
  1654  			ctx, c.l, roachprod, "ssh", c.name, "--",
  1655  			"/bin/bash", "-c", "'sudo journalctl > "+name+"'", /* src */
  1656  		); err != nil {
  1657  			// Don't error out because it might've worked on some nodes. Fetching will
  1658  			// error out below but will get everything it can first.
  1659  			c.l.Printf("during journalctl fetching: %s", err)
  1660  		}
  1661  		return execCmd(ctx, c.l, roachprod, "get", c.name, name /* src */, path /* dest */)
  1662  	})
  1663  }
  1664  
  1665  // FetchCores fetches any core files on the cluster.
  1666  func (c *cluster) FetchCores(ctx context.Context) error {
  1667  	if c.spec.NodeCount == 0 || c.isLocal() {
  1668  		// No nodes can happen during unit tests and implies nothing to do.
  1669  		// Also, don't grab dmesg on local runs.
  1670  		return nil
  1671  	}
  1672  
  1673  	if true {
  1674  		// TeamCity does not handle giant artifacts well. We'd generally profit
  1675  		// from having the cores, but we should push them straight into a temp
  1676  		// bucket on S3 instead. OTOH, the ROI of this may be low; I don't know
  1677  		// of a recent example where we've wanted the Core dumps.
  1678  		c.l.Printf("skipped fetching cores\n")
  1679  		return nil
  1680  	}
  1681  
  1682  	c.l.Printf("fetching cores\n")
  1683  	c.status("fetching cores")
  1684  
  1685  	// Don't hang forever. The core files can be large, so we give a generous
  1686  	// timeout.
  1687  	return contextutil.RunWithTimeout(ctx, "cores", 60*time.Second, func(ctx context.Context) error {
  1688  		path := filepath.Join(c.t.ArtifactsDir(), "cores")
  1689  		return execCmd(ctx, c.l, roachprod, "get", c.name, "/mnt/data1/cores" /* src */, path /* dest */)
  1690  	})
  1691  }
  1692  
  1693  type closeLoggerOpt bool
  1694  
  1695  const (
  1696  	closeLogger     closeLoggerOpt = true
  1697  	dontCloseLogger                = false
  1698  )
  1699  
  1700  // Destroy calls `roachprod destroy` or `roachprod wipe` on the cluster.
  1701  // If called while another Destroy() or destroyInner() is in progress, the call
  1702  // blocks until that first call finishes.
  1703  //
  1704  // If c.Save() had previously been called, then Destroy() will not actually
  1705  // touch the cluster. It might still close c.l, though.
  1706  //
  1707  // Cluster destruction errors are swallowed.
  1708  //
  1709  // lo specifies if c.l should be closed or not. If c.l may still be in use by a
  1710  // test (i.e. if this Destroy is happening because of a timeout or a signal),
  1711  // then we don't want to close the logger.
  1712  // l is the logger that will log this destroy operation.
  1713  //
  1714  // This method generally does not react to ctx cancelation.
  1715  func (c *cluster) Destroy(ctx context.Context, lo closeLoggerOpt, l *logger) {
  1716  	if ctx.Err() != nil {
  1717  		return
  1718  	}
  1719  	if c.spec.NodeCount == 0 {
  1720  		// No nodes can happen during unit tests and implies not much to do.
  1721  		c.r.unregisterCluster(c)
  1722  		return
  1723  	}
  1724  
  1725  	ch := c.doDestroy(ctx, l)
  1726  	<-ch
  1727  	// NB: Closing the logger without waiting on c.destroyState.destroyed above
  1728  	// would be bad because we might cause the ongoing `roachprod destroy` to fail
  1729  	// by closing its stdout/stderr.
  1730  	if lo == closeLogger && c.l != nil {
  1731  		c.closeLogger()
  1732  	}
  1733  }
  1734  
  1735  func (c *cluster) doDestroy(ctx context.Context, l *logger) <-chan struct{} {
  1736  	var inFlight <-chan struct{}
  1737  	c.destroyState.mu.Lock()
  1738  	if c.destroyState.mu.saved {
  1739  		// Nothing to do. Short-circuit.
  1740  		c.destroyState.mu.Unlock()
  1741  		ch := make(chan struct{})
  1742  		close(ch)
  1743  		return ch
  1744  	}
  1745  	if c.destroyState.mu.destroyed == nil {
  1746  		c.destroyState.mu.destroyed = make(chan struct{})
  1747  	} else {
  1748  		inFlight = c.destroyState.mu.destroyed
  1749  	}
  1750  	c.destroyState.mu.Unlock()
  1751  	if inFlight != nil {
  1752  		return inFlight
  1753  	}
  1754  
  1755  	if clusterWipe {
  1756  		if c.destroyState.owned {
  1757  			l.PrintfCtx(ctx, "destroying cluster %s...", c)
  1758  			c.status("destroying cluster")
  1759  			// We use a non-cancelable context for running this command. Once we got
  1760  			// here, the cluster cannot be destroyed again, so we really want this
  1761  			// command to succeed.
  1762  			if err := execCmd(context.Background(), l, roachprod, "destroy", c.name); err != nil {
  1763  				l.ErrorfCtx(ctx, "error destroying cluster %s: %s", c, err)
  1764  			} else {
  1765  				l.PrintfCtx(ctx, "destroying cluster %s... done", c)
  1766  			}
  1767  			c.destroyState.alloc.Release()
  1768  		} else {
  1769  			l.PrintfCtx(ctx, "wiping cluster %s", c)
  1770  			c.status("wiping cluster")
  1771  			if err := execCmd(ctx, l, roachprod, "wipe", c.name); err != nil {
  1772  				l.Errorf("%s", err)
  1773  			}
  1774  		}
  1775  	} else {
  1776  		l.Printf("skipping cluster wipe\n")
  1777  	}
  1778  	c.r.unregisterCluster(c)
  1779  	c.destroyState.mu.Lock()
  1780  	ch := c.destroyState.mu.destroyed
  1781  	close(ch)
  1782  	c.destroyState.mu.Unlock()
  1783  	return ch
  1784  }
  1785  
  1786  // Run a command with output redirected to the logs instead of to os.Stdout
  1787  // (which doesn't go anywhere I've been able to find) Don't use this if you're
  1788  // going to call cmd.CombinedOutput or cmd.Output.
  1789  func (c *cluster) LoggedCommand(ctx context.Context, arg0 string, args ...string) *exec.Cmd {
  1790  	cmd := exec.CommandContext(ctx, arg0, args...)
  1791  	cmd.Stdout = c.l.stdout
  1792  	cmd.Stderr = c.l.stderr
  1793  	return cmd
  1794  }
  1795  
  1796  // Put a local file to all of the machines in a cluster.
  1797  // Put is DEPRECATED. Use PutE instead.
  1798  func (c *cluster) Put(ctx context.Context, src, dest string, opts ...option) {
  1799  	if err := c.PutE(ctx, c.l, src, dest, opts...); err != nil {
  1800  		c.t.Fatal(err)
  1801  	}
  1802  }
  1803  
  1804  // PutE puts a local file to all of the machines in a cluster.
  1805  func (c *cluster) PutE(ctx context.Context, l *logger, src, dest string, opts ...option) error {
  1806  	if ctx.Err() != nil {
  1807  		return errors.Wrap(ctx.Err(), "cluster.Put")
  1808  	}
  1809  
  1810  	c.status("uploading binary")
  1811  	defer c.status("")
  1812  
  1813  	err := execCmd(ctx, c.l, roachprod, "put", c.makeNodes(opts...), src, dest)
  1814  	if err != nil {
  1815  		return errors.Wrap(err, "cluster.Put")
  1816  	}
  1817  	return nil
  1818  }
  1819  
  1820  // Get gets files from remote hosts.
  1821  func (c *cluster) Get(ctx context.Context, l *logger, src, dest string, opts ...option) error {
  1822  	if ctx.Err() != nil {
  1823  		return errors.Wrap(ctx.Err(), "cluster.Get error")
  1824  	}
  1825  	c.status(fmt.Sprintf("getting %v", src))
  1826  	defer c.status("")
  1827  	return errors.Wrap(
  1828  		execCmd(ctx, l, roachprod, "get", c.makeNodes(opts...), src, dest),
  1829  		"cluster.Get error")
  1830  
  1831  }
  1832  
  1833  // Put a string into the specified file on the remote(s).
  1834  func (c *cluster) PutString(
  1835  	ctx context.Context, content, dest string, mode os.FileMode, opts ...option,
  1836  ) error {
  1837  	if ctx.Err() != nil {
  1838  		return errors.Wrap(ctx.Err(), "cluster.PutString error")
  1839  	}
  1840  	c.status("uploading string")
  1841  	defer c.status("")
  1842  
  1843  	temp, err := ioutil.TempFile("", filepath.Base(dest))
  1844  	if err != nil {
  1845  		return errors.Wrap(err, "PutString")
  1846  	}
  1847  	if _, err := temp.WriteString(content); err != nil {
  1848  		return errors.Wrap(err, "PutString")
  1849  	}
  1850  	temp.Close()
  1851  	src := temp.Name()
  1852  
  1853  	if err := os.Chmod(src, mode); err != nil {
  1854  		return errors.Wrap(err, "PutString")
  1855  	}
  1856  	// NB: we intentionally don't remove the temp files. This is because roachprod
  1857  	// will symlink them when running locally.
  1858  
  1859  	if err := execCmd(ctx, c.l, roachprod, "put", c.makeNodes(opts...), src, dest); err != nil {
  1860  		return errors.Wrap(err, "PutString")
  1861  	}
  1862  	return nil
  1863  }
  1864  
  1865  // GitClone clones a git repo from src into dest and checks out origin's
  1866  // version of the given branch. The src, dest, and branch arguments must not
  1867  // contain shell special characters.
  1868  func (c *cluster) GitClone(
  1869  	ctx context.Context, l *logger, src, dest, branch string, node nodeListOption,
  1870  ) error {
  1871  	return c.RunL(ctx, l, node, "bash", "-e", "-c", fmt.Sprintf(`'
  1872  if ! test -d %s; then
  1873    git clone -b %s --depth 1 %s %s
  1874  else
  1875    cd %s
  1876    git fetch origin
  1877    git checkout origin/%s
  1878  fi
  1879  '`, dest,
  1880  		branch, src, dest,
  1881  		dest,
  1882  		branch))
  1883  }
  1884  
  1885  // startArgs specifies extra arguments that are passed to `roachprod` during `c.Start`.
  1886  func startArgs(extraArgs ...string) option {
  1887  	return roachprodArgOption(extraArgs)
  1888  }
  1889  
  1890  // startArgsDontEncrypt will pass '--encrypt=false' to roachprod regardless of the
  1891  // --encrypt flag on roachtest. This is useful for tests that cannot pass with
  1892  // encryption enabled.
  1893  var startArgsDontEncrypt = startArgs("--encrypt=false")
  1894  
  1895  // racks is an option which specifies the number of racks to partition the nodes
  1896  // into.
  1897  func racks(n int) option {
  1898  	return startArgs(fmt.Sprintf("--racks=%d", n))
  1899  }
  1900  
  1901  // stopArgs specifies extra arguments that are passed to `roachprod` during `c.Stop`.
  1902  func stopArgs(extraArgs ...string) option {
  1903  	return roachprodArgOption(extraArgs)
  1904  }
  1905  
  1906  type roachprodArgOption []string
  1907  
  1908  func (o roachprodArgOption) option() {}
  1909  
  1910  func roachprodArgs(opts []option) []string {
  1911  	var args []string
  1912  	for _, opt := range opts {
  1913  		a, ok := opt.(roachprodArgOption)
  1914  		if !ok {
  1915  			continue
  1916  		}
  1917  		args = append(args, ([]string)(a)...)
  1918  	}
  1919  	return args
  1920  }
  1921  
  1922  // Restart restarts the specified cockroach node. It takes a test and, on error,
  1923  // calls t.Fatal().
  1924  func (c *cluster) Restart(ctx context.Context, t *test, node nodeListOption) {
  1925  	// We bound the time taken to restart a node through roachprod. Because
  1926  	// roachprod uses SSH, it's particularly vulnerable to network flakiness (as
  1927  	// seen in #35326) and may stall indefinitely. Setting up timeouts better
  1928  	// surfaces this kind of failure.
  1929  	//
  1930  	// TODO(irfansharif): The underlying issue here is the fact that we're running
  1931  	// roachprod commands that may (reasonably) fail due to connection issues, and
  1932  	// we're unable to retry them safely (the underlying commands are
  1933  	// non-idempotent). Presently we simply fail the entire test, when really we
  1934  	// should be able to retry the specific roachprod commands.
  1935  	var cancel func()
  1936  	ctx, cancel = context.WithTimeout(ctx, 30*time.Second)
  1937  	c.Stop(ctx, node)
  1938  	c.Start(ctx, t, node)
  1939  	cancel()
  1940  }
  1941  
  1942  // StartE starts cockroach nodes on a subset of the cluster. The nodes parameter
  1943  // can either be a specific node, empty (to indicate all nodes), or a pair of
  1944  // nodes indicating a range.
  1945  func (c *cluster) StartE(ctx context.Context, opts ...option) error {
  1946  	if ctx.Err() != nil {
  1947  		return errors.Wrap(ctx.Err(), "cluster.StartE")
  1948  	}
  1949  	// If the test failed (indicated by a canceled ctx), short-circuit.
  1950  	if ctx.Err() != nil {
  1951  		return ctx.Err()
  1952  	}
  1953  	c.status("starting cluster")
  1954  	defer c.status()
  1955  	args := []string{
  1956  		roachprod,
  1957  		"start",
  1958  	}
  1959  	args = append(args, roachprodArgs(opts)...)
  1960  	args = append(args, c.makeNodes(opts...))
  1961  	if !argExists(args, "--encrypt") && c.encryptDefault {
  1962  		args = append(args, "--encrypt")
  1963  	}
  1964  	return execCmd(ctx, c.l, args...)
  1965  }
  1966  
  1967  // Start is like StartE() except it takes a test and, on error, calls t.Fatal().
  1968  func (c *cluster) Start(ctx context.Context, t *test, opts ...option) {
  1969  	FatalIfErr(t, c.StartE(ctx, opts...))
  1970  }
  1971  
  1972  func argExists(args []string, target string) bool {
  1973  	for _, arg := range args {
  1974  		if arg == target || strings.HasPrefix(arg, target+"=") {
  1975  			return true
  1976  		}
  1977  	}
  1978  	return false
  1979  }
  1980  
  1981  // StopE cockroach nodes running on a subset of the cluster. See cluster.Start()
  1982  // for a description of the nodes parameter.
  1983  func (c *cluster) StopE(ctx context.Context, opts ...option) error {
  1984  	if ctx.Err() != nil {
  1985  		return errors.Wrap(ctx.Err(), "cluster.StopE")
  1986  	}
  1987  	args := []string{
  1988  		roachprod,
  1989  		"stop",
  1990  	}
  1991  	args = append(args, roachprodArgs(opts)...)
  1992  	args = append(args, c.makeNodes(opts...))
  1993  	c.status("stopping cluster")
  1994  	defer c.status()
  1995  	return execCmd(ctx, c.l, args...)
  1996  }
  1997  
  1998  // Stop is like StopE, except instead of returning an error, it does
  1999  // c.t.Fatal(). c.t needs to be set.
  2000  func (c *cluster) Stop(ctx context.Context, opts ...option) {
  2001  	if c.t.Failed() {
  2002  		// If the test has failed, don't try to limp along.
  2003  		return
  2004  	}
  2005  	if err := c.StopE(ctx, opts...); err != nil {
  2006  		c.t.Fatal(err)
  2007  	}
  2008  }
  2009  
  2010  // WipeE wipes a subset of the nodes in a cluster. See cluster.Start() for a
  2011  // description of the nodes parameter.
  2012  func (c *cluster) WipeE(ctx context.Context, l *logger, opts ...option) error {
  2013  	if ctx.Err() != nil {
  2014  		return errors.Wrap(ctx.Err(), "cluster.WipeE")
  2015  	}
  2016  	if c.spec.NodeCount == 0 {
  2017  		// For tests.
  2018  		return nil
  2019  	}
  2020  	c.status("wiping cluster")
  2021  	defer c.status()
  2022  	return execCmd(ctx, l, roachprod, "wipe", c.makeNodes(opts...))
  2023  }
  2024  
  2025  // Wipe is like WipeE, except instead of returning an error, it does
  2026  // c.t.Fatal(). c.t needs to be set.
  2027  func (c *cluster) Wipe(ctx context.Context, opts ...option) {
  2028  	if ctx.Err() != nil {
  2029  		return
  2030  	}
  2031  	if err := c.WipeE(ctx, c.l, opts...); err != nil {
  2032  		c.t.Fatal(err)
  2033  	}
  2034  }
  2035  
  2036  // Run a command on the specified node.
  2037  func (c *cluster) Run(ctx context.Context, node nodeListOption, args ...string) {
  2038  	err := c.RunE(ctx, node, args...)
  2039  	if err != nil {
  2040  		c.t.Fatal(err)
  2041  	}
  2042  }
  2043  
  2044  // Reformat the disk on the specified node.
  2045  func (c *cluster) Reformat(ctx context.Context, node nodeListOption, args ...string) {
  2046  	err := execCmd(ctx, c.l,
  2047  		append([]string{roachprod, "reformat", c.makeNodes(node), "--"}, args...)...)
  2048  	if err != nil {
  2049  		c.t.Fatal(err)
  2050  	}
  2051  }
  2052  
  2053  // Silence unused warning.
  2054  var _ = (&cluster{}).Reformat
  2055  
  2056  // Install a package in a node
  2057  func (c *cluster) Install(
  2058  	ctx context.Context, l *logger, node nodeListOption, args ...string,
  2059  ) error {
  2060  	return execCmd(ctx, l,
  2061  		append([]string{roachprod, "install", c.makeNodes(node), "--"}, args...)...)
  2062  }
  2063  
  2064  var reOnlyAlphanumeric = regexp.MustCompile(`[^a-zA-Z0-9]+`)
  2065  
  2066  // cmdLogFileName comes up with a log file to use for the given argument string.
  2067  func cmdLogFileName(t time.Time, nodes nodeListOption, args ...string) string {
  2068  	// Make sure we treat {"./cockroach start"} like {"./cockroach", "start"}.
  2069  	args = strings.Split(strings.Join(args, " "), " ")
  2070  	prefix := []string{reOnlyAlphanumeric.ReplaceAllString(args[0], "")}
  2071  	for _, arg := range args[1:] {
  2072  		if s := reOnlyAlphanumeric.ReplaceAllString(arg, ""); s != arg {
  2073  			break
  2074  		}
  2075  		prefix = append(prefix, arg)
  2076  	}
  2077  	s := strings.Join(prefix, "_")
  2078  	const maxLen = 70
  2079  	if len(s) > maxLen {
  2080  		s = s[:maxLen]
  2081  	}
  2082  	logFile := fmt.Sprintf(
  2083  		"run_%s_n%s_%s",
  2084  		t.Format(`150405.000`),
  2085  		nodes.String()[1:],
  2086  		s,
  2087  	)
  2088  	return logFile
  2089  }
  2090  
  2091  // RunE runs a command on the specified node, returning an error. The output
  2092  // will be redirected to a file which is logged via the cluster-wide logger in
  2093  // case of an error. Logs will sort chronologically and those belonging to
  2094  // failing invocations will be suffixed `.failed.log`.
  2095  func (c *cluster) RunE(ctx context.Context, node nodeListOption, args ...string) error {
  2096  	cmdString := strings.Join(args, " ")
  2097  	logFile := cmdLogFileName(timeutil.Now(), node, args...)
  2098  
  2099  	// NB: we set no prefix because it's only going to a file anyway.
  2100  	l, err := c.l.ChildLogger(logFile, quietStderr, quietStdout)
  2101  	if err != nil {
  2102  		return err
  2103  	}
  2104  	c.l.PrintfCtx(ctx, "> %s", cmdString)
  2105  	err = c.RunL(ctx, l, node, args...)
  2106  	l.Printf("> result: %+v", err)
  2107  	if err := ctx.Err(); err != nil {
  2108  		l.Printf("(note: incoming context was canceled: %s", err)
  2109  	}
  2110  	physicalFileName := l.file.Name()
  2111  	l.close()
  2112  	if err != nil {
  2113  		_ = os.Rename(physicalFileName, strings.TrimSuffix(physicalFileName, ".log")+".failed.log")
  2114  	}
  2115  	err = errors.Wrapf(err, "output in %s", logFile)
  2116  	return err
  2117  }
  2118  
  2119  // RunL runs a command on the specified node, returning an error.
  2120  func (c *cluster) RunL(ctx context.Context, l *logger, node nodeListOption, args ...string) error {
  2121  	if err := errors.Wrap(ctx.Err(), "cluster.RunL"); err != nil {
  2122  		return err
  2123  	}
  2124  	return execCmd(ctx, l,
  2125  		append([]string{roachprod, "run", c.makeNodes(node), "--"}, args...)...)
  2126  }
  2127  
  2128  // RunWithBuffer runs a command on the specified node, returning the resulting combined stderr
  2129  // and stdout or an error.
  2130  func (c *cluster) RunWithBuffer(
  2131  	ctx context.Context, l *logger, node nodeListOption, args ...string,
  2132  ) ([]byte, error) {
  2133  	if err := errors.Wrap(ctx.Err(), "cluster.RunWithBuffer"); err != nil {
  2134  		return nil, err
  2135  	}
  2136  	return execCmdWithBuffer(ctx, l,
  2137  		append([]string{roachprod, "run", c.makeNodes(node), "--"}, args...)...)
  2138  }
  2139  
  2140  // pgURL returns the Postgres endpoint for the specified node. It accepts a flag
  2141  // specifying whether the URL should include the node's internal or external IP
  2142  // address. In general, inter-cluster communication and should use internal IPs
  2143  // and communication from a test driver to nodes in a cluster should use
  2144  // external IPs.
  2145  func (c *cluster) pgURL(ctx context.Context, node nodeListOption, external bool) []string {
  2146  	args := []string{roachprod, "pgurl"}
  2147  	if external {
  2148  		args = append(args, `--external`)
  2149  	}
  2150  	nodes := c.makeNodes(node)
  2151  	args = append(args, nodes)
  2152  	cmd := execCmdEx(ctx, c.l, args...)
  2153  	if cmd.err != nil {
  2154  		c.t.Fatal(errors.Wrapf(cmd.err, "failed to get pgurl for nodes: %s", nodes))
  2155  	}
  2156  	urls := strings.Split(strings.TrimSpace(cmd.stdout), " ")
  2157  	if len(urls) != len(node) {
  2158  		c.t.Fatalf(
  2159  			"pgurl for nodes %v got urls %v from stdout:\n%s\nstderr:\n%s",
  2160  			node, urls, cmd.stdout, cmd.stderr,
  2161  		)
  2162  	}
  2163  	for i := range urls {
  2164  		urls[i] = strings.Trim(urls[i], "'")
  2165  		if urls[i] == "" {
  2166  			c.t.Fatalf(
  2167  				"pgurl for nodes %s empty: %v from\nstdout:\n%s\nstderr:\n%s",
  2168  				urls, node, cmd.stdout, cmd.stderr,
  2169  			)
  2170  		}
  2171  	}
  2172  	return urls
  2173  }
  2174  
  2175  // InternalPGUrl returns the internal Postgres endpoint for the specified nodes.
  2176  func (c *cluster) InternalPGUrl(ctx context.Context, node nodeListOption) []string {
  2177  	return c.pgURL(ctx, node, false /* external */)
  2178  }
  2179  
  2180  // Silence unused warning.
  2181  var _ = (&cluster{}).InternalPGUrl
  2182  
  2183  // ExternalPGUrl returns the external Postgres endpoint for the specified nodes.
  2184  func (c *cluster) ExternalPGUrl(ctx context.Context, node nodeListOption) []string {
  2185  	return c.pgURL(ctx, node, true /* external */)
  2186  }
  2187  
  2188  func addrToAdminUIAddr(c *cluster, addr string) string {
  2189  	host, port, err := net.SplitHostPort(addr)
  2190  	if err != nil {
  2191  		c.t.Fatal(err)
  2192  	}
  2193  	webPort, err := strconv.Atoi(port)
  2194  	if err != nil {
  2195  		c.t.Fatal(err)
  2196  	}
  2197  	// Roachprod makes Admin UI's port to be node's port + 1.
  2198  	return fmt.Sprintf("%s:%d", host, webPort+1)
  2199  }
  2200  
  2201  func urlToAddr(c *cluster, pgURL string) string {
  2202  	u, err := url.Parse(pgURL)
  2203  	if err != nil {
  2204  		c.t.Fatal(err)
  2205  	}
  2206  	return u.Host
  2207  }
  2208  
  2209  func addrToHost(c *cluster, addr string) string {
  2210  	host, _ := addrToHostPort(c, addr)
  2211  	return host
  2212  }
  2213  
  2214  func addrToHostPort(c *cluster, addr string) (string, int) {
  2215  	host, portStr, err := net.SplitHostPort(addr)
  2216  	if err != nil {
  2217  		c.t.Fatal(err)
  2218  	}
  2219  	port, err := strconv.Atoi(portStr)
  2220  	if err != nil {
  2221  		c.t.Fatal(err)
  2222  	}
  2223  	return host, port
  2224  }
  2225  
  2226  // InternalAdminUIAddr returns the internal Admin UI address in the form host:port
  2227  // for the specified node.
  2228  func (c *cluster) InternalAdminUIAddr(ctx context.Context, node nodeListOption) []string {
  2229  	var addrs []string
  2230  	for _, u := range c.InternalAddr(ctx, node) {
  2231  		addrs = append(addrs, addrToAdminUIAddr(c, u))
  2232  	}
  2233  	return addrs
  2234  }
  2235  
  2236  // ExternalAdminUIAddr returns the internal Admin UI address in the form host:port
  2237  // for the specified node.
  2238  func (c *cluster) ExternalAdminUIAddr(ctx context.Context, node nodeListOption) []string {
  2239  	var addrs []string
  2240  	for _, u := range c.ExternalAddr(ctx, node) {
  2241  		addrs = append(addrs, addrToAdminUIAddr(c, u))
  2242  	}
  2243  	return addrs
  2244  }
  2245  
  2246  // InternalAddr returns the internal address in the form host:port for the
  2247  // specified nodes.
  2248  func (c *cluster) InternalAddr(ctx context.Context, node nodeListOption) []string {
  2249  	var addrs []string
  2250  	for _, u := range c.pgURL(ctx, node, false /* external */) {
  2251  		addrs = append(addrs, urlToAddr(c, u))
  2252  	}
  2253  	return addrs
  2254  }
  2255  
  2256  // InternalIP returns the internal IP addresses for the specified nodes.
  2257  func (c *cluster) InternalIP(ctx context.Context, node nodeListOption) []string {
  2258  	var ips []string
  2259  	for _, addr := range c.InternalAddr(ctx, node) {
  2260  		ips = append(ips, addrToHost(c, addr))
  2261  	}
  2262  	return ips
  2263  }
  2264  
  2265  // ExternalAddr returns the external address in the form host:port for the
  2266  // specified node.
  2267  func (c *cluster) ExternalAddr(ctx context.Context, node nodeListOption) []string {
  2268  	var addrs []string
  2269  	for _, u := range c.pgURL(ctx, node, true /* external */) {
  2270  		addrs = append(addrs, urlToAddr(c, u))
  2271  	}
  2272  	return addrs
  2273  }
  2274  
  2275  // ExternalIP returns the external IP addresses for the specified node.
  2276  func (c *cluster) ExternalIP(ctx context.Context, node nodeListOption) []string {
  2277  	var ips []string
  2278  	for _, addr := range c.ExternalAddr(ctx, node) {
  2279  		ips = append(ips, addrToHost(c, addr))
  2280  	}
  2281  	return ips
  2282  }
  2283  
  2284  // Silence unused warning.
  2285  var _ = (&cluster{}).ExternalIP
  2286  
  2287  // Conn returns a SQL connection to the specified node.
  2288  func (c *cluster) Conn(ctx context.Context, node int) *gosql.DB {
  2289  	url := c.ExternalPGUrl(ctx, c.Node(node))[0]
  2290  	db, err := gosql.Open("postgres", url)
  2291  	if err != nil {
  2292  		c.t.Fatal(err)
  2293  	}
  2294  	return db
  2295  }
  2296  
  2297  // ConnE returns a SQL connection to the specified node.
  2298  func (c *cluster) ConnE(ctx context.Context, node int) (*gosql.DB, error) {
  2299  	url := c.ExternalPGUrl(ctx, c.Node(node))[0]
  2300  	db, err := gosql.Open("postgres", url)
  2301  	if err != nil {
  2302  		return nil, err
  2303  	}
  2304  	return db, nil
  2305  }
  2306  
  2307  func (c *cluster) makeNodes(opts ...option) string {
  2308  	var r nodeListOption
  2309  	for _, o := range opts {
  2310  		if s, ok := o.(nodeSelector); ok {
  2311  			r = s.merge(r)
  2312  		}
  2313  	}
  2314  	return c.name + r.String()
  2315  }
  2316  
  2317  func (c *cluster) isLocal() bool {
  2318  	return c.name == "local"
  2319  }
  2320  
  2321  // Extend extends the cluster's expiration by d, after truncating d to minute
  2322  // granularity.
  2323  func (c *cluster) Extend(ctx context.Context, d time.Duration, l *logger) error {
  2324  	if ctx.Err() != nil {
  2325  		return errors.Wrap(ctx.Err(), "cluster.Extend")
  2326  	}
  2327  	minutes := int(d.Minutes())
  2328  	l.PrintfCtx(ctx, "extending cluster by %d minutes", minutes)
  2329  	if out, err := execCmdWithBuffer(ctx, l, roachprod, "extend", c.name,
  2330  		fmt.Sprintf("--lifetime=%dm", minutes),
  2331  	); err != nil {
  2332  		l.PrintfCtx(ctx, "roachprod extend failed: %s", out)
  2333  		return errors.Wrap(err, "roachprod extend failed")
  2334  	}
  2335  	// Update c.expiration. Keep it under the real expiration.
  2336  	c.expiration = c.expiration.Add(time.Duration((minutes - 1)) * time.Minute)
  2337  	return nil
  2338  }
  2339  
  2340  // getDiskUsageInBytes does what's on the tin. nodeIdx starts at one.
  2341  func getDiskUsageInBytes(
  2342  	ctx context.Context, c *cluster, logger *logger, nodeIdx int,
  2343  ) (int, error) {
  2344  	var out []byte
  2345  	for {
  2346  		if c.t.Failed() {
  2347  			return 0, errors.New("already failed")
  2348  		}
  2349  		var err error
  2350  		// `du` can warn if files get removed out from under it (which
  2351  		// happens during RocksDB compactions, for example). Discard its
  2352  		// stderr to avoid breaking Atoi later.
  2353  		// TODO(bdarnell): Refactor this stack to not combine stdout and
  2354  		// stderr so we don't need to do this (and the Warning check
  2355  		// below).
  2356  		out, err = c.RunWithBuffer(ctx, logger, c.Node(nodeIdx),
  2357  			fmt.Sprint("du -sk {store-dir} 2>/dev/null | grep -oE '^[0-9]+'"))
  2358  		if err != nil {
  2359  			if ctx.Err() != nil {
  2360  				return 0, ctx.Err()
  2361  			}
  2362  			// If `du` fails, retry.
  2363  			// TODO(bdarnell): is this worth doing? It was originally added
  2364  			// because of the "files removed out from under it" problem, but
  2365  			// that doesn't result in a command failure, just a stderr
  2366  			// message.
  2367  			logger.Printf("retrying disk usage computation after spurious error: %s", err)
  2368  			continue
  2369  		}
  2370  		break
  2371  	}
  2372  
  2373  	str := string(out)
  2374  	// We need this check because sometimes the first line of the roachprod output is a warning
  2375  	// about adding an ip to a list of known hosts.
  2376  	if strings.Contains(str, "Warning") {
  2377  		str = strings.Split(str, "\n")[1]
  2378  	}
  2379  
  2380  	size, err := strconv.Atoi(strings.TrimSpace(str))
  2381  	if err != nil {
  2382  		return 0, err
  2383  	}
  2384  
  2385  	return size * 1024, nil
  2386  }
  2387  
  2388  type monitor struct {
  2389  	t         testI
  2390  	l         *logger
  2391  	nodes     string
  2392  	ctx       context.Context
  2393  	cancel    func()
  2394  	g         *errgroup.Group
  2395  	expDeaths int32 // atomically
  2396  }
  2397  
  2398  func newMonitor(ctx context.Context, c *cluster, opts ...option) *monitor {
  2399  	m := &monitor{
  2400  		t:     c.t,
  2401  		l:     c.l,
  2402  		nodes: c.makeNodes(opts...),
  2403  	}
  2404  	m.ctx, m.cancel = context.WithCancel(ctx)
  2405  	m.g, m.ctx = errgroup.WithContext(m.ctx)
  2406  	return m
  2407  }
  2408  
  2409  // ExpectDeath lets the monitor know that a node is about to be killed, and that
  2410  // this should be ignored.
  2411  func (m *monitor) ExpectDeath() {
  2412  	m.ExpectDeaths(1)
  2413  }
  2414  
  2415  // ExpectDeaths lets the monitor know that a specific number of nodes are about
  2416  // to be killed, and that they should be ignored.
  2417  func (m *monitor) ExpectDeaths(count int32) {
  2418  	atomic.AddInt32(&m.expDeaths, count)
  2419  }
  2420  
  2421  func (m *monitor) ResetDeaths() {
  2422  	atomic.StoreInt32(&m.expDeaths, 0)
  2423  }
  2424  
  2425  var errGoexit = errors.New("Goexit() was called")
  2426  
  2427  func (m *monitor) Go(fn func(context.Context) error) {
  2428  	m.g.Go(func() (err error) {
  2429  		var returned bool
  2430  		defer func() {
  2431  			if returned {
  2432  				return
  2433  			}
  2434  			if r := recover(); r != errGoexit && r != nil {
  2435  				// Pass any regular panics through.
  2436  				panic(r)
  2437  			} else {
  2438  				// If the invoked method called runtime.Goexit (such as it
  2439  				// happens when it calls t.Fatal), exit with a sentinel error
  2440  				// here so that the wrapped errgroup cancels itself.
  2441  				//
  2442  				// Note that the trick here is that we panicked explicitly below,
  2443  				// which somehow "overrides" the Goexit which is supposed to be
  2444  				// un-recoverable, but we do need to recover to return an error.
  2445  				err = errGoexit
  2446  			}
  2447  		}()
  2448  		if impl, ok := m.t.(*test); ok {
  2449  			// Automatically clear the worker status message when the goroutine exits.
  2450  			defer impl.WorkerStatus()
  2451  		}
  2452  		defer func() {
  2453  			if !returned {
  2454  				if r := recover(); r != nil {
  2455  					panic(r)
  2456  				}
  2457  				panic(errGoexit)
  2458  			}
  2459  		}()
  2460  		err = fn(m.ctx)
  2461  		returned = true
  2462  		return err
  2463  	})
  2464  }
  2465  
  2466  func (m *monitor) WaitE() error {
  2467  	if m.t.Failed() {
  2468  		// If the test has failed, don't try to limp along.
  2469  		return errors.New("already failed")
  2470  	}
  2471  
  2472  	return errors.Wrap(m.wait(roachprod, "monitor", m.nodes), "monitor failure")
  2473  }
  2474  
  2475  func (m *monitor) Wait() {
  2476  	if m.t.Failed() {
  2477  		// If the test has failed, don't try to limp along.
  2478  		return
  2479  	}
  2480  	if err := m.WaitE(); err != nil {
  2481  		// Note that we used to avoid fataling again if we had already fatal'ed.
  2482  		// However, this error here might be the one to actually report, see:
  2483  		// https://github.com/cockroachdb/cockroach/issues/44436
  2484  		m.t.Fatal(err)
  2485  	}
  2486  }
  2487  
  2488  func (m *monitor) wait(args ...string) error {
  2489  	// It is surprisingly difficult to get the cancellation semantics exactly
  2490  	// right. We need to watch for the "workers" group (m.g) to finish, or for
  2491  	// the monitor command to emit an unexpected node failure, or for the monitor
  2492  	// command itself to exit. We want to capture whichever error happens first
  2493  	// and then cancel the other goroutines. This ordering prevents the usage of
  2494  	// an errgroup.Group for the goroutines below. Consider:
  2495  	//
  2496  	//   g, _ := errgroup.WithContext(m.ctx)
  2497  	//   g.Go(func(context.Context) error {
  2498  	//     defer m.cancel()
  2499  	//     return m.g.Wait()
  2500  	//   })
  2501  	//
  2502  	// Now consider what happens when an error is returned. Before the error
  2503  	// reaches the errgroup, we invoke the cancellation closure which can cause
  2504  	// the other goroutines to wake up and perhaps race and set the errgroup
  2505  	// error first.
  2506  	//
  2507  	// The solution is to implement our own errgroup mechanism here which allows
  2508  	// us to set the error before performing the cancellation.
  2509  
  2510  	var errOnce sync.Once
  2511  	var err error
  2512  	setErr := func(e error) {
  2513  		if e != nil {
  2514  			errOnce.Do(func() {
  2515  				err = e
  2516  			})
  2517  		}
  2518  	}
  2519  
  2520  	// 1. The first goroutine waits for the worker errgroup to exit.
  2521  	var wg sync.WaitGroup
  2522  	wg.Add(1)
  2523  	go func() {
  2524  		defer func() {
  2525  			m.cancel()
  2526  			wg.Done()
  2527  		}()
  2528  		setErr(errors.Wrap(m.g.Wait(), "monitor task failed"))
  2529  	}()
  2530  
  2531  	setMonitorCmdErr := func(err error) {
  2532  		setErr(errors.Wrap(err, "monitor command failure"))
  2533  	}
  2534  
  2535  	// 2. The second goroutine forks/execs the monitoring command.
  2536  	pipeR, pipeW := io.Pipe()
  2537  	wg.Add(1)
  2538  	go func() {
  2539  		defer func() {
  2540  			_ = pipeW.Close()
  2541  			wg.Done()
  2542  			// NB: we explicitly do not want to call m.cancel() here as we want the
  2543  			// goroutine that is reading the monitoring events to be able to decide
  2544  			// on the error if the monitoring command exits peacefully.
  2545  		}()
  2546  
  2547  		monL, err := m.l.ChildLogger(`MONITOR`)
  2548  		if err != nil {
  2549  			setMonitorCmdErr(err)
  2550  			return
  2551  		}
  2552  		defer monL.close()
  2553  
  2554  		cmd := exec.CommandContext(m.ctx, args[0], args[1:]...)
  2555  		cmd.Stdout = io.MultiWriter(pipeW, monL.stdout)
  2556  		cmd.Stderr = monL.stderr
  2557  		if err := cmd.Run(); err != nil {
  2558  			if !errors.Is(err, context.Canceled) && !strings.Contains(err.Error(), "killed") {
  2559  				// The expected reason for an error is that the monitor was killed due
  2560  				// to the context being canceled. Any other error is an actual error.
  2561  				setMonitorCmdErr(err)
  2562  				return
  2563  			}
  2564  		}
  2565  		// Returning will cause the pipe to be closed which will cause the reader
  2566  		// goroutine to exit and close the monitoring channel.
  2567  	}()
  2568  
  2569  	// 3. The third goroutine reads from the monitoring pipe, watching for any
  2570  	// unexpected death events.
  2571  	wg.Add(1)
  2572  	go func() {
  2573  		defer func() {
  2574  			_ = pipeR.Close()
  2575  			m.cancel()
  2576  			wg.Done()
  2577  		}()
  2578  
  2579  		scanner := bufio.NewScanner(pipeR)
  2580  		for scanner.Scan() {
  2581  			msg := scanner.Text()
  2582  			var id int
  2583  			var s string
  2584  			if n, _ := fmt.Sscanf(msg, "%d: %s", &id, &s); n == 2 {
  2585  				if strings.Contains(s, "dead") && atomic.AddInt32(&m.expDeaths, -1) < 0 {
  2586  					setErr(fmt.Errorf("unexpected node event: %s", msg))
  2587  					return
  2588  				}
  2589  			}
  2590  		}
  2591  	}()
  2592  
  2593  	wg.Wait()
  2594  	return err
  2595  }
  2596  
  2597  func waitForFullReplication(t *test, db *gosql.DB) {
  2598  	t.l.Printf("waiting for up-replication...\n")
  2599  	tStart := timeutil.Now()
  2600  	for ok := false; !ok; time.Sleep(time.Second) {
  2601  		if err := db.QueryRow(
  2602  			"SELECT min(array_length(replicas, 1)) >= 3 FROM crdb_internal.ranges",
  2603  		).Scan(&ok); err != nil {
  2604  			t.Fatal(err)
  2605  		}
  2606  		if timeutil.Since(tStart) > 30*time.Second {
  2607  			t.l.Printf("still waiting for full replication")
  2608  		}
  2609  	}
  2610  }
  2611  
  2612  type loadGroup struct {
  2613  	roachNodes nodeListOption
  2614  	loadNodes  nodeListOption
  2615  }
  2616  
  2617  type loadGroupList []loadGroup
  2618  
  2619  func (lg loadGroupList) roachNodes() nodeListOption {
  2620  	var roachNodes nodeListOption
  2621  	for _, g := range lg {
  2622  		roachNodes = roachNodes.merge(g.roachNodes)
  2623  	}
  2624  	return roachNodes
  2625  }
  2626  
  2627  func (lg loadGroupList) loadNodes() nodeListOption {
  2628  	var loadNodes nodeListOption
  2629  	for _, g := range lg {
  2630  		loadNodes = loadNodes.merge(g.loadNodes)
  2631  	}
  2632  	return loadNodes
  2633  }
  2634  
  2635  // makeLoadGroups create a loadGroupList that has an equal number of cockroach
  2636  // nodes per zone. It assumes that numLoadNodes <= numZones and that numZones is
  2637  // divisible by numLoadNodes.
  2638  func makeLoadGroups(c *cluster, numZones, numRoachNodes, numLoadNodes int) loadGroupList {
  2639  	if numLoadNodes > numZones {
  2640  		panic("cannot have more than one load node per zone")
  2641  	} else if numZones%numLoadNodes != 0 {
  2642  		panic("numZones must be divisible by numLoadNodes")
  2643  	}
  2644  	// roachprod allocates nodes over regions in a round-robin fashion.
  2645  	// If the number of nodes is not divisible by the number of regions, the
  2646  	// extra nodes are allocated in a round-robin fashion over the regions at
  2647  	// the end of cluster.
  2648  	loadNodesAtTheEnd := numLoadNodes%numZones != 0
  2649  	loadGroups := make(loadGroupList, numLoadNodes)
  2650  	roachNodesPerGroup := numRoachNodes / numLoadNodes
  2651  	for i := range loadGroups {
  2652  		if loadNodesAtTheEnd {
  2653  			first := i*roachNodesPerGroup + 1
  2654  			loadGroups[i].roachNodes = c.Range(first, first+roachNodesPerGroup-1)
  2655  			loadGroups[i].loadNodes = c.Node(numRoachNodes + i + 1)
  2656  		} else {
  2657  			first := i*(roachNodesPerGroup+1) + 1
  2658  			loadGroups[i].roachNodes = c.Range(first, first+roachNodesPerGroup-1)
  2659  			loadGroups[i].loadNodes = c.Node((i + 1) * (roachNodesPerGroup + 1))
  2660  		}
  2661  	}
  2662  	return loadGroups
  2663  }