github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/cmd/roachprod/install/cluster_synced.go (about)

     1  // Copyright 2018 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package install
    12  
    13  import (
    14  	"bufio"
    15  	"bytes"
    16  	"context"
    17  	"fmt"
    18  	"io"
    19  	"io/ioutil"
    20  	"log"
    21  	"math"
    22  	"os"
    23  	"os/exec"
    24  	"os/signal"
    25  	"path/filepath"
    26  	"sort"
    27  	"strings"
    28  	"sync"
    29  	"syscall"
    30  	"text/template"
    31  	"time"
    32  
    33  	"github.com/cockroachdb/cockroach/pkg/cmd/roachprod/config"
    34  	rperrors "github.com/cockroachdb/cockroach/pkg/cmd/roachprod/errors"
    35  	"github.com/cockroachdb/cockroach/pkg/cmd/roachprod/ssh"
    36  	"github.com/cockroachdb/cockroach/pkg/cmd/roachprod/ui"
    37  	clog "github.com/cockroachdb/cockroach/pkg/util/log"
    38  	"github.com/cockroachdb/cockroach/pkg/util/syncutil"
    39  	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
    40  	"github.com/cockroachdb/errors"
    41  	crdberrors "github.com/cockroachdb/errors"
    42  	"golang.org/x/sync/errgroup"
    43  )
    44  
    45  // ClusterImpl TODO(peter): document
    46  type ClusterImpl interface {
    47  	Start(c *SyncedCluster, extraArgs []string)
    48  	CertsDir(c *SyncedCluster, index int) string
    49  	NodeDir(c *SyncedCluster, index int) string
    50  	LogDir(c *SyncedCluster, index int) string
    51  	NodeURL(c *SyncedCluster, host string, port int) string
    52  	NodePort(c *SyncedCluster, index int) int
    53  	NodeUIPort(c *SyncedCluster, index int) int
    54  }
    55  
    56  // A SyncedCluster is created from the information in the synced hosts file
    57  // and is used as the target for installing and managing various software
    58  // components.
    59  //
    60  // TODO(benesch): unify with CloudCluster.
    61  type SyncedCluster struct {
    62  	// name, vms, users, localities are populated at init time.
    63  	Name       string
    64  	VMs        []string
    65  	Users      []string
    66  	Localities []string
    67  	VPCs       []string
    68  	// all other fields are populated in newCluster.
    69  	Nodes          []int
    70  	Secure         bool
    71  	Env            string
    72  	Args           []string
    73  	Tag            string
    74  	Impl           ClusterImpl
    75  	UseTreeDist    bool
    76  	Quiet          bool
    77  	MaxConcurrency int // used in Parallel
    78  	// AuthorizedKeys is used by SetupSSH to add additional authorized keys.
    79  	AuthorizedKeys []byte
    80  
    81  	// Used to stash debug information.
    82  	DebugDir string
    83  }
    84  
    85  // CmdKind is the kind of command passed to SyncedCluster.Run().
    86  type CmdKind int
    87  
    88  // The kinds of commands passed to SyncedCluster.Run().
    89  const (
    90  	// A cockroach command is passed.
    91  	CockroachCmd CmdKind = iota
    92  
    93  	// A non-classified command is passed.
    94  	OtherCmd
    95  )
    96  
    97  func (ck CmdKind) classifyError(err error) rperrors.Error {
    98  	if ck == CockroachCmd {
    99  		return rperrors.ClassifyCockroachError(err)
   100  	}
   101  
   102  	return rperrors.ClassifyCmdError(err)
   103  }
   104  
   105  func (c *SyncedCluster) host(index int) string {
   106  	return c.VMs[index-1]
   107  }
   108  
   109  func (c *SyncedCluster) user(index int) string {
   110  	return c.Users[index-1]
   111  }
   112  
   113  func (c *SyncedCluster) locality(index int) string {
   114  	return c.Localities[index-1]
   115  }
   116  
   117  // IsLocal TODO(peter): document
   118  //
   119  // TODO(tschottdorf): roachprod should cleanly encapsulate the home directory
   120  // which is currently the biggest culprit for awkward one-offs.
   121  func (c *SyncedCluster) IsLocal() bool {
   122  	return c.Name == config.Local
   123  }
   124  
   125  // ServerNodes TODO(peter): document
   126  func (c *SyncedCluster) ServerNodes() []int {
   127  	return append([]int{}, c.Nodes...)
   128  }
   129  
   130  // GetInternalIP returns the internal IP address of the specified node.
   131  func (c *SyncedCluster) GetInternalIP(index int) (string, error) {
   132  	if c.IsLocal() {
   133  		return c.host(index), nil
   134  	}
   135  
   136  	session, err := c.newSession(index)
   137  	if err != nil {
   138  		return "", errors.Wrapf(err, "GetInternalIP: failed dial %s:%d", c.Name, index)
   139  	}
   140  	defer session.Close()
   141  
   142  	var stdout, stderr strings.Builder
   143  	session.SetStdout(&stdout)
   144  	session.SetStderr(&stderr)
   145  	cmd := `hostname --all-ip-addresses`
   146  	if err := session.Run(cmd); err != nil {
   147  		return "", errors.Wrapf(err,
   148  			"GetInternalIP: failed to execute hostname on %s:%d:\n(stdout) %s\n(stderr) %s",
   149  			c.Name, index, stdout.String(), stderr.String())
   150  	}
   151  	ip := strings.TrimSpace(stdout.String())
   152  	if ip == "" {
   153  		return "", errors.Errorf(
   154  			"empty internal IP returned, stdout:\n%s\nstderr:\n%s",
   155  			stdout.String(), stderr.String(),
   156  		)
   157  	}
   158  	return ip, nil
   159  }
   160  
   161  // Start TODO(peter): document
   162  func (c *SyncedCluster) Start() {
   163  	c.Impl.Start(c, c.Args)
   164  }
   165  
   166  func (c *SyncedCluster) newSession(i int) (session, error) {
   167  	if c.IsLocal() {
   168  		return newLocalSession(), nil
   169  	}
   170  	return newRemoteSession(c.user(i), c.host(i), c.DebugDir)
   171  }
   172  
   173  // Stop TODO(peter): document
   174  func (c *SyncedCluster) Stop(sig int, wait bool) {
   175  	display := fmt.Sprintf("%s: stopping", c.Name)
   176  	if wait {
   177  		display += " and waiting"
   178  	}
   179  	c.Parallel(display, len(c.Nodes), 0, func(i int) ([]byte, error) {
   180  		sess, err := c.newSession(c.Nodes[i])
   181  		if err != nil {
   182  			return nil, err
   183  		}
   184  		defer sess.Close()
   185  
   186  		var waitCmd string
   187  		if wait {
   188  			waitCmd = fmt.Sprintf(`
   189    for pid in ${pids}; do
   190      echo "${pid}: checking" >> %[1]s/roachprod.log
   191      while kill -0 ${pid}; do
   192        kill -0 ${pid} >> %[1]s/roachprod.log 2>&1
   193        echo "${pid}: still alive [$?]" >> %[1]s/roachprod.log
   194        ps axeww -o pid -o command >> %[1]s/roachprod.log
   195        sleep 1
   196      done
   197      echo "${pid}: dead" >> %[1]s/roachprod.log
   198    done
   199  `, c.Impl.LogDir(c, c.Nodes[i]))
   200  		}
   201  
   202  		// NB: the awkward-looking `awk` invocation serves to avoid having the
   203  		// awk process match its own output from `ps`.
   204  		cmd := fmt.Sprintf(`
   205  mkdir -p logs
   206  echo ">>> roachprod stop: $(date)" >> %[1]s/roachprod.log
   207  ps axeww -o pid -o command >> %[1]s/roachprod.log
   208  pids=$(ps axeww -o pid -o command | \
   209    sed 's/export ROACHPROD=//g' | \
   210    awk '/ROACHPROD=(%[2]d%[3]s)[ \/]/ { print $1 }')
   211  if [ -n "${pids}" ]; then
   212    kill -%[4]d ${pids}
   213  %[5]s
   214  fi
   215  `, c.Impl.LogDir(c, c.Nodes[i]), c.Nodes[i], c.escapedTag(), sig, waitCmd)
   216  		return sess.CombinedOutput(cmd)
   217  	})
   218  }
   219  
   220  // Wipe TODO(peter): document
   221  func (c *SyncedCluster) Wipe(preserveCerts bool) {
   222  	display := fmt.Sprintf("%s: wiping", c.Name)
   223  	c.Stop(9, true /* wait */)
   224  	c.Parallel(display, len(c.Nodes), 0, func(i int) ([]byte, error) {
   225  		sess, err := c.newSession(c.Nodes[i])
   226  		if err != nil {
   227  			return nil, err
   228  		}
   229  		defer sess.Close()
   230  
   231  		var cmd string
   232  		if c.IsLocal() {
   233  			// Not all shells like brace expansion, so we'll do it here
   234  			dirs := []string{"data", "logs"}
   235  			if !preserveCerts {
   236  				dirs = append(dirs, "certs*")
   237  			}
   238  			for _, dir := range dirs {
   239  				cmd += fmt.Sprintf(`rm -fr ${HOME}/local/%d/%s ;`, c.Nodes[i], dir)
   240  			}
   241  		} else {
   242  			cmd = `sudo find /mnt/data* -maxdepth 1 -type f -exec rm -f {} \; &&
   243  sudo rm -fr /mnt/data*/{auxiliary,local,tmp,cassandra,cockroach,cockroach-temp*,mongo-data} &&
   244  sudo rm -fr logs &&
   245  `
   246  			if !preserveCerts {
   247  				cmd += "sudo rm -fr certs* ;\n"
   248  			}
   249  		}
   250  		return sess.CombinedOutput(cmd)
   251  	})
   252  }
   253  
   254  // Status TODO(peter): document
   255  func (c *SyncedCluster) Status() {
   256  	display := fmt.Sprintf("%s: status", c.Name)
   257  	results := make([]string, len(c.Nodes))
   258  	c.Parallel(display, len(c.Nodes), 0, func(i int) ([]byte, error) {
   259  		sess, err := c.newSession(c.Nodes[i])
   260  		if err != nil {
   261  			results[i] = err.Error()
   262  			return nil, nil
   263  		}
   264  		defer sess.Close()
   265  
   266  		binary := cockroachNodeBinary(c, c.Nodes[i])
   267  		cmd := fmt.Sprintf(`out=$(ps axeww -o pid -o ucomm -o command | \
   268    sed 's/export ROACHPROD=//g' | \
   269    awk '/ROACHPROD=(%d%s)[ \/]/ {print $2, $1}'`,
   270  			c.Nodes[i], c.escapedTag())
   271  		cmd += ` | sort | uniq);
   272  vers=$(` + binary + ` version 2>/dev/null | awk '/Build Tag:/ {print $NF}')
   273  if [ -n "${out}" -a -n "${vers}" ]; then
   274    echo ${out} | sed "s/cockroach/cockroach-${vers}/g"
   275  else
   276    echo ${out}
   277  fi
   278  `
   279  		out, err := sess.CombinedOutput(cmd)
   280  		var msg string
   281  		if err != nil {
   282  			return nil, errors.Wrapf(err, "~ %s\n%s", cmd, out)
   283  		}
   284  		msg = strings.TrimSpace(string(out))
   285  		if msg == "" {
   286  			msg = "not running"
   287  		}
   288  		results[i] = msg
   289  		return nil, nil
   290  	})
   291  
   292  	for i, r := range results {
   293  		fmt.Printf("  %2d: %s\n", c.Nodes[i], r)
   294  	}
   295  }
   296  
   297  // NodeMonitorInfo is a message describing a cockroach process' status.
   298  type NodeMonitorInfo struct {
   299  	// The index of the node (in a SyncedCluster) at which the message originated.
   300  	Index int
   301  	// A message about the node. This is either a PID, "dead", "nc exited", or
   302  	// "skipped".
   303  	// Anything but a PID or "skipped" is an indication that there is some
   304  	// problem with the node and that the process is not running.
   305  	Msg string
   306  	// Err is an error that may occur when trying to probe the status of the node.
   307  	// If Err is non-nil, Msg is empty. After an error is returned, the node with
   308  	// the given index will no longer be probed. Errors typically indicate networking
   309  	// issues or nodes that have (physically) shut down.
   310  	Err error
   311  }
   312  
   313  // Monitor writes NodeMonitorInfo for the cluster nodes to the returned channel.
   314  // Infos sent to the channel always have the Index and exactly one of Msg or Err
   315  // set.
   316  //
   317  // If oneShot is true, infos are retrieved only once for each node and the
   318  // channel is subsequently closed; otherwise the process continues indefinitely
   319  // (emitting new information as the status of the cockroach process changes).
   320  //
   321  // If ignoreEmptyNodes is true, nodes on which no CockroachDB data is found
   322  // (in {store-dir}) will not be probed and single message, "skipped", will
   323  // be emitted for them.
   324  func (c *SyncedCluster) Monitor(ignoreEmptyNodes bool, oneShot bool) chan NodeMonitorInfo {
   325  	ch := make(chan NodeMonitorInfo)
   326  	nodes := c.ServerNodes()
   327  	var wg sync.WaitGroup
   328  
   329  	for i := range nodes {
   330  		wg.Add(1)
   331  		go func(i int) {
   332  			defer wg.Done()
   333  			sess, err := c.newSession(nodes[i])
   334  			if err != nil {
   335  				ch <- NodeMonitorInfo{Index: nodes[i], Err: err}
   336  				wg.Done()
   337  				return
   338  			}
   339  			defer sess.Close()
   340  
   341  			p, err := sess.StdoutPipe()
   342  			if err != nil {
   343  				ch <- NodeMonitorInfo{Index: nodes[i], Err: err}
   344  				wg.Done()
   345  				return
   346  			}
   347  
   348  			// On each monitored node, we loop looking for a cockroach process. In
   349  			// order to avoid polling with lsof, if we find a live process we use nc
   350  			// (netcat) to connect to the rpc port which will block until the server
   351  			// either decides to kill the connection or the process is killed.
   352  			// In one-shot we don't use nc and return after the first assessment
   353  			// of the process' health.
   354  			data := struct {
   355  				OneShot     bool
   356  				IgnoreEmpty bool
   357  				Store       string
   358  				Port        int
   359  			}{
   360  				OneShot:     oneShot,
   361  				IgnoreEmpty: ignoreEmptyNodes,
   362  				Store:       Cockroach{}.NodeDir(c, nodes[i]),
   363  				Port:        Cockroach{}.NodePort(c, nodes[i]),
   364  			}
   365  
   366  			snippet := `
   367  lastpid=0
   368  {{ if .IgnoreEmpty}}
   369  if [ ! -f "{{.Store}}/CURRENT" ]; then
   370    echo "skipped"
   371    exit 0
   372  fi
   373  {{- end}}
   374  while :; do
   375    pid=$(lsof -i :{{.Port}} -sTCP:LISTEN | awk '!/COMMAND/ {print $2}')
   376    if [ "${pid}" != "${lastpid}" ]; then
   377      if [ -n "${lastpid}" -a -z "${pid}" ]; then
   378        echo dead
   379      fi
   380      lastpid=${pid}
   381      if [ -n "${pid}" ]; then
   382        echo ${pid}
   383      fi
   384    fi
   385  {{if .OneShot }}
   386    exit 0
   387  {{- end}}
   388    if [ -n "${lastpid}" ]; then
   389      while kill -0 "${lastpid}"; do
   390        sleep 1
   391      done
   392      echo "kill exited nonzero"
   393    else
   394      sleep 1
   395    fi
   396  done
   397  `
   398  
   399  			t := template.Must(template.New("script").Parse(snippet))
   400  			var buf bytes.Buffer
   401  			if err := t.Execute(&buf, data); err != nil {
   402  				ch <- NodeMonitorInfo{Index: nodes[i], Err: err}
   403  				return
   404  			}
   405  
   406  			// Request a PTY so that the script will receive will receive a SIGPIPE
   407  			// when the session is closed.
   408  			if err := sess.RequestPty(); err != nil {
   409  				ch <- NodeMonitorInfo{Index: nodes[i], Err: err}
   410  				return
   411  			}
   412  			// Give the session a valid stdin pipe so that nc won't exit immediately.
   413  			// When nc does exit, we write to stdout, which has a side effect of
   414  			// checking whether the stdout pipe has broken. This allows us to detect
   415  			// when the roachprod process is killed.
   416  			inPipe, err := sess.StdinPipe()
   417  			if err != nil {
   418  				ch <- NodeMonitorInfo{Index: nodes[i], Err: err}
   419  				return
   420  			}
   421  			defer inPipe.Close()
   422  
   423  			var readerWg sync.WaitGroup
   424  			readerWg.Add(1)
   425  			go func(p io.Reader) {
   426  				defer readerWg.Done()
   427  				r := bufio.NewReader(p)
   428  				for {
   429  					line, _, err := r.ReadLine()
   430  					if err == io.EOF {
   431  						return
   432  					}
   433  					ch <- NodeMonitorInfo{Index: nodes[i], Msg: string(line)}
   434  				}
   435  			}(p)
   436  
   437  			if err := sess.Start(buf.String()); err != nil {
   438  				ch <- NodeMonitorInfo{Index: nodes[i], Err: err}
   439  				return
   440  			}
   441  
   442  			readerWg.Wait()
   443  			// We must call `sess.Wait()` only after finishing reading from the stdout
   444  			// pipe. Otherwise it can be closed under us, causing the reader to loop
   445  			// infinitely receiving a non-`io.EOF` error.
   446  			if err := sess.Wait(); err != nil {
   447  				ch <- NodeMonitorInfo{Index: nodes[i], Err: err}
   448  				return
   449  			}
   450  		}(i)
   451  	}
   452  	go func() {
   453  		wg.Wait()
   454  		close(ch)
   455  	}()
   456  
   457  	return ch
   458  }
   459  
   460  // Run a command on >= 1 node in the cluster.
   461  //
   462  // When running on just one node, the command output is streamed to stdout.
   463  // When running on multiple nodes, the commands run in parallel, their output
   464  // is cached and then emitted all together once all commands are completed.
   465  //
   466  // stdout: Where stdout messages are written
   467  // stderr: Where stderr messages are written
   468  // nodes: The cluster nodes where the command will be run.
   469  // cmdKind: Which type of command is being run? This allows refined error reporting.
   470  // title: A description of the command being run that is output to the logs.
   471  // cmd: The command to run.
   472  func (c *SyncedCluster) Run(
   473  	stdout, stderr io.Writer, nodes []int, cmdKind CmdKind, title, cmd string,
   474  ) error {
   475  	// Stream output if we're running the command on only 1 node.
   476  	stream := len(nodes) == 1
   477  	var display string
   478  	if !stream {
   479  		display = fmt.Sprintf("%s: %s", c.Name, title)
   480  	}
   481  
   482  	errors := make([]error, len(nodes))
   483  	results := make([]string, len(nodes))
   484  	c.Parallel(display, len(nodes), 0, func(i int) ([]byte, error) {
   485  		sess, err := c.newSession(nodes[i])
   486  		if err != nil {
   487  			errors[i] = err
   488  			results[i] = err.Error()
   489  			return nil, nil
   490  		}
   491  		defer sess.Close()
   492  
   493  		// Argument template expansion is node specific (e.g. for {store-dir}).
   494  		e := expander{
   495  			node: nodes[i],
   496  		}
   497  		expandedCmd, err := e.expand(c, cmd)
   498  		if err != nil {
   499  			return nil, err
   500  		}
   501  
   502  		// Be careful about changing these command strings. In particular, we need
   503  		// to support running commands in the background on both local and remote
   504  		// nodes. For example:
   505  		//
   506  		//   roachprod run cluster -- "sleep 60 &> /dev/null < /dev/null &"
   507  		//
   508  		// That command should return immediately. And a "roachprod status" should
   509  		// reveal that the sleep command is running on the cluster.
   510  		nodeCmd := fmt.Sprintf(`export ROACHPROD=%d%s GOTRACEBACK=crash && bash -c %s`,
   511  			nodes[i], c.Tag, ssh.Escape1(expandedCmd))
   512  		if c.IsLocal() {
   513  			nodeCmd = fmt.Sprintf("cd ${HOME}/local/%d ; %s", nodes[i], nodeCmd)
   514  		}
   515  
   516  		if stream {
   517  			sess.SetStdout(stdout)
   518  			sess.SetStderr(stderr)
   519  			errors[i] = sess.Run(nodeCmd)
   520  			if errors[i] != nil {
   521  				detailMsg := fmt.Sprintf("Node %d. Command with error:\n```\n%s\n```\n", nodes[i], cmd)
   522  				err = crdberrors.WithDetail(errors[i], detailMsg)
   523  				err = cmdKind.classifyError(err)
   524  				errors[i] = err
   525  			}
   526  			return nil, nil
   527  		}
   528  
   529  		out, err := sess.CombinedOutput(nodeCmd)
   530  		msg := strings.TrimSpace(string(out))
   531  		if err != nil {
   532  			detailMsg := fmt.Sprintf("Node %d. Command with error:\n```\n%s\n```\n", nodes[i], cmd)
   533  			err = crdberrors.WithDetail(err, detailMsg)
   534  			err = cmdKind.classifyError(err)
   535  			errors[i] = err
   536  			msg += fmt.Sprintf("\n%v", err)
   537  		}
   538  		results[i] = msg
   539  		return nil, nil
   540  	})
   541  
   542  	if !stream {
   543  		for i, r := range results {
   544  			fmt.Fprintf(stdout, "  %2d: %s\n", nodes[i], r)
   545  		}
   546  	}
   547  
   548  	return rperrors.SelectPriorityError(errors)
   549  }
   550  
   551  // Wait TODO(peter): document
   552  func (c *SyncedCluster) Wait() error {
   553  	display := fmt.Sprintf("%s: waiting for nodes to start", c.Name)
   554  	errs := make([]error, len(c.Nodes))
   555  	c.Parallel(display, len(c.Nodes), 0, func(i int) ([]byte, error) {
   556  		for j := 0; j < 600; j++ {
   557  			sess, err := c.newSession(c.Nodes[i])
   558  			if err != nil {
   559  				time.Sleep(500 * time.Millisecond)
   560  				continue
   561  			}
   562  			defer sess.Close()
   563  
   564  			_, err = sess.CombinedOutput("test -e /mnt/data1/.roachprod-initialized")
   565  			if err != nil {
   566  				time.Sleep(500 * time.Millisecond)
   567  				continue
   568  			}
   569  			return nil, nil
   570  		}
   571  		errs[i] = errors.New("timed out after 5m")
   572  		return nil, nil
   573  	})
   574  
   575  	var foundErr bool
   576  	for i, err := range errs {
   577  		if err != nil {
   578  			fmt.Printf("  %2d: %v\n", c.Nodes[i], err)
   579  			foundErr = true
   580  		}
   581  	}
   582  	if foundErr {
   583  		return errors.New("not all nodes booted successfully")
   584  	}
   585  	return nil
   586  }
   587  
   588  // SetupSSH configures the cluster for use with SSH. This is generally run after
   589  // the cloud.Cluster has been synced which resets the SSH credentials on the
   590  // machines and sets them up for the current user. This method enables the
   591  // hosts to talk to eachother and optionally confiures additional keys to be
   592  // added to the hosts via the c.AuthorizedKeys field. It does so in the following
   593  // steps:
   594  //
   595  //   1. Creates an ssh key pair on the first host to be used on all hosts if
   596  //      none exists.
   597  //   2. Distributes the public key, private key, and authorized_keys file from
   598  //      the first host to the others.
   599  //   3. Merges the data in c.AuthorizedKeys with the existing authorized_keys
   600  //      files on all hosts.
   601  //
   602  // This call strives to be idempotent.
   603  func (c *SyncedCluster) SetupSSH() error {
   604  	if c.IsLocal() {
   605  		return nil
   606  	}
   607  
   608  	if len(c.Nodes) == 0 || len(c.Users) == 0 || len(c.VMs) == 0 {
   609  		return fmt.Errorf("%s: invalid cluster: nodes=%d users=%d hosts=%d",
   610  			c.Name, len(c.Nodes), len(c.Users), len(c.VMs))
   611  	}
   612  
   613  	// Generate an ssh key that we'll distribute to all of the nodes in the
   614  	// cluster in order to allow inter-node ssh.
   615  	var sshTar []byte
   616  	c.Parallel("generating ssh key", 1, 0, func(i int) ([]byte, error) {
   617  		sess, err := c.newSession(1)
   618  		if err != nil {
   619  			return nil, err
   620  		}
   621  		defer sess.Close()
   622  
   623  		// Create the ssh key and then tar up the public, private and
   624  		// authorized_keys files and output them to stdout. We'll take this output
   625  		// and pipe it back into tar on the other nodes in the cluster.
   626  		cmd := `
   627  test -f .ssh/id_rsa || \
   628    (ssh-keygen -q -f .ssh/id_rsa -t rsa -N '' && \
   629     cat .ssh/id_rsa.pub >> .ssh/authorized_keys);
   630  tar cf - .ssh/id_rsa .ssh/id_rsa.pub .ssh/authorized_keys
   631  `
   632  
   633  		var stdout bytes.Buffer
   634  		var stderr bytes.Buffer
   635  		sess.SetStdout(&stdout)
   636  		sess.SetStderr(&stderr)
   637  
   638  		if err := sess.Run(cmd); err != nil {
   639  			return nil, errors.Wrapf(err, "%s: stderr:\n%s", cmd, stderr.String())
   640  		}
   641  		sshTar = stdout.Bytes()
   642  		return nil, nil
   643  	})
   644  
   645  	// Skip the the first node which is where we generated the key.
   646  	nodes := c.Nodes[1:]
   647  	c.Parallel("distributing ssh key", len(nodes), 0, func(i int) ([]byte, error) {
   648  		sess, err := c.newSession(nodes[i])
   649  		if err != nil {
   650  			return nil, err
   651  		}
   652  		defer sess.Close()
   653  
   654  		sess.SetStdin(bytes.NewReader(sshTar))
   655  		cmd := `tar xf -`
   656  		if out, err := sess.CombinedOutput(cmd); err != nil {
   657  			return nil, errors.Wrapf(err, "%s: output:\n%s", cmd, out)
   658  		}
   659  		return nil, nil
   660  	})
   661  
   662  	// Populate the known_hosts file with both internal and external IPs of all
   663  	// of the nodes in the cluster. Note that as a side effect, this creates the
   664  	// known hosts file in unhashed format, working around a limitation of jsch
   665  	// (which is used in jepsen tests).
   666  	ips := make([]string, len(c.Nodes), len(c.Nodes)*2)
   667  	c.Parallel("retrieving hosts", len(c.Nodes), 0, func(i int) ([]byte, error) {
   668  		for j := 0; j < 20 && ips[i] == ""; j++ {
   669  			var err error
   670  			ips[i], err = c.GetInternalIP(c.Nodes[i])
   671  			if err != nil {
   672  				return nil, errors.Wrapf(err, "pgurls")
   673  			}
   674  			time.Sleep(time.Second)
   675  		}
   676  		if ips[i] == "" {
   677  			return nil, fmt.Errorf("retrieved empty IP address")
   678  		}
   679  		return nil, nil
   680  	})
   681  	for _, i := range c.Nodes {
   682  		ips = append(ips, c.host(i))
   683  	}
   684  	var knownHostsData []byte
   685  	c.Parallel("scanning hosts", 1, 0, func(i int) ([]byte, error) {
   686  		sess, err := c.newSession(c.Nodes[i])
   687  		if err != nil {
   688  			return nil, err
   689  		}
   690  		defer sess.Close()
   691  
   692  		// ssh-keyscan may return fewer than the desired number of entries if the
   693  		// remote nodes are not responding yet, so we loop until we have a scan that
   694  		// found host keys for all of the IPs. Merge the newly scanned keys with the
   695  		// existing list to make this process idempotent.
   696  		cmd := `
   697  set -e
   698  tmp="$(tempfile -d ~/.ssh -p 'roachprod' )"
   699  on_exit() {
   700      rm -f "${tmp}"
   701  }
   702  trap on_exit EXIT
   703  for i in {1..20}; do
   704    ssh-keyscan -T 60 -t rsa ` + strings.Join(ips, " ") + ` > "${tmp}"
   705    if [[ "$(wc < ${tmp} -l)" -eq "` + fmt.Sprint(len(ips)) + `" ]]; then
   706      [[ -f .ssh/known_hosts ]] && cat .ssh/known_hosts >> "${tmp}"
   707      sort -u < "${tmp}"
   708      exit 0
   709    fi
   710    sleep 1
   711  done
   712  exit 1
   713  `
   714  		var stdout bytes.Buffer
   715  		var stderr bytes.Buffer
   716  		sess.SetStdout(&stdout)
   717  		sess.SetStderr(&stderr)
   718  		if err := sess.Run(cmd); err != nil {
   719  			return nil, errors.Wrapf(err, "%s: stderr:\n%s", cmd, stderr.String())
   720  		}
   721  		knownHostsData = stdout.Bytes()
   722  		return nil, nil
   723  	})
   724  	c.Parallel("distributing known_hosts", len(c.Nodes), 0, func(i int) ([]byte, error) {
   725  		sess, err := c.newSession(c.Nodes[i])
   726  		if err != nil {
   727  			return nil, err
   728  		}
   729  		defer sess.Close()
   730  
   731  		sess.SetStdin(bytes.NewReader(knownHostsData))
   732  		const cmd = `
   733  known_hosts_data="$(cat)"
   734  set -e
   735  tmp="$(tempfile -p 'roachprod' -m 0644 )"
   736  on_exit() {
   737      rm -f "${tmp}"
   738  }
   739  trap on_exit EXIT
   740  echo "${known_hosts_data}" > "${tmp}"
   741  cat "${tmp}" >> ~/.ssh/known_hosts
   742  # If our bootstrapping user is not the shared user install all of the
   743  # relevant ssh files from the bootstrapping user into the shared user's
   744  # .ssh directory.
   745  if [[ "$(whoami)" != "` + config.SharedUser + `" ]]; then
   746      # Ensure that the shared user has a .ssh directory
   747      sudo -u ` + config.SharedUser +
   748  			` bash -c "mkdir -p ~` + config.SharedUser + `/.ssh"
   749      # This somewhat absurd incantation ensures that we properly shell quote
   750      # filenames so that they both aren't expanded and work even if the filenames
   751      # include spaces.
   752      sudo find ~/.ssh -type f -execdir bash -c 'install \
   753          --owner ` + config.SharedUser + ` \
   754          --group ` + config.SharedUser + ` \
   755          --mode $(stat -c "%a" '"'"'{}'"'"') \
   756          '"'"'{}'"'"' ~` + config.SharedUser + `/.ssh' \;
   757  fi
   758  `
   759  		if out, err := sess.CombinedOutput(cmd); err != nil {
   760  			return nil, errors.Wrapf(err, "%s: output:\n%s", cmd, out)
   761  		}
   762  		return nil, nil
   763  	})
   764  	if len(c.AuthorizedKeys) > 0 {
   765  		// When clusters are created using cloud APIs they only have a subset of
   766  		// desired keys installed on a subset of users. This code distributes
   767  		// additional authorized_keys to both the current user (your username on
   768  		// gce and the shared user on aws) as well as to the shared user on both
   769  		// platforms.
   770  		c.Parallel("adding additional authorized keys", len(c.Nodes), 0, func(i int) ([]byte, error) {
   771  			sess, err := c.newSession(c.Nodes[i])
   772  			if err != nil {
   773  				return nil, err
   774  			}
   775  			defer sess.Close()
   776  
   777  			sess.SetStdin(bytes.NewReader(c.AuthorizedKeys))
   778  			const cmd = `
   779  keys_data="$(cat)"
   780  set -e
   781  tmp1="$(tempfile -d ~/.ssh -p 'roachprod' )"
   782  tmp2="$(tempfile -d ~/.ssh -p 'roachprod' )"
   783  on_exit() {
   784      rm -f "${tmp1}" "${tmp2}"
   785  }
   786  trap on_exit EXIT
   787  if [[ -f ~/.ssh/authorized_keys ]]; then
   788      cat ~/.ssh/authorized_keys > "${tmp1}"
   789  fi
   790  echo "${keys_data}" >> "${tmp1}"
   791  sort -u < "${tmp1}" > "${tmp2}"
   792  install --mode 0600 "${tmp2}" ~/.ssh/authorized_keys
   793  if [[ "$(whoami)" != "` + config.SharedUser + `" ]]; then
   794      sudo install --mode 0600 \
   795          --owner ` + config.SharedUser + `\
   796          --group ` + config.SharedUser + `\
   797          "${tmp2}" ~` + config.SharedUser + `/.ssh/authorized_keys
   798  fi
   799  `
   800  			if out, err := sess.CombinedOutput(cmd); err != nil {
   801  				return nil, errors.Wrapf(err, "~ %s\n%s", cmd, out)
   802  			}
   803  			return nil, nil
   804  		})
   805  	}
   806  
   807  	return nil
   808  }
   809  
   810  // DistributeCerts will generate and distribute certificates to all of the
   811  // nodes.
   812  func (c *SyncedCluster) DistributeCerts() {
   813  	dir := ""
   814  	if c.IsLocal() {
   815  		dir = `${HOME}/local/1`
   816  	}
   817  
   818  	// Check to see if the certs have already been initialized.
   819  	var existsErr error
   820  	display := fmt.Sprintf("%s: checking certs", c.Name)
   821  	c.Parallel(display, 1, 0, func(i int) ([]byte, error) {
   822  		sess, err := c.newSession(1)
   823  		if err != nil {
   824  			return nil, err
   825  		}
   826  		defer sess.Close()
   827  		_, existsErr = sess.CombinedOutput(`test -e ` + filepath.Join(dir, `certs.tar`))
   828  		return nil, nil
   829  	})
   830  
   831  	if existsErr == nil {
   832  		return
   833  	}
   834  
   835  	// Gather the internal IP addresses for every node in the cluster, even
   836  	// if it won't be added to the cluster itself we still add the IP address
   837  	// to the node cert.
   838  	var msg string
   839  	display = fmt.Sprintf("%s: initializing certs", c.Name)
   840  	nodes := allNodes(len(c.VMs))
   841  	var ips []string
   842  	if !c.IsLocal() {
   843  		ips = make([]string, len(nodes))
   844  		c.Parallel("", len(nodes), 0, func(i int) ([]byte, error) {
   845  			var err error
   846  			ips[i], err = c.GetInternalIP(nodes[i])
   847  			return nil, errors.Wrapf(err, "IPs")
   848  		})
   849  	}
   850  
   851  	// Generate the ca, client and node certificates on the first node.
   852  	c.Parallel(display, 1, 0, func(i int) ([]byte, error) {
   853  		sess, err := c.newSession(1)
   854  		if err != nil {
   855  			return nil, err
   856  		}
   857  		defer sess.Close()
   858  
   859  		var nodeNames []string
   860  		if c.IsLocal() {
   861  			// For local clusters, we only need to add one of the VM IP addresses.
   862  			nodeNames = append(nodeNames, "$(hostname)", c.VMs[0])
   863  		} else {
   864  			// Add both the local and external IP addresses, as well as the
   865  			// hostnames to the node certificate.
   866  			nodeNames = append(nodeNames, ips...)
   867  			nodeNames = append(nodeNames, c.VMs...)
   868  			for i := range c.VMs {
   869  				nodeNames = append(nodeNames, fmt.Sprintf("%s-%04d", c.Name, i+1))
   870  				// On AWS nodes internally have a DNS name in the form ip-<ip address>
   871  				// where dots have been replaces with dashes.
   872  				// See https://docs.aws.amazon.com/vpc/latest/userguide/vpc-dns.html#vpc-dns-hostnames
   873  				if strings.Contains(c.Localities[i], "cloud=aws") {
   874  					nodeNames = append(nodeNames, "ip-"+strings.ReplaceAll(ips[i], ".", "-"))
   875  				}
   876  			}
   877  		}
   878  
   879  		var cmd string
   880  		if c.IsLocal() {
   881  			cmd = `cd ${HOME}/local/1 ; `
   882  		}
   883  		cmd += fmt.Sprintf(`
   884  rm -fr certs
   885  mkdir -p certs
   886  %[1]s cert create-ca --certs-dir=certs --ca-key=certs/ca.key
   887  %[1]s cert create-client root --certs-dir=certs --ca-key=certs/ca.key
   888  %[1]s cert create-node localhost %[2]s --certs-dir=certs --ca-key=certs/ca.key
   889  tar cvf certs.tar certs
   890  `, cockroachNodeBinary(c, 1), strings.Join(nodeNames, " "))
   891  		if out, err := sess.CombinedOutput(cmd); err != nil {
   892  			msg = fmt.Sprintf("%s: %v", out, err)
   893  		}
   894  		return nil, nil
   895  	})
   896  
   897  	if msg != "" {
   898  		fmt.Fprintln(os.Stderr, msg)
   899  		os.Exit(1)
   900  	}
   901  
   902  	var tmpfileName string
   903  	if c.IsLocal() {
   904  		tmpfileName = os.ExpandEnv(filepath.Join(dir, "certs.tar"))
   905  	} else {
   906  		// Retrieve the certs.tar that was created on the first node.
   907  		tmpfile, err := ioutil.TempFile("", "certs")
   908  		if err != nil {
   909  			fmt.Fprintln(os.Stderr, err)
   910  			os.Exit(1)
   911  		}
   912  		_ = tmpfile.Close()
   913  		defer func() {
   914  			_ = os.Remove(tmpfile.Name()) // clean up
   915  		}()
   916  
   917  		if err := func() error {
   918  			return c.scp(fmt.Sprintf("%s@%s:certs.tar", c.user(1), c.host(1)), tmpfile.Name())
   919  		}(); err != nil {
   920  			fmt.Fprintln(os.Stderr, err)
   921  			os.Exit(1)
   922  		}
   923  
   924  		tmpfileName = tmpfile.Name()
   925  	}
   926  
   927  	// Read the certs.tar file we just downloaded. We'll be piping it to the
   928  	// other nodes in the cluster.
   929  	certsTar, err := ioutil.ReadFile(tmpfileName)
   930  	if err != nil {
   931  		fmt.Fprintln(os.Stderr, err)
   932  		os.Exit(1)
   933  	}
   934  
   935  	// Skip the the first node which is where we generated the certs.
   936  	display = c.Name + ": distributing certs"
   937  	nodes = nodes[1:]
   938  	c.Parallel(display, len(nodes), 0, func(i int) ([]byte, error) {
   939  		sess, err := c.newSession(nodes[i])
   940  		if err != nil {
   941  			return nil, err
   942  		}
   943  		defer sess.Close()
   944  
   945  		sess.SetStdin(bytes.NewReader(certsTar))
   946  		var cmd string
   947  		if c.IsLocal() {
   948  			cmd = fmt.Sprintf(`cd ${HOME}/local/%d ; `, nodes[i])
   949  		}
   950  		cmd += `tar xf -`
   951  		if out, err := sess.CombinedOutput(cmd); err != nil {
   952  			return nil, errors.Wrapf(err, "~ %s\n%s", cmd, out)
   953  		}
   954  		return nil, nil
   955  	})
   956  }
   957  
   958  const progressDone = "=======================================>"
   959  const progressTodo = "----------------------------------------"
   960  
   961  func formatProgress(p float64) string {
   962  	i := int(math.Ceil(float64(len(progressDone)) * (1 - p)))
   963  	if i > len(progressDone) {
   964  		i = len(progressDone)
   965  	}
   966  	if i < 0 {
   967  		i = 0
   968  	}
   969  	return fmt.Sprintf("[%s%s] %.0f%%", progressDone[i:], progressTodo[:i], 100*p)
   970  }
   971  
   972  // Put TODO(peter): document
   973  func (c *SyncedCluster) Put(src, dest string) {
   974  	// NB: This value was determined with a few experiments. Higher values were
   975  	// not tested.
   976  	const treeDistFanout = 10
   977  
   978  	var detail string
   979  	if !c.IsLocal() {
   980  		if c.UseTreeDist {
   981  			detail = " (dist)"
   982  		} else {
   983  			detail = " (scp)"
   984  		}
   985  	}
   986  	fmt.Printf("%s: putting%s %s %s\n", c.Name, detail, src, dest)
   987  
   988  	type result struct {
   989  		index int
   990  		err   error
   991  	}
   992  
   993  	results := make(chan result, len(c.Nodes))
   994  	lines := make([]string, len(c.Nodes))
   995  	var linesMu syncutil.Mutex
   996  	var wg sync.WaitGroup
   997  	wg.Add(len(c.Nodes))
   998  
   999  	// Each destination for the copy needs a source to copy from. We create a
  1000  	// channel that has capacity for each destination. If we try to add a source
  1001  	// and the channel is full we can simply drop that source as we know we won't
  1002  	// need to use it.
  1003  	sources := make(chan int, len(c.Nodes))
  1004  	pushSource := func(i int) {
  1005  		select {
  1006  		case sources <- i:
  1007  		default:
  1008  		}
  1009  	}
  1010  
  1011  	if c.UseTreeDist {
  1012  		// In treedist mode, only add the local source initially.
  1013  		pushSource(-1)
  1014  	} else {
  1015  		// In non-treedist mode, add the local source N times (once for each
  1016  		// destination).
  1017  		for range c.Nodes {
  1018  			pushSource(-1)
  1019  		}
  1020  	}
  1021  
  1022  	mkpath := func(i int, dest string) (string, error) {
  1023  		if i == -1 {
  1024  			return src, nil
  1025  		}
  1026  		// Expand the destination to allow, for example, putting directly
  1027  		// into {store-dir}.
  1028  		e := expander{
  1029  			node: c.Nodes[i],
  1030  		}
  1031  		dest, err := e.expand(c, dest)
  1032  		if err != nil {
  1033  			return "", err
  1034  		}
  1035  		return fmt.Sprintf("%s@%s:%s", c.user(c.Nodes[i]), c.host(c.Nodes[i]), dest), nil
  1036  	}
  1037  
  1038  	for i := range c.Nodes {
  1039  		go func(i int, dest string) {
  1040  			defer wg.Done()
  1041  
  1042  			if c.IsLocal() {
  1043  				// Expand the destination to allow, for example, putting directly
  1044  				// into {store-dir}.
  1045  				e := expander{
  1046  					node: c.Nodes[i],
  1047  				}
  1048  				var err error
  1049  				dest, err = e.expand(c, dest)
  1050  				if err != nil {
  1051  					results <- result{i, err}
  1052  					return
  1053  				}
  1054  				if _, err := os.Stat(src); err != nil {
  1055  					results <- result{i, err}
  1056  					return
  1057  				}
  1058  				from, err := filepath.Abs(src)
  1059  				if err != nil {
  1060  					results <- result{i, err}
  1061  					return
  1062  				}
  1063  				// TODO(jlinder): this does not take into account things like
  1064  				// roachprod put local:1 /some/file.txt /some/dir
  1065  				// and will replace 'dir' with the contents of file.txt, instead
  1066  				// of creating /some/dir/file.txt.
  1067  				var to string
  1068  				if filepath.IsAbs(dest) {
  1069  					to = dest
  1070  				} else {
  1071  					to = fmt.Sprintf(os.ExpandEnv("${HOME}/local/%d/%s"), c.Nodes[i], dest)
  1072  				}
  1073  				// Remove the destination if it exists, ignoring errors which we'll
  1074  				// handle via the os.Symlink() call.
  1075  				_ = os.Remove(to)
  1076  				results <- result{i, os.Symlink(from, to)}
  1077  				return
  1078  			}
  1079  
  1080  			// Determine the source to copy from.
  1081  			//
  1082  			// TODO(peter): Take the cluster topology into account. We should
  1083  			// preferentially use a source in the same region and only perform a
  1084  			// single copy between regions. We have the region information and
  1085  			// achieving this approach is likely a generalization of the current
  1086  			// code.
  1087  			srcIndex := <-sources
  1088  			from, err := mkpath(srcIndex, dest)
  1089  			if err != nil {
  1090  				results <- result{i, err}
  1091  				return
  1092  			}
  1093  			// TODO(peter): For remote-to-remote copies, should the destination use
  1094  			// the internal IP address? The external address works, but it might be
  1095  			// slower.
  1096  			to, err := mkpath(i, dest)
  1097  			if err != nil {
  1098  				results <- result{i, err}
  1099  				return
  1100  			}
  1101  
  1102  			err = c.scp(from, to)
  1103  			results <- result{i, err}
  1104  
  1105  			if err != nil {
  1106  				// The copy failed. Re-add the original source.
  1107  				pushSource(srcIndex)
  1108  			} else {
  1109  				// The copy failed. Re-add the original source if it is remote.
  1110  				if srcIndex != -1 {
  1111  					pushSource(srcIndex)
  1112  				}
  1113  				// Add fanout number of new sources for the destination.
  1114  				for j := 0; j < treeDistFanout; j++ {
  1115  					pushSource(i)
  1116  				}
  1117  			}
  1118  		}(i, dest)
  1119  	}
  1120  
  1121  	go func() {
  1122  		wg.Wait()
  1123  		close(results)
  1124  	}()
  1125  
  1126  	var writer ui.Writer
  1127  	var ticker *time.Ticker
  1128  	if !c.Quiet {
  1129  		ticker = time.NewTicker(100 * time.Millisecond)
  1130  	} else {
  1131  		ticker = time.NewTicker(1000 * time.Millisecond)
  1132  	}
  1133  	defer ticker.Stop()
  1134  	haveErr := false
  1135  
  1136  	var spinner = []string{"|", "/", "-", "\\"}
  1137  	spinnerIdx := 0
  1138  
  1139  	for done := false; !done; {
  1140  		select {
  1141  		case <-ticker.C:
  1142  			if c.Quiet {
  1143  				fmt.Printf(".")
  1144  			}
  1145  		case r, ok := <-results:
  1146  			done = !ok
  1147  			if ok {
  1148  				linesMu.Lock()
  1149  				if r.err != nil {
  1150  					haveErr = true
  1151  					lines[r.index] = r.err.Error()
  1152  				} else {
  1153  					lines[r.index] = "done"
  1154  				}
  1155  				linesMu.Unlock()
  1156  			}
  1157  		}
  1158  		if !c.Quiet {
  1159  			linesMu.Lock()
  1160  			for i := range lines {
  1161  				fmt.Fprintf(&writer, "  %2d: ", c.Nodes[i])
  1162  				if lines[i] != "" {
  1163  					fmt.Fprintf(&writer, "%s", lines[i])
  1164  				} else {
  1165  					fmt.Fprintf(&writer, "%s", spinner[spinnerIdx%len(spinner)])
  1166  				}
  1167  				fmt.Fprintf(&writer, "\n")
  1168  			}
  1169  			linesMu.Unlock()
  1170  			_ = writer.Flush(os.Stdout)
  1171  			spinnerIdx++
  1172  		}
  1173  	}
  1174  
  1175  	if c.Quiet {
  1176  		fmt.Printf("\n")
  1177  		linesMu.Lock()
  1178  		for i := range lines {
  1179  			fmt.Printf("  %2d: %s\n", c.Nodes[i], lines[i])
  1180  		}
  1181  		linesMu.Unlock()
  1182  	}
  1183  
  1184  	if haveErr {
  1185  		log.Fatalf("put %s failed", src)
  1186  	}
  1187  }
  1188  
  1189  // Logs will sync the logs from c to dest with each nodes logs under dest in
  1190  // directories per node and stream the merged logs to out.
  1191  // For example, if dest is "tpcc-test.logs" then the logs for each node will be
  1192  // stored like:
  1193  //
  1194  //  tpcc-test.logs/1.logs/...
  1195  //  tpcc-test.logs/2.logs/...
  1196  //  ...
  1197  //
  1198  // Log file syncing uses rsync which attempts to be efficient when deciding
  1199  // which files to update. The logs are merged by calling
  1200  // `cockroach debug merge-logs <dest>/*/*` with the optional flag for filter.
  1201  // The syncing and merging happens in a loop which pauses <interval> between
  1202  // iterations and takes some care with the from/to flags in merge-logs to make
  1203  // new logs appear to be streamed. If <from> is zero streaming begins from now.
  1204  // If to is non-zero, when the stream of logs passes to, the function returns.
  1205  // <user> allows retrieval of logs from a roachprod cluster being run by another
  1206  // user and assumes that the current user used to create c has the ability to
  1207  // sudo into <user>.
  1208  func (c *SyncedCluster) Logs(
  1209  	src, dest, user, filter, programFilter string,
  1210  	interval time.Duration,
  1211  	from, to time.Time,
  1212  	out io.Writer,
  1213  ) error {
  1214  	rsyncNodeLogs := func(ctx context.Context, idx int) error {
  1215  		base := fmt.Sprintf("%d.logs", c.Nodes[idx-1])
  1216  		local := filepath.Join(dest, base) + "/"
  1217  		sshUser := c.user(c.Nodes[idx-1])
  1218  		rsyncArgs := []string{"-az", "--size-only"}
  1219  		var remote string
  1220  		if c.IsLocal() {
  1221  			// This here is a bit of a hack to guess that the parent of the log dir is
  1222  			// the "home" for the local node and that the srcBase is relative to that.
  1223  			localHome := filepath.Dir(c.Impl.LogDir(c, idx))
  1224  			remote = filepath.Join(localHome, src) + "/"
  1225  		} else {
  1226  			logDir := src
  1227  			if !filepath.IsAbs(logDir) && user != "" && user != sshUser {
  1228  				logDir = "~" + user + "/" + logDir
  1229  			}
  1230  			remote = fmt.Sprintf("%s@%s:%s/", c.user(c.Nodes[idx-1]),
  1231  				c.host(c.Nodes[idx-1]), logDir)
  1232  			// Use control master to mitigate SSH connection setup cost.
  1233  			rsyncArgs = append(rsyncArgs, "--rsh", "ssh "+
  1234  				"-o StrictHostKeyChecking=no "+
  1235  				"-o ControlMaster=auto "+
  1236  				"-o ControlPath=~/.ssh/%r@%h:%p "+
  1237  				"-o UserKnownHostsFile=/dev/null "+
  1238  				"-o ControlPersist=2m")
  1239  			// Use rsync-path flag to sudo into user if different from sshUser.
  1240  			if user != "" && user != sshUser {
  1241  				rsyncArgs = append(rsyncArgs, "--rsync-path",
  1242  					fmt.Sprintf("sudo -u %s rsync", user))
  1243  			}
  1244  		}
  1245  		rsyncArgs = append(rsyncArgs, remote, local)
  1246  		cmd := exec.CommandContext(ctx, "rsync", rsyncArgs...)
  1247  		var stderrBuf bytes.Buffer
  1248  		cmd.Stdout = os.Stdout
  1249  		cmd.Stderr = &stderrBuf
  1250  		if err := cmd.Run(); err != nil {
  1251  			if ctx.Err() != nil {
  1252  				return nil
  1253  			}
  1254  			return errors.Errorf("failed to rsync from %v to %v: %v\n%s",
  1255  				src, dest, err, stderrBuf.String())
  1256  		}
  1257  		return nil
  1258  	}
  1259  	rsyncLogs := func(ctx context.Context) error {
  1260  		g, gctx := errgroup.WithContext(ctx)
  1261  		for i := range c.Nodes {
  1262  			idx := c.Nodes[i]
  1263  			g.Go(func() error {
  1264  				return rsyncNodeLogs(gctx, idx)
  1265  			})
  1266  		}
  1267  		return g.Wait()
  1268  	}
  1269  	mergeLogs := func(ctx context.Context, prev, t time.Time) error {
  1270  		cmd := exec.CommandContext(ctx, "cockroach", "debug", "merge-logs",
  1271  			dest+"/*/*",
  1272  			"--from", prev.Format(time.RFC3339),
  1273  			"--to", t.Format(time.RFC3339))
  1274  		if filter != "" {
  1275  			cmd.Args = append(cmd.Args, "--filter", filter)
  1276  		}
  1277  		if programFilter != "" {
  1278  			cmd.Args = append(cmd.Args, "--program-filter", programFilter)
  1279  		}
  1280  		// For local clusters capture the cluster ID from the sync path because the
  1281  		// host information is useless.
  1282  		if c.IsLocal() {
  1283  			cmd.Args = append(cmd.Args,
  1284  				"--file-pattern", "^(?:.*/)?(?P<id>[0-9]+).*/"+clog.FileNamePattern+"$",
  1285  				"--prefix", "${id}> ")
  1286  		}
  1287  		cmd.Stdout = out
  1288  		var errBuf bytes.Buffer
  1289  		cmd.Stderr = &errBuf
  1290  		if err := cmd.Run(); err != nil && ctx.Err() == nil {
  1291  			return fmt.Errorf("failed to run cockroach debug merge-logs:%v\n%v",
  1292  				err, errBuf.String())
  1293  		}
  1294  		return nil
  1295  	}
  1296  
  1297  	ctx, cancel := context.WithCancel(context.Background())
  1298  	defer cancel()
  1299  	if err := os.MkdirAll(dest, 0755); err != nil {
  1300  		return errors.Errorf("failed to create destination directory: %v", err)
  1301  	}
  1302  	// Cancel context upon signaling.
  1303  	ch := make(chan os.Signal, 1)
  1304  	signal.Notify(ch, syscall.SIGINT, syscall.SIGTERM, syscall.SIGQUIT)
  1305  	defer func() { signal.Stop(ch); close(ch) }()
  1306  	go func() { <-ch; cancel() }()
  1307  	// TODO(ajwerner): consider SIGHUP-ing cockroach before the rsync to avoid the delays
  1308  	prev := from
  1309  	if prev.IsZero() {
  1310  		prev = timeutil.Now().Add(-2 * time.Second).Truncate(time.Microsecond)
  1311  	}
  1312  	for to.IsZero() || prev.Before(to) {
  1313  		// Subtract ~1 second to deal with the flush delay in util/log.
  1314  		t := timeutil.Now().Add(-1100 * time.Millisecond).Truncate(time.Microsecond)
  1315  		if err := rsyncLogs(ctx); err != nil {
  1316  			return errors.Errorf("failed to sync logs: %v", err)
  1317  		}
  1318  		if !to.IsZero() && t.After(to) {
  1319  			t = to
  1320  		}
  1321  		if err := mergeLogs(ctx, prev, t); err != nil {
  1322  			return err
  1323  		}
  1324  		prev = t
  1325  		if !to.IsZero() && !prev.Before(to) {
  1326  			return nil
  1327  		}
  1328  		select {
  1329  		case <-time.After(interval):
  1330  		case <-ctx.Done():
  1331  			return nil
  1332  		}
  1333  	}
  1334  	return nil
  1335  }
  1336  
  1337  // Get TODO(peter): document
  1338  func (c *SyncedCluster) Get(src, dest string) {
  1339  	// TODO(peter): Only get 10 nodes at a time. When a node completes, output a
  1340  	// line indicating that.
  1341  	var detail string
  1342  	if !c.IsLocal() {
  1343  		detail = " (scp)"
  1344  	}
  1345  	fmt.Printf("%s: getting%s %s %s\n", c.Name, detail, src, dest)
  1346  
  1347  	type result struct {
  1348  		index int
  1349  		err   error
  1350  	}
  1351  
  1352  	var writer ui.Writer
  1353  	results := make(chan result, len(c.Nodes))
  1354  	lines := make([]string, len(c.Nodes))
  1355  	var linesMu syncutil.Mutex
  1356  
  1357  	var wg sync.WaitGroup
  1358  	for i := range c.Nodes {
  1359  		wg.Add(1)
  1360  		go func(i int) {
  1361  			defer wg.Done()
  1362  
  1363  			src := src
  1364  			dest := dest
  1365  			if len(c.Nodes) > 1 {
  1366  				base := fmt.Sprintf("%d.%s", c.Nodes[i], filepath.Base(dest))
  1367  				dest = filepath.Join(filepath.Dir(dest), base)
  1368  			}
  1369  
  1370  			progress := func(p float64) {
  1371  				linesMu.Lock()
  1372  				defer linesMu.Unlock()
  1373  				lines[i] = formatProgress(p)
  1374  			}
  1375  
  1376  			if c.IsLocal() {
  1377  				if !filepath.IsAbs(src) {
  1378  					src = filepath.Join(fmt.Sprintf(os.ExpandEnv("${HOME}/local/%d"), c.Nodes[i]), src)
  1379  				}
  1380  
  1381  				var copy func(src, dest string, info os.FileInfo) error
  1382  				copy = func(src, dest string, info os.FileInfo) error {
  1383  					// Make sure the destination file is world readable.
  1384  					// See:
  1385  					// https://github.com/cockroachdb/cockroach/issues/44843
  1386  					mode := info.Mode() | 0444
  1387  					if info.IsDir() {
  1388  						if err := os.MkdirAll(dest, mode); err != nil {
  1389  							return err
  1390  						}
  1391  
  1392  						infos, err := ioutil.ReadDir(src)
  1393  						if err != nil {
  1394  							return err
  1395  						}
  1396  
  1397  						for _, info := range infos {
  1398  							if err := copy(
  1399  								filepath.Join(src, info.Name()),
  1400  								filepath.Join(dest, info.Name()),
  1401  								info,
  1402  							); err != nil {
  1403  								return err
  1404  							}
  1405  						}
  1406  						return nil
  1407  					}
  1408  
  1409  					if !mode.IsRegular() {
  1410  						return nil
  1411  					}
  1412  
  1413  					out, err := os.Create(dest)
  1414  					if err != nil {
  1415  						return err
  1416  					}
  1417  					defer out.Close()
  1418  
  1419  					if err := os.Chmod(out.Name(), mode); err != nil {
  1420  						return err
  1421  					}
  1422  
  1423  					in, err := os.Open(src)
  1424  					if err != nil {
  1425  						return err
  1426  					}
  1427  					defer in.Close()
  1428  
  1429  					p := &ssh.ProgressWriter{
  1430  						Writer:   out,
  1431  						Done:     0,
  1432  						Total:    info.Size(),
  1433  						Progress: progress,
  1434  					}
  1435  					_, err = io.Copy(p, in)
  1436  					return err
  1437  				}
  1438  
  1439  				info, err := os.Stat(src)
  1440  				if err != nil {
  1441  					results <- result{i, err}
  1442  					return
  1443  				}
  1444  				err = copy(src, dest, info)
  1445  				results <- result{i, err}
  1446  				return
  1447  			}
  1448  
  1449  			err := c.scp(fmt.Sprintf("%s@%s:%s", c.user(c.Nodes[0]), c.host(c.Nodes[i]), src), dest)
  1450  			if err == nil {
  1451  				// Make sure all created files and directories are world readable.
  1452  				// The CRDB process intentionally sets a 0007 umask (resulting in
  1453  				// non-world-readable files). This creates annoyances during CI
  1454  				// that we circumvent wholesale by adding o+r back here.
  1455  				// See:
  1456  				//
  1457  				// https://github.com/cockroachdb/cockroach/issues/44843
  1458  				chmod := func(path string, info os.FileInfo, err error) error {
  1459  					if err != nil {
  1460  						return err
  1461  					}
  1462  					const oRead = 0004
  1463  					if mode := info.Mode(); mode&oRead == 0 {
  1464  						if err := os.Chmod(path, mode|oRead); err != nil {
  1465  							return err
  1466  						}
  1467  					}
  1468  					return nil
  1469  				}
  1470  				err = filepath.Walk(dest, chmod)
  1471  			}
  1472  
  1473  			results <- result{i, err}
  1474  		}(i)
  1475  	}
  1476  
  1477  	go func() {
  1478  		wg.Wait()
  1479  		close(results)
  1480  	}()
  1481  
  1482  	var ticker *time.Ticker
  1483  	if !c.Quiet {
  1484  		ticker = time.NewTicker(100 * time.Millisecond)
  1485  	} else {
  1486  		ticker = time.NewTicker(1000 * time.Millisecond)
  1487  	}
  1488  	defer ticker.Stop()
  1489  	haveErr := false
  1490  
  1491  	var spinner = []string{"|", "/", "-", "\\"}
  1492  	spinnerIdx := 0
  1493  
  1494  	for done := false; !done; {
  1495  		select {
  1496  		case <-ticker.C:
  1497  			if c.Quiet {
  1498  				fmt.Printf(".")
  1499  			}
  1500  		case r, ok := <-results:
  1501  			done = !ok
  1502  			if ok {
  1503  				linesMu.Lock()
  1504  				if r.err != nil {
  1505  					haveErr = true
  1506  					lines[r.index] = r.err.Error()
  1507  				} else {
  1508  					lines[r.index] = "done"
  1509  				}
  1510  				linesMu.Unlock()
  1511  			}
  1512  		}
  1513  		if !c.Quiet {
  1514  			linesMu.Lock()
  1515  			for i := range lines {
  1516  				fmt.Fprintf(&writer, "  %2d: ", c.Nodes[i])
  1517  				if lines[i] != "" {
  1518  					fmt.Fprintf(&writer, "%s", lines[i])
  1519  				} else {
  1520  					fmt.Fprintf(&writer, "%s", spinner[spinnerIdx%len(spinner)])
  1521  				}
  1522  				fmt.Fprintf(&writer, "\n")
  1523  			}
  1524  			linesMu.Unlock()
  1525  			_ = writer.Flush(os.Stdout)
  1526  			spinnerIdx++
  1527  		}
  1528  	}
  1529  
  1530  	if c.Quiet {
  1531  		fmt.Printf("\n")
  1532  		linesMu.Lock()
  1533  		for i := range lines {
  1534  			fmt.Printf("  %2d: %s\n", c.Nodes[i], lines[i])
  1535  		}
  1536  		linesMu.Unlock()
  1537  	}
  1538  
  1539  	if haveErr {
  1540  		log.Fatalf("get %s failed", src)
  1541  	}
  1542  }
  1543  
  1544  func (c *SyncedCluster) pgurls(nodes []int) map[int]string {
  1545  	hosts := c.pghosts(nodes)
  1546  	m := make(map[int]string, len(hosts))
  1547  	for node, host := range hosts {
  1548  		m[node] = c.Impl.NodeURL(c, host, c.Impl.NodePort(c, node))
  1549  	}
  1550  	return m
  1551  }
  1552  
  1553  func (c *SyncedCluster) pghosts(nodes []int) map[int]string {
  1554  	ips := make([]string, len(nodes))
  1555  	c.Parallel("", len(nodes), 0, func(i int) ([]byte, error) {
  1556  		var err error
  1557  		ips[i], err = c.GetInternalIP(nodes[i])
  1558  		return nil, errors.Wrapf(err, "pghosts")
  1559  	})
  1560  
  1561  	m := make(map[int]string, len(ips))
  1562  	for i, ip := range ips {
  1563  		m[nodes[i]] = ip
  1564  	}
  1565  	return m
  1566  }
  1567  
  1568  // SSH TODO(peter): document
  1569  func (c *SyncedCluster) SSH(sshArgs, args []string) error {
  1570  	if len(c.Nodes) != 1 && len(args) == 0 {
  1571  		// If trying to ssh to more than 1 node and the ssh session is interative,
  1572  		// try sshing with an iTerm2 split screen configuration.
  1573  		sshed, err := maybeSplitScreenSSHITerm2(c)
  1574  		if sshed {
  1575  			return err
  1576  		}
  1577  	}
  1578  
  1579  	// Perform template expansion on the arguments.
  1580  	e := expander{
  1581  		node: c.Nodes[0],
  1582  	}
  1583  	var expandedArgs []string
  1584  	for _, arg := range args {
  1585  		expandedArg, err := e.expand(c, arg)
  1586  		if err != nil {
  1587  			return err
  1588  		}
  1589  		expandedArgs = append(expandedArgs, strings.Split(expandedArg, " ")...)
  1590  	}
  1591  
  1592  	var allArgs []string
  1593  	if c.IsLocal() {
  1594  		allArgs = []string{
  1595  			"/bin/bash", "-c",
  1596  		}
  1597  		cmd := fmt.Sprintf("cd ${HOME}/local/%d ; ", c.Nodes[0])
  1598  		if len(args) == 0 /* interactive */ {
  1599  			cmd += "/bin/bash "
  1600  		}
  1601  		if len(args) > 0 {
  1602  			cmd += fmt.Sprintf("export ROACHPROD=%d%s ; ", c.Nodes[0], c.Tag)
  1603  			cmd += strings.Join(expandedArgs, " ")
  1604  		}
  1605  		allArgs = append(allArgs, cmd)
  1606  	} else {
  1607  		allArgs = []string{
  1608  			"ssh",
  1609  			fmt.Sprintf("%s@%s", c.user(c.Nodes[0]), c.host(c.Nodes[0])),
  1610  			"-o", "UserKnownHostsFile=/dev/null",
  1611  			"-o", "StrictHostKeyChecking=no",
  1612  		}
  1613  		allArgs = append(allArgs, sshAuthArgs()...)
  1614  		allArgs = append(allArgs, sshArgs...)
  1615  		if len(args) > 0 {
  1616  			allArgs = append(allArgs, fmt.Sprintf("export ROACHPROD=%d%s ;", c.Nodes[0], c.Tag))
  1617  		}
  1618  		allArgs = append(allArgs, expandedArgs...)
  1619  	}
  1620  
  1621  	sshPath, err := exec.LookPath(allArgs[0])
  1622  	if err != nil {
  1623  		return err
  1624  	}
  1625  	return syscall.Exec(sshPath, allArgs, os.Environ())
  1626  }
  1627  
  1628  func (c *SyncedCluster) scp(src, dest string) error {
  1629  	args := []string{
  1630  		"scp", "-r", "-C",
  1631  		"-o", "StrictHostKeyChecking=no",
  1632  	}
  1633  	args = append(args, sshAuthArgs()...)
  1634  	args = append(args, src, dest)
  1635  	cmd := exec.Command(args[0], args[1:]...)
  1636  	out, err := cmd.CombinedOutput()
  1637  	if err != nil {
  1638  		return errors.Wrapf(err, "~ %s\n%s", strings.Join(args, " "), out)
  1639  	}
  1640  	return nil
  1641  }
  1642  
  1643  // Parallel TODO(peter): document
  1644  func (c *SyncedCluster) Parallel(
  1645  	display string, count, concurrency int, fn func(i int) ([]byte, error),
  1646  ) {
  1647  	if concurrency == 0 || concurrency > count {
  1648  		concurrency = count
  1649  	}
  1650  	if c.MaxConcurrency > 0 && concurrency > c.MaxConcurrency {
  1651  		concurrency = c.MaxConcurrency
  1652  	}
  1653  	type result struct {
  1654  		index int
  1655  		out   []byte
  1656  		err   error
  1657  	}
  1658  
  1659  	results := make(chan result, count)
  1660  	var wg sync.WaitGroup
  1661  	wg.Add(count)
  1662  
  1663  	var index int
  1664  	startNext := func() {
  1665  		go func(i int) {
  1666  			defer wg.Done()
  1667  			out, err := fn(i)
  1668  			results <- result{i, out, err}
  1669  		}(index)
  1670  		index++
  1671  	}
  1672  
  1673  	for index < concurrency {
  1674  		startNext()
  1675  	}
  1676  
  1677  	go func() {
  1678  		wg.Wait()
  1679  		close(results)
  1680  	}()
  1681  
  1682  	var writer ui.Writer
  1683  	out := io.Writer(os.Stdout)
  1684  	if display == "" {
  1685  		out = ioutil.Discard
  1686  	}
  1687  
  1688  	var ticker *time.Ticker
  1689  	if !c.Quiet {
  1690  		ticker = time.NewTicker(100 * time.Millisecond)
  1691  	} else {
  1692  		ticker = time.NewTicker(1000 * time.Millisecond)
  1693  		fmt.Fprintf(out, "%s", display)
  1694  	}
  1695  
  1696  	defer ticker.Stop()
  1697  	complete := make([]bool, count)
  1698  	var failed []result
  1699  
  1700  	var spinner = []string{"|", "/", "-", "\\"}
  1701  	spinnerIdx := 0
  1702  
  1703  	for done := false; !done; {
  1704  		select {
  1705  		case <-ticker.C:
  1706  			if c.Quiet {
  1707  				fmt.Fprintf(out, ".")
  1708  			}
  1709  		case r, ok := <-results:
  1710  			if r.err != nil {
  1711  				failed = append(failed, r)
  1712  			}
  1713  			done = !ok
  1714  			if ok {
  1715  				complete[r.index] = true
  1716  			}
  1717  			if index < count {
  1718  				startNext()
  1719  			}
  1720  		}
  1721  
  1722  		if !c.Quiet {
  1723  			fmt.Fprint(&writer, display)
  1724  			var n int
  1725  			for i := range complete {
  1726  				if complete[i] {
  1727  					n++
  1728  				}
  1729  			}
  1730  			fmt.Fprintf(&writer, " %d/%d", n, len(complete))
  1731  			if !done {
  1732  				fmt.Fprintf(&writer, " %s", spinner[spinnerIdx%len(spinner)])
  1733  			}
  1734  			fmt.Fprintf(&writer, "\n")
  1735  			_ = writer.Flush(out)
  1736  			spinnerIdx++
  1737  		}
  1738  	}
  1739  
  1740  	if c.Quiet {
  1741  		fmt.Fprintf(out, "\n")
  1742  	}
  1743  
  1744  	if len(failed) > 0 {
  1745  		sort.Slice(failed, func(i, j int) bool { return failed[i].index < failed[j].index })
  1746  		for _, f := range failed {
  1747  			fmt.Fprintf(os.Stderr, "%d: %+v: %s\n", f.index, f.err, f.out)
  1748  		}
  1749  		log.Fatal("command failed")
  1750  	}
  1751  }
  1752  
  1753  func (c *SyncedCluster) escapedTag() string {
  1754  	return strings.Replace(c.Tag, "/", "\\/", -1)
  1755  }